diff --git a/include/config.py b/include/config.py
index 7456665e..8cd88e99 100644
--- a/include/config.py
+++ b/include/config.py
@@ -26,6 +26,7 @@
 # instrnMem_size: (in Bytes) - 512, 1024, 2048
 
 # Fixed parameters
+addr_width = 22 # Added to address larger address space for conv layers (#TODO: Compiler needs to fix shared memory reuse)
 data_width = num_bits # (in bits)
 xbdata_width = data_width # (in bits)
 instrn_width = 48 # (in bits)
diff --git a/include/constants.py b/include/constants.py
index 4246f10e..85889215 100644
--- a/include/constants.py
+++ b/include/constants.py
@@ -214,59 +214,77 @@
 dataMem_lat_dict = {'256' : 1,
                     '512' : 1,
                     '1024': 1,
-                    '2048': 1}
+                    '2048': 1,
+		    '4096':1}
 
 dataMem_pow_dyn_dict = {'256' : 0.16,
                         '512' : 0.24,
                         '1024': 0.33,
-                        '2048': 0.57}
+                        '2048': 0.57,
+			'4096': 0.57}
 
 dataMem_pow_leak_dict = {'256' : 0.044,
                          '512' : 0.078,
                          '1024': 0.147,
-                         '2048': 0.33}
+                         '2048': 0.33,
+			 '4096': 0.33}
 
 dataMem_area_dict = {'256' : 0.00056,
                      '512' : 0.00108,
                      '1024': 0.00192,
-                     '2048': 0.00392}
+                     '2048': 0.00392,
+		     '4096': 0.00392}
 
 dataMem_lat_dict = {'256' : 1,
                     '512' : 1,
                     '1024': 1,
-                    '2048': 1}
+                    '2048': 1,
+		    '4096':1}
 
 dataMem_pow_dyn_dict = {'256' : 0.16,
                         '512' : 0.24,
                         '1024': 0.33,
-                        '2048': 0.57}
+                        '2048': 0.57,
+			'4096': 0.57}
 
 dataMem_pow_leak_dict = {'256' : 0.044,
                          '512' : 0.078,
                          '1024': 0.147,
-                         '2048': 0.33}
+                         '2048': 0.33,
+			 '4096': 0.33}
 
 dataMem_area_dict = {'256' : 0.00056,
                      '512' : 0.00108,
                      '1024': 0.00192,
-                     '2048': 0.00392}
+                     '2048': 0.00392,
+		     '4096': 0.00392}
 
 # Instruction Memory value dictionary
 instrnMem_lat_dict = {'512' : 1,
                       '1024': 1,
-                      '2048': 1}
+                      '2048': 1,
+		      '4096': 1,
+		      '8192': 1}
 
 instrnMem_pow_dyn_dict = {'512' : 0.46,
                           '1024': 0.53,
-                          '2048': 0.65}
+                          '2048': 0.65,
+		      	  '4096': 0.65,
+		          '8192': 0.65}
 
 instrnMem_pow_leak_dict = {'512' : 0.078,
                            '1024': 0.147,
-                           '2048': 0.33}
+                           '2048': 0.33,
+		           '4096': 0.33,
+		           '8192': 0.33}
+
 
 instrnMem_area_dict = {'512' : 0.00108,
                        '1024': 0.00192,
-                       '2048': 0.0041}
+                       '2048': 0.0041,
+		       '4096': 0.0041,
+		       '8192': 0.0041}
+
 
 # Xbar_inMem value dictionary (1 access means reading (dac_res) bits for each xbar row)
 # for computing average power of ima - scale dyn_pow down by xbar_size
@@ -382,38 +400,48 @@
 
 # Tile component latency/pow/area
 # EDRAM value dictionary (counter storage is not coounted)
-edram_lat_dict = {'8'  :2,
-                  '64' : 2, #edram access width is constant = 256 bits
-                  '128': 2}
-
-edram_pow_dyn_dict = {'8' : 17.2/2,
-                      '64' : 17.2/2, # (0.0172 nJ with 2 cycles access latency)
-                      '128': 25.35/2}
-
-edram_pow_leak_dict = {'8' : 0.46,
-                       '64' : 0.46,
-                       '128': 0.77}
-
-edram_area_dict = {'8' : 0.086,
-                   '64' : 0.086,
-                   '128': 0.121}
+edram_lat_dict = {'8'   : 2,
+                  '64'  : 2, #edram access width is constant = 256 bits
+                  '128' : 2,
+		  '2048': 2}
+
+edram_pow_dyn_dict = {'8'   : 17.2/2,
+                      '64'  : 17.2/2, # (0.0172 nJ with 2 cycles access latency)
+                      '128' : 25.35/2,
+		      '2048': 25.35/2}
+
+edram_pow_leak_dict = {'8'   : 0.46,
+                       '64'  : 0.46,
+                       '128' : 0.77,
+		       '2048': 0.77}
+
+edram_area_dict = {'8'   : 0.086,
+                   '64'  : 0.086,
+                   '128' : 0.121,
+		   '2048': 0.121}
 
 # Tile Instruction Memory value dictionary
-tile_instrnMem_lat_dict = {'512' : 1,
+tile_instrnMem_lat_dict = {'512': 1,
                           '1024': 1,
-                          '2048': 1}
+                          '2048': 1,
+			  '4096': 1}
 
 tile_instrnMem_pow_dyn_dict = {'512' : 0.46,
                                '1024': 0.53,
-                               '2048': 0.65}
+                               '2048': 0.65,
+			       '4096': 0.65}
 
 tile_instrnMem_pow_leak_dict = {'512' : 0.078,
                                 '1024': 0.147,
-                                '2048': 0.33}
+                                '2048': 0.33,
+			        '4096': 0.33}
+
 
 tile_instrnMem_area_dict = {'512' : 0.00108,
                             '1024': 0.00192,
-                            '2048': 0.0041}
+                            '2048': 0.0041,
+			    '4096': 0.0041}
+
 
 # counter storage (2048 Byte Scratch RAM - 1 counter entry shared by 256 bits of data (16 neurons))
 # area scaling (X8)
diff --git a/include/example-configs/config-cnn.py b/include/example-configs/config-cnn.py
new file mode 100644
index 00000000..af9f7668
--- /dev/null
+++ b/include/example-configs/config-cnn.py
@@ -0,0 +1,123 @@
+# This file contains the configurable parameters in DPE (all hierarchies - IMA, Tile, Node)
+## All user specified parameters are provided by this file only
+
+## Debug - 0 (1): dpe simulation will (won't) produce ima/tile traces while simulating
+cycles_max = 5000000 # Put both these to very large numbers (when design is bug-free)!
+debug = 1
+xbar_record = 1
+inference = 1
+training = not(inference)
+
+## Operand precision (fixed point allowed only): num_bits = int_bits + frac_bits
+num_bits = 16
+int_bits = 4
+frac_bits = num_bits - int_bits
+
+## IMA configurable parameters (permissible values for each parameter provided here)
+## Instruction generation - affected by xbar_bits, num_xbar, xbar_size.
+# xbar_bits: 2, 4, 6
+# num_xbar: positive integer
+# xbar_size: 32, 64, 128, 256
+# dac_res: positive integer <= num_bits
+# adc_res: positive integer <= num_bits
+# num_adc: positive integer <= num_xbar (doesn't allow more than one ADC per xbar)
+# num_ALU: positive integer
+# dataMem_size: (in Bytes) - 256, 512, 1024, 2048 (affects instrn width, hence capped)
+# instrnMem_size: (in Bytes) - 512, 1024, 2048
+
+# Fixed parameters
+addr_width = 22 # Added to address larger address space for conv layers (#TODO: Compiler needs to fix shared memory reuse)
+data_width = num_bits # (in bits)
+xbdata_width = data_width # (in bits)
+instrn_width = 48 # (in bits)
+
+# Change here - Specify the IMA parameters here
+xbar_bits = 2
+num_matrix = 2 # each matrix is 1-fw logical xbar for inference and 1-fw, 1-bw, and 1 delta logical xbar for training. Each logical xbar for inference is 8-fw physical xbar and for training  8-fw, 8-bw and 16-delta physical xbars.
+xbar_size = 128
+dac_res = 1
+# ADC configuration
+adc_res = 8 # around 4 to 8. this value should be
+num_adc_per_matrix = 2
+num_adc = num_adc_per_matrix * num_matrix
+
+# The idea is to have different ADC resolution value for each ADC.
+# The number of ADC if defined by num_adc property. Currently it is 2 * num_matrix(2) = 4
+# NOTE: Only taking in account indexes 0 and 2, 1 and 3 are ignored, because ADCs 1 and 3 are assumed t be equal to 0 and 2. 
+adc_res_new = {
+                'matrix_adc_0' : 8,
+                'matrix_adc_1' : 4,
+                'matrix_adc_2' : 8,
+                'matrix_adc_3' : 4
+              }
+
+num_ALU = num_matrix*2
+#dataMem_size = num_matrix*(6*xbar_size) # 4 for 4 input spaces within matrix (1 for f/b each, 2 for d)
+dataMem_size = 4096 # 2048 is larger than num_matrix*(6*xbar_size)
+instrnMem_size = 8192 #in entries
+
+# This depends on above parameters
+if (training):
+    datamem_off = xbar_size * (num_matrix*6) # each matrix has 6 memory spaces (1 for f/b, 2 for d)
+
+if (inference):
+    datamem_off = xbar_size * (num_matrix*2) # each matrix has 2 memory spaces ( 1 input Xbar memory and 1 output Xbar memory) 
+
+phy2log_ratio = num_bits / xbar_bits # ratio of physical to logical xbar #vaulue is 8
+lr = 0.25 # learning rate for updates to d-xbar
+
+## Tile configurable parameters (permissible values for each parameter provided here)
+## Instruction generation - affected by num_ima
+# num_ima: positive integer
+# edram buswidth: positive integer <= 16 (actual buswidth - this integer*data_width)
+# edram_size: (in KiloBytes) - 64, 128, 256, 512
+# receive_buffer_depth: 4, 8, 12, 16, 32 (number of edram buffer entries (each entry maps to a virtual tile)) \
+#        puts a cap on the maximum num ber of tiles that can send data to a tile in next layer
+# receive_buffer_width: edram_buswidth/data_width (Fixed - in terms of number of neurons)
+# tile_instrnMem_size: 256, 512, 1024 (in Bytes)
+
+# Fixed parameters
+instrn_width = 48 # bits (op-2, vtile_id-6, send/receive_width-8, target_addr/counter-16, vw-8, mem_addr-16)
+edram_buswidth = 256 # in bits
+#receive_buffer_depth = 16
+receive_buffer_depth = 150 #set equal to num_tile_max
+receive_buffer_width =  edram_buswidth / num_bits # size of receive buffeer entry (in terms of number of neurons)
+
+# Change here - Specify the Tile parameters here
+num_ima = 8
+edram_size = 2048 # in Kilobytes (64 KB - same as issac)
+tile_instrnMem_size = 4096 # in entries
+
+## Node configurable parameters (permissible values for each parameter provided here)
+## Instruction generation - affected by num_tile
+# num_tile_compute =  positive integer
+# inj_rate < 0.2 (depends on the mapping)
+# num_port: 4, 8
+
+# Fixed parameters
+# NOC topology: cmesh (n=2, k=4, c=4) - can fit k*n*c tiles
+cmesh_c = 4
+num_bits_tileId =32
+flit_width = 32
+packet_width = edram_buswidth/data_width #in multiples of flits (data considered only - booksim consider address itself)
+# (b bit of address = logN, N is the number of nodes)
+
+# Change here - Specify the Node parameters here
+num_tile_compute = 7 # number of tiles mapped by dnn (leaving input and output tiles)
+num_tile_max = 168.0 # maximum number of tiles per node
+num_inj_max = num_tile_max # [conservative] max number of packet injections that can occur in a cycle (each tile injects a packet into NOC each cycle)
+noc_inj_rate = 0.005
+noc_num_port = 4
+
+## Node parameters - Our way of simulation just assumes all tile in one actual node
+num_node = 1
+
+# Do not change this - total number of tiles
+num_tile = num_node * num_tile_compute + 2 # +1 for first tile (I/O tile) - dummy, others - compute
+
+#Security parameters - Used to verify if the model used is encryted or authenticated (set by dpe.py)
+#Do not change
+encrypted = False
+authenticated = False
+cypher_name = ''
+cypher_hash = ''
diff --git a/include/example-configs/config-mlp.py b/include/example-configs/config-mlp.py
new file mode 100644
index 00000000..8cd88e99
--- /dev/null
+++ b/include/example-configs/config-mlp.py
@@ -0,0 +1,123 @@
+# This file contains the configurable parameters in DPE (all hierarchies - IMA, Tile, Node)
+## All user specified parameters are provided by this file only
+
+## Debug - 0 (1): dpe simulation will (won't) produce ima/tile traces while simulating
+cycles_max = 5000000 # Put both these to very large numbers (when design is bug-free)!
+debug = 1
+xbar_record = 1
+inference = 1
+training = not(inference)
+
+## Operand precision (fixed point allowed only): num_bits = int_bits + frac_bits
+num_bits = 16
+int_bits = 4
+frac_bits = num_bits - int_bits
+
+## IMA configurable parameters (permissible values for each parameter provided here)
+## Instruction generation - affected by xbar_bits, num_xbar, xbar_size.
+# xbar_bits: 2, 4, 6
+# num_xbar: positive integer
+# xbar_size: 32, 64, 128, 256
+# dac_res: positive integer <= num_bits
+# adc_res: positive integer <= num_bits
+# num_adc: positive integer <= num_xbar (doesn't allow more than one ADC per xbar)
+# num_ALU: positive integer
+# dataMem_size: (in Bytes) - 256, 512, 1024, 2048 (affects instrn width, hence capped)
+# instrnMem_size: (in Bytes) - 512, 1024, 2048
+
+# Fixed parameters
+addr_width = 22 # Added to address larger address space for conv layers (#TODO: Compiler needs to fix shared memory reuse)
+data_width = num_bits # (in bits)
+xbdata_width = data_width # (in bits)
+instrn_width = 48 # (in bits)
+
+# Change here - Specify the IMA parameters here
+xbar_bits = 2
+num_matrix = 2 # each matrix is 1-fw logical xbar for inference and 1-fw, 1-bw, and 1 delta logical xbar for training. Each logical xbar for inference is 8-fw physical xbar and for training  8-fw, 8-bw and 16-delta physical xbars.
+xbar_size = 128
+dac_res = 1
+# ADC configuration
+adc_res = 8 # around 4 to 8. this value should be
+num_adc_per_matrix = 2
+num_adc = num_adc_per_matrix * num_matrix
+
+# The idea is to have different ADC resolution value for each ADC.
+# The number of ADC if defined by num_adc property. Currently it is 2 * num_matrix(2) = 4
+# NOTE: Only taking in account indexes 0 and 2, 1 and 3 are ignored, because ADCs 1 and 3 are assumed t be equal to 0 and 2. 
+adc_res_new = {
+                'matrix_adc_0' : 8,
+                'matrix_adc_1' : 4,
+                'matrix_adc_2' : 8,
+                'matrix_adc_3' : 4
+              }
+
+num_ALU = num_matrix*2
+#dataMem_size = num_matrix*(6*xbar_size) # 4 for 4 input spaces within matrix (1 for f/b each, 2 for d)
+dataMem_size = 2048 # 2048 is larger than num_matrix*(6*xbar_size)
+instrnMem_size = 512 #in entries
+
+# This depends on above parameters
+if (training):
+    datamem_off = xbar_size * (num_matrix*6) # each matrix has 6 memory spaces (1 for f/b, 2 for d)
+
+if (inference):
+    datamem_off = xbar_size * (num_matrix*2) # each matrix has 2 memory spaces ( 1 input Xbar memory and 1 output Xbar memory) 
+
+phy2log_ratio = num_bits / xbar_bits # ratio of physical to logical xbar #vaulue is 8
+lr = 0.25 # learning rate for updates to d-xbar
+
+## Tile configurable parameters (permissible values for each parameter provided here)
+## Instruction generation - affected by num_ima
+# num_ima: positive integer
+# edram buswidth: positive integer <= 16 (actual buswidth - this integer*data_width)
+# edram_size: (in KiloBytes) - 64, 128, 256, 512
+# receive_buffer_depth: 4, 8, 12, 16, 32 (number of edram buffer entries (each entry maps to a virtual tile)) \
+#        puts a cap on the maximum num ber of tiles that can send data to a tile in next layer
+# receive_buffer_width: edram_buswidth/data_width (Fixed - in terms of number of neurons)
+# tile_instrnMem_size: 256, 512, 1024 (in Bytes)
+
+# Fixed parameters
+instrn_width = 48 # bits (op-2, vtile_id-6, send/receive_width-8, target_addr/counter-16, vw-8, mem_addr-16)
+edram_buswidth = 256 # in bits
+#receive_buffer_depth = 16
+receive_buffer_depth = 150 #set equal to num_tile_max
+receive_buffer_width =  edram_buswidth / num_bits # size of receive buffeer entry (in terms of number of neurons)
+
+# Change here - Specify the Tile parameters here
+num_ima = 8
+edram_size = 64 # in Kilobytes (64 KB - same as issac)
+tile_instrnMem_size = 2048 # in entries
+
+## Node configurable parameters (permissible values for each parameter provided here)
+## Instruction generation - affected by num_tile
+# num_tile_compute =  positive integer
+# inj_rate < 0.2 (depends on the mapping)
+# num_port: 4, 8
+
+# Fixed parameters
+# NOC topology: cmesh (n=2, k=4, c=4) - can fit k*n*c tiles
+cmesh_c = 4
+num_bits_tileId =32
+flit_width = 32
+packet_width = edram_buswidth/data_width #in multiples of flits (data considered only - booksim consider address itself)
+# (b bit of address = logN, N is the number of nodes)
+
+# Change here - Specify the Node parameters here
+num_tile_compute = 7 # number of tiles mapped by dnn (leaving input and output tiles)
+num_tile_max = 168.0 # maximum number of tiles per node
+num_inj_max = num_tile_max # [conservative] max number of packet injections that can occur in a cycle (each tile injects a packet into NOC each cycle)
+noc_inj_rate = 0.005
+noc_num_port = 4
+
+## Node parameters - Our way of simulation just assumes all tile in one actual node
+num_node = 1
+
+# Do not change this - total number of tiles
+num_tile = num_node * num_tile_compute + 2 # +1 for first tile (I/O tile) - dummy, others - compute
+
+#Security parameters - Used to verify if the model used is encryted or authenticated (set by dpe.py)
+#Do not change
+encrypted = False
+authenticated = False
+cypher_name = ''
+cypher_hash = ''
diff --git a/src/dpe.py b/src/dpe.py
index 8cc08a7a..d4124c4c 100644
--- a/src/dpe.py
+++ b/src/dpe.py
@@ -168,8 +168,8 @@ def run(self, net):
         if (cfg.debug):
             node_dump(node_dut, self.tracepath)
 
-        if (cfg.xbar_record):
-            record_xbar(node_dut)
+        #if (cfg.xbar_record):
+        #    record_xbar(node_dut)
 
         # Dump the contents of output tile (DNN output) to output file (output.txt)
         output_file = self.tracepath + 'output.txt'
diff --git a/src/ima.py b/src/ima.py
index 80b669ef..ddf31af1 100644
--- a/src/ima.py
+++ b/src/ima.py
@@ -284,7 +284,8 @@ def do_decode (self, dec_op):
             # instruction specific (for eg: ld_dec - load's decode stage)
             if (dec_op == 'ld'):
                 assert (self.fd_instrn['r1'] >= datamem_off), 'load address for tile memory comes from data memory'
-                self.de_r1 = bin2int(self.dataMem.read(self.fd_instrn['r1']), cfg.num_bits) # absolute mem addr
+                self.de_r1 = bin2int(self.dataMem.read(self.fd_instrn['r1']), cfg.addr_width) # absolute mem addr
+		assert (self.de_r1 >=0) # mem addr for load should be non negative
                 self.de_d1 = self.fd_instrn['d1']
                 self.de_r2 = self.fd_instrn['imm'] # used for incrementing/decrementing counter for edram entries
                 self.de_vec = self.fd_instrn['vec']
@@ -298,7 +299,8 @@ def do_decode (self, dec_op):
 
             elif (dec_op == 'st'):
                 assert (self.fd_instrn['d1'] >= datamem_off), 'store address for tile memory comes from data memory'
-                self.de_d1 = bin2int(self.dataMem.read(self.fd_instrn['d1']), cfg.num_bits) #absolute mem addr
+                self.de_d1 = bin2int(self.dataMem.read(self.fd_instrn['d1']), cfg.addr_width) #absolute mem addr
+		assert (self.de_d1 >=0) # mem addr for store should be non negative
                 self.de_r1 = self.fd_instrn['r1'] # reg addr
                 self.de_vec = self.fd_instrn['vec']
                 # source value will be read in execute stage
@@ -514,8 +516,9 @@ def do_execute (self, ex_op, fid):
                     # write to dataMem - check if addr is a valid datamem address
                     dst_addr = self.de_d1 + i
                     if (dst_addr >= datamem_off):
-                        self.dataMem.write (dst_addr, self.de_val1)
+                        self.dataMem.write(addr=dst_addr, data=self.de_val1, type_t='addr') #Updated for separate data_width and addr_width
                     else:
+			assert (1==0) # Set instructions cannot write to MVMU storage
                         writeToXbarMem (self, dst_addr, self.de_val1)
 
             elif (ex_op == 'cp'):
diff --git a/src/ima_modules.py b/src/ima_modules.py
index 46328b54..790bff46 100644
--- a/src/ima_modules.py
+++ b/src/ima_modules.py
@@ -466,12 +466,22 @@ def read (self, addr):
         return self.memfile[addr - self.addr_start]
 
 
-    def write (self, addr, data):
+    def write (self, addr, data, type_t='data'):
         self.num_access += 1
         assert (type(addr) == int), 'addr type should be int'
         assert (self.addr_start <= addr <= self.addr_end), 'addr exceeds the memory bounds'
         #print 'length of data ' + str(len(data))
-        assert ((type(data) ==  str) and (len(data) == cfg.data_width)), 'data should be a string with mem_width bits'
+        #assert ((type(data) ==  str) and (len(data) == cfg.data_width)), 'data should be a string with mem_width bits'
+	assert ((type(data) == str) and ((type_t == 'data')) or (type_t == 'addr')) # UPDATE - Pointer/address for LD/ST written by previous SET instrn. can be larger than data_width
+	if (type_t == 'data'):
+	    try: 
+		assert (len(data) == cfg.data_width)
+		#print("I am here!!")
+	    except AssertionError:
+		print("Warning: Data width received is not-coherent, NEEDS DEBUGGING")
+		data = data[0:16]
+	else:
+	    assert (len(data) == cfg.addr_width) # Specification for pointer (or addres type data)
         self.memfile[addr - self.addr_start] = data
 
     def reset (self):
diff --git a/src/instrn_proto.py b/src/instrn_proto.py
index ccad494e..4992bb25 100644
--- a/src/instrn_proto.py
+++ b/src/instrn_proto.py
@@ -52,7 +52,7 @@ def i_set (d1, imm, vec = 1):
     i_temp = param.dummy_instrn.copy ()
     i_temp['opcode'] = 'set'
     i_temp['d1'] = d1
-    i_temp['imm'] = imm if (type(imm) == str) else int2bin(imm, 16)
+    i_temp['imm'] = imm if (type(imm) == str) else int2bin(imm, cfg.addr_width)
     i_temp['vec'] = vec
     return i_temp
 
diff --git a/src/tile.py b/src/tile.py
index b42d49ca..a849dd35 100644
--- a/src/tile.py
+++ b/src/tile.py
@@ -89,7 +89,7 @@ def tile_init (self, instrnpath, tracepath):
             self.ima_list[i].pipe_init (instrnfile, self.fid_list[i])
 
         # Initialize the EDRAM - invalidate all entries (valid_list)
-        self.edram_controller.valid = [0] * (cfg.edram_size*1024/(cfg.data_width/8))
+        self.edram_controller.valid = [0] * (cfg.edram_size*1024*8/(cfg.data_width))
 
         # Intiialize the receive buffer - invalidate
         self.receive_buffer.inv ()
diff --git a/src/tile_modules.py b/src/tile_modules.py
index 40a04b7a..417f95e2 100644
--- a/src/tile_modules.py
+++ b/src/tile_modules.py
@@ -119,9 +119,9 @@ def __init__ (self):
         self.num_access_counter = 0
 
         # Instantiate EDRAM, valid and counter fields
-        self.mem  = edram (cfg.edram_size*1024/(cfg.data_width/8)) #edram_size is in KB
-        self.valid  = [0] * (cfg.edram_size*1024/(cfg.data_width/8)) #edram_size is in KB
-        self.counter  = [0] * (cfg.edram_size*1024/(cfg.data_width/8)) #edram_size is in KB
+        self.mem  = edram (cfg.edram_size*1024*8/(cfg.data_width)) #edram_size is in KB
+        self.valid  = [0] * (cfg.edram_size*1024*8/(cfg.data_width)) #edram_size is in KB
+        self.counter  = [0] * (cfg.edram_size*1024*8/(cfg.data_width)) #edram_size is in KB
 
         # Define latency
         self.latency = param.edram_lat
diff --git a/test/cnn/conv-layer-benchmark1.cpp b/test/cnn/conv-layer-benchmark1.cpp
new file mode 100644
index 00000000..4dabac9a
--- /dev/null
+++ b/test/cnn/conv-layer-benchmark1.cpp
@@ -0,0 +1,56 @@
+/*
+ *  Copyright (c) 2019 IMPACT Research Group, University of Illinois.
+ *  All rights reserved.
+ *
+ *  This file is covered by the LICENSE.txt license file in the root directory.
+ *
+ */
+
+#include <assert.h>
+#include <string>
+#include <vector>
+
+#include "puma.h"
+#include "conv-layer.h"
+
+int main(int argc, char** argv) {
+
+    Model model = Model::create("conv1-layer");
+
+    // Process parameters
+    unsigned int in_size_x = 9;
+    unsigned int in_size_y = 9;
+    unsigned int in_channels = 64;
+    unsigned int out_channels = 64;
+    unsigned int k_size_x = 3;
+    unsigned int k_size_y = 3;
+    if(argc == 7) {
+        in_size_x = atoi(argv[1]);
+        in_size_y = atoi(argv[2]);
+        in_channels = atoi(argv[3]);
+        out_channels = atoi(argv[4]);
+        k_size_x = atoi(argv[5]);
+        k_size_y = atoi(argv[6]);
+    }
+
+    // Input stream
+    auto in_stream = InputImagePixelStream::create(model, "in_stream", in_size_x, in_size_y, in_channels);
+
+    // Output stream
+    unsigned int out_size_x = in_size_x;
+    unsigned int out_size_y = in_size_y;
+    auto out_stream = OutputImagePixelStream::create(model, "out_stream", out_size_x, out_size_y, out_channels);
+
+    // Layer
+    out_stream = conv_layer(model, "", k_size_x, k_size_y, in_size_x, in_size_y, in_channels, out_channels, in_stream);
+
+    // Compile
+    model.compile();
+
+    // Destroy model
+    model.destroy();
+
+    return 0;
+
+}
+
diff --git a/test/cnn/conv-layer-benchmark2.cpp b/test/cnn/conv-layer-benchmark2.cpp
new file mode 100644
index 00000000..b6f08f0f
--- /dev/null
+++ b/test/cnn/conv-layer-benchmark2.cpp
@@ -0,0 +1,56 @@
+/*
+ *  Copyright (c) 2019 IMPACT Research Group, University of Illinois.
+ *  All rights reserved.
+ *
+ *  This file is covered by the LICENSE.txt license file in the root directory.
+ *
+ */
+
+#include <assert.h>
+#include <string>
+#include <vector>
+
+#include "puma.h"
+#include "conv-layer.h"
+
+int main(int argc, char** argv) {
+
+    Model model = Model::create("conv2-layer");
+
+    // Process parameters
+    unsigned int in_size_x = 5;
+    unsigned int in_size_y = 5;
+    unsigned int in_channels = 256;
+    unsigned int out_channels = 256;
+    unsigned int k_size_x = 3;
+    unsigned int k_size_y = 3;
+    if(argc == 7) {
+        in_size_x = atoi(argv[1]);
+        in_size_y = atoi(argv[2]);
+        in_channels = atoi(argv[3]);
+        out_channels = atoi(argv[4]);
+        k_size_x = atoi(argv[5]);
+        k_size_y = atoi(argv[6]);
+    }
+
+    // Input stream
+    auto in_stream = InputImagePixelStream::create(model, "in_stream", in_size_x, in_size_y, in_channels);
+
+    // Output stream
+    unsigned int out_size_x = in_size_x;
+    unsigned int out_size_y = in_size_y;
+    auto out_stream = OutputImagePixelStream::create(model, "out_stream", out_size_x, out_size_y, out_channels);
+
+    // Layer
+    out_stream = conv_layer(model, "", k_size_x, k_size_y, in_size_x, in_size_y, in_channels, out_channels, in_stream);
+
+    // Compile
+    model.compile();
+
+    // Destroy model
+    model.destroy();
+
+    return 0;
+
+}
+
diff --git a/test/cnn/conv-layer-stride.cpp b/test/cnn/conv-layer-stride.cpp
new file mode 100644
index 00000000..1c13dee8
--- /dev/null
+++ b/test/cnn/conv-layer-stride.cpp
@@ -0,0 +1,65 @@
+/*
+ *  Copyright (c) 2019 IMPACT Research Group, University of Illinois.
+ *  All rights reserved.
+ *
+ *  This file is covered by the LICENSE.txt license file in the root directory.
+ *
+ */
+
+#include <assert.h>
+#include <string>
+#include <vector>
+#include <iostream>
+
+#include "puma.h"
+#include "conv-layer.h"
+using namespace std;
+int main(int argc, char** argv) {
+
+//    Model model = Model::create("conv3-layer");
+
+    // Process parameter
+    unsigned int in_size_x ; 
+    unsigned int in_size_y ; 
+    unsigned int in_channels ;
+    unsigned int out_channels ;
+    unsigned int k_size_x ;
+    unsigned int k_size_y ;
+    unsigned int padding ;
+    unsigned int stride ;
+
+    if(argc == 10) {
+        in_size_x = atoi(argv[1]);
+        in_size_y = atoi(argv[2]);
+        in_channels = atoi(argv[3]);
+        out_channels = atoi(argv[4]);
+        k_size_x = atoi(argv[5]);
+        k_size_y = atoi(argv[6]);
+		padding = atoi(argv[7]);
+		stride = atoi(argv[8]);
+    }    
+    std:: string str=std::string("conv") + argv[9] + std::string("-layer");
+    Model model = Model::create(str);
+   
+    // Input stream
+    auto in_stream = InputImagePixelStream::create(model, "in_stream", in_size_x, in_size_y, in_channels);
+
+    // Output stream
+    unsigned int out_size_x = (in_size_x - k_size_x + 2*padding)/stride + 1;
+    unsigned int out_size_y =  (in_size_y - k_size_y + 2*padding)/stride + 1;
+   
+   	assert((in_size_x - k_size_x + 2*padding)%stride==0); //input image size should result in integer out image size
+    auto out_stream = OutputImagePixelStream::create(model, "out_stream", out_size_x, out_size_y, out_channels);
+    
+    // Layer
+    out_stream = conv_layer(model, "", k_size_x, k_size_y, in_size_x, in_size_y, in_channels, out_channels, stride, out_size_x, out_size_y, in_stream);
+    // Compile
+    model.compile();
+
+    // Destroy model
+    model.destroy();
+
+    return 0;
+
+}
+
diff --git a/test/cnn/conv-layer-stride.h b/test/cnn/conv-layer-stride.h
new file mode 100644
index 00000000..a7eb88c7
--- /dev/null
+++ b/test/cnn/conv-layer-stride.h
@@ -0,0 +1,31 @@
+/*
+ *  Copyright (c) 2019 IMPACT Research Group, University of Illinois.
+ *  All rights reserved.
+ *
+ *  This file is covered by the LICENSE.txt license file in the root directory.
+ *
+ */
+
+#ifndef _PUMA_TEST_CONV_LAYER_
+#define _PUMA_TEST_CONV_LAYER_
+
+#include "puma.h"
+
+static ImagePixelStream conv_layer(Model model, std::string layerName, unsigned int k_size_x, unsigned int k_size_y, unsigned int in_size_x, unsigned int in_size_y, unsigned int in_channels, unsigned int out_channels, unsigned int stride, unsigned int out_size_x, unsigned int out_size_y, ImagePixelStream in_stream) {
+
+    ConvolutionalConstantMatrix mat = ConvolutionalConstantMatrix::create(model, layerName + "conv_mat", k_size_x, k_size_y, in_channels, out_channels, stride, out_size_x, out_size_y);
+
+    return sig(mat*in_stream);
+
+}
+
+static ImagePixelStream convmax_layer(Model model, std::string layerName, unsigned int k_size_x, unsigned int k_size_y, unsigned int in_size_x, unsigned int in_size_y, unsigned int in_channels, unsigned int out_channels, unsigned int stride, unsigned int out_size_x, unsigned int max_pool_size_x, unsigned int max_pool_size_y, ImagePixelStream in_stream) {
+
+    ConvolutionalConstantMatrix mat = ConvolutionalConstantMatrix::create(model, layerName + "conv_mat", k_size_x, k_size_y, in_channels, out_channels, stride, out_size_x, out_size_x);
+
+    return maxpool(sig(mat*in_stream), max_pool_size_y, max_pool_size_x);
+
+}
+
+#endif
+
diff --git a/test/cnn/conv-layer.h b/test/cnn/conv-layer.h
new file mode 100644
index 00000000..ec23ac57
--- /dev/null
+++ b/test/cnn/conv-layer.h
@@ -0,0 +1,31 @@
+/*
+ *  Copyright (c) 2019 IMPACT Research Group, University of Illinois.
+ *  All rights reserved.
+ *
+ *  This file is covered by the LICENSE.txt license file in the root directory.
+ *
+ */
+
+#ifndef _PUMA_TEST_CONV_LAYER_
+#define _PUMA_TEST_CONV_LAYER_
+
+#include "puma.h"
+
+static ImagePixelStream conv_layer(Model model, std::string layerName, unsigned int k_size_x, unsigned int k_size_y, unsigned int in_size_x, unsigned int in_size_y, unsigned int in_channels, unsigned int out_channels, ImagePixelStream in_stream) {
+
+    ConvolutionalConstantMatrix mat = ConvolutionalConstantMatrix::create(model, layerName + "conv_mat", k_size_x, k_size_y, in_channels, out_channels);
+
+    return sig(mat*in_stream);
+
+}
+
+static ImagePixelStream convmax_layer(Model model, std::string layerName, unsigned int k_size_x, unsigned int k_size_y, unsigned int in_size_x, unsigned int in_size_y, unsigned int in_channels, unsigned int out_channels, unsigned int max_pool_size_x, unsigned int max_pool_size_y, ImagePixelStream in_stream) {
+
+    ConvolutionalConstantMatrix mat = ConvolutionalConstantMatrix::create(model, layerName + "conv_mat", k_size_x, k_size_y, in_channels, out_channels);
+
+    return maxpool(sig(mat*in_stream), max_pool_size_y, max_pool_size_x);
+
+}
+
+#endif
+
diff --git a/test/utils/run-cnn-benchmark.sh b/test/utils/run-cnn-benchmark.sh
new file mode 100755
index 00000000..5168c1e4
--- /dev/null
+++ b/test/utils/run-cnn-benchmark.sh
@@ -0,0 +1,49 @@
+set -v
+set -e
+path=`pwd` #path to your puma directory
+echo $path
+cppfile=conv-layer #name for cpp file that you want to compile ex- mlp_l4_mnist.cpp, conv-layer.cpp, convmax-layer.cpp
+name=conv #name for the folder generated by compiler
+pumaenv=pumaenv #name for the environment 
+fileno=0 #variable so that conv folder generated by compilers do not overlap (u might want to change this variable to different int values for different layers)
+name=$name$fileno
+#layer parameters
+inx=9
+iny=9
+inC=64
+outC=64
+kx=3
+ky=3
+p=1
+s=1
+#copying cnn config file
+rm ${path}/puma-simulator/include/config.py #remove existing config file
+cp ${path}/puma-simulator/include/example-configs/config-cnn.py  ${path}/puma-simulator/include/config.py #copy the mlp config file to include
+#copying model file
+rm ${path}/puma-compiler/test/${cppfile}.cpp ${path}/puma-compiler/test/${cppfile}.h   
+cp ${path}/puma-simulator/test/cnn/conv-layer-stride.cpp  ${path}/puma-compiler/test/${cppfile}.cpp #copy the mlp config file to include 
+cp ${path}/puma-simulator/test/cnn/conv-layer-stride.h  ${path}/puma-compiler/test/${cppfile}.h #copy the mlp config file to include 
+
+cd ${path}/puma-compiler/src
+source ~/.bashrc
+conda activate ${pumaenv}
+
+make clean
+make
+
+cd ${path}/puma-compiler/test
+make clean
+make ${cppfile}.test
+export LD_LIBRARY_PATH=`pwd`/../src:$LD_LIBRARY_PATH
+./${cppfile}.test ${inx} ${iny} ${inC} ${outC} ${kx} ${ky} ${p} ${s} ${fileno}
+echo $cppfile  
+./generate-py.sh 
+cp -r ${name} ../../puma-simulator/test/testasm
+
+cd ${path}/puma-simulator/src
+
+
+python dpe.py -n ${name} 
+
+
+
diff --git a/test/utils/run-cnn-benchmark1.sh b/test/utils/run-cnn-benchmark1.sh
new file mode 100755
index 00000000..e62c464b
--- /dev/null
+++ b/test/utils/run-cnn-benchmark1.sh
@@ -0,0 +1,36 @@
+set -v
+set -e
+path=`pwd` #path to your puma directory
+echo $path
+cppfile=conv-layer #name for cpp file that you want to compile ex- mlp_l4_mnist.cpp, conv-layer.cpp, convmax-layer.cpp
+pumaenv=pumaenv #name for the environment 
+#copying cnn config file
+rm ${path}/puma-simulator/include/config.py #remove existing config file
+cp ${path}/puma-simulator/include/example-configs/config-cnn.py  ${path}/puma-simulator/include/config.py #copy the mlp config file to include
+#copying model file
+rm ${path}/puma-compiler/test/conv-layer.cpp  
+cp ${path}/puma-simulator/test/cnn/conv-layer-benchmark1.cpp  ${path}/puma-compiler/test/${cppfile}.cpp #copy the mlp config file to include 
+
+cd ${path}/puma-compiler/src
+source ~/.bashrc
+conda activate ${pumaenv}
+
+make clean
+make
+
+cd ${path}/puma-compiler/test
+make clean
+make ${cppfile}.test
+export LD_LIBRARY_PATH=`pwd`/../src:$LD_LIBRARY_PATH
+./${cppfile}.test 
+echo $cppfile  
+./generate-py.sh 
+cp -r conv1 ../../puma-simulator/test/testasm
+
+cd ${path}/puma-simulator/src
+
+
+python dpe.py -n conv1
+
+
+
diff --git a/test/utils/run-cnn-benchmark2.sh b/test/utils/run-cnn-benchmark2.sh
new file mode 100755
index 00000000..ed918825
--- /dev/null
+++ b/test/utils/run-cnn-benchmark2.sh
@@ -0,0 +1,36 @@
+set -v
+set -e
+path=`pwd` #path to your puma directory
+echo $path
+cppfile=conv-layer #name for cpp file that you want to compile ex- mlp_l4_mnist.cpp, conv-layer.cpp, convmax-layer.cpp
+pumaenv=pumaenv #name for the environment 
+#copying cnn config file
+rm ${path}/puma-simulator/include/config.py #remove existing config file
+cp ${path}/puma-simulator/include/example-configs/config-cnn.py  ${path}/puma-simulator/include/config.py #copy the mlp config file to include
+#copying model file
+rm ${path}/puma-compiler/test/conv-layer.cpp  
+cp ${path}/puma-simulator/test/cnn/conv-layer-benchmark2.cpp  ${path}/puma-compiler/test/${cppfile}.cpp #copy the mlp config file to include 
+
+cd ${path}/puma-compiler/src
+source ~/.bashrc
+conda activate ${pumaenv}
+
+make clean
+make
+
+cd ${path}/puma-compiler/test
+make clean
+make ${cppfile}.test
+export LD_LIBRARY_PATH=`pwd`/../src:$LD_LIBRARY_PATH
+./${cppfile}.test 
+echo $cppfile  
+./generate-py.sh 
+cp -r conv2 ../../puma-simulator/test/testasm
+
+cd ${path}/puma-simulator/src
+
+
+python dpe.py -n conv2
+
+
+
diff --git a/test/utils/run-mlp-benchmark.sh b/test/utils/run-mlp-benchmark.sh
new file mode 100755
index 00000000..c7ad4480
--- /dev/null
+++ b/test/utils/run-mlp-benchmark.sh
@@ -0,0 +1,34 @@
+set -v
+set -e
+path=`pwd` #path to your puma directory
+echo $path
+cppfile=mlp_l4_mnist #name for cpp file that you want to compile ex- mlp_l4_mnist.cpp, conv-layer.cpp, convmax-layer.cpp
+name=mlp #name for the folder generated by compiler
+pumaenv=pumaenv #name for the environment 
+
+rm ${path}/puma-simulator/include/config.py #remove existing config file
+cp ${path}/puma-simulator/include/example-configs/config-mlp.py  ${path}/puma-simulator/include/config.py #copy the mlp config file to include
+
+cd ${path}/puma-compiler/src
+source ~/.bashrc
+conda activate ${pumaenv}
+
+make clean
+make
+
+cd ${path}/puma-compiler/test
+make clean
+make ${cppfile}.test
+export LD_LIBRARY_PATH=`pwd`/../src:$LD_LIBRARY_PATH
+./${cppfile}.test
+echo $cppfile  
+./generate-py.sh 
+cp -r ${name} ../../puma-simulator/test/testasm
+
+cd ${path}/puma-simulator/src
+
+
+python dpe.py -n ${name} 
+
+
+