Merge pull request #26 from negishubham/training

Bug-fixes for CNN and simulator enhancements.
Aayush-Ankit · Apr 15, 2020 · 090feed · 090feed
2 parents f4f8f41 + 994016b
commit 090feed
Show file tree

Hide file tree

Showing 19 changed files with 726 additions and 44 deletions.
diff --git a/include/config.py b/include/config.py
@@ -26,6 +26,7 @@
 # instrnMem_size: (in Bytes) - 512, 1024, 2048
 
 # Fixed parameters
+addr_width = 22 # Added to address larger address space for conv layers (#TODO: Compiler needs to fix shared memory reuse)
 data_width = num_bits # (in bits)
 xbdata_width = data_width # (in bits)
 instrn_width = 48 # (in bits)

diff --git a/include/constants.py b/include/constants.py
@@ -214,59 +214,77 @@
 dataMem_lat_dict = {'256' : 1,
                     '512' : 1,
                     '1024': 1,
-                    '2048': 1}
+                    '2048': 1,
+		    '4096':1}
 
 dataMem_pow_dyn_dict = {'256' : 0.16,
                         '512' : 0.24,
                         '1024': 0.33,
-                        '2048': 0.57}
+                        '2048': 0.57,
+			'4096': 0.57}
 
 dataMem_pow_leak_dict = {'256' : 0.044,
                          '512' : 0.078,
                          '1024': 0.147,
-                         '2048': 0.33}
+                         '2048': 0.33,
+			 '4096': 0.33}
 
 dataMem_area_dict = {'256' : 0.00056,
                      '512' : 0.00108,
                      '1024': 0.00192,
-                     '2048': 0.00392}
+                     '2048': 0.00392,
+		     '4096': 0.00392}
 
 dataMem_lat_dict = {'256' : 1,
                     '512' : 1,
                     '1024': 1,
-                    '2048': 1}
+                    '2048': 1,
+		    '4096':1}
 
 dataMem_pow_dyn_dict = {'256' : 0.16,
                         '512' : 0.24,
                         '1024': 0.33,
-                        '2048': 0.57}
+                        '2048': 0.57,
+			'4096': 0.57}
 
 dataMem_pow_leak_dict = {'256' : 0.044,
                          '512' : 0.078,
                          '1024': 0.147,
-                         '2048': 0.33}
+                         '2048': 0.33,
+			 '4096': 0.33}
 
 dataMem_area_dict = {'256' : 0.00056,
                      '512' : 0.00108,
                      '1024': 0.00192,
-                     '2048': 0.00392}
+                     '2048': 0.00392,
+		     '4096': 0.00392}
 
 # Instruction Memory value dictionary
 instrnMem_lat_dict = {'512' : 1,
                       '1024': 1,
-                      '2048': 1}
+                      '2048': 1,
+		      '4096': 1,
+		      '8192': 1}
 
 instrnMem_pow_dyn_dict = {'512' : 0.46,
                           '1024': 0.53,
-                          '2048': 0.65}
+                          '2048': 0.65,
+		      	  '4096': 0.65,
+		          '8192': 0.65}
 
 instrnMem_pow_leak_dict = {'512' : 0.078,
                            '1024': 0.147,
-                           '2048': 0.33}
+                           '2048': 0.33,
+		           '4096': 0.33,
+		           '8192': 0.33}
+
 
 instrnMem_area_dict = {'512' : 0.00108,
                        '1024': 0.00192,
-                       '2048': 0.0041}
+                       '2048': 0.0041,
+		       '4096': 0.0041,
+		       '8192': 0.0041}
+
 
 # Xbar_inMem value dictionary (1 access means reading (dac_res) bits for each xbar row)
 # for computing average power of ima - scale dyn_pow down by xbar_size
@@ -382,38 +400,48 @@
 
 # Tile component latency/pow/area
 # EDRAM value dictionary (counter storage is not coounted)
-edram_lat_dict = {'8'  :2,
-                  '64' : 2, #edram access width is constant = 256 bits
-                  '128': 2}
-
-edram_pow_dyn_dict = {'8' : 17.2/2,
-                      '64' : 17.2/2, # (0.0172 nJ with 2 cycles access latency)
-                      '128': 25.35/2}
-
-edram_pow_leak_dict = {'8' : 0.46,
-                       '64' : 0.46,
-                       '128': 0.77}
-
-edram_area_dict = {'8' : 0.086,
-                   '64' : 0.086,
-                   '128': 0.121}
+edram_lat_dict = {'8'   : 2,
+                  '64'  : 2, #edram access width is constant = 256 bits
+                  '128' : 2,
+		  '2048': 2}
+
+edram_pow_dyn_dict = {'8'   : 17.2/2,
+                      '64'  : 17.2/2, # (0.0172 nJ with 2 cycles access latency)
+                      '128' : 25.35/2,
+		      '2048': 25.35/2}
+
+edram_pow_leak_dict = {'8'   : 0.46,
+                       '64'  : 0.46,
+                       '128' : 0.77,
+		       '2048': 0.77}
+
+edram_area_dict = {'8'   : 0.086,
+                   '64'  : 0.086,
+                   '128' : 0.121,
+		   '2048': 0.121}
 
 # Tile Instruction Memory value dictionary
-tile_instrnMem_lat_dict = {'512' : 1,
+tile_instrnMem_lat_dict = {'512': 1,
                           '1024': 1,
-                          '2048': 1}
+                          '2048': 1,
+			  '4096': 1}
 
 tile_instrnMem_pow_dyn_dict = {'512' : 0.46,
                                '1024': 0.53,
-                               '2048': 0.65}
+                               '2048': 0.65,
+			       '4096': 0.65}
 
 tile_instrnMem_pow_leak_dict = {'512' : 0.078,
                                 '1024': 0.147,
-                                '2048': 0.33}
+                                '2048': 0.33,
+			        '4096': 0.33}
+
 
 tile_instrnMem_area_dict = {'512' : 0.00108,
                             '1024': 0.00192,
-                            '2048': 0.0041}
+                            '2048': 0.0041,
+			    '4096': 0.0041}
+
 
 # counter storage (2048 Byte Scratch RAM - 1 counter entry shared by 256 bits of data (16 neurons))
 # area scaling (X8)

diff --git a/include/example-configs/config-cnn.py b/include/example-configs/config-cnn.py
@@ -0,0 +1,123 @@
+# This file contains the configurable parameters in DPE (all hierarchies - IMA, Tile, Node)
+## All user specified parameters are provided by this file only
+
+## Debug - 0 (1): dpe simulation will (won't) produce ima/tile traces while simulating
+cycles_max = 5000000 # Put both these to very large numbers (when design is bug-free)!
+debug = 1
+xbar_record = 1
+inference = 1
+training = not(inference)
+
+## Operand precision (fixed point allowed only): num_bits = int_bits + frac_bits
+num_bits = 16
+int_bits = 4
+frac_bits = num_bits - int_bits
+
+## IMA configurable parameters (permissible values for each parameter provided here)
+## Instruction generation - affected by xbar_bits, num_xbar, xbar_size.
+# xbar_bits: 2, 4, 6
+# num_xbar: positive integer
+# xbar_size: 32, 64, 128, 256
+# dac_res: positive integer <= num_bits
+# adc_res: positive integer <= num_bits
+# num_adc: positive integer <= num_xbar (doesn't allow more than one ADC per xbar)
+# num_ALU: positive integer
+# dataMem_size: (in Bytes) - 256, 512, 1024, 2048 (affects instrn width, hence capped)
+# instrnMem_size: (in Bytes) - 512, 1024, 2048
+
+# Fixed parameters
+addr_width = 22 # Added to address larger address space for conv layers (#TODO: Compiler needs to fix shared memory reuse)
+data_width = num_bits # (in bits)
+xbdata_width = data_width # (in bits)
+instrn_width = 48 # (in bits)
+
+# Change here - Specify the IMA parameters here
+xbar_bits = 2
+num_matrix = 2 # each matrix is 1-fw logical xbar for inference and 1-fw, 1-bw, and 1 delta logical xbar for training. Each logical xbar for inference is 8-fw physical xbar and for training  8-fw, 8-bw and 16-delta physical xbars.
+xbar_size = 128
+dac_res = 1
+# ADC configuration
+adc_res = 8 # around 4 to 8. this value should be
+num_adc_per_matrix = 2
+num_adc = num_adc_per_matrix * num_matrix
+
+# The idea is to have different ADC resolution value for each ADC.
+# The number of ADC if defined by num_adc property. Currently it is 2 * num_matrix(2) = 4
+# NOTE: Only taking in account indexes 0 and 2, 1 and 3 are ignored, because ADCs 1 and 3 are assumed t be equal to 0 and 2. 
+adc_res_new = {
+                'matrix_adc_0' : 8,
+                'matrix_adc_1' : 4,
+                'matrix_adc_2' : 8,
+                'matrix_adc_3' : 4
+              }
+
+num_ALU = num_matrix*2
+#dataMem_size = num_matrix*(6*xbar_size) # 4 for 4 input spaces within matrix (1 for f/b each, 2 for d)
+dataMem_size = 4096 # 2048 is larger than num_matrix*(6*xbar_size)
+instrnMem_size = 8192 #in entries
+
+# This depends on above parameters
+if (training):
+    datamem_off = xbar_size * (num_matrix*6) # each matrix has 6 memory spaces (1 for f/b, 2 for d)
+
+if (inference):
+    datamem_off = xbar_size * (num_matrix*2) # each matrix has 2 memory spaces ( 1 input Xbar memory and 1 output Xbar memory) 
+
+phy2log_ratio = num_bits / xbar_bits # ratio of physical to logical xbar #vaulue is 8
+lr = 0.25 # learning rate for updates to d-xbar
+
+## Tile configurable parameters (permissible values for each parameter provided here)
+## Instruction generation - affected by num_ima
+# num_ima: positive integer
+# edram buswidth: positive integer <= 16 (actual buswidth - this integer*data_width)
+# edram_size: (in KiloBytes) - 64, 128, 256, 512
+# receive_buffer_depth: 4, 8, 12, 16, 32 (number of edram buffer entries (each entry maps to a virtual tile)) \
+#        puts a cap on the maximum num ber of tiles that can send data to a tile in next layer
+# receive_buffer_width: edram_buswidth/data_width (Fixed - in terms of number of neurons)
+# tile_instrnMem_size: 256, 512, 1024 (in Bytes)
+
+# Fixed parameters
+instrn_width = 48 # bits (op-2, vtile_id-6, send/receive_width-8, target_addr/counter-16, vw-8, mem_addr-16)
+edram_buswidth = 256 # in bits
+#receive_buffer_depth = 16
+receive_buffer_depth = 150 #set equal to num_tile_max
+receive_buffer_width =  edram_buswidth / num_bits # size of receive buffeer entry (in terms of number of neurons)
+
+# Change here - Specify the Tile parameters here
+num_ima = 8
+edram_size = 2048 # in Kilobytes (64 KB - same as issac)
+tile_instrnMem_size = 4096 # in entries
+
+## Node configurable parameters (permissible values for each parameter provided here)
+## Instruction generation - affected by num_tile
+# num_tile_compute =  positive integer
+# inj_rate < 0.2 (depends on the mapping)
+# num_port: 4, 8
+
+# Fixed parameters
+# NOC topology: cmesh (n=2, k=4, c=4) - can fit k*n*c tiles
+cmesh_c = 4
+num_bits_tileId =32
+flit_width = 32
+packet_width = edram_buswidth/data_width #in multiples of flits (data considered only - booksim consider address itself)
+# (b bit of address = logN, N is the number of nodes)
+
+# Change here - Specify the Node parameters here
+num_tile_compute = 7 # number of tiles mapped by dnn (leaving input and output tiles)
+num_tile_max = 168.0 # maximum number of tiles per node
+num_inj_max = num_tile_max # [conservative] max number of packet injections that can occur in a cycle (each tile injects a packet into NOC each cycle)
+noc_inj_rate = 0.005
+noc_num_port = 4
+
+## Node parameters - Our way of simulation just assumes all tile in one actual node
+num_node = 1
+
+# Do not change this - total number of tiles
+num_tile = num_node * num_tile_compute + 2 # +1 for first tile (I/O tile) - dummy, others - compute
+
+#Security parameters - Used to verify if the model used is encryted or authenticated (set by dpe.py)
+#Do not change
+encrypted = False
+authenticated = False
+cypher_name = ''
+cypher_hash = ''