diff --git a/how_to_run.md b/how_to_run.md index bf1dc791..53491caa 100644 --- a/how_to_run.md +++ b/how_to_run.md @@ -99,6 +99,11 @@ num_tile_compute = 23 # number of tiles mapped by dnn (leaving input and output # Do not change this - total number of tiles num_tile = num_node * num_tile_compute + 2 # +1 for first tile (I/O tile) - dummy, others - compute -- (Line 95) ``` +#### 6.2 - Setup constants file: + +Use the appropriate constants file from ```puma-simulator/include/example-constants/(constant file name)``` +For example: for 128x128 crossbar use ```constants-128.py``` +Copy the file to ```puma-simulator/include/``` and rename it to ```constants.py``` ### 7. Run your model, in this example, the ```lstm-layer.cpp```: @@ -185,3 +190,12 @@ number of tiles mapped: 23 ### 10. To run Regression tests after running with weights for inference, go to simulator/test/val. ```python reg_test_1.py -n mlp``` + +### 11. Quantization: +Change the ```input_prec``` and ```weight_width``` parameters in config file to see the effects of quantization. + +``` +# Input and Weight parameters +input_prec = 16 +weight_width = 16 +``` diff --git a/include/config.py b/include/config.py index e09ebedc..afea3bf5 100644 --- a/include/config.py +++ b/include/config.py @@ -7,7 +7,7 @@ xbar_record = 1 inference = 1 training = not(inference) -sparse_opt = 1 # Flag for Sparsity optimisaton (Make it 0 for only dense computations) +sparse_opt = 0 # Flag for Sparsity optimisaton (Make it 0 for only dense computations) ## Variable to define the type of MVMU # One of "Analog", "Digital_V1" or "Digital_V2" @@ -35,7 +35,7 @@ # Fixed parameters addr_width = 22 # Added to address larger address space for conv layers (#TODO: Compiler needs to fix shared memory reuse) data_width = num_bits # (in bits) -xbdata_width = data_width # (in bits) +xbdata_width = data_width # (in bits), equivalent to input_prec instrn_width = 48 # (in bits) # Input and Weight parameters input_prec = 16 @@ -50,15 +50,19 @@ num_adc_per_matrix = 2 num_adc = num_adc_per_matrix * num_matrix +#uncomment this line for homogeneous ADC precision +adc_res_new ={} + +#uncomment adc_res_new for heterogenous adcs # The idea is to have different ADC resolution value for each ADC. # The number of ADC if defined by num_adc property. Currently it is 2 * num_matrix(2) = 4 # NOTE: Only taking in account indexes 0 and 2, 1 and 3 are ignored, because ADCs 1 and 3 are assumed t be equal to 0 and 2. -adc_res_new = { - 'matrix_adc_0' : 8, - 'matrix_adc_1' : 4, - 'matrix_adc_2' : 8, - 'matrix_adc_3' : 4 - } +#adc_res_new = { +# 'matrix_adc_0' : 8, +# 'matrix_adc_1' : 4, +# 'matrix_adc_2' : 8, +# 'matrix_adc_3' : 4 +# } num_ALU = num_matrix*2 #dataMem_size = num_matrix*(6*xbar_size) # 4 for 4 input spaces within matrix (1 for f/b each, 2 for d) diff --git a/include/constants.py b/include/constants.py index a2cfe755..5072cc32 100644 --- a/include/constants.py +++ b/include/constants.py @@ -64,41 +64,50 @@ # IMA component latency/power/area dictionary (all values in ns, mw, mm2) # XBAR - Models from ISAAC paper -xbar_lat_dict = {'2': {'32' : 32, # first indexed by xbar_bits then by xbar_size +xbar_lat_dict = {'2': {'16' : 16, + '32' : 32, # first indexed by xbar_bits then by xbar_size '64' : 64, '128': 128, '256': 256}, - '4': {'32' : 32, + '4': {'16' : 16, + '32' : 32, '64' : 64, '128': 128, '256': 256}, - '6': {'32' : 32, + '6': {'16' : 16, + '32' : 32, '64' : 64, '128': 128, '256': 256}} -xbar_pow_dict = {'2': {'32' : 0.01875, +xbar_pow_dict = {'2': {'16' : 0.0046875, + '32' : 0.01875, '64' : 0.075, '128': 0.3, '256': 1.2}, - '4': {'32' : 0.01875, + '4': {'16' : 0.0046875, + '32' : 0.01875, '64' : 0.075, '128': 0.3, '256': 1.2}, - '6': {'32' : 0.01875, + '6': {'16' : 0.0046875, + '32' : 0.01875, '64' : 0.075, '128': 0.3, '256': 1.2}} -xbar_area_dict = {'2': {'32' : 1.5625 * 10**(-6), +xbar_area_dict = {'2': {'16' : 3.90625 * 10**(-7), + '32' : 1.5625 * 10**(-6), '64' : 6.25 * 10**(-6), '128': 2.5 * 10**(-5), '256': 1.0 * 10**(-4)}, - '4': {'32' : 1.5625 * 10**(-6), + '4': {'16' : 3.90625 * 10**(-7), + '32' : 1.5625 * 10**(-6), '64' : 6.25 * 10**(-6), '128': 2.5 * 10**(-5), '256': 1.0 * 10**(-4)}, - '6': {'32' : 1.5625 * 10**(-6), + '6': {'16' : 3.90625 * 10**(-7), + '32' : 1.5625 * 10**(-6), '64' : 6.25 * 10**(-6), '128': 2.5 * 10**(-5), '256': 1.0 * 10**(-4)}} @@ -110,7 +119,7 @@ xbar_ip_lat = 100.0 #xbar_ip_pow = (1.37*2.0) # xbar_ip_pow (includes all mvmu) -xbar_ip_pow = (1.37*2.0) - 1.04 # xbar_ip_pow (includes all mvmu except ADC - uncomment num_access for ADC object) +xbar_ip_pow = (1.37*2.0) - 1.04 if cfg.training else 1.37-1.04 # xbar_ip_pow (includes all mvmu except ADC - uncomment num_access for ADC object), # Note the read and write lat/pow are for entire xbar xbar_rd_lat = 328.0 * 1000 * (1/32.0) @@ -236,126 +245,110 @@ dataMem_lat_dict = {'256' : 1, '512' : 1, '1024': 1, - '2048': 1, - '4096':1} + '2048': 1} dataMem_pow_dyn_dict = {'256' : 0.16, '512' : 0.24, '1024': 0.33, - '2048': 0.57, - '4096': 0.57} + '2048': 0.57} dataMem_pow_leak_dict = {'256' : 0.044, '512' : 0.078, '1024': 0.147, - '2048': 0.33, - '4096': 0.33} + '2048': 0.33} dataMem_area_dict = {'256' : 0.00056, '512' : 0.00108, '1024': 0.00192, - '2048': 0.00392, - '4096': 0.00392} - -dataMem_lat_dict = {'256' : 1, - '512' : 1, - '1024': 1, - '2048': 1, - '4096':1} - -dataMem_pow_dyn_dict = {'256' : 0.16, - '512' : 0.24, - '1024': 0.33, - '2048': 0.57, - '4096': 0.57} - -dataMem_pow_leak_dict = {'256' : 0.044, - '512' : 0.078, - '1024': 0.147, - '2048': 0.33, - '4096': 0.33} - -dataMem_area_dict = {'256' : 0.00056, - '512' : 0.00108, - '1024': 0.00192, - '2048': 0.00392, - '4096': 0.00392} + '2048': 0.00392} # Instruction Memory value dictionary instrnMem_lat_dict = {'512' : 1, '1024': 1, - '2048': 1, - '4096': 1, - '8192': 1} + '2048': 1} instrnMem_pow_dyn_dict = {'512' : 0.46, '1024': 0.53, - '2048': 0.65, - '4096': 0.65, - '8192': 0.65} + '2048': 0.65} instrnMem_pow_leak_dict = {'512' : 0.078, '1024': 0.147, - '2048': 0.33, - '4096': 0.33, - '8192': 0.33} + '2048': 0.33} instrnMem_area_dict = {'512' : 0.00108, '1024': 0.00192, - '2048': 0.0041, - '4096': 0.0041, - '8192': 0.0041} + '2048': 0.0041} # Xbar_inMem value dictionary (1 access means reading (dac_res) bits for each xbar row) # for computing average power of ima - scale dyn_pow down by xbar_size -xbar_inMem_lat_dict = {'32' : 1, # indexed with xbar size +xbar_inMem_lat_dict = {'16' : 1, + '32' : 1, # indexed with xbar size '64' : 1, '128' : 1, '256' : 1} -xbar_inMem_pow_dyn_read_dict = {'32' : 0.3, +xbar_inMem_pow_dyn_read_dict = {'16' : 0.3, #doesn't change much as we move from 32 to 16, because these are very small memories + '32' : 0.3, '64' : 0.7, '128' : 1.7, '256' : 4.7} -xbar_inMem_pow_dyn_write_dict = {'32' : 0.1, +xbar_inMem_pow_dyn_write_dict = {'16' : 0.1, + '32' : 0.1, '64' : 0.1, '128' : 0.16, '256' : 0.2} -xbar_inMem_pow_leak_dict = {'32' : 0.009, +xbar_inMem_pow_leak_dict = {'16' : 0.009, + '32' : 0.009, '64' : 0.02, '128' : 0.04, '256' : 0.075} -xbar_inMem_area_dict = {'32' : 0.00015, +xbar_inMem_area_dict = {'16' : 0.00015, + '32' : 0.00015, '64' : 0.00033, '128' : 0.00078, '256' : 0.0019} # Xbar_outMem value dictionary -xbar_outMem_lat_dict = {'32' : 1, # indexed with xbar size +xbar_outMem_lat_dict = {'16' : 1, + '32' : 1, # indexed with xbar size '64' : 1, '128' : 1, '256' : 1} -xbar_outMem_pow_dyn_dict = {'32' : 0.1, +xbar_outMem_pow_dyn_dict = {'16' : 0.1, + '32' : 0.1, '64' : 0.1, '128' : 0.16, '256' : 0.2} -xbar_outMem_pow_leak_dict = {'32' : 0.009, +xbar_outMem_pow_leak_dict = {'16' : 0.009, + '32' : 0.009, '64' : 0.02, '128' : 0.04, '256' : 0.075} -xbar_outMem_area_dict = {'32' : 0.00015, +xbar_outMem_area_dict = {'16' : 0.00015, + '32' : 0.00015, '64' : 0.00033, '128' : 0.00078, '256' : 0.0019} +dataMem_size_max = '2048' +if str(cfg.dataMem_size) in dataMem_lat_dict: + dataMem_size_max = str(cfg.dataMem_size) +else: + print("Warning: No values for core data memory size provided. Using values for 2048 instead.") + +instrnMem_size_max = '2048' +if str(cfg.instrnMem_size) in instrnMem_lat_dict: + instrnMem_size_max = str(cfg.instrnMem_size) +else: + print("Warning: No values for core instruction memory size provided. Using values for 2048 instead.") # Chosen latency based on config file - only for components whose latency is parameter dependent #xbar_lat = xbar_lat_dict [str(cfg.xbar_bits)][str(cfg.xbar_size)] @@ -373,10 +366,10 @@ adc_lat = adc_lat_dict [str(cfg.adc_res)] xbar_inMem_lat = xbar_inMem_lat_dict[str(cfg.xbar_size)] xbar_outMem_lat = xbar_outMem_lat_dict[str(cfg.xbar_size)] -instrnMem_lat = instrnMem_lat_dict[str(cfg.instrnMem_size)] -dataMem_lat = dataMem_lat_dict[str(cfg.dataMem_size)] +instrnMem_lat = instrnMem_lat_dict[str(instrnMem_size_max)] +dataMem_lat = dataMem_lat_dict[str(dataMem_size_max)] -# Chosen area based on config file - only for components whose area is parameter dependent +# Chosen area based on config file - only for components whose latency is parameter dependent if cfg.MVMU_ver == "Analog": xbar_area = xbar_area_dict[str(cfg.xbar_bits)][str(cfg.xbar_size)] else: @@ -385,8 +378,8 @@ adc_area = adc_area_dict [str(cfg.adc_res)] xbar_inMem_area = xbar_inMem_area_dict[str(cfg.xbar_size)] xbar_outMem_area = xbar_outMem_area_dict[str(cfg.xbar_size)] -instrnMem_area = instrnMem_area_dict[str(cfg.instrnMem_size)] * math.sqrt(8) #area scaling for 8 bytes per instruction -dataMem_area = dataMem_area_dict[str(cfg.dataMem_size)] +instrnMem_area = instrnMem_area_dict[str(instrnMem_size_max)] * math.sqrt(8) #area scaling for 8 bytes per instruction +dataMem_area = dataMem_area_dict[str(dataMem_size_max)] # Chosen dyn_power based on config file - only for components whose latency is parameter dependent #xbar_pow_dyn = xbar_pow_dict [str(cfg.xbar_bits)][str(cfg.xbar_size)] @@ -399,8 +392,8 @@ xbar_inMem_pow_dyn_read = xbar_inMem_pow_dyn_read_dict[str(cfg.xbar_size)] xbar_inMem_pow_dyn_write = xbar_inMem_pow_dyn_write_dict[str(cfg.xbar_size)] xbar_outMem_pow_dyn = xbar_outMem_pow_dyn_dict[str(cfg.xbar_size)] -instrnMem_pow_dyn = instrnMem_pow_dyn_dict[str(cfg.instrnMem_size)] * math.sqrt(8) #area scaling for 8 bytes per instruction -dataMem_pow_dyn = dataMem_pow_dyn_dict[str(cfg.dataMem_size)] +instrnMem_pow_dyn = instrnMem_pow_dyn_dict[str(instrnMem_size_max)] * math.sqrt(8) #area scaling for 8 bytes per instruction +dataMem_pow_dyn = dataMem_pow_dyn_dict[str(dataMem_size_max)] # Energy xbar_ip_energy_dict = {'0':0, '90':0, '80':0, '70':0, '60':0, '50':0, '40':0, '30':0, '20':0, '10':0} @@ -420,8 +413,8 @@ adc_pow_leak = adc_pow_leak_dict [str(cfg.adc_res)] xbar_inMem_pow_leak = xbar_inMem_pow_leak_dict[str(cfg.xbar_size)] xbar_outMem_pow_leak = xbar_outMem_pow_leak_dict[str(cfg.xbar_size)] -instrnMem_pow_leak = instrnMem_pow_leak_dict[str(cfg.instrnMem_size)] * math.sqrt(8) #area scaling for 8 bytes per instruction -dataMem_pow_leak = dataMem_pow_leak_dict[str(cfg.dataMem_size)] +instrnMem_pow_leak = instrnMem_pow_leak_dict[str(instrnMem_size_max)] * math.sqrt(8) #area scaling for 8 bytes per instruction +dataMem_pow_leak = dataMem_pow_leak_dict[str(dataMem_size_max)] # Core Control unit (control unit and pipeline registers) ccu_pow = 1.25*0.2 #0.2 for activvity @@ -444,46 +437,50 @@ # EDRAM value dictionary (counter storage is not coounted) edram_lat_dict = {'8' : 2, '64' : 2, #edram access width is constant = 256 bits - '128' : 2, - '2048': 2} + '128' : 2} edram_pow_dyn_dict = {'8' : 17.2/2, '64' : 17.2/2, # (0.0172 nJ with 2 cycles access latency) - '128' : 25.35/2, - '2048': 25.35/2} + '128' : 25.35/2} edram_pow_leak_dict = {'8' : 0.46, '64' : 0.46, - '128' : 0.77, - '2048': 0.77} + '128' : 0.77} edram_area_dict = {'8' : 0.086, '64' : 0.086, - '128' : 0.121, - '2048': 0.121} + '128' : 0.121} # Tile Instruction Memory value dictionary tile_instrnMem_lat_dict = {'512': 1, '1024': 1, - '2048': 1, - '4096': 1} + '2048': 1} tile_instrnMem_pow_dyn_dict = {'512' : 0.46, '1024': 0.53, - '2048': 0.65, - '4096': 0.65} + '2048': 0.65} tile_instrnMem_pow_leak_dict = {'512' : 0.078, '1024': 0.147, - '2048': 0.33, - '4096': 0.33} + '2048': 0.33} tile_instrnMem_area_dict = {'512' : 0.00108, '1024': 0.00192, - '2048': 0.0041, - '4096': 0.0041} + '2048': 0.0041} + +edram_size_max = '128' +if str(cfg.edram_size) in edram_lat_dict: + edram_size_max = str(cfg.edram_size) +else: + print("Warning: No values for edram memory size provided. Using values for 128 instead.") + +tile_instrnMem_size_max = '2048' +if str(cfg.tile_instrnMem_size) in tile_instrnMem_lat_dict: + tile_instrnMem_size_max = str(cfg.tile_instrnMem_size) +else: + print("Warning: No values for tile instrn memory size provided. Using values for 2048 instead.") # counter storage (2048 Byte Scratch RAM - 1 counter entry shared by 256 bits of data (16 neurons)) # area scaling (X8) @@ -513,20 +510,20 @@ # Chosen latency based on config file - only for components whose latency is parameter dependent -edram_lat = edram_lat_dict[str(cfg.edram_size)] -tile_instrnMem_lat = tile_instrnMem_lat_dict[str(cfg.tile_instrnMem_size)] +edram_lat = edram_lat_dict[str(edram_size_max)] +tile_instrnMem_lat = tile_instrnMem_lat_dict[str(tile_instrnMem_size_max)] # Chosen area based on config file - only for components whose area is parameter dependent -edram_area = edram_area_dict[str(cfg.edram_size)] -tile_instrnMem_area = tile_instrnMem_area_dict[str(cfg.tile_instrnMem_size)] * math.sqrt(8) #area scaling for 8 bytes per instruction +edram_area = edram_area_dict[str(edram_size_max)] +tile_instrnMem_area = tile_instrnMem_area_dict[str(tile_instrnMem_size_max)] * math.sqrt(8) #area scaling for 8 bytes per instruction # Chosen dynamic power based on config file - only for components whose dynamic power is parameter dependent -edram_pow_dyn = edram_pow_dyn_dict[str(cfg.edram_size)] -tile_instrnMem_pow_dyn = tile_instrnMem_pow_dyn_dict[str(cfg.tile_instrnMem_size)] * math.sqrt(8) #area scaling for 8 bytes per instruction +edram_pow_dyn = edram_pow_dyn_dict[str(edram_size_max)] +tile_instrnMem_pow_dyn = tile_instrnMem_pow_dyn_dict[str(tile_instrnMem_size_max)] * math.sqrt(8) #area scaling for 8 bytes per instruction # Chosen leakage power based on config file - only for components whose leakage power is parameter dependent -edram_pow_leak = edram_pow_leak_dict[str(cfg.edram_size)] -tile_instrnMem_pow_leak = tile_instrnMem_pow_leak_dict[str(cfg.tile_instrnMem_size)] * math.sqrt(8) #area scaling for 8 bytes per instruction +edram_pow_leak = edram_pow_leak_dict[str(edram_size_max)] +tile_instrnMem_pow_leak = tile_instrnMem_pow_leak_dict[str(tile_instrnMem_size_max)] * math.sqrt(8) #area scaling for 8 bytes per instruction # Tile Control unit tcu_pow = 0.25*0.2 @@ -555,7 +552,7 @@ noc_area_dict = {'4': 0.047, '8': 0.116} -# Router dynamic power - NOC will be used only if atleast oen of send_queue in node is non_empty +# Router dynamic power - NOC will be used only if atleast one of send_queue in node is non_empty noc_pow_dyn_dict = {'4': 16.13, '8': 51.48} @@ -563,7 +560,7 @@ noc_pow_leak_dict = {'4': 0.41, '8': 1.04} -# Enter component latency (Based on teh above NOC topological parameters) +# Enter component latency (Based on the above NOC topological parameters) # Inter-node Noc (router & channel) assert (cfg.noc_inj_rate <= noc_inj_rate_max), 'Oops: reconsider NOC design and or DNN mapping, with this inj_rate, NOC data transfer throughput \ will be terrible!' diff --git a/include/example-configs/config-cnn.py b/include/example-configs/config-cnn.py index 4b2bdde5..f4ce24c3 100644 --- a/include/example-configs/config-cnn.py +++ b/include/example-configs/config-cnn.py @@ -7,13 +7,13 @@ xbar_record = 1 inference = 1 training = not(inference) -sparse_opt = 1 # Flag for Sparsity optimisaton (Make it 0 for only dense computations) +sparse_opt = 0 # Flag for Sparsity optimisaton (Make it 0 for only dense computations) ## Variable to define the type of MVMU # One of "Analog", "Digital_V1" or "Digital_V2" # Digital_V1 has compressed inputs (Data+Offset style) # Digital_V2 has uncompressed inputs (Skips computations for 0 activation) -MVMU_ver = "Digital_V2" +MVMU_ver = "Analog" ## Operand precision (fixed point allowed only): num_bits = int_bits + frac_bits num_bits = 16 @@ -50,15 +50,19 @@ num_adc_per_matrix = 2 num_adc = num_adc_per_matrix * num_matrix +#uncomment this line for homogeneous ADC precision +adc_res_new ={} + +#uncomment adc_res_new for heterogenous adcs # The idea is to have different ADC resolution value for each ADC. # The number of ADC if defined by num_adc property. Currently it is 2 * num_matrix(2) = 4 # NOTE: Only taking in account indexes 0 and 2, 1 and 3 are ignored, because ADCs 1 and 3 are assumed t be equal to 0 and 2. -adc_res_new = { - 'matrix_adc_0' : 8, - 'matrix_adc_1' : 4, - 'matrix_adc_2' : 8, - 'matrix_adc_3' : 4 - } +#adc_res_new = { +# 'matrix_adc_0' : 8, +# 'matrix_adc_1' : 4, +# 'matrix_adc_2' : 8, +# 'matrix_adc_3' : 4 +# } num_ALU = num_matrix*2 #dataMem_size = num_matrix*(6*xbar_size) # 4 for 4 input spaces within matrix (1 for f/b each, 2 for d) diff --git a/include/example-configs/config-mlp.py b/include/example-configs/config-mlp.py index 02e78dd1..3c3b1952 100644 --- a/include/example-configs/config-mlp.py +++ b/include/example-configs/config-mlp.py @@ -7,13 +7,13 @@ xbar_record = 1 inference = 1 training = not(inference) -sparse_opt = 1 # Flag for Sparsity optimisaton (Make it 0 for only dense computations) +sparse_opt = 0 # Flag for Sparsity optimisaton (Make it 0 for only dense computations) ## Variable to define the type of MVMU # One of "Analog", "Digital_V1" or "Digital_V2" # Digital_V1 has compressed inputs (Data+Offset style) # Digital_V2 has uncompressed inputs (Skips computations for 0 activation) -MVMU_ver = "Digital_V2" +MVMU_ver = "Analog" ## Operand precision (fixed point allowed only): num_bits = int_bits + frac_bits num_bits = 16 @@ -50,15 +50,19 @@ num_adc_per_matrix = 2 num_adc = num_adc_per_matrix * num_matrix +#uncomment this line for homogeneous ADC precision +adc_res_new ={} + +#uncomment adc_res_new for heterogenous adcs # The idea is to have different ADC resolution value for each ADC. # The number of ADC if defined by num_adc property. Currently it is 2 * num_matrix(2) = 4 # NOTE: Only taking in account indexes 0 and 2, 1 and 3 are ignored, because ADCs 1 and 3 are assumed t be equal to 0 and 2. -adc_res_new = { - 'matrix_adc_0' : 8, - 'matrix_adc_1' : 4, - 'matrix_adc_2' : 8, - 'matrix_adc_3' : 4 - } +#adc_res_new = { +# 'matrix_adc_0' : 8, +# 'matrix_adc_1' : 4, +# 'matrix_adc_2' : 8, +# 'matrix_adc_3' : 4 +# } num_ALU = num_matrix*2 #dataMem_size = num_matrix*(6*xbar_size) # 4 for 4 input spaces within matrix (1 for f/b each, 2 for d) diff --git a/include/example-constants/constants-128.py b/include/example-constants/constants-128.py new file mode 100644 index 00000000..141563d9 --- /dev/null +++ b/include/example-constants/constants-128.py @@ -0,0 +1,589 @@ +## This file contains the data structures used in differnet hierarchies. +## It also holds power, area and latency numbers of different component used in DPE design +import config as cfg +import math +import constants_digital as digi_param +# Limits the number of cycles an IMA runs in case it doesn't halt +infinity = 100000 + +############################################################################################################# +## Technology/Other constants for all the modules +############################################################################################################# +# IMA - folliwng parameters are not used currently, will be used when analog functionality is implemented +cycle_time = 1 # in nanoseconds (1ns) +vdd = 0.9 +xbar_out_min = -10e-10 +xbar_out_max = 1 # think about this - ??? + +############################################################################################################# +## Define commonly used data structures +############################################################################################################# +# List of supported opcodes for tile +op_list_tile = ['send', 'receive', 'compute', 'halt'] + +# Instruction format for Tile +dummy_instrn_tile = {'opcode' : op_list_tile[0], + 'mem_addr': 0, # send/receive - edram_addr + 'r1': 0, # send-send_width, receive-receive_width + 'r2': 0, # send-target_addr, receive-counter + 'vtile_id': 0, # send/receive-neuron_id + 'ima_nma': '', # compute - a bit for each ima + 'vec': 0} # vector width + +# List of supported opcodes/aluops for IMA - cp will copy data (from data memory of ima to xbarInmem) +op_list = ['ld', 'cp', 'st', 'set', 'nop', 'alu', 'alui', 'mvm', 'vvo', 'hlt', 'jmp', 'beq', 'alu_int', 'crs'] +aluop_list = ['add', 'sub', 'sna', 'mul', 'sigmoid'] # sna is also used by mvm isntruction + +# Instruction format for IMA +dummy_instrn = {'opcode' : op_list[0], # instrn op + 'aluop' : aluop_list[0], # alu function + 'd1' : 0, # destination + 'r1' : 0, # operand1 (stride for mvm) + 'r2' : 0, # operand2 + 'r3' : 0, # operand3 (shift) + 'vec' : 0, # vector width + 'imm' : 0, # immediate (scalar) data + 'xb_nma' : 0 } # xbar negative-mask, a xbar evaluates if neg-mask = 1 + +# List of pipeline stages - in order for IMA +stage_list = ['fet', 'dec', 'ex'] +last_stage = 'ex' + +############################################################################################################# +# IMA Hierarchy parameters + # Number of Xbars + # Crossbar Size + # Crossbar bits + # Bit resolution of ADCs and DACs + # Number of ADCs + # Number of ALUs + # Data memory size + # Size of Xbar in/out memory (Register) is dependent on Xbar size and num_bits + # Instruction memory size +############################################################################################################# + +# IMA component latency/power/area dictionary (all values in ns, mw, mm2) +# XBAR - Models from ISAAC paper +xbar_lat_dict = {'2': {'16' : 16, + '32' : 32, # first indexed by xbar_bits then by xbar_size + '64' : 64, + '128': 128, + '256': 256}, + '4': {'16' : 16, + '32' : 32, + '64' : 64, + '128': 128, + '256': 256}, + '6': {'16' : 16, + '32' : 32, + '64' : 64, + '128': 128, + '256': 256}} + +xbar_pow_dict = {'2': {'16' : 0.0046875, + '32' : 0.01875, + '64' : 0.075, + '128': 0.3, + '256': 1.2}, + '4': {'16' : 0.0046875, + '32' : 0.01875, + '64' : 0.075, + '128': 0.3, + '256': 1.2}, + '6': {'16' : 0.0046875, + '32' : 0.01875, + '64' : 0.075, + '128': 0.3, + '256': 1.2}} + +xbar_area_dict = {'2': {'16' : 3.90625 * 10**(-7), + '32' : 1.5625 * 10**(-6), + '64' : 6.25 * 10**(-6), + '128': 2.5 * 10**(-5), + '256': 1.0 * 10**(-4)}, + '4': {'16' : 3.90625 * 10**(-7), + '32' : 1.5625 * 10**(-6), + '64' : 6.25 * 10**(-6), + '128': 2.5 * 10**(-5), + '256': 1.0 * 10**(-4)}, + '6': {'16' : 3.90625 * 10**(-7), + '32' : 1.5625 * 10**(-6), + '64' : 6.25 * 10**(-6), + '128': 2.5 * 10**(-5), + '256': 1.0 * 10**(-4)}} + +## New values added for xbar MVM/MTVM, OP (parallel write), serial read/write +# the following is lumped power for xbar inner/outer-product - includes peripherals +xbar_op_lat = 20.0*12.8 # with 4 VFUs +xbar_op_pow = 4.44 * 3.27 / (12.8) + +#hardcoded value +#xbar_ip_lat = 100.0 +#value depending on xb size +xbar_ip_lat = xbar_lat_dict[str(cfg.xbar_bits)][str(cfg.xbar_size)] + +#xbar_ip_pow = (1.37*2.0) # xbar_ip_pow (includes all mvmu) +#xbar_ip_pow = (1.37*2.0) - 1.04 if cfg.training else 1.37-1.04 # xbar_ip_pow (includes all mvmu except ADC - uncomment num_access for ADC object), + +#xbar inner product power dependence on xbar size +xbar_ip_pow = xbar_pow_dict[str(cfg.xbar_bits)][str(cfg.xbar_size)] + +# Note the read and write lat/pow are for entire xbar +xbar_rd_lat = 328.0 * 1000 * (1/32.0) +xbar_wr_lat = 351.0 * 1000 * (1/32.0) + +# the following is lumped power for xbar rd/wr (for whole array) - includes peripherals +xbar_rd_pow = 208.0 * 1000 * (1/32.0) / xbar_rd_lat +xbar_wr_pow = 676.0 * 1000 * (1/32.0) / xbar_rd_lat + +# DAC - Discuss exact values with ISSAC authors +dac_lat_dict = {'1' : 1, + '2' : 1, + '4' : 1, + '8' : 1, + '16': 1} + +dac_pow_dyn_dict = {'1' : 0.00350625, + '2' : 0.00350625, + '4' : 0.00350625, + '8' : 0.00350625, + '16': 0.00350625} + +dac_pow_leak_dict = {'1' : 0.000390625, + '2' : 0.000390625, + '4' : 0.000390625, + '8' : 0.000390625, + '16': 0.000390625} + +dac_area_dict = {'1' : 1.67 * 10**(-7), + '2' : 1.67 * 10**(-7), + '4' : 1.67 * 10**(-7), + '8' : 1.67 * 10**(-7), + '16': 1.67 * 10**(-7)} + +# ADC - Discuss exact values with ISSAC authors +# ADC Values for including sparsity +adc_lat_dict = {'1' : 12.5, + '2' : 25, + '3' : 37.5, + '4' : 50, + '5' : 62.5, + '6' : 75, + '7' : 87.5, + '8' : 100, + '9' : 112.5, + '16': 200} + +adc_pow_dyn_dict = {'1' : 0.225, + '2' : 0.45, + '3' : 0.675, + '4' : 0.9, + '5' : 1.125, + '6' : 1.35, + '7' : 1.575, + '8' : 1.8, + '9' : 2.025, + '16': 3.6} + +adc_pow_leak_dict = {'1' : 0.025, + '2' : 0.05, + '3' : 0.075, + '4' : 0.1, + '5' : 0.125, + '6' : 0.15, + '7' : 0.175, + '8' : 0.2, + '9' : 0.225, + '16': 0.4} + +adc_area_dict = {'1' : 0.0012, + '2' : 0.0012, + '3' : 0.0012, + '4' : 0.0012, + '5' : 0.00075, + '6' : 0.0009, + '7' : 0.00105, + '8' : 0.0012, + '9' : 0.0012, + '16': 0.0012} + +# SNH (MVM pipeline) +snh_lat = 1 +snh_pow_leak = 9.7 * 10**(-7) +snh_pow_dyn = 9.7 * 10**(-6) - snh_pow_leak +snh_area = 0.00004 / 8 / 128 + +# SNA (MVM pipeline) +sna_lat = 1 +sna_pow_leak = 0.005 +sna_pow_dyn = 0.05 - sna_pow_leak +sna_area = 0.00006 + +# ALU (Part of Vector Functional Unit) +alu_lat = 1 +alu_pow_dyn = 2.4 * 32/45 +alu_pow_div_dyn = 1.52 * 32/45 +alu_pow_mul_dyn = 0.795 * 32/45 +alu_pow_others_dyn = 0.373 * 32/45 # logical, eq, relu, add, sub, lsh, rsh +alu_pow_leak = 0.27 * 32/45 +alu_area = 0.00567 * 32/45 + +# witout considering division +#alu_lat = 1 +#alu_pow_dyn = 1.15 * 32/45 +#alu_pow_mul_dyn = 0.796 * 32/45 +#alu_pow_others_dyn = 0.36 * 32/45 # logical, eq, relu, add, sub, lsh, rsh +#alu_pow_leak = 0.05 * 32/45 +#alu_area = 0.002326 * 32/45 + +# Sigmoid/Tanh (Part of Vector Functional Unit) - Taken from ISAAC paper +act_lat = 1 # added for 4 exponential units +act_pow_leak = 0.026 +act_pow_dyn = 0.26 - act_pow_leak +act_area = 0.0003 # check this ??? + +# Multiplexer - These should be analog muxes +mux_lat = 0 +mux_pow_leak = 0 +mux_pow_dyn = 0 +mux_area = 0 + +# Data Memory value dictionary +dataMem_lat_dict = {'256' : 1, + '512' : 1, + '1024': 1, + '2048': 1} + +dataMem_pow_dyn_dict = {'256' : 0.16, + '512' : 0.24, + '1024': 0.33, + '2048': 0.57} + +dataMem_pow_leak_dict = {'256' : 0.044, + '512' : 0.078, + '1024': 0.147, + '2048': 0.33} + +dataMem_area_dict = {'256' : 0.00056, + '512' : 0.00108, + '1024': 0.00192, + '2048': 0.00392} + +# Instruction Memory value dictionary +instrnMem_lat_dict = {'512' : 1, + '1024': 1, + '2048': 1} + +instrnMem_pow_dyn_dict = {'512' : 0.46, + '1024': 0.53, + '2048': 0.65} + +instrnMem_pow_leak_dict = {'512' : 0.078, + '1024': 0.147, + '2048': 0.33} + + +instrnMem_area_dict = {'512' : 0.00108, + '1024': 0.00192, + '2048': 0.0041} + + +# Xbar_inMem value dictionary (1 access means reading (dac_res) bits for each xbar row) +# for computing average power of ima - scale dyn_pow down by xbar_size +xbar_inMem_lat_dict = {'16' : 1, + '32' : 1, # indexed with xbar size + '64' : 1, + '128' : 1, + '256' : 1} + +xbar_inMem_pow_dyn_read_dict = {'16' : 0.3, #doesn't change much as we move from 32 to 16, because these are very small memories + '32' : 0.3, + '64' : 0.7, + '128' : 1.7, + '256' : 4.7} + +xbar_inMem_pow_dyn_write_dict = {'16' : 0.1, + '32' : 0.1, + '64' : 0.1, + '128' : 0.16, + '256' : 0.2} + +xbar_inMem_pow_leak_dict = {'16' : 0.009, + '32' : 0.009, + '64' : 0.02, + '128' : 0.04, + '256' : 0.075} + +xbar_inMem_area_dict = {'16' : 0.00015, + '32' : 0.00015, + '64' : 0.00033, + '128' : 0.00078, + '256' : 0.0019} + +# Xbar_outMem value dictionary +xbar_outMem_lat_dict = {'16' : 1, + '32' : 1, # indexed with xbar size + '64' : 1, + '128' : 1, + '256' : 1} + +xbar_outMem_pow_dyn_dict = {'16' : 0.1, + '32' : 0.1, + '64' : 0.1, + '128' : 0.16, + '256' : 0.2} + +xbar_outMem_pow_leak_dict = {'16' : 0.009, + '32' : 0.009, + '64' : 0.02, + '128' : 0.04, + '256' : 0.075} + +xbar_outMem_area_dict = {'16' : 0.00015, + '32' : 0.00015, + '64' : 0.00033, + '128' : 0.00078, + '256' : 0.0019} + +dataMem_size_max = '2048' +if str(cfg.dataMem_size) in dataMem_lat_dict: + dataMem_size_max = str(cfg.dataMem_size) +else: + print("Warning: No values for core data memory size provided. Using values for 2048 instead.") + +instrnMem_size_max = '2048' +if str(cfg.instrnMem_size) in instrnMem_lat_dict: + instrnMem_size_max = str(cfg.instrnMem_size) +else: + print("Warning: No values for core instruction memory size provided. Using values for 2048 instead.") + +# Chosen latency based on config file - only for components whose latency is parameter dependent +#xbar_lat = xbar_lat_dict [str(cfg.xbar_bits)][str(cfg.xbar_size)] +xbar_ip_lat_dict = {'0':0, '90':0, '80':0, '70':0, '60':0, '50':0, '40':0, '30':0, '20':0, '10':0} +if cfg.MVMU_ver == "Analog": + for key, value in xbar_ip_lat_dict.items(): + xbar_ip_lat_dict[key] = xbar_ip_lat +else: + xbar_ip_lat_dict = digi_param.Digital_xbar_lat_dict[cfg.MVMU_ver][str(cfg.xbar_size)] +xbar_op_lat = xbar_op_lat +xbar_rd_lat = xbar_rd_lat +xbar_wr_lat = xbar_wr_lat +dac_lat = dac_lat_dict [str(cfg.dac_res)] +#FIXME need to review it I can remove adc_lat property +adc_lat = adc_lat_dict [str(cfg.adc_res)] +xbar_inMem_lat = xbar_inMem_lat_dict[str(cfg.xbar_size)] +xbar_outMem_lat = xbar_outMem_lat_dict[str(cfg.xbar_size)] +instrnMem_lat = instrnMem_lat_dict[str(instrnMem_size_max)] +dataMem_lat = dataMem_lat_dict[str(dataMem_size_max)] + +# Chosen area based on config file - only for components whose latency is parameter dependent +if cfg.MVMU_ver == "Analog": + xbar_area = xbar_area_dict[str(cfg.xbar_bits)][str(cfg.xbar_size)] +else: + xbar_area = digi_param.Digital_xbar_area_dict[cfg.MVMU_ver][str(cfg.xbar_size)] +dac_area = dac_area_dict [str(cfg.dac_res)] +adc_area = adc_area_dict [str(cfg.adc_res)] +xbar_inMem_area = xbar_inMem_area_dict[str(cfg.xbar_size)] +xbar_outMem_area = xbar_outMem_area_dict[str(cfg.xbar_size)] +instrnMem_area = instrnMem_area_dict[str(instrnMem_size_max)] * math.sqrt(8) #area scaling for 8 bytes per instruction +dataMem_area = dataMem_area_dict[str(dataMem_size_max)] + +# Chosen dyn_power based on config file - only for components whose latency is parameter dependent +#xbar_pow_dyn = xbar_pow_dict [str(cfg.xbar_bits)][str(cfg.xbar_size)] +xbar_ip_pow_dyn = xbar_ip_pow +xbar_op_pow_dyn = xbar_op_pow +xbar_rd_pow_dyn = xbar_rd_pow +xbar_wr_pow_dyn = xbar_wr_pow +dac_pow_dyn = dac_pow_dyn_dict [str(cfg.dac_res)] +adc_pow_dyn = adc_pow_dyn_dict [str(cfg.adc_res)] +xbar_inMem_pow_dyn_read = xbar_inMem_pow_dyn_read_dict[str(cfg.xbar_size)] +xbar_inMem_pow_dyn_write = xbar_inMem_pow_dyn_write_dict[str(cfg.xbar_size)] +xbar_outMem_pow_dyn = xbar_outMem_pow_dyn_dict[str(cfg.xbar_size)] +instrnMem_pow_dyn = instrnMem_pow_dyn_dict[str(instrnMem_size_max)] * math.sqrt(8) #area scaling for 8 bytes per instruction +dataMem_pow_dyn = dataMem_pow_dyn_dict[str(dataMem_size_max)] + +# Energy +xbar_ip_energy_dict = {'0':0, '90':0, '80':0, '70':0, '60':0, '50':0, '40':0, '30':0, '20':0, '10':0} +if cfg.MVMU_ver == "Analog": + for key,value in xbar_ip_energy_dict.items(): + xbar_ip_energy_dict[key] = xbar_ip_lat*xbar_ip_pow_dyn +else: + xbar_ip_energy_dict = digi_param.Digital_xbar_energy_dict[cfg.MVMU_ver][str(cfg.xbar_size)] +print('xbar_ip_energy_dict', xbar_ip_energy_dict) + +# Chosen leak_power based on config file - only for components whose latency is parameter dependent +if cfg.MVMU_ver == "Analog": + xbar_pow_leak = 0 +else: + xbar_pow_leak = digi_param.Digital_xbar_pow_leak_dict[str(cfg.xbar_size)] +dac_pow_leak = dac_pow_leak_dict [str(cfg.dac_res)] +adc_pow_leak = adc_pow_leak_dict [str(cfg.adc_res)] +xbar_inMem_pow_leak = xbar_inMem_pow_leak_dict[str(cfg.xbar_size)] +xbar_outMem_pow_leak = xbar_outMem_pow_leak_dict[str(cfg.xbar_size)] +instrnMem_pow_leak = instrnMem_pow_leak_dict[str(instrnMem_size_max)] * math.sqrt(8) #area scaling for 8 bytes per instruction +dataMem_pow_leak = dataMem_pow_leak_dict[str(dataMem_size_max)] + +# Core Control unit (control unit and pipeline registers) +ccu_pow = 1.25*0.2 #0.2 for activity +ccu_area = 0.00145*2.25 #taken similar as edctrl (scaled by power) + +# Added here for simplicity now (***can need modification later***) +# The latency of mem access is dependent on when can the ima find edram bys non-busy +memInterface_lat = infinity # infinite latency + +############################################################################################################# +# Tile Hierarchy + # Number of IMAs + # EDRAM size + # Shared Bus width + # Instruction memory size + # Receive Buffer size +############################################################################################################# + +# Tile component latency/pow/area +# EDRAM value dictionary (counter storage is not coounted) +edram_lat_dict = {'8' : 2, + '64' : 2, #edram access width is constant = 256 bits + '128' : 2} + +edram_pow_dyn_dict = {'8' : 17.2/2, + '64' : 17.2/2, # (0.0172 nJ with 2 cycles access latency) + '128' : 25.35/2} + +edram_pow_leak_dict = {'8' : 0.46, + '64' : 0.46, + '128' : 0.77} + +edram_area_dict = {'8' : 0.086, + '64' : 0.086, + '128' : 0.121} + +# Tile Instruction Memory value dictionary +tile_instrnMem_lat_dict = {'512': 1, + '1024': 1, + '2048': 1} + +tile_instrnMem_pow_dyn_dict = {'512' : 0.46, + '1024': 0.53, + '2048': 0.65} + +tile_instrnMem_pow_leak_dict = {'512' : 0.078, + '1024': 0.147, + '2048': 0.33} + + +tile_instrnMem_area_dict = {'512' : 0.00108, + '1024': 0.00192, + '2048': 0.0041} + + +edram_size_max = '128' +if str(cfg.edram_size) in edram_lat_dict: + edram_size_max = str(cfg.edram_size) +else: + print("Warning: No values for edram memory size provided. Using values for 128 instead.") + +tile_instrnMem_size_max = '2048' +if str(cfg.tile_instrnMem_size) in tile_instrnMem_lat_dict: + tile_instrnMem_size_max = str(cfg.tile_instrnMem_size) +else: + print("Warning: No values for tile instrn memory size provided. Using values for 2048 instead.") + +# counter storage (2048 Byte Scratch RAM - 1 counter entry shared by 256 bits of data (16 neurons)) +# area scaling (X8) +counter_buff_lat = 1 * math.sqrt(8) +counter_buff_pow_dyn = 0.65/2 * math.sqrt(8) +counter_buff_pow_leak = 0.33/2 * math.sqrt(8) +counter_buff_area = 0.0041 * math.sqrt(8) + +# EDRAM to IMA bus values +edram_bus_lat = 1 +edram_bus_pow_dyn = 6/2 #bus width = 384, same as issac (over two cycles) +edram_bus_pow_leak = 1/2 #bus width = 384, same as issac +edram_bus_area = 0.090 + +# EDRAM controller values +edram_ctrl_lat = 1 +edram_ctrl_pow_dyn = 0.475 +edram_ctrl_pow_leak = 0.05 +edram_ctrl_area = 0.00145 + +# Receive buffer value dictionary - 16 entries (Need to make this a dictionary) +# Increasing to 64 entries +receive_buffer_lat = 1 * math.sqrt(4) +receive_buffer_pow_dyn = 4.48 * math.sqrt(4) # (0.2*256/16) +receive_buffer_pow_leak = 0.09 * math.sqrt(4) +receive_buffer_area = 0.0022 *math.sqrt(4) + + +# Chosen latency based on config file - only for components whose latency is parameter dependent +edram_lat = edram_lat_dict[str(edram_size_max)] +tile_instrnMem_lat = tile_instrnMem_lat_dict[str(tile_instrnMem_size_max)] + +# Chosen area based on config file - only for components whose area is parameter dependent +edram_area = edram_area_dict[str(edram_size_max)] +tile_instrnMem_area = tile_instrnMem_area_dict[str(tile_instrnMem_size_max)] * math.sqrt(8) #area scaling for 8 bytes per instruction + +# Chosen dynamic power based on config file - only for components whose dynamic power is parameter dependent +edram_pow_dyn = edram_pow_dyn_dict[str(edram_size_max)] +tile_instrnMem_pow_dyn = tile_instrnMem_pow_dyn_dict[str(tile_instrnMem_size_max)] * math.sqrt(8) #area scaling for 8 bytes per instruction + +# Chosen leakage power based on config file - only for components whose leakage power is parameter dependent +edram_pow_leak = edram_pow_leak_dict[str(edram_size_max)] +tile_instrnMem_pow_leak = tile_instrnMem_pow_leak_dict[str(tile_instrnMem_size_max)] * math.sqrt(8) #area scaling for 8 bytes per instruction + +# Tile Control unit +tcu_pow = 0.25*0.2 +tcu_area = 0.00145 #taken similar as edctrl + +############################################################################################################# +# Node Hierarchy + # Number of Tiles + # NOC - Topology (Currently assumes a cmesh (c=4, same as ISSAC)) + # n = number of dimension\ + # k = number of tiles in each dimension + # c = concentartion (tiles/router) + # average injection rate (0.25 - a tile injects a new packet for each destination in every four cycles) +############################################################################################################# + +# NOC latency dictionary (in terms of flit cycle) +# Note - if inj_rate (packet injection -1 packet - 16 neurons) exceeds 0.025 - there's a problem, NoC needs to be redesigned else network latency will be killing! +# Hence, not provided for +noc_inj_rate_max = 0.025 +noc_lat_dict = {'0.001': 29, + '0.005': 31, + '0.01' : 34, + '0.02' : 54, + '0.025': 115} + +noc_area_dict = {'4': 0.047, + '8': 0.116} + +# Router dynamic power - NOC will be used only if atleast one of send_queue in node is non_empty +noc_pow_dyn_dict = {'4': 16.13, + '8': 51.48} + +# Router leakage power - NOC will be used only if atleast oen of send_queue in node is non_empty +noc_pow_leak_dict = {'4': 0.41, + '8': 1.04} + +# Enter component latency (Based on the above NOC topological parameters) +# Inter-node Noc (router & channel) +assert (cfg.noc_inj_rate <= noc_inj_rate_max), 'Oops: reconsider NOC design and or DNN mapping, with this inj_rate, NOC data transfer throughput \ +will be terrible!' + +noc_intra_lat = noc_lat_dict[str(cfg.noc_inj_rate)] +noc_intra_pow_dyn = noc_pow_dyn_dict[str(cfg.noc_num_port)] # per router +noc_intra_pow_leak = noc_pow_leak_dict[str(cfg.noc_num_port)]# per router +noc_intra_area = noc_area_dict[str(cfg.noc_num_port)] # per router + +# Hypertransport network (HT) +# Note HT is external to a node, but we consider all tiles in one +# virtual node itself for simplicity +# HT numbers from ISAAC = 6.4GB/s = 6.4B/ ns = 1packet(16*2 Bytes) = 5ns +ht_lat = 5 #latency per packet +noc_inter_lat = ht_lat + noc_intra_lat #navigate to the node, then to tile within node +noc_inter_pow_dyn = 10400 #10.4W +noc_inter_pow_leak = 0 +noc_inter_area = 22.88 + diff --git a/include/example-constants/constants-16.py b/include/example-constants/constants-16.py new file mode 100644 index 00000000..54db2161 --- /dev/null +++ b/include/example-constants/constants-16.py @@ -0,0 +1,614 @@ +## This file contains the data structures used in differnet hierarchies. +## It also holds power, area and latency numbers of different component used in DPE design +import config as cfg +import math +import constants_digital as digi_param +# Limits the number of cycles an IMA runs in case it doesn't halt +infinity = 100000 + +############################################################################################################# +## Technology/Other constants for all the modules +############################################################################################################# +# IMA - folliwng parameters are not used currently, will be used when analog functionality is implemented +cycle_time = 1 # in nanoseconds (1ns) +vdd = 0.9 +xbar_out_min = -10e-10 +xbar_out_max = 1 # think about this - ??? + +############################################################################################################# +## Define commonly used data structures +############################################################################################################# +# List of supported opcodes for tile +op_list_tile = ['send', 'receive', 'compute', 'halt'] + +# Instruction format for Tile +dummy_instrn_tile = {'opcode' : op_list_tile[0], + 'mem_addr': 0, # send/receive - edram_addr + 'r1': 0, # send-send_width, receive-receive_width + 'r2': 0, # send-target_addr, receive-counter + 'vtile_id': 0, # send/receive-neuron_id + 'ima_nma': '', # compute - a bit for each ima + 'vec': 0} # vector width + +# List of supported opcodes/aluops for IMA - cp will copy data (from data memory of ima to xbarInmem) +op_list = ['ld', 'cp', 'st', 'set', 'nop', 'alu', 'alui', 'mvm', 'vvo', 'hlt', 'jmp', 'beq', 'alu_int', 'crs'] +aluop_list = ['add', 'sub', 'sna', 'mul', 'sigmoid'] # sna is also used by mvm isntruction + +# Instruction format for IMA +dummy_instrn = {'opcode' : op_list[0], # instrn op + 'aluop' : aluop_list[0], # alu function + 'd1' : 0, # destination + 'r1' : 0, # operand1 (stride for mvm) + 'r2' : 0, # operand2 + 'r3' : 0, # operand3 (shift) + 'vec' : 0, # vector width + 'imm' : 0, # immediate (scalar) data + 'xb_nma' : 0 } # xbar negative-mask, a xbar evaluates if neg-mask = 1 + +# List of pipeline stages - in order for IMA +stage_list = ['fet', 'dec', 'ex'] +last_stage = 'ex' + +############################################################################################################# +# IMA Hierarchy parameters + # Number of Xbars + # Crossbar Size + # Crossbar bits + # Bit resolution of ADCs and DACs + # Number of ADCs + # Number of ALUs + # Data memory size + # Size of Xbar in/out memory (Register) is dependent on Xbar size and num_bits + # Instruction memory size +############################################################################################################# + +# IMA component latency/power/area dictionary (all values in ns, mw, mm2) +# XBAR - Models from ISAAC paper +xbar_lat_dict = {'2': {'16' : 16, + '32' : 32, # first indexed by xbar_bits then by xbar_size + '64' : 64, + '128': 128, + '256': 256}, + '4': {'16' : 16, + '32' : 32, + '64' : 64, + '128': 128, + '256': 256}, + '6': {'16' : 16, + '32' : 32, + '64' : 64, + '128': 128, + '256': 256}} + +xbar_pow_dict = {'2': {'16' : 0.0046875, + '32' : 0.01875, + '64' : 0.075, + '128': 0.3, + '256': 1.2}, + '4': {'16' : 0.0046875, + '32' : 0.01875, + '64' : 0.075, + '128': 0.3, + '256': 1.2}, + '6': {'16' : 0.0046875, + '32' : 0.01875, + '64' : 0.075, + '128': 0.3, + '256': 1.2}} + +xbar_area_dict = {'2': {'16' : 3.90625 * 10**(-7), + '32' : 1.5625 * 10**(-6), + '64' : 6.25 * 10**(-6), + '128': 2.5 * 10**(-5), + '256': 1.0 * 10**(-4)}, + '4': {'16' : 3.90625 * 10**(-7), + '32' : 1.5625 * 10**(-6), + '64' : 6.25 * 10**(-6), + '128': 2.5 * 10**(-5), + '256': 1.0 * 10**(-4)}, + '6': {'16' : 3.90625 * 10**(-7), + '32' : 1.5625 * 10**(-6), + '64' : 6.25 * 10**(-6), + '128': 2.5 * 10**(-5), + '256': 1.0 * 10**(-4)}} + +## New values added for xbar MVM/MTVM, OP (parallel write), serial read/write +# the following is lumped power for xbar inner/outer-product - includes peripherals +xbar_op_lat = 20.0*12.8 # with 4 VFUs +xbar_op_pow = 4.44 * 3.27 / (12.8) + +#hardcoded value +#xbar_ip_lat = 100.0 +#value depending on xb size +xbar_ip_lat = xbar_lat_dict[str(cfg.xbar_bits)][str(cfg.xbar_size)] + +#xbar_ip_pow = (1.37*2.0) # xbar_ip_pow (includes all mvmu) +#xbar_ip_pow = (1.37*2.0) - 1.04 if cfg.training else 1.37-1.04 # xbar_ip_pow (includes all mvmu except ADC - uncomment num_access for ADC object), + +#xbar inner product power dependence on xbar size +xbar_ip_pow = xbar_pow_dict[str(cfg.xbar_bits)][str(cfg.xbar_size)] + +# Note the read and write lat/pow are for entire xbar +xbar_rd_lat = 328.0 * 1000 * (1/32.0) +xbar_wr_lat = 351.0 * 1000 * (1/32.0) + +# the following is lumped power for xbar rd/wr (for whole array) - includes peripherals +xbar_rd_pow = 208.0 * 1000 * (1/32.0) / xbar_rd_lat +xbar_wr_pow = 676.0 * 1000 * (1/32.0) / xbar_rd_lat + +# DAC - Discuss exact values with ISSAC authors +dac_lat_dict = {'1' : 1, + '2' : 1, + '4' : 1, + '8' : 1, + '16': 1} + +dac_pow_dyn_dict = {'1' : 0.00350625, + '2' : 0.00350625, + '4' : 0.00350625, + '8' : 0.00350625, + '16': 0.00350625} + +dac_pow_leak_dict = {'1' : 0.000390625, + '2' : 0.000390625, + '4' : 0.000390625, + '8' : 0.000390625, + '16': 0.000390625} + +dac_area_dict = {'1' : 1.67 * 10**(-7), + '2' : 1.67 * 10**(-7), + '4' : 1.67 * 10**(-7), + '8' : 1.67 * 10**(-7), + '16': 1.67 * 10**(-7)} + +# ADC - Discuss exact values with ISSAC authors +# ADC Values for including sparsity +adc_lat_dict = {'1' : 12.5, + '2' : 25, + '3' : 37.5, + '4' : 50, + '5' : 62.5, + '6' : 75, + '7' : 87.5, + '8' : 100, + '9' : 112.5, + '16': 200} + +adc_pow_dyn_dict = {'1' : 0.225, + '2' : 0.45, + '3' : 0.675, + '4' : 0.9, + '5' : 1.125, + '6' : 1.35, + '7' : 1.575, + '8' : 1.8, + '9' : 2.025, + '16': 3.6} + +adc_pow_leak_dict = {'1' : 0.025, + '2' : 0.05, + '3' : 0.075, + '4' : 0.1, + '5' : 0.125, + '6' : 0.15, + '7' : 0.175, + '8' : 0.2, + '9' : 0.225, + '16': 0.4} + +adc_area_dict = {'1' : 0.0012, + '2' : 0.0012, + '3' : 0.0012, + '4' : 0.0012, + '5' : 0.00075, + '6' : 0.0009, + '7' : 0.00105, + '8' : 0.0012, + '9' : 0.0012, + '16': 0.0012} + +# SNH (MVM pipeline) +snh_lat = 1 +snh_pow_leak = 9.7 * 10**(-7) +snh_pow_dyn = 9.7 * 10**(-6) - snh_pow_leak +snh_area = 0.00004 / 8 / 128 + +# SNA (MVM pipeline) +sna_lat = 1 +sna_pow_leak = 0.005 +sna_pow_dyn = 0.05 - sna_pow_leak +sna_area = 0.00006 + +# ALU (Part of Vector Functional Unit) +alu_lat = 1 +alu_pow_dyn = 2.4 * 32/45 +alu_pow_div_dyn = 1.52 * 32/45 +alu_pow_mul_dyn = 0.795 * 32/45 +alu_pow_others_dyn = 0.373 * 32/45 # logical, eq, relu, add, sub, lsh, rsh +alu_pow_leak = 0.27 * 32/45 +alu_area = 0.00567 * 32/45 + +# witout considering division +#alu_lat = 1 +#alu_pow_dyn = 1.15 * 32/45 +#alu_pow_mul_dyn = 0.796 * 32/45 +#alu_pow_others_dyn = 0.36 * 32/45 # logical, eq, relu, add, sub, lsh, rsh +#alu_pow_leak = 0.05 * 32/45 +#alu_area = 0.002326 * 32/45 + +# Sigmoid/Tanh (Part of Vector Functional Unit) - Taken from ISAAC paper +act_lat = 1 # added for 4 exponential units +act_pow_leak = 0.026 +act_pow_dyn = 0.26 - act_pow_leak +act_area = 0.0003 # check this ??? + +# Multiplexer - These should be analog muxes +mux_lat = 0 +mux_pow_leak = 0 +mux_pow_dyn = 0 +mux_area = 0 + +# Data Memory value dictionary +dataMem_lat_dict = {'256' : 1, + '512' : 1, + '1024': 1, + '2048': 1, + '4096':1, + '16384':1, + '65536':1} + +dataMem_pow_dyn_dict = {'256' : 0.16, + '512' : 0.24, + '1024': 0.33, + '2048': 0.57, + '4096': 0.74, + '16384':1.6, + '65536':3.4} + +dataMem_pow_leak_dict = {'256' : 0.044, + '512' : 0.078, + '1024': 0.147, + '2048': 0.33, + '4096': 0.489, + '16384':1.28, + '65536':2.741} + +dataMem_area_dict = {'256' : 0.00056, + '512' : 0.00108, + '1024': 0.00192, + '2048': 0.00392, + '4096': 0.020691, + '16384':0.0666, + '65536':0.2684} + +# Instruction Memory value dictionary +instrnMem_lat_dict = {'512' : 1, + '1024': 1, + '2048': 1, + '4096':1, + '16384':1, + '65536':1} + +instrnMem_pow_dyn_dict = {'512' : 0.46, + '1024': 0.53, + '2048': 0.65, + '4096':0.74, + '16384':1.6, + '65536':3.4} + +instrnMem_pow_leak_dict = {'512' : 0.078, + '1024': 0.147, + '2048': 0.33, + '4096':0.489, + '16384':1.28, + '65536':2.741} + + +instrnMem_area_dict = {'512' : 0.00108, + '1024': 0.00192, + '2048': 0.0041, + '4096':0.020691, + '16384':0.0666, + '65536':0.2684} + + +# Xbar_inMem value dictionary (1 access means reading (dac_res) bits for each xbar row) +# for computing average power of ima - scale dyn_pow down by xbar_size +xbar_inMem_lat_dict = {'16' : 1, + '32' : 1, # indexed with xbar size + '64' : 1, + '128' : 1, + '256' : 1} + +xbar_inMem_pow_dyn_read_dict = {'16' : 0.3, #doesn't change much as we move from 32 to 16, because these are very small memories + '32' : 0.3, + '64' : 0.7, + '128' : 1.7, + '256' : 4.7} + +xbar_inMem_pow_dyn_write_dict = {'16' : 0.1, + '32' : 0.1, + '64' : 0.1, + '128' : 0.16, + '256' : 0.2} + +xbar_inMem_pow_leak_dict = {'16' : 0.009, + '32' : 0.009, + '64' : 0.02, + '128' : 0.04, + '256' : 0.075} + +xbar_inMem_area_dict = {'16' : 0.00015, + '32' : 0.00015, + '64' : 0.00033, + '128' : 0.00078, + '256' : 0.0019} + +# Xbar_outMem value dictionary +xbar_outMem_lat_dict = {'16' : 1, + '32' : 1, # indexed with xbar size + '64' : 1, + '128' : 1, + '256' : 1} + +xbar_outMem_pow_dyn_dict = {'16' : 0.1, + '32' : 0.1, + '64' : 0.1, + '128' : 0.16, + '256' : 0.2} + +xbar_outMem_pow_leak_dict = {'16' : 0.009, + '32' : 0.009, + '64' : 0.02, + '128' : 0.04, + '256' : 0.075} + +xbar_outMem_area_dict = {'16' : 0.00015, + '32' : 0.00015, + '64' : 0.00033, + '128' : 0.00078, + '256' : 0.0019} + +dataMem_size_max = '65536' +if str(cfg.dataMem_size) in dataMem_lat_dict: + dataMem_size_max = str(cfg.dataMem_size) +else: + print("Warning: No values for core data memory size provided. Using values for 2048 instead.") + +instrnMem_size_max = '65536' +if str(cfg.instrnMem_size) in instrnMem_lat_dict: + instrnMem_size_max = str(cfg.instrnMem_size) +else: + print("Warning: No values for core instruction memory size provided. Using values for 2048 instead.") + +# Chosen latency based on config file - only for components whose latency is parameter dependent +#xbar_lat = xbar_lat_dict [str(cfg.xbar_bits)][str(cfg.xbar_size)] +#xbar_ip_lat = xbar_ip_lat +xbar_ip_lat_dict = {'0':0, '90':0, '80':0, '70':0, '60':0, '50':0, '40':0, '30':0, '20':0, '10':0} +if cfg.MVMU_ver == "Analog": + for key, value in xbar_ip_lat_dict.items(): + xbar_ip_lat_dict[key] = xbar_ip_lat +else: + xbar_ip_lat_dict = digi_param.Digital_xbar_lat_dict[cfg.MVMU_ver][str(cfg.xbar_size)] +xbar_op_lat = xbar_op_lat +xbar_rd_lat = xbar_rd_lat +xbar_wr_lat = xbar_wr_lat +dac_lat = dac_lat_dict [str(cfg.dac_res)] +#FIXME need to review it I can remove adc_lat property +adc_lat = adc_lat_dict [str(cfg.adc_res)] +xbar_inMem_lat = xbar_inMem_lat_dict[str(cfg.xbar_size)] +xbar_outMem_lat = xbar_outMem_lat_dict[str(cfg.xbar_size)] +instrnMem_lat = instrnMem_lat_dict[str(instrnMem_size_max)] +dataMem_lat = dataMem_lat_dict[str(dataMem_size_max)] + +# Chosen area based on config file - only for components whose latency is parameter dependent +if cfg.MVMU_ver == "Analog": + xbar_area = xbar_area_dict[str(cfg.xbar_bits)][str(cfg.xbar_size)] +else: + xbar_area = digi_param.Digital_xbar_area_dict[cfg.MVMU_ver][str(cfg.xbar_size)] +dac_area = dac_area_dict [str(cfg.dac_res)] +adc_area = adc_area_dict [str(cfg.adc_res)] +xbar_inMem_area = xbar_inMem_area_dict[str(cfg.xbar_size)] +xbar_outMem_area = xbar_outMem_area_dict[str(cfg.xbar_size)] +instrnMem_area = instrnMem_area_dict[str(instrnMem_size_max)] * math.sqrt(8) #area scaling for 8 bytes per instruction +dataMem_area = dataMem_area_dict[str(dataMem_size_max)] + +# Chosen dyn_power based on config file - only for components whose latency is parameter dependent +#xbar_pow_dyn = xbar_pow_dict [str(cfg.xbar_bits)][str(cfg.xbar_size)] +xbar_ip_pow_dyn = xbar_ip_pow +xbar_op_pow_dyn = xbar_op_pow +xbar_rd_pow_dyn = xbar_rd_pow +xbar_wr_pow_dyn = xbar_wr_pow +dac_pow_dyn = dac_pow_dyn_dict [str(cfg.dac_res)] +adc_pow_dyn = adc_pow_dyn_dict [str(cfg.adc_res)] +xbar_inMem_pow_dyn_read = xbar_inMem_pow_dyn_read_dict[str(cfg.xbar_size)] +xbar_inMem_pow_dyn_write = xbar_inMem_pow_dyn_write_dict[str(cfg.xbar_size)] +xbar_outMem_pow_dyn = xbar_outMem_pow_dyn_dict[str(cfg.xbar_size)] +instrnMem_pow_dyn = instrnMem_pow_dyn_dict[str(instrnMem_size_max)] * math.sqrt(8) #area scaling for 8 bytes per instruction +dataMem_pow_dyn = dataMem_pow_dyn_dict[str(dataMem_size_max)] + +# Energy +xbar_ip_energy_dict = {'0':0, '90':0, '80':0, '70':0, '60':0, '50':0, '40':0, '30':0, '20':0, '10':0} +if cfg.MVMU_ver == "Analog": + for key,value in xbar_ip_energy_dict.items(): + xbar_ip_energy_dict[key] = xbar_ip_lat*xbar_ip_pow_dyn +else: + xbar_ip_energy_dict = digi_param.Digital_xbar_energy_dict[cfg.MVMU_ver][str(cfg.xbar_size)] +print('xbar_ip_energy_dict', xbar_ip_energy_dict) + +# Chosen leak_power based on config file - only for components whose latency is parameter dependent +if cfg.MVMU_ver == "Analog": + xbar_pow_leak = 0 +else: + xbar_pow_leak = digi_param.Digital_xbar_pow_leak_dict[str(cfg.xbar_size)] +dac_pow_leak = dac_pow_leak_dict [str(cfg.dac_res)] +adc_pow_leak = adc_pow_leak_dict [str(cfg.adc_res)] +xbar_inMem_pow_leak = xbar_inMem_pow_leak_dict[str(cfg.xbar_size)] +xbar_outMem_pow_leak = xbar_outMem_pow_leak_dict[str(cfg.xbar_size)] +instrnMem_pow_leak = instrnMem_pow_leak_dict[str(instrnMem_size_max)] * math.sqrt(8) #area scaling for 8 bytes per instruction +dataMem_pow_leak = dataMem_pow_leak_dict[str(dataMem_size_max)] + +# Core Control unit (control unit and pipeline registers) +ccu_pow = 1.25*0.2 #0.2 for activvity +ccu_area = 0.00145*2.25 #taken similar as edctrl (scaled by power) + +# Added here for simplicity now (***can need modification later***) +# The latency of mem access is dependent on when can the ima find edram bys non-busy +memInterface_lat = infinity # infinite latency + +############################################################################################################# +# Tile Hierarchy + # Number of IMAs + # EDRAM size + # Shared Bus width + # Instruction memory size + # Receive Buffer size +############################################################################################################# + +# Tile component latency/pow/area +# EDRAM value dictionary (counter storage is not coounted) +edram_lat_dict = {'8' : 2, + '64' : 2, #edram access width is constant = 256 bits + '128' : 2} + +edram_pow_dyn_dict = {'8' : 17.2/2, + '64' : 17.2/2, # (0.0172 nJ with 2 cycles access latency) + '128' : 25.35/2} + +edram_pow_leak_dict = {'8' : 0.46, + '64' : 0.46, + '128' : 0.77} + +edram_area_dict = {'8' : 0.086, + '64' : 0.086, + '128' : 0.121} + +# Tile Instruction Memory value dictionary +tile_instrnMem_lat_dict = {'512': 1, + '1024': 1, + '2048': 1} + +tile_instrnMem_pow_dyn_dict = {'512' : 0.46, + '1024': 0.53, + '2048': 0.65} + +tile_instrnMem_pow_leak_dict = {'512' : 0.078, + '1024': 0.147, + '2048': 0.33} + + +tile_instrnMem_area_dict = {'512' : 0.00108, + '1024': 0.00192, + '2048': 0.0041} + + +edram_size_max = '128' +if str(cfg.edram_size) in edram_lat_dict: + edram_size_max = str(cfg.edram_size) +else: + print("Warning: No values for edram memory size provided. Using values for 128 instead.") + +tile_instrnMem_size_max = '2048' +if str(cfg.tile_instrnMem_size) in tile_instrnMem_lat_dict: + tile_instrnMem_size_max = str(cfg.tile_instrnMem_size) +else: + print("Warning: No values for tile instrn memory size provided. Using values for 2048 instead.") + +# counter storage (2048 Byte Scratch RAM - 1 counter entry shared by 256 bits of data (16 neurons)) +# area scaling (X8) +counter_buff_lat = 1 * math.sqrt(8) +counter_buff_pow_dyn = 0.65/2 * math.sqrt(8) +counter_buff_pow_leak = 0.33/2 * math.sqrt(8) +counter_buff_area = 0.0041 * math.sqrt(8) + +# EDRAM to IMA bus values +edram_bus_lat = 1 +edram_bus_pow_dyn = 6/2 #bus width = 384, same as issac (over two cycles) +edram_bus_pow_leak = 1/2 #bus width = 384, same as issac +edram_bus_area = 0.090 + +# EDRAM controller values +edram_ctrl_lat = 1 +edram_ctrl_pow_dyn = 0.475 +edram_ctrl_pow_leak = 0.05 +edram_ctrl_area = 0.00145 + +# Receive buffer value dictionary - 16 entries (Need to make this a dictionary) +# Increasing to 64 entries +receive_buffer_lat = 1 * math.sqrt(4) +receive_buffer_pow_dyn = 4.48 * math.sqrt(4) # (0.2*256/16) +receive_buffer_pow_leak = 0.09 * math.sqrt(4) +receive_buffer_area = 0.0022 *math.sqrt(4) + + +# Chosen latency based on config file - only for components whose latency is parameter dependent +edram_lat = edram_lat_dict[str(edram_size_max)] +tile_instrnMem_lat = tile_instrnMem_lat_dict[str(tile_instrnMem_size_max)] + +# Chosen area based on config file - only for components whose area is parameter dependent +edram_area = edram_area_dict[str(edram_size_max)] +tile_instrnMem_area = tile_instrnMem_area_dict[str(tile_instrnMem_size_max)] * math.sqrt(8) #area scaling for 8 bytes per instruction + +# Chosen dynamic power based on config file - only for components whose dynamic power is parameter dependent +edram_pow_dyn = edram_pow_dyn_dict[str(edram_size_max)] +tile_instrnMem_pow_dyn = tile_instrnMem_pow_dyn_dict[str(tile_instrnMem_size_max)] * math.sqrt(8) #area scaling for 8 bytes per instruction + +# Chosen leakage power based on config file - only for components whose leakage power is parameter dependent +edram_pow_leak = edram_pow_leak_dict[str(edram_size_max)] +tile_instrnMem_pow_leak = tile_instrnMem_pow_leak_dict[str(tile_instrnMem_size_max)] * math.sqrt(8) #area scaling for 8 bytes per instruction + +# Tile Control unit +tcu_pow = 0.25*0.2 +tcu_area = 0.00145 #taken similar as edctrl + +############################################################################################################# +# Node Hierarchy + # Number of Tiles + # NOC - Topology (Currently assumes a cmesh (c=4, same as ISSAC)) + # n = number of dimension\ + # k = number of tiles in each dimension + # c = concentartion (tiles/router) + # average injection rate (0.25 - a tile injects a new packet for each destination in every four cycles) +############################################################################################################# + +# NOC latency dictionary (in terms of flit cycle) +# Note - if inj_rate (packet injection -1 packet - 16 neurons) exceeds 0.025 - there's a problem, NoC needs to be redesigned else network latency will be killing! +# Hence, not provided for +noc_inj_rate_max = 0.025 +noc_lat_dict = {'0.001': 29, + '0.005': 31, + '0.01' : 34, + '0.02' : 54, + '0.025': 115} + +noc_area_dict = {'4': 0.047, + '8': 0.116} + +# Router dynamic power - NOC will be used only if atleast one of send_queue in node is non_empty +noc_pow_dyn_dict = {'4': 16.13, + '8': 51.48} + +# Router leakage power - NOC will be used only if atleast oen of send_queue in node is non_empty +noc_pow_leak_dict = {'4': 0.41, + '8': 1.04} + +# Enter component latency (Based on teh above NOC topological parameters) +# Inter-node Noc (router & channel) +assert (cfg.noc_inj_rate <= noc_inj_rate_max), 'Oops: reconsider NOC design and or DNN mapping, with this inj_rate, NOC data transfer throughput \ +will be terrible!' + +noc_intra_lat = noc_lat_dict[str(cfg.noc_inj_rate)] +noc_intra_pow_dyn = noc_pow_dyn_dict[str(cfg.noc_num_port)] # per router +noc_intra_pow_leak = noc_pow_leak_dict[str(cfg.noc_num_port)]# per router +noc_intra_area = noc_area_dict[str(cfg.noc_num_port)] # per router + +# Hypertransport network (HT) +# Note HT is external to a node, but we consider all tiles in one +# virtual node itself for simplicity +# HT numbers from ISAAC = 6.4GB/s = 6.4B/ ns = 1packet(16*2 Bytes) = 5ns +ht_lat = 5 #latency per packet +noc_inter_lat = ht_lat + noc_intra_lat #navigate to the node, then to tile within node +noc_inter_pow_dyn = 10400 #10.4W +noc_inter_pow_leak = 0 +noc_inter_area = 22.88 + diff --git a/include/example-constants/constants-32.py b/include/example-constants/constants-32.py new file mode 100644 index 00000000..b9158e5a --- /dev/null +++ b/include/example-constants/constants-32.py @@ -0,0 +1,606 @@ +## This file contains the data structures used in differnet hierarchies. +## It also holds power, area and latency numbers of different component used in DPE design +import config as cfg +import math +import constants_digital as digi_param +# Limits the number of cycles an IMA runs in case it doesn't halt +infinity = 100000 + +############################################################################################################# +## Technology/Other constants for all the modules +############################################################################################################# +# IMA - folliwng parameters are not used currently, will be used when analog functionality is implemented +cycle_time = 1 # in nanoseconds (1ns) +vdd = 0.9 +xbar_out_min = -10e-10 +xbar_out_max = 1 # think about this - ??? + +############################################################################################################# +## Define commonly used data structures +############################################################################################################# +# List of supported opcodes for tile +op_list_tile = ['send', 'receive', 'compute', 'halt'] + +# Instruction format for Tile +dummy_instrn_tile = {'opcode' : op_list_tile[0], + 'mem_addr': 0, # send/receive - edram_addr + 'r1': 0, # send-send_width, receive-receive_width + 'r2': 0, # send-target_addr, receive-counter + 'vtile_id': 0, # send/receive-neuron_id + 'ima_nma': '', # compute - a bit for each ima + 'vec': 0} # vector width + +# List of supported opcodes/aluops for IMA - cp will copy data (from data memory of ima to xbarInmem) +op_list = ['ld', 'cp', 'st', 'set', 'nop', 'alu', 'alui', 'mvm', 'vvo', 'hlt', 'jmp', 'beq', 'alu_int', 'crs'] +aluop_list = ['add', 'sub', 'sna', 'mul', 'sigmoid'] # sna is also used by mvm isntruction + +# Instruction format for IMA +dummy_instrn = {'opcode' : op_list[0], # instrn op + 'aluop' : aluop_list[0], # alu function + 'd1' : 0, # destination + 'r1' : 0, # operand1 (stride for mvm) + 'r2' : 0, # operand2 + 'r3' : 0, # operand3 (shift) + 'vec' : 0, # vector width + 'imm' : 0, # immediate (scalar) data + 'xb_nma' : 0 } # xbar negative-mask, a xbar evaluates if neg-mask = 1 + +# List of pipeline stages - in order for IMA +stage_list = ['fet', 'dec', 'ex'] +last_stage = 'ex' + +############################################################################################################# +# IMA Hierarchy parameters + # Number of Xbars + # Crossbar Size + # Crossbar bits + # Bit resolution of ADCs and DACs + # Number of ADCs + # Number of ALUs + # Data memory size + # Size of Xbar in/out memory (Register) is dependent on Xbar size and num_bits + # Instruction memory size +############################################################################################################# + +# IMA component latency/power/area dictionary (all values in ns, mw, mm2) +# XBAR - Models from ISAAC paper +xbar_lat_dict = {'2': {'16' : 16, + '32' : 32, # first indexed by xbar_bits then by xbar_size + '64' : 64, + '128': 128, + '256': 256}, + '4': {'16' : 16, + '32' : 32, + '64' : 64, + '128': 128, + '256': 256}, + '6': {'16' : 16, + '32' : 32, + '64' : 64, + '128': 128, + '256': 256}} + +xbar_pow_dict = {'2': {'16' : 0.0046875, + '32' : 0.01875, + '64' : 0.075, + '128': 0.3, + '256': 1.2}, + '4': {'16' : 0.0046875, + '32' : 0.01875, + '64' : 0.075, + '128': 0.3, + '256': 1.2}, + '6': {'16' : 0.0046875, + '32' : 0.01875, + '64' : 0.075, + '128': 0.3, + '256': 1.2}} + +xbar_area_dict = {'2': {'16' : 3.90625 * 10**(-7), + '32' : 1.5625 * 10**(-6), + '64' : 6.25 * 10**(-6), + '128': 2.5 * 10**(-5), + '256': 1.0 * 10**(-4)}, + '4': {'16' : 3.90625 * 10**(-7), + '32' : 1.5625 * 10**(-6), + '64' : 6.25 * 10**(-6), + '128': 2.5 * 10**(-5), + '256': 1.0 * 10**(-4)}, + '6': {'16' : 3.90625 * 10**(-7), + '32' : 1.5625 * 10**(-6), + '64' : 6.25 * 10**(-6), + '128': 2.5 * 10**(-5), + '256': 1.0 * 10**(-4)}} + +## New values added for xbar MVM/MTVM, OP (parallel write), serial read/write +# the following is lumped power for xbar inner/outer-product - includes peripherals +xbar_op_lat = 20.0*12.8 # with 4 VFUs +xbar_op_pow = 4.44 * 3.27 / (12.8) + +#hardcoded value +#xbar_ip_lat = 100.0 +#value depending on xb size +xbar_ip_lat = xbar_lat_dict[str(cfg.xbar_bits)][str(cfg.xbar_size)] + +#xbar_ip_pow = (1.37*2.0) # xbar_ip_pow (includes all mvmu) +#xbar_ip_pow = (1.37*2.0) - 1.04 if cfg.training else 1.37-1.04 # xbar_ip_pow (includes all mvmu except ADC - uncomment num_access for ADC object), + +#xbar inner product power dependence on xbar size +xbar_ip_pow = xbar_pow_dict[str(cfg.xbar_bits)][str(cfg.xbar_size)] + +# Note the read and write lat/pow are for entire xbar +xbar_rd_lat = 328.0 * 1000 * (1/32.0) +xbar_wr_lat = 351.0 * 1000 * (1/32.0) + +# the following is lumped power for xbar rd/wr (for whole array) - includes peripherals +xbar_rd_pow = 208.0 * 1000 * (1/32.0) / xbar_rd_lat +xbar_wr_pow = 676.0 * 1000 * (1/32.0) / xbar_rd_lat + +# DAC - Discuss exact values with ISSAC authors +dac_lat_dict = {'1' : 1, + '2' : 1, + '4' : 1, + '8' : 1, + '16': 1} + +dac_pow_dyn_dict = {'1' : 0.00350625, + '2' : 0.00350625, + '4' : 0.00350625, + '8' : 0.00350625, + '16': 0.00350625} + +dac_pow_leak_dict = {'1' : 0.000390625, + '2' : 0.000390625, + '4' : 0.000390625, + '8' : 0.000390625, + '16': 0.000390625} + +dac_area_dict = {'1' : 1.67 * 10**(-7), + '2' : 1.67 * 10**(-7), + '4' : 1.67 * 10**(-7), + '8' : 1.67 * 10**(-7), + '16': 1.67 * 10**(-7)} + +# ADC - Discuss exact values with ISSAC authors +# ADC Values for including sparsity +adc_lat_dict = {'1' : 12.5, + '2' : 25, + '3' : 37.5, + '4' : 50, + '5' : 62.5, + '6' : 75, + '7' : 87.5, + '8' : 100, + '9' : 112.5, + '16': 200} + +adc_pow_dyn_dict = {'1' : 0.225, + '2' : 0.45, + '3' : 0.675, + '4' : 0.9, + '5' : 1.125, + '6' : 1.35, + '7' : 1.575, + '8' : 1.8, + '9' : 2.025, + '16': 3.6} + +adc_pow_leak_dict = {'1' : 0.025, + '2' : 0.05, + '3' : 0.075, + '4' : 0.1, + '5' : 0.125, + '6' : 0.15, + '7' : 0.175, + '8' : 0.2, + '9' : 0.225, + '16': 0.4} + +adc_area_dict = {'1' : 0.0012, + '2' : 0.0012, + '3' : 0.0012, + '4' : 0.0012, + '5' : 0.00075, + '6' : 0.0009, + '7' : 0.00105, + '8' : 0.0012, + '9' : 0.0012, + '16': 0.0012} + +# SNH (MVM pipeline) +snh_lat = 1 +snh_pow_leak = 9.7 * 10**(-7) +snh_pow_dyn = 9.7 * 10**(-6) - snh_pow_leak +snh_area = 0.00004 / 8 / 128 + +# SNA (MVM pipeline) +sna_lat = 1 +sna_pow_leak = 0.005 +sna_pow_dyn = 0.05 - sna_pow_leak +sna_area = 0.00006 + +# ALU (Part of Vector Functional Unit) +alu_lat = 1 +alu_pow_dyn = 2.4 * 32/45 +alu_pow_div_dyn = 1.52 * 32/45 +alu_pow_mul_dyn = 0.795 * 32/45 +alu_pow_others_dyn = 0.373 * 32/45 # logical, eq, relu, add, sub, lsh, rsh +alu_pow_leak = 0.27 * 32/45 +alu_area = 0.00567 * 32/45 + +# witout considering division +#alu_lat = 1 +#alu_pow_dyn = 1.15 * 32/45 +#alu_pow_mul_dyn = 0.796 * 32/45 +#alu_pow_others_dyn = 0.36 * 32/45 # logical, eq, relu, add, sub, lsh, rsh +#alu_pow_leak = 0.05 * 32/45 +#alu_area = 0.002326 * 32/45 + +# Sigmoid/Tanh (Part of Vector Functional Unit) - Taken from ISAAC paper +act_lat = 1 # added for 4 exponential units +act_pow_leak = 0.026 +act_pow_dyn = 0.26 - act_pow_leak +act_area = 0.0003 # check this ??? + +# Multiplexer - These should be analog muxes +mux_lat = 0 +mux_pow_leak = 0 +mux_pow_dyn = 0 +mux_area = 0 + +# Data Memory value dictionary +dataMem_lat_dict = {'256' : 1, + '512' : 1, + '1024': 1, + '2048': 1, + '4096':1, + '16384':1} + +dataMem_pow_dyn_dict = {'256' : 0.16, + '512' : 0.24, + '1024': 0.33, + '2048': 0.57, + '4096': 0.74, + '16384':1.6} + +dataMem_pow_leak_dict = {'256' : 0.044, + '512' : 0.078, + '1024': 0.147, + '2048': 0.33, + '4096': 0.489, + '16384':1.28} + +dataMem_area_dict = {'256' : 0.00056, + '512' : 0.00108, + '1024': 0.00192, + '2048': 0.00392, + '4096': 0.020691, + '16384':0.0666} + +# Instruction Memory value dictionary +instrnMem_lat_dict = {'512' : 1, + '1024': 1, + '2048': 1, + '4096':1, + '16384':1} + +instrnMem_pow_dyn_dict = {'512' : 0.46, + '1024': 0.53, + '2048': 0.65, + '4096':0.74, + '16384':1.6} + +instrnMem_pow_leak_dict = {'512' : 0.078, + '1024': 0.147, + '2048': 0.33, + '4096':0.489, + '16384':1.28} + + +instrnMem_area_dict = {'512' : 0.00108, + '1024': 0.00192, + '2048': 0.0041, + '4096':0.020691, + '16384':0.0666} + + +# Xbar_inMem value dictionary (1 access means reading (dac_res) bits for each xbar row) +# for computing average power of ima - scale dyn_pow down by xbar_size +xbar_inMem_lat_dict = {'16' : 1, + '32' : 1, # indexed with xbar size + '64' : 1, + '128' : 1, + '256' : 1} + +xbar_inMem_pow_dyn_read_dict = {'16' : 0.3, #doesn't change much as we move from 32 to 16, because these are very small memories + '32' : 0.3, + '64' : 0.7, + '128' : 1.7, + '256' : 4.7} + +xbar_inMem_pow_dyn_write_dict = {'16' : 0.1, + '32' : 0.1, + '64' : 0.1, + '128' : 0.16, + '256' : 0.2} + +xbar_inMem_pow_leak_dict = {'16' : 0.009, + '32' : 0.009, + '64' : 0.02, + '128' : 0.04, + '256' : 0.075} + +xbar_inMem_area_dict = {'16' : 0.00015, + '32' : 0.00015, + '64' : 0.00033, + '128' : 0.00078, + '256' : 0.0019} + +# Xbar_outMem value dictionary +xbar_outMem_lat_dict = {'16' : 1, + '32' : 1, # indexed with xbar size + '64' : 1, + '128' : 1, + '256' : 1} + +xbar_outMem_pow_dyn_dict = {'16' : 0.1, + '32' : 0.1, + '64' : 0.1, + '128' : 0.16, + '256' : 0.2} + +xbar_outMem_pow_leak_dict = {'16' : 0.009, + '32' : 0.009, + '64' : 0.02, + '128' : 0.04, + '256' : 0.075} + +xbar_outMem_area_dict = {'16' : 0.00015, + '32' : 0.00015, + '64' : 0.00033, + '128' : 0.00078, + '256' : 0.0019} + +dataMem_size_max = '16384' +if str(cfg.dataMem_size) in dataMem_lat_dict: + dataMem_size_max = str(cfg.dataMem_size) +else: + print("Warning: No values for core data memory size provided. Using values for 2048 instead.") + +instrnMem_size_max = '16384' +if str(cfg.instrnMem_size) in instrnMem_lat_dict: + instrnMem_size_max = str(cfg.instrnMem_size) +else: + print("Warning: No values for core instruction memory size provided. Using values for 2048 instead.") + +# Chosen latency based on config file - only for components whose latency is parameter dependent +#xbar_lat = xbar_lat_dict [str(cfg.xbar_bits)][str(cfg.xbar_size)] +#xbar_ip_lat = xbar_ip_lat +xbar_ip_lat_dict = {'0':0, '90':0, '80':0, '70':0, '60':0, '50':0, '40':0, '30':0, '20':0, '10':0} +if cfg.MVMU_ver == "Analog": + for key, value in xbar_ip_lat_dict.items(): + xbar_ip_lat_dict[key] = xbar_ip_lat +else: + xbar_ip_lat_dict = digi_param.Digital_xbar_lat_dict[cfg.MVMU_ver][str(cfg.xbar_size)] +xbar_op_lat = xbar_op_lat +xbar_rd_lat = xbar_rd_lat +xbar_wr_lat = xbar_wr_lat +dac_lat = dac_lat_dict [str(cfg.dac_res)] +#FIXME need to review it I can remove adc_lat property +adc_lat = adc_lat_dict [str(cfg.adc_res)] +xbar_inMem_lat = xbar_inMem_lat_dict[str(cfg.xbar_size)] +xbar_outMem_lat = xbar_outMem_lat_dict[str(cfg.xbar_size)] +instrnMem_lat = instrnMem_lat_dict[str(instrnMem_size_max)] +dataMem_lat = dataMem_lat_dict[str(dataMem_size_max)] + +# Chosen area based on config file - only for components whose area is parameter dependent +if cfg.MVMU_ver == "Analog": + xbar_area = xbar_area_dict[str(cfg.xbar_bits)][str(cfg.xbar_size)] +else: + xbar_area = digi_param.Digital_xbar_area_dict[cfg.MVMU_ver][str(cfg.xbar_size)] +dac_area = dac_area_dict [str(cfg.dac_res)] +adc_area = adc_area_dict [str(cfg.adc_res)] +xbar_inMem_area = xbar_inMem_area_dict[str(cfg.xbar_size)] +xbar_outMem_area = xbar_outMem_area_dict[str(cfg.xbar_size)] +instrnMem_area = instrnMem_area_dict[str(instrnMem_size_max)] * math.sqrt(8) #area scaling for 8 bytes per instruction +dataMem_area = dataMem_area_dict[str(dataMem_size_max)] + +# Chosen dyn_power based on config file - only for components whose latency is parameter dependent +#xbar_pow_dyn = xbar_pow_dict [str(cfg.xbar_bits)][str(cfg.xbar_size)] +xbar_ip_pow_dyn = xbar_ip_pow +xbar_op_pow_dyn = xbar_op_pow +xbar_rd_pow_dyn = xbar_rd_pow +xbar_wr_pow_dyn = xbar_wr_pow +dac_pow_dyn = dac_pow_dyn_dict [str(cfg.dac_res)] +adc_pow_dyn = adc_pow_dyn_dict [str(cfg.adc_res)] +xbar_inMem_pow_dyn_read = xbar_inMem_pow_dyn_read_dict[str(cfg.xbar_size)] +xbar_inMem_pow_dyn_write = xbar_inMem_pow_dyn_write_dict[str(cfg.xbar_size)] +xbar_outMem_pow_dyn = xbar_outMem_pow_dyn_dict[str(cfg.xbar_size)] +instrnMem_pow_dyn = instrnMem_pow_dyn_dict[str(instrnMem_size_max)] * math.sqrt(8) #area scaling for 8 bytes per instruction +dataMem_pow_dyn = dataMem_pow_dyn_dict[str(dataMem_size_max)] + +# Energy +xbar_ip_energy_dict = {'0':0, '90':0, '80':0, '70':0, '60':0, '50':0, '40':0, '30':0, '20':0, '10':0} +if cfg.MVMU_ver == "Analog": + for key,value in xbar_ip_energy_dict.items(): + xbar_ip_energy_dict[key] = xbar_ip_lat*xbar_ip_pow_dyn +else: + xbar_ip_energy_dict = digi_param.Digital_xbar_energy_dict[cfg.MVMU_ver][str(cfg.xbar_size)] +print('xbar_ip_energy_dict', xbar_ip_energy_dict) + +# Chosen leak_power based on config file - only for components whose latency is parameter dependent +if cfg.MVMU_ver == "Analog": + xbar_pow_leak = 0 +else: + xbar_pow_leak = digi_param.Digital_xbar_pow_leak_dict[str(cfg.xbar_size)] +dac_pow_leak = dac_pow_leak_dict [str(cfg.dac_res)] +adc_pow_leak = adc_pow_leak_dict [str(cfg.adc_res)] +xbar_inMem_pow_leak = xbar_inMem_pow_leak_dict[str(cfg.xbar_size)] +xbar_outMem_pow_leak = xbar_outMem_pow_leak_dict[str(cfg.xbar_size)] +instrnMem_pow_leak = instrnMem_pow_leak_dict[str(instrnMem_size_max)] * math.sqrt(8) #area scaling for 8 bytes per instruction +dataMem_pow_leak = dataMem_pow_leak_dict[str(dataMem_size_max)] + +# Core Control unit (control unit and pipeline registers) +ccu_pow = 1.25*0.2 #0.2 for activvity +ccu_area = 0.00145*2.25 #taken similar as edctrl (scaled by power) + +# Added here for simplicity now (***can need modification later***) +# The latency of mem access is dependent on when can the ima find edram bys non-busy +memInterface_lat = infinity # infinite latency + +############################################################################################################# +# Tile Hierarchy + # Number of IMAs + # EDRAM size + # Shared Bus width + # Instruction memory size + # Receive Buffer size +############################################################################################################# + +# Tile component latency/pow/area +# EDRAM value dictionary (counter storage is not coounted) +edram_lat_dict = {'8' : 2, + '64' : 2, #edram access width is constant = 256 bits + '128' : 2} + +edram_pow_dyn_dict = {'8' : 17.2/2, + '64' : 17.2/2, # (0.0172 nJ with 2 cycles access latency) + '128' : 25.35/2} + +edram_pow_leak_dict = {'8' : 0.46, + '64' : 0.46, + '128' : 0.77} + +edram_area_dict = {'8' : 0.086, + '64' : 0.086, + '128' : 0.121} + +# Tile Instruction Memory value dictionary +tile_instrnMem_lat_dict = {'512': 1, + '1024': 1, + '2048': 1} + +tile_instrnMem_pow_dyn_dict = {'512' : 0.46, + '1024': 0.53, + '2048': 0.65} + +tile_instrnMem_pow_leak_dict = {'512' : 0.078, + '1024': 0.147, + '2048': 0.33} + + +tile_instrnMem_area_dict = {'512' : 0.00108, + '1024': 0.00192, + '2048': 0.0041} + + +edram_size_max = '128' +if str(cfg.edram_size) in edram_lat_dict: + edram_size_max = str(cfg.edram_size) +else: + print("Warning: No values for edram memory size provided. Using values for 128 instead.") + +tile_instrnMem_size_max = '2048' +if str(cfg.tile_instrnMem_size) in tile_instrnMem_lat_dict: + tile_instrnMem_size_max = str(cfg.tile_instrnMem_size) +else: + print("Warning: No values for tile instrn memory size provided. Using values for 2048 instead.") + +# counter storage (2048 Byte Scratch RAM - 1 counter entry shared by 256 bits of data (16 neurons)) +# area scaling (X8) +counter_buff_lat = 1 * math.sqrt(8) +counter_buff_pow_dyn = 0.65/2 * math.sqrt(8) +counter_buff_pow_leak = 0.33/2 * math.sqrt(8) +counter_buff_area = 0.0041 * math.sqrt(8) + +# EDRAM to IMA bus values +edram_bus_lat = 1 +edram_bus_pow_dyn = 6/2 #bus width = 384, same as issac (over two cycles) +edram_bus_pow_leak = 1/2 #bus width = 384, same as issac +edram_bus_area = 0.090 + +# EDRAM controller values +edram_ctrl_lat = 1 +edram_ctrl_pow_dyn = 0.475 +edram_ctrl_pow_leak = 0.05 +edram_ctrl_area = 0.00145 + +# Receive buffer value dictionary - 16 entries (Need to make this a dictionary) +# Increasing to 64 entries +receive_buffer_lat = 1 * math.sqrt(4) +receive_buffer_pow_dyn = 4.48 * math.sqrt(4) # (0.2*256/16) +receive_buffer_pow_leak = 0.09 * math.sqrt(4) +receive_buffer_area = 0.0022 *math.sqrt(4) + + +# Chosen latency based on config file - only for components whose latency is parameter dependent +edram_lat = edram_lat_dict[str(edram_size_max)] +tile_instrnMem_lat = tile_instrnMem_lat_dict[str(tile_instrnMem_size_max)] + +# Chosen area based on config file - only for components whose area is parameter dependent +edram_area = edram_area_dict[str(edram_size_max)] +tile_instrnMem_area = tile_instrnMem_area_dict[str(tile_instrnMem_size_max)] * math.sqrt(8) #area scaling for 8 bytes per instruction + +# Chosen dynamic power based on config file - only for components whose dynamic power is parameter dependent +edram_pow_dyn = edram_pow_dyn_dict[str(edram_size_max)] +tile_instrnMem_pow_dyn = tile_instrnMem_pow_dyn_dict[str(tile_instrnMem_size_max)] * math.sqrt(8) #area scaling for 8 bytes per instruction + +# Chosen leakage power based on config file - only for components whose leakage power is parameter dependent +edram_pow_leak = edram_pow_leak_dict[str(edram_size_max)] +tile_instrnMem_pow_leak = tile_instrnMem_pow_leak_dict[str(tile_instrnMem_size_max)] * math.sqrt(8) #area scaling for 8 bytes per instruction + +# Tile Control unit +tcu_pow = 0.25*0.2 +tcu_area = 0.00145 #taken similar as edctrl + +############################################################################################################# +# Node Hierarchy + # Number of Tiles + # NOC - Topology (Currently assumes a cmesh (c=4, same as ISSAC)) + # n = number of dimension\ + # k = number of tiles in each dimension + # c = concentartion (tiles/router) + # average injection rate (0.25 - a tile injects a new packet for each destination in every four cycles) +############################################################################################################# + +# NOC latency dictionary (in terms of flit cycle) +# Note - if inj_rate (packet injection -1 packet - 16 neurons) exceeds 0.025 - there's a problem, NoC needs to be redesigned else network latency will be killing! +# Hence, not provided for +noc_inj_rate_max = 0.025 +noc_lat_dict = {'0.001': 29, + '0.005': 31, + '0.01' : 34, + '0.02' : 54, + '0.025': 115} + +noc_area_dict = {'4': 0.047, + '8': 0.116} + +# Router dynamic power - NOC will be used only if atleast one of send_queue in node is non_empty +noc_pow_dyn_dict = {'4': 16.13, + '8': 51.48} + +# Router leakage power - NOC will be used only if atleast oen of send_queue in node is non_empty +noc_pow_leak_dict = {'4': 0.41, + '8': 1.04} + +# Enter component latency (Based on teh above NOC topological parameters) +# Inter-node Noc (router & channel) +assert (cfg.noc_inj_rate <= noc_inj_rate_max), 'Oops: reconsider NOC design and or DNN mapping, with this inj_rate, NOC data transfer throughput \ +will be terrible!' + +noc_intra_lat = noc_lat_dict[str(cfg.noc_inj_rate)] +noc_intra_pow_dyn = noc_pow_dyn_dict[str(cfg.noc_num_port)] # per router +noc_intra_pow_leak = noc_pow_leak_dict[str(cfg.noc_num_port)]# per router +noc_intra_area = noc_area_dict[str(cfg.noc_num_port)] # per router + +# Hypertransport network (HT) +# Note HT is external to a node, but we consider all tiles in one +# virtual node itself for simplicity +# HT numbers from ISAAC = 6.4GB/s = 6.4B/ ns = 1packet(16*2 Bytes) = 5ns +ht_lat = 5 #latency per packet +noc_inter_lat = ht_lat + noc_intra_lat #navigate to the node, then to tile within node +noc_inter_pow_dyn = 10400 #10.4W +noc_inter_pow_leak = 0 +noc_inter_area = 22.88 + diff --git a/include/example-constants/constants-64.py b/include/example-constants/constants-64.py new file mode 100644 index 00000000..ae589dfd --- /dev/null +++ b/include/example-constants/constants-64.py @@ -0,0 +1,596 @@ +## This file contains the data structures used in differnet hierarchies. +## It also holds power, area and latency numbers of different component used in DPE design +import config as cfg +import math +import constants_digital as digi_param +# Limits the number of cycles an IMA runs in case it doesn't halt +infinity = 100000 + +############################################################################################################# +## Technology/Other constants for all the modules +############################################################################################################# +# IMA - folliwng parameters are not used currently, will be used when analog functionality is implemented +cycle_time = 1 # in nanoseconds (1ns) +vdd = 0.9 +xbar_out_min = -10e-10 +xbar_out_max = 1 # think about this - ??? + +############################################################################################################# +## Define commonly used data structures +############################################################################################################# +# List of supported opcodes for tile +op_list_tile = ['send', 'receive', 'compute', 'halt'] + +# Instruction format for Tile +dummy_instrn_tile = {'opcode' : op_list_tile[0], + 'mem_addr': 0, # send/receive - edram_addr + 'r1': 0, # send-send_width, receive-receive_width + 'r2': 0, # send-target_addr, receive-counter + 'vtile_id': 0, # send/receive-neuron_id + 'ima_nma': '', # compute - a bit for each ima + 'vec': 0} # vector width + +# List of supported opcodes/aluops for IMA - cp will copy data (from data memory of ima to xbarInmem) +op_list = ['ld', 'cp', 'st', 'set', 'nop', 'alu', 'alui', 'mvm', 'vvo', 'hlt', 'jmp', 'beq', 'alu_int', 'crs'] +aluop_list = ['add', 'sub', 'sna', 'mul', 'sigmoid'] # sna is also used by mvm isntruction + +# Instruction format for IMA +dummy_instrn = {'opcode' : op_list[0], # instrn op + 'aluop' : aluop_list[0], # alu function + 'd1' : 0, # destination + 'r1' : 0, # operand1 (stride for mvm) + 'r2' : 0, # operand2 + 'r3' : 0, # operand3 (shift) + 'vec' : 0, # vector width + 'imm' : 0, # immediate (scalar) data + 'xb_nma' : 0 } # xbar negative-mask, a xbar evaluates if neg-mask = 1 + +# List of pipeline stages - in order for IMA +stage_list = ['fet', 'dec', 'ex'] +last_stage = 'ex' + +############################################################################################################# +# IMA Hierarchy parameters + # Number of Xbars + # Crossbar Size + # Crossbar bits + # Bit resolution of ADCs and DACs + # Number of ADCs + # Number of ALUs + # Data memory size + # Size of Xbar in/out memory (Register) is dependent on Xbar size and num_bits + # Instruction memory size +############################################################################################################# + +# IMA component latency/power/area dictionary (all values in ns, mw, mm2) +# XBAR - Models from ISAAC paper +xbar_lat_dict = {'2': {'16' : 16, + '32' : 32, # first indexed by xbar_bits then by xbar_size + '64' : 64, + '128': 128, + '256': 256}, + '4': {'16' : 16, + '32' : 32, + '64' : 64, + '128': 128, + '256': 256}, + '6': {'16' : 16, + '32' : 32, + '64' : 64, + '128': 128, + '256': 256}} + +xbar_pow_dict = {'2': {'16' : 0.0046875, + '32' : 0.01875, + '64' : 0.075, + '128': 0.3, + '256': 1.2}, + '4': {'16' : 0.0046875, + '32' : 0.01875, + '64' : 0.075, + '128': 0.3, + '256': 1.2}, + '6': {'16' : 0.0046875, + '32' : 0.01875, + '64' : 0.075, + '128': 0.3, + '256': 1.2}} + +xbar_area_dict = {'2': {'16' : 3.90625 * 10**(-7), + '32' : 1.5625 * 10**(-6), + '64' : 6.25 * 10**(-6), + '128': 2.5 * 10**(-5), + '256': 1.0 * 10**(-4)}, + '4': {'16' : 3.90625 * 10**(-7), + '32' : 1.5625 * 10**(-6), + '64' : 6.25 * 10**(-6), + '128': 2.5 * 10**(-5), + '256': 1.0 * 10**(-4)}, + '6': {'16' : 3.90625 * 10**(-7), + '32' : 1.5625 * 10**(-6), + '64' : 6.25 * 10**(-6), + '128': 2.5 * 10**(-5), + '256': 1.0 * 10**(-4)}} + +## New values added for xbar MVM/MTVM, OP (parallel write), serial read/write +# the following is lumped power for xbar inner/outer-product - includes peripherals +xbar_op_lat = 20.0*12.8 # with 4 VFUs +xbar_op_pow = 4.44 * 3.27 / (12.8) + +#hardcoded value +#xbar_ip_lat = 100.0 +#value depending on xb size +xbar_ip_lat = xbar_lat_dict[str(cfg.xbar_bits)][str(cfg.xbar_size)] + +#xbar_ip_pow = (1.37*2.0) # xbar_ip_pow (includes all mvmu) +#xbar_ip_pow = (1.37*2.0) - 1.04 if cfg.training else 1.37-1.04 # xbar_ip_pow (includes all mvmu except ADC - uncomment num_access for ADC object), +#xbar inner product power dependence on xbar size +xbar_ip_pow = xbar_pow_dict[str(cfg.xbar_bits)][str(cfg.xbar_size)] + +# Note the read and write lat/pow are for entire xbar +xbar_rd_lat = 328.0 * 1000 * (1/32.0) +xbar_wr_lat = 351.0 * 1000 * (1/32.0) + +# the following is lumped power for xbar rd/wr (for whole array) - includes peripherals +xbar_rd_pow = 208.0 * 1000 * (1/32.0) / xbar_rd_lat +xbar_wr_pow = 676.0 * 1000 * (1/32.0) / xbar_rd_lat + +# DAC - Discuss exact values with ISSAC authors +dac_lat_dict = {'1' : 1, + '2' : 1, + '4' : 1, + '8' : 1, + '16': 1} + +dac_pow_dyn_dict = {'1' : 0.00350625, + '2' : 0.00350625, + '4' : 0.00350625, + '8' : 0.00350625, + '16': 0.00350625} + +dac_pow_leak_dict = {'1' : 0.000390625, + '2' : 0.000390625, + '4' : 0.000390625, + '8' : 0.000390625, + '16': 0.000390625} + +dac_area_dict = {'1' : 1.67 * 10**(-7), + '2' : 1.67 * 10**(-7), + '4' : 1.67 * 10**(-7), + '8' : 1.67 * 10**(-7), + '16': 1.67 * 10**(-7)} + +# ADC - Discuss exact values with ISSAC authors +# ADC Values for including sparsity +adc_lat_dict = {'1' : 12.5, + '2' : 25, + '3' : 37.5, + '4' : 50, + '5' : 62.5, + '6' : 75, + '7' : 87.5, + '8' : 100, + '9' : 112.5, + '16': 200} + +adc_pow_dyn_dict = {'1' : 0.225, + '2' : 0.45, + '3' : 0.675, + '4' : 0.9, + '5' : 1.125, + '6' : 1.35, + '7' : 1.575, + '8' : 1.8, + '9' : 2.025, + '16': 3.6} + +adc_pow_leak_dict = {'1' : 0.025, + '2' : 0.05, + '3' : 0.075, + '4' : 0.1, + '5' : 0.125, + '6' : 0.15, + '7' : 0.175, + '8' : 0.2, + '9' : 0.225, + '16': 0.4} + +adc_area_dict = {'1' : 0.0012, + '2' : 0.0012, + '3' : 0.0012, + '4' : 0.0012, + '5' : 0.00075, + '6' : 0.0009, + '7' : 0.00105, + '8' : 0.0012, + '9' : 0.0012, + '16': 0.0012} + +# SNH (MVM pipeline) +snh_lat = 1 +snh_pow_leak = 9.7 * 10**(-7) +snh_pow_dyn = 9.7 * 10**(-6) - snh_pow_leak +snh_area = 0.00004 / 8 / 128 + +# SNA (MVM pipeline) +sna_lat = 1 +sna_pow_leak = 0.005 +sna_pow_dyn = 0.05 - sna_pow_leak +sna_area = 0.00006 + +# ALU (Part of Vector Functional Unit) +alu_lat = 1 +alu_pow_dyn = 2.4 * 32/45 +alu_pow_div_dyn = 1.52 * 32/45 +alu_pow_mul_dyn = 0.795 * 32/45 +alu_pow_others_dyn = 0.373 * 32/45 # logical, eq, relu, add, sub, lsh, rsh +alu_pow_leak = 0.27 * 32/45 +alu_area = 0.00567 * 32/45 + +# witout considering division +#alu_lat = 1 +#alu_pow_dyn = 1.15 * 32/45 +#alu_pow_mul_dyn = 0.796 * 32/45 +#alu_pow_others_dyn = 0.36 * 32/45 # logical, eq, relu, add, sub, lsh, rsh +#alu_pow_leak = 0.05 * 32/45 +#alu_area = 0.002326 * 32/45 + +# Sigmoid/Tanh (Part of Vector Functional Unit) - Taken from ISAAC paper +act_lat = 1 # added for 4 exponential units +act_pow_leak = 0.026 +act_pow_dyn = 0.26 - act_pow_leak +act_area = 0.0003 # check this ??? + +# Multiplexer - These should be analog muxes +mux_lat = 0 +mux_pow_leak = 0 +mux_pow_dyn = 0 +mux_area = 0 + +# Data Memory value dictionary +dataMem_lat_dict = {'256' : 1, + '512' : 1, + '1024': 1, + '2048': 1, + '4096':1} + +dataMem_pow_dyn_dict = {'256' : 0.16, + '512' : 0.24, + '1024': 0.33, + '2048': 0.57, + '4096': 0.74} + +dataMem_pow_leak_dict = {'256' : 0.044, + '512' : 0.078, + '1024': 0.147, + '2048': 0.33, + '4096': 0.489} + +dataMem_area_dict = {'256' : 0.00056, + '512' : 0.00108, + '1024': 0.00192, + '2048': 0.00392, + '4096': 0.020691} + +# Instruction Memory value dictionary +instrnMem_lat_dict = {'512' : 1, + '1024': 1, + '2048': 1, + '4096':1} + +instrnMem_pow_dyn_dict = {'512' : 0.46, + '1024': 0.53, + '2048': 0.65, + '4096':0.74} + +instrnMem_pow_leak_dict = {'512' : 0.078, + '1024': 0.147, + '2048': 0.33, + '4096':0.489} + + +instrnMem_area_dict = {'512' : 0.00108, + '1024': 0.00192, + '2048': 0.0041, + '4096':0.020691} + + +# Xbar_inMem value dictionary (1 access means reading (dac_res) bits for each xbar row) +# for computing average power of ima - scale dyn_pow down by xbar_size +xbar_inMem_lat_dict = {'16' : 1, + '32' : 1, # indexed with xbar size + '64' : 1, + '128' : 1, + '256' : 1} + +xbar_inMem_pow_dyn_read_dict = {'16' : 0.3, #doesn't change much as we move from 32 to 16, because these are very small memories + '32' : 0.3, + '64' : 0.7, + '128' : 1.7, + '256' : 4.7} + +xbar_inMem_pow_dyn_write_dict = {'16' : 0.1, + '32' : 0.1, + '64' : 0.1, + '128' : 0.16, + '256' : 0.2} + +xbar_inMem_pow_leak_dict = {'16' : 0.009, + '32' : 0.009, + '64' : 0.02, + '128' : 0.04, + '256' : 0.075} + +xbar_inMem_area_dict = {'16' : 0.00015, + '32' : 0.00015, + '64' : 0.00033, + '128' : 0.00078, + '256' : 0.0019} + +# Xbar_outMem value dictionary +xbar_outMem_lat_dict = {'16' : 1, + '32' : 1, # indexed with xbar size + '64' : 1, + '128' : 1, + '256' : 1} + +xbar_outMem_pow_dyn_dict = {'16' : 0.1, + '32' : 0.1, + '64' : 0.1, + '128' : 0.16, + '256' : 0.2} + +xbar_outMem_pow_leak_dict = {'16' : 0.009, + '32' : 0.009, + '64' : 0.02, + '128' : 0.04, + '256' : 0.075} + +xbar_outMem_area_dict = {'16' : 0.00015, + '32' : 0.00015, + '64' : 0.00033, + '128' : 0.00078, + '256' : 0.0019} + +dataMem_size_max = '4096' +if str(cfg.dataMem_size) in dataMem_lat_dict: + dataMem_size_max = str(cfg.dataMem_size) +else: + print("Warning: No values for core data memory size provided. Using values for 2048 instead.") + +instrnMem_size_max = '4096' +if str(cfg.instrnMem_size) in instrnMem_lat_dict: + instrnMem_size_max = str(cfg.instrnMem_size) +else: + print("Warning: No values for core instruction memory size provided. Using values for 2048 instead.") + +# Chosen latency based on config file - only for components whose latency is parameter dependent +#xbar_lat = xbar_lat_dict [str(cfg.xbar_bits)][str(cfg.xbar_size)] +xbar_ip_lat_dict = {'0':0, '90':0, '80':0, '70':0, '60':0, '50':0, '40':0, '30':0, '20':0, '10':0} +if cfg.MVMU_ver == "Analog": + for key, value in xbar_ip_lat_dict.items(): + xbar_ip_lat_dict[key] = xbar_ip_lat +else: + xbar_ip_lat_dict = digi_param.Digital_xbar_lat_dict[cfg.MVMU_ver][str(cfg.xbar_size)] +xbar_op_lat = xbar_op_lat +xbar_rd_lat = xbar_rd_lat +xbar_wr_lat = xbar_wr_lat +dac_lat = dac_lat_dict [str(cfg.dac_res)] +#FIXME need to review it I can remove adc_lat property +adc_lat = adc_lat_dict [str(cfg.adc_res)] +xbar_inMem_lat = xbar_inMem_lat_dict[str(cfg.xbar_size)] +xbar_outMem_lat = xbar_outMem_lat_dict[str(cfg.xbar_size)] +instrnMem_lat = instrnMem_lat_dict[str(instrnMem_size_max)] +dataMem_lat = dataMem_lat_dict[str(dataMem_size_max)] + +# Chosen area based on config file - only for components whose area is parameter dependent +if cfg.MVMU_ver == "Analog": + xbar_area = xbar_area_dict[str(cfg.xbar_bits)][str(cfg.xbar_size)] +else: + xbar_area = digi_param.Digital_xbar_area_dict[cfg.MVMU_ver][str(cfg.xbar_size)] +dac_area = dac_area_dict [str(cfg.dac_res)] +adc_area = adc_area_dict [str(cfg.adc_res)] +xbar_inMem_area = xbar_inMem_area_dict[str(cfg.xbar_size)] +xbar_outMem_area = xbar_outMem_area_dict[str(cfg.xbar_size)] +instrnMem_area = instrnMem_area_dict[str(instrnMem_size_max)] * math.sqrt(8) #area scaling for 8 bytes per instruction +dataMem_area = dataMem_area_dict[str(dataMem_size_max)] + +# Chosen dyn_power based on config file - only for components whose latency is parameter dependent +#xbar_pow_dyn = xbar_pow_dict [str(cfg.xbar_bits)][str(cfg.xbar_size)] +xbar_ip_pow_dyn = xbar_ip_pow +xbar_op_pow_dyn = xbar_op_pow +xbar_rd_pow_dyn = xbar_rd_pow +xbar_wr_pow_dyn = xbar_wr_pow +dac_pow_dyn = dac_pow_dyn_dict [str(cfg.dac_res)] +adc_pow_dyn = adc_pow_dyn_dict [str(cfg.adc_res)] +xbar_inMem_pow_dyn_read = xbar_inMem_pow_dyn_read_dict[str(cfg.xbar_size)] +xbar_inMem_pow_dyn_write = xbar_inMem_pow_dyn_write_dict[str(cfg.xbar_size)] +xbar_outMem_pow_dyn = xbar_outMem_pow_dyn_dict[str(cfg.xbar_size)] +instrnMem_pow_dyn = instrnMem_pow_dyn_dict[str(instrnMem_size_max)] * math.sqrt(8) #area scaling for 8 bytes per instruction +dataMem_pow_dyn = dataMem_pow_dyn_dict[str(dataMem_size_max)] + +# Energy +xbar_ip_energy_dict = {'0':0, '90':0, '80':0, '70':0, '60':0, '50':0, '40':0, '30':0, '20':0, '10':0} +if cfg.MVMU_ver == "Analog": + for key,value in xbar_ip_energy_dict.items(): + xbar_ip_energy_dict[key] = xbar_ip_lat*xbar_ip_pow_dyn +else: + xbar_ip_energy_dict = digi_param.Digital_xbar_energy_dict[cfg.MVMU_ver][str(cfg.xbar_size)] +print('xbar_ip_energy_dict', xbar_ip_energy_dict) + +# Chosen leak_power based on config file - only for components whose latency is parameter dependent +if cfg.MVMU_ver == "Analog": + xbar_pow_leak = 0 +else: + xbar_pow_leak = digi_param.Digital_xbar_pow_leak_dict[str(cfg.xbar_size)] +dac_pow_leak = dac_pow_leak_dict [str(cfg.dac_res)] +adc_pow_leak = adc_pow_leak_dict [str(cfg.adc_res)] +xbar_inMem_pow_leak = xbar_inMem_pow_leak_dict[str(cfg.xbar_size)] +xbar_outMem_pow_leak = xbar_outMem_pow_leak_dict[str(cfg.xbar_size)] +instrnMem_pow_leak = instrnMem_pow_leak_dict[str(instrnMem_size_max)] * math.sqrt(8) #area scaling for 8 bytes per instruction +dataMem_pow_leak = dataMem_pow_leak_dict[str(dataMem_size_max)] + +# Core Control unit (control unit and pipeline registers) +ccu_pow = 1.25*0.2 #0.2 for activvity +ccu_area = 0.00145*2.25 #taken similar as edctrl (scaled by power) + +# Added here for simplicity now (***can need modification later***) +# The latency of mem access is dependent on when can the ima find edram bys non-busy +memInterface_lat = infinity # infinite latency + +############################################################################################################# +# Tile Hierarchy + # Number of IMAs + # EDRAM size + # Shared Bus width + # Instruction memory size + # Receive Buffer size +############################################################################################################# + +# Tile component latency/pow/area +# EDRAM value dictionary (counter storage is not coounted) +edram_lat_dict = {'8' : 2, + '64' : 2, #edram access width is constant = 256 bits + '128' : 2} + +edram_pow_dyn_dict = {'8' : 17.2/2, + '64' : 17.2/2, # (0.0172 nJ with 2 cycles access latency) + '128' : 25.35/2} + +edram_pow_leak_dict = {'8' : 0.46, + '64' : 0.46, + '128' : 0.77} + +edram_area_dict = {'8' : 0.086, + '64' : 0.086, + '128' : 0.121} + +# Tile Instruction Memory value dictionary +tile_instrnMem_lat_dict = {'512': 1, + '1024': 1, + '2048': 1} + +tile_instrnMem_pow_dyn_dict = {'512' : 0.46, + '1024': 0.53, + '2048': 0.65} + +tile_instrnMem_pow_leak_dict = {'512' : 0.078, + '1024': 0.147, + '2048': 0.33} + + +tile_instrnMem_area_dict = {'512' : 0.00108, + '1024': 0.00192, + '2048': 0.0041} + + +edram_size_max = '128' +if str(cfg.edram_size) in edram_lat_dict: + edram_size_max = str(cfg.edram_size) +else: + print("Warning: No values for edram memory size provided. Using values for 128 instead.") + +tile_instrnMem_size_max = '2048' +if str(cfg.tile_instrnMem_size) in tile_instrnMem_lat_dict: + tile_instrnMem_size_max = str(cfg.tile_instrnMem_size) +else: + print("Warning: No values for tile instrn memory size provided. Using values for 2048 instead.") + +# counter storage (2048 Byte Scratch RAM - 1 counter entry shared by 256 bits of data (16 neurons)) +# area scaling (X8) +counter_buff_lat = 1 * math.sqrt(8) +counter_buff_pow_dyn = 0.65/2 * math.sqrt(8) +counter_buff_pow_leak = 0.33/2 * math.sqrt(8) +counter_buff_area = 0.0041 * math.sqrt(8) + +# EDRAM to IMA bus values +edram_bus_lat = 1 +edram_bus_pow_dyn = 6/2 #bus width = 384, same as issac (over two cycles) +edram_bus_pow_leak = 1/2 #bus width = 384, same as issac +edram_bus_area = 0.090 + +# EDRAM controller values +edram_ctrl_lat = 1 +edram_ctrl_pow_dyn = 0.475 +edram_ctrl_pow_leak = 0.05 +edram_ctrl_area = 0.00145 + +# Receive buffer value dictionary - 16 entries (Need to make this a dictionary) +# Increasing to 64 entries +receive_buffer_lat = 1 * math.sqrt(4) +receive_buffer_pow_dyn = 4.48 * math.sqrt(4) # (0.2*256/16) +receive_buffer_pow_leak = 0.09 * math.sqrt(4) +receive_buffer_area = 0.0022 *math.sqrt(4) + + +# Chosen latency based on config file - only for components whose latency is parameter dependent +edram_lat = edram_lat_dict[str(edram_size_max)] +tile_instrnMem_lat = tile_instrnMem_lat_dict[str(tile_instrnMem_size_max)] + +# Chosen area based on config file - only for components whose area is parameter dependent +edram_area = edram_area_dict[str(edram_size_max)] +tile_instrnMem_area = tile_instrnMem_area_dict[str(tile_instrnMem_size_max)] * math.sqrt(8) #area scaling for 8 bytes per instruction + +# Chosen dynamic power based on config file - only for components whose dynamic power is parameter dependent +edram_pow_dyn = edram_pow_dyn_dict[str(edram_size_max)] +tile_instrnMem_pow_dyn = tile_instrnMem_pow_dyn_dict[str(tile_instrnMem_size_max)] * math.sqrt(8) #area scaling for 8 bytes per instruction + +# Chosen leakage power based on config file - only for components whose leakage power is parameter dependent +edram_pow_leak = edram_pow_leak_dict[str(edram_size_max)] +tile_instrnMem_pow_leak = tile_instrnMem_pow_leak_dict[str(tile_instrnMem_size_max)] * math.sqrt(8) #area scaling for 8 bytes per instruction + +# Tile Control unit +tcu_pow = 0.25*0.2 +tcu_area = 0.00145 #taken similar as edctrl + +############################################################################################################# +# Node Hierarchy + # Number of Tiles + # NOC - Topology (Currently assumes a cmesh (c=4, same as ISSAC)) + # n = number of dimension\ + # k = number of tiles in each dimension + # c = concentartion (tiles/router) + # average injection rate (0.25 - a tile injects a new packet for each destination in every four cycles) +############################################################################################################# + +# NOC latency dictionary (in terms of flit cycle) +# Note - if inj_rate (packet injection -1 packet - 16 neurons) exceeds 0.025 - there's a problem, NoC needs to be redesigned else network latency will be killing! +# Hence, not provided for +noc_inj_rate_max = 0.025 +noc_lat_dict = {'0.001': 29, + '0.005': 31, + '0.01' : 34, + '0.02' : 54, + '0.025': 115} + +noc_area_dict = {'4': 0.047, + '8': 0.116} + +# Router dynamic power - NOC will be used only if atleast one of send_queue in node is non_empty +noc_pow_dyn_dict = {'4': 16.13, + '8': 51.48} + +# Router leakage power - NOC will be used only if atleast oen of send_queue in node is non_empty +noc_pow_leak_dict = {'4': 0.41, + '8': 1.04} + +# Enter component latency (Based on teh above NOC topological parameters) +# Inter-node Noc (router & channel) +assert (cfg.noc_inj_rate <= noc_inj_rate_max), 'Oops: reconsider NOC design and or DNN mapping, with this inj_rate, NOC data transfer throughput \ +will be terrible!' + +noc_intra_lat = noc_lat_dict[str(cfg.noc_inj_rate)] +noc_intra_pow_dyn = noc_pow_dyn_dict[str(cfg.noc_num_port)] # per router +noc_intra_pow_leak = noc_pow_leak_dict[str(cfg.noc_num_port)]# per router +noc_intra_area = noc_area_dict[str(cfg.noc_num_port)] # per router + +# Hypertransport network (HT) +# Note HT is external to a node, but we consider all tiles in one +# virtual node itself for simplicity +# HT numbers from ISAAC = 6.4GB/s = 6.4B/ ns = 1packet(16*2 Bytes) = 5ns +ht_lat = 5 #latency per packet +noc_inter_lat = ht_lat + noc_intra_lat #navigate to the node, then to tile within node +noc_inter_pow_dyn = 10400 #10.4W +noc_inter_pow_leak = 0 +noc_inter_area = 22.88 + diff --git a/src/hw_stats.py b/src/hw_stats.py index 6970de88..f9e54419 100644 --- a/src/hw_stats.py +++ b/src/hw_stats.py @@ -30,14 +30,14 @@ 'xbar_wr':param.xbar_wr_pow_dyn*param.xbar_wr_lat, 'dac':param.dac_pow_dyn, 'snh':param.snh_pow_dyn, \ 'mux1':param.mux_pow_dyn, 'mux2':param.mux_pow_dyn, \ - 'adc':{ 'n' : param.adc_pow_dyn_dict[str(cfg.adc_res)], \ - 'n/2': param.adc_pow_dyn_dict[str(cfg.adc_res-1)], \ - 'n/4': param.adc_pow_dyn_dict[str(cfg.adc_res-2)], \ - 'n/8': param.adc_pow_dyn_dict[str(cfg.adc_res-3)], \ - 'n/16': param.adc_pow_dyn_dict[str(cfg.adc_res-4)], \ - 'n/32': param.adc_pow_dyn_dict[str(cfg.adc_res-5)], \ - 'n/64': param.adc_pow_dyn_dict[str(cfg.adc_res-6)], \ - 'n/128': param.adc_pow_dyn_dict[str(cfg.adc_res-7)]}, \ + 'adc':{ 'n' : param.adc_pow_dyn_dict[str(cfg.adc_res)] if cfg.adc_res>0 else 0, \ + 'n/2': param.adc_pow_dyn_dict[str(cfg.adc_res-1)] if cfg.adc_res-1>0 else 0, \ + 'n/4': param.adc_pow_dyn_dict[str(cfg.adc_res-2)] if cfg.adc_res-2>0 else 0, \ + 'n/8': param.adc_pow_dyn_dict[str(cfg.adc_res-3)] if cfg.adc_res-3>0 else 0, \ + 'n/16': param.adc_pow_dyn_dict[str(cfg.adc_res-4)] if cfg.adc_res-4>0 else 0, \ + 'n/32': param.adc_pow_dyn_dict[str(cfg.adc_res-5)] if cfg.adc_res-5>0 else 0, \ + 'n/64': param.adc_pow_dyn_dict[str(cfg.adc_res-6)] if cfg.adc_res-6>0 else 0, \ + 'n/128': param.adc_pow_dyn_dict[str(cfg.adc_res-7)] if cfg.adc_res-7>0 else 0}, \ 'alu_div': param.alu_pow_div_dyn, 'alu_mul':param.alu_pow_mul_dyn, \ 'alu_act': param.act_pow_dyn, 'alu_other':param.alu_pow_others_dyn, \ 'alu_sna': param.sna_pow_dyn, \ @@ -105,7 +105,6 @@ def get_hw_stats (fid, node_dut, cycle): for j in range (cfg.num_ima): sum_num_cycle_ima += node_dut.tile_list[i].ima_list[j].cycle_count # used for leakage energy of imas - mvmu_type = ['f', 'b', 'd'] for k in range (cfg.num_matrix): for mvmu_t in mvmu_type: @@ -187,8 +186,8 @@ def get_hw_stats (fid, node_dut, cycle): hw_comp_access['dmem'] += node_dut.tile_list[i].ima_list[j].dataMem.num_access # Added for core and tile control units - hw_comp_access['core_control'] = sum_num_cycle_tile - hw_comp_access['tile_control'] = sum_num_cycle_ima + hw_comp_access['core_control'] = sum_num_cycle_ima + hw_comp_access['tile_control'] = sum_num_cycle_tile total_energy = 0 total_adc_energy = 0 diff --git a/src/ima.py b/src/ima.py index ea54481a..b3864426 100644 --- a/src/ima.py +++ b/src/ima.py @@ -502,7 +502,12 @@ def do_execute (self, ex_op, fid): # check if data is a list if (type(data) != list): data = ['0'*cfg.data_width]*self.de_r2 + elif (len(data)= datamem_off): self.dataMem.write (dst_addr, data[i]) @@ -721,7 +726,8 @@ def outer_product (mat_id, key): if (cfg.inference): for i in xrange(cfg.num_matrix): - if self.de_xb_nma[i]: + if int(self.de_xb_nma[i]): + #if self.de_xb_nma[i]: print ("ima_id: " +str(self.ima_id) + " mat_id: " +str(i) + " MVM") inner_product(i,'f') diff --git a/src/ima_modules.py b/src/ima_modules.py index b0d5b2d0..f98e2afe 100644 --- a/src/ima_modules.py +++ b/src/ima_modules.py @@ -280,7 +280,7 @@ def propagate_dummy (self, inp, sparsity = 0): else: self.num_access['n/128'] += 1 self.adc_res = cfg.adc_res-7 - if(self.adc_res<0): + if(self.adc_res<=0): self.adc_res = 1 return inp @@ -740,4 +740,3 @@ def rdRequest (self, addr, rd_width): ## For DEBUG of IMA only #self.ramload = self.edram.memfile[addr] - diff --git a/src/tile.py b/src/tile.py index a849dd35..b895feef 100644 --- a/src/tile.py +++ b/src/tile.py @@ -109,7 +109,7 @@ def tile_init (self, instrnpath, tracepath): def tile_compute (self, cycle): ## Simulate a cycle if IMA(s) that haven't halted - if (not all(self.halt_list)): # A tile halts whwn all IMAs (within the tile) halt + if (not all(self.halt_list)): # A tile halts when all IMAs (within the tile) halt for i in range (cfg.num_ima): if ((not self.halt_list[i]) and self.ima_nma_list[i]): self.ima_list[i].pipe_run (cycle, self.fid_list[i]) diff --git a/src/tile_modules.py b/src/tile_modules.py index 417f95e2..55e9670d 100644 --- a/src/tile_modules.py +++ b/src/tile_modules.py @@ -92,8 +92,12 @@ def read (self, addr, width = 1): # read edram_buswidth/data_width of continuous # returns a list of entries (list has one entry - Typical case) assert (width < cfg.edram_buswidth/cfg.data_width+1), \ 'read edram width exceeds' - return self.memfile[(addr - self.addr_start) : \ - (addr - self.addr_start + width)][:] + data = self.memfile[(addr - self.addr_start) : \ + (addr - self.addr_start + width)][:] + assert (len(data) == width), 'data length not same as requested width' + return data +# return self.memfile[(addr - self.addr_start) : \ +# (addr - self.addr_start + width)][:] # redefine the write assertion def write (self, addr, data, width = 1): # write (edram_buswidth/data_width) to continuous writes to edram @@ -185,8 +189,10 @@ def propagate (self, ren_list, wen_list, rd_width_list, wr_width_list, ramstore_ self.counter[addr+i] = self.counter[addr+i] - 1 if (self.counter[addr+i] <= 0): #modified self.valid[addr+i] = 0 - # read the data and send to ima - if found is 0, ramload is junk - ramload = self.mem.read (addr, rd_width_list[idx]) + # read the data and send to ima - if found is 0, ramload is junk + ramload = self.mem.read (addr, rd_width_list[idx]) + else: + ramload = 0 #if found=0 implies set ramload as dummy 0 return [found, idx, ramload] else: # ST instruction diff --git a/test/cnn/conv-layer-stride.cpp b/test/cnn/conv-layer-stride.cpp index 1c13dee8..70a6e26e 100644 --- a/test/cnn/conv-layer-stride.cpp +++ b/test/cnn/conv-layer-stride.cpp @@ -19,14 +19,14 @@ int main(int argc, char** argv) { // Model model = Model::create("conv3-layer"); // Process parameter - unsigned int in_size_x ; - unsigned int in_size_y ; - unsigned int in_channels ; - unsigned int out_channels ; - unsigned int k_size_x ; - unsigned int k_size_y ; - unsigned int padding ; - unsigned int stride ; + unsigned int in_size_x=9 ; + unsigned int in_size_y=9 ; + unsigned int in_channels=128 ; + unsigned int out_channels=256 ; + unsigned int k_size_x=3 ; + unsigned int k_size_y=3 ; + unsigned int padding=1 ; + unsigned int stride=1 ; if(argc == 10) { in_size_x = atoi(argv[1]); @@ -35,14 +35,14 @@ int main(int argc, char** argv) { out_channels = atoi(argv[4]); k_size_x = atoi(argv[5]); k_size_y = atoi(argv[6]); - padding = atoi(argv[7]); - stride = atoi(argv[8]); + padding = atoi(argv[7]); + stride = atoi(argv[8]); } std:: string str=std::string("conv") + argv[9] + std::string("-layer"); Model model = Model::create(str); // Input stream - auto in_stream = InputImagePixelStream::create(model, "in_stream", in_size_x, in_size_y, in_channels); + auto in_stream = InputImagePixelStream::create(model, "in_stream", in_size_x, in_size_y, in_channels, stride); // Output stream unsigned int out_size_x = (in_size_x - k_size_x + 2*padding)/stride + 1;