From 73c6adfc13797d80f381ca14b143175c640b6a5f Mon Sep 17 00:00:00 2001 From: Deepika Sharma Date: Tue, 1 Oct 2019 16:24:07 -0400 Subject: [PATCH 01/15] Added a testbench for MVM test Addded an API for reding weights from weight files --- include/config.py | 12 +++++-- src/dnn_wt_p.py | 63 ++++++++++++++++++++++++++++++++++ test/mvm_ip_test.py | 83 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 155 insertions(+), 3 deletions(-) create mode 100644 src/dnn_wt_p.py create mode 100644 test/mvm_ip_test.py diff --git a/include/config.py b/include/config.py index fe652f4a..ebb4f323 100644 --- a/include/config.py +++ b/include/config.py @@ -2,9 +2,11 @@ ## All user specified parameters are provided by this file only ## Debug - 0 (1): dpe simulation will (won't) produce ima/tile traces while simulating -cycles_max = 5000000 # Put both these to very large numbers (when design is bug-free)! +cycles_max = 500000 # Put both these to very large numbers (when design is bug-free)! debug = 1 xbar_record = 1 +inference =1 +# training = 0 ## Operand precision (fixed point allowed only): num_bits = int_bits + frac_bits num_bits = 16 @@ -55,7 +57,11 @@ instrnMem_size = 512 #in entries # This depends on above parameters -datamem_off = xbar_size * (num_matrix*6) # each matrix has 6 memory spaces (1 for f/b, 2 for d) +if(not inference): + datamem_off = xbar_size * (num_matrix*6) # each matrix has 6 memory spaces (1 for f/b, 2 for d) +if(inference): + datamem_off = xbar_size * (num_matrix*2) # each matrix has 2 memory spaces (for f only) +# datamem_off = xbar_size * (num_matrix*6) # each matrix has 6 memory spaces (1 for f/b, 2 for d) phy2log_ratio = num_bits / xbar_bits # ratio of physical to logical xbar #vaulue is 8 lr = 0.25 # learning rate for updates to d-xbar @@ -96,7 +102,7 @@ # (b bit of address = logN, N is the number of nodes) # Change here - Specify the Node parameters here -num_tile_compute = 23 # number of tiles mapped by dnn (leaving input and output tiles) +num_tile_compute = 7 # number of tiles mapped by dnn (leaving input and output tiles) num_tile_max = 168.0 # maximum number of tiles per node num_inj_max = num_tile_max # [conservative] max number of packet injections that can occur in a cycle (each tile injects a packet into NOC each cycle) noc_inj_rate = 0.005 diff --git a/src/dnn_wt_p.py b/src/dnn_wt_p.py new file mode 100644 index 00000000..046789e0 --- /dev/null +++ b/src/dnn_wt_p.py @@ -0,0 +1,63 @@ +from functools import partial +from multiprocessing import Pool + +#**************************************************************************************** +# Designed by - Aayush Ankit +# School of Elctrical and Computer Engineering +# Nanoelectronics Research Laboratory +# Purdue University +# (aankit at purdue dot edu) +# +# DPEsim - Dot-Product Engine Simulator +# +# Input Tile (tile_id = 0) - has instructions to send input layer data to tiles +# -> Dump the SEND instructions correponding to input data in this tile +# +# Output Tile (tile_id = num_tile) - has instructions to receive output data from tiles +# -> Dump the data in EDRAM - that's your DNN output +# +# Other tiles (0 < tile_id < num_tile) - physical tiles used in computations +#**************************************************************************************** + +import time + +import sys +import getopt +import os +import argparse + +root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +src_dir = os.path.join(root_dir, "src") +include_dir = os.path.join(root_dir, "include") +test_dir = os.path.join(root_dir, "test") + +sys.path.insert(0, include_dir) +sys.path.insert(0, src_dir) +sys.path.insert(0, root_dir) + +# Set the instruction & trace paths (create the folder hierarchy) +# Assumption: All instructions for all TILEs and IMAs have already been generated +from node_dump import * +import numpy as np + +class dnn_wt: + + def prog_dnn_wt(self, instrnpath, node_dut): + + ## Program DNN weights on the xbars + for i in range(1, cfg.num_tile): + print ('Programming weights of tile no: ', i) + for j in range(cfg.num_ima): + print ('Programming ima no: ', j) + for k in range(cfg.num_matrix): + for l in range(cfg.phy2log_ratio): + wt_filename = instrnpath + 'weights/tile' + str(i) + '/core'+str(j)+\ + '/mat'+str(k)+'-phy_xbar'+str(l)+'.npy' + if (os.path.exists(wt_filename)): # check if weights for the xbar exist + print ('wtfile exits: ' + 'tile ' + str(i) + + 'ima ' + str(j) + 'matrix ' + str(k) + 'xbar' + str(l)) + wt_temp = np.load(wt_filename) + node_dut.tile_list[i].ima_list[j].matrix_list[k]['f'][l].program(wt_temp) + node_dut.tile_list[i].ima_list[j].matrix_list[k]['b'][l].program(wt_temp) + + diff --git a/test/mvm_ip_test.py b/test/mvm_ip_test.py new file mode 100644 index 00000000..2e58c9c9 --- /dev/null +++ b/test/mvm_ip_test.py @@ -0,0 +1,83 @@ +# API for testing MVM inner product operation +import sys +import os +import numpy as np + +root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.insert(0, root_dir) + +from src.data_convert import * +import src.ima as ima +from src.instrn_proto import * +import include.config as cfg + +#change the core and mvmu id'd here: +# tile_ID = 2 +# core_ID = 1 +# matrix_ID = 0 + +for tile_ID in range(2, cfg.num_tile): + for core_ID in range(cfg.num_ima): + for matrix_ID in range(cfg.num_matrix): + + path = 'testasm/mlp/' + wt_path = path +'weights/tile'+ str(tile_ID)+ '/core'+ str(core_ID)+ '/' + inst_file = path + 'tile'+ str(tile_ID)+ '/core_imem'+ str(core_ID)+ '.npy' + trace_path = 'traces/mlp/' + trace_file = trace_path + 'tile'+ str(tile_ID)+ '/ima_trace'+ str(core_ID)+ '.txt' + dump_file = trace_path + 'tile'+ str(tile_ID)+ '/memsim.txt' + + datamem_off = cfg.datamem_off # each matrix has 6 memory spaces (1 for f/b, 2 for d) + phy2log_ratio = cfg.phy2log_ratio # ratio of physical to logical xbar + + if (os.path.exists(wt_path)): # check if weights for the xbar exist + # print ('wtfile exits: ' + 'tile' + str(tile_ID) +' core ' + str(core_ID) + 'matrix ' + str(matrix_ID)) + + xbar_input = ['']*cfg.xbar_size + xbar_output = ['']*cfg.xbar_size + with open(dump_file, 'r') as file: + lines=file.readlines() + + for i in range (len(lines)): + if(lines[i] == 'Xbar Input Memory: imaId:'+ str(core_ID)+ ' matrixId:'+ str(matrix_ID)+ ' mvmu_type:f contents\n'): + ip_start=i+1 + if(lines[i] == 'Xbar Output Memory: imaId:'+ str(core_ID)+ ' matrixId:'+ str(matrix_ID)+ ' mvmu_type:f contents\n'): + op_start=i+1 + ip_end=i-1 + if(lines[i] == 'Xbar Input Memory: imaId:'+ str(core_ID)+ ' matrixId:'+ str(matrix_ID)+ ' mvmu_type:b contents\n'): + op_end=i-1 + + # print(ip_start) + # print(ip_end) + # print(op_start) + # print(op_end) + # print('Length of input=',ip_end-ip_start+1 ) + # print('Length of output=',op_end-op_start+1 ) + + for j in range (ip_end-ip_start+1): + xbar_input[j] = float(lines[ip_start+j]) + for j in range (op_end-op_start+1): + xbar_output[j] = float(lines[op_start+j]) + + # print(xbar_input) + # print(xbar_output) + + ## Testcases for Functionality Debug of MVM (1,2,3,4) + ## 1. compare golden output to ima output + wt_gold = np.load(wt_path+'log_xbar0.npy') + # print(wt_gold) + # out_gold = np.dot (ima.dataMem.memfile_float, wt_gold) + if(ip_end-ip_start+1 == 128): + + out_gold = np.dot (np.asarray(xbar_input), wt_gold) + out_exp = np.asarray(xbar_output) + + # print (out_gold) + # print (out_exp) + + err = np.tanh(out_gold) - np.tanh(out_exp) + print ("error for tile"+ str(tile_ID) +" core" + str(core_ID) + " matrix" + str(matrix_ID)+ " has mean= " + str(np.average(err)) + " and stdev= " + \ + str(np.std(err))) + + else: + print("No or less than length 128 input available for tile"+ str(tile_ID) +" core" + str(core_ID) + " matrix" + str(matrix_ID)+".") From 2014ee324f4e968c17e6453e81099600fd0f5da8 Mon Sep 17 00:00:00 2001 From: Deepika Sharma Date: Tue, 1 Oct 2019 16:41:47 -0400 Subject: [PATCH 02/15] Added modified node dump --- src/dpe.py | 36 +++++----- src/ima.py | 161 +++++++++++++++++++++++++++++++------------- src/ima_modules.py | 10 +-- src/instrn_proto.py | 19 ++++-- src/node_dump.py | 17 +++-- src/record_xbar.py | 8 ++- 6 files changed, 172 insertions(+), 79 deletions(-) diff --git a/src/dpe.py b/src/dpe.py index f7009603..1145ee6c 100644 --- a/src/dpe.py +++ b/src/dpe.py @@ -55,6 +55,7 @@ import ima_metrics import tile_metrics import node_metrics +import dnn_wt_p compiler_path = os.path.join(root_dir, "test/testasm/") trace_path = os.path.join(root_dir, "test/traces/") @@ -110,21 +111,23 @@ def run(self, net): node_dut.tile_list[inp_tileId].edram_controller.valid[i] = int( inp['valid'][i]) - ## Program DNN weights on the xbars - for i in range(1, cfg.num_tile): - print ('Programming weights of tile no: ', i) - for j in range(cfg.num_ima): - print ('Programming ima no: ', j) - for k in range(cfg.num_matrix): - for l in range(cfg.phy2log_ratio): - wt_filename = self.instrnpath + 'weights/tile' + str(i) + '/core'+str(j)+\ - '/mat'+str(k)+'-phy_xbar'+str(l)+'.npy' - if (os.path.exists(wt_filename)): # check if weights for the xbar exist - print ('wtfile exits: ' + 'tile ' + str(i) + - 'ima ' + str(j) + 'matrix ' + str(k) + 'xbar' + str(l)) - wt_temp = np.load(wt_filename) - node_dut.tile_list[i].ima_list[j].matrix_list[k]['f'][l].program(wt_temp) - node_dut.tile_list[i].ima_list[j].matrix_list[k]['b'][l].program(wt_temp) + dnn_wt_p.dnn_wt().prog_dnn_wt(self.instrnpath, node_dut) + + # ## Program DNN weights on the xbars + # for i in range(1, cfg.num_tile): + # print ('Programming weights of tile no: ', i) + # for j in range(cfg.num_ima): + # print ('Programming ima no: ', j) + # for k in range(cfg.num_matrix): + # for l in range(cfg.phy2log_ratio): + # wt_filename = self.instrnpath + 'weights/tile' + str(i) + '/core'+str(j)+\ + # '/mat'+str(k)+'-phy_xbar'+str(l)+'.npy' + # if (os.path.exists(wt_filename)): # check if weights for the xbar exist + # print ('wtfile exits: ' + 'tile ' + str(i) + + # 'ima ' + str(j) + 'matrix ' + str(k) + 'xbar' + str(l)) + # wt_temp = np.load(wt_filename) + # node_dut.tile_list[i].ima_list[j].matrix_list[k]['f'][l].program(wt_temp) + # node_dut.tile_list[i].ima_list[j].matrix_list[k]['b'][l].program(wt_temp) #raw_input ('Press Enter') @@ -143,6 +146,9 @@ def run(self, net): if (cfg.debug): node_dump(node_dut, self.tracepath) + if (cfg.xbar_record): + record_xbar(node_dut) + # Dump the contents of output tile (DNN output) to output file (output.txt) output_file = self.tracepath + 'output.txt' fid = open(output_file, 'w') diff --git a/src/ima.py b/src/ima.py index 152e7053..40cb8748 100644 --- a/src/ima.py +++ b/src/ima.py @@ -6,7 +6,7 @@ # import dependancy files import numpy as np import math -import config as cfg +import include.config as cfg #import include.configTest as cfg import include.constants as param import src.ima_modules as imod @@ -348,7 +348,7 @@ def do_decode (self, dec_op): assert (self.fd_instrn['r2'] >= datamem_off), 'operand2 for beq comes from data memory' self.de_val1 = self.dataMem.read(self.fd_instrn['r1']) self.de_val2 = self.dataMem.read(self.fd_instrn['r2']) - + elif (dec_op == 'alu_int'): self.de_aluop = self.fd_instrn['aluop'] self.de_d1 = self.fd_instrn['d1'] # addr for rf @@ -356,7 +356,7 @@ def do_decode (self, dec_op): assert (self.fd_instrn['r2'] >= datamem_off), 'operand2 for alu_int comes from data memory' self.de_val1 = self.dataMem.read(self.fd_instrn['r1']) self.de_val2 = self.dataMem.read(self.fd_instrn['r2']) - + # do nothing for halt/jmp in decode (just propagate to ex when applicable) @@ -417,32 +417,61 @@ def execute (self, update_ready, fid): # xbar_addr = matrix_addr % cfg.xbar_size # return [num_matrix, xbar_type, mem_addr, xbar_addr] - def getXbarAddr (data_addr): - # find i or o - if (data_addr < cfg.num_matrix*3*cfg.xbar_size): - mem_addr = 0 - else: - mem_addr = 128 - - # find xbar_addr - xbar_addr = data_addr % cfg.xbar_size - - # find matrix_addr - num_matrix = (data_addr / (3*cfg.xbar_size)) % cfg.num_matrix - - # find xbar_type - temp_val = (data_addr % (cfg.num_matrix*3*cfg.xbar_size)) - temp_val1 = temp_val % (3*cfg.xbar_size) - if (temp_val1 < cfg.xbar_size): - xbar_type = 'f' - elif (temp_val1 < 2*cfg.xbar_size): - xbar_type = 'b' - elif (temp_val1 < 3*cfg.xbar_size): - xbar_type = 'd' - else: - assert (1==0), "xbar memory addressing failed" + if(not cfg.inference): + def getXbarAddr (data_addr): + # find i or o + if (data_addr < cfg.num_matrix*3*cfg.xbar_size): + mem_addr = 0 + else: + mem_addr = 128 + + # find xbar_addr + xbar_addr = data_addr % cfg.xbar_size + + # find matrix_addr + num_matrix = (data_addr / (3*cfg.xbar_size)) % cfg.num_matrix + + # find xbar_type + temp_val = (data_addr % (cfg.num_matrix*3*cfg.xbar_size)) + temp_val1 = temp_val % (3*cfg.xbar_size) + if (temp_val1 < cfg.xbar_size): + xbar_type = 'f' + elif (temp_val1 < 2*cfg.xbar_size): + xbar_type = 'b' + elif (temp_val1 < 3*cfg.xbar_size): + xbar_type = 'd' + else: + assert (1==0), "xbar memory addressing failed" + + return [num_matrix, xbar_type, mem_addr, xbar_addr] - return [num_matrix, xbar_type, mem_addr, xbar_addr] + if(cfg.inference): + def getXbarAddr (data_addr): + # find i or o + if (data_addr < cfg.num_matrix*1*cfg.xbar_size): + mem_addr = 0 + else: + mem_addr = 128 + + # find xbar_addr + xbar_addr = data_addr % cfg.xbar_size + + # find matrix_addr + num_matrix = (data_addr / (1*cfg.xbar_size)) % cfg.num_matrix + + # find xbar_type + temp_val = (data_addr % (cfg.num_matrix*1*cfg.xbar_size)) + temp_val1 = temp_val % (1*cfg.xbar_size) + if (temp_val1 < cfg.xbar_size): + xbar_type = 'f' + # elif (temp_val1 < 2*cfg.xbar_size): + # xbar_type = 'b' + # elif (temp_val1 < 3*cfg.xbar_size): + # xbar_type = 'd' + else: + assert (1==0), "xbar memory addressing failed" + + return [num_matrix, xbar_type, mem_addr, xbar_addr] # write to the xbar memory (in/out) space depending on the address def writeToXbarMem (self, data_addr, data): @@ -465,9 +494,12 @@ def readFromXbarMem (self, data_addr): return self.xb_outMem_list[matrix_id][xbar_type].read (xbar_addr) # Define what to do in execute (done for conciseness) + + #set_trace() def do_execute (self, ex_op, fid): if (ex_op == 'ld'): + # print('In Load') self.ldAccess_done = 0 data = self.mem_interface.ramload # based on the address write to dataMem or xb_inMem @@ -477,15 +509,17 @@ def do_execute (self, ex_op, fid): data = ['0'*cfg.data_width]*self.de_r2 for i in range (self.de_r2): dst_addr = data_addr + i + # print('Destination Address in load', dst_addr) if (dst_addr >= datamem_off): - self.dataMem.write (dst_addr, data[i]) + self.dataMem.write (dst_addr, data[i]) else: - writeToXbarMem (self, dst_addr, data[i]) + writeToXbarMem (self, dst_addr, data[i]) elif (ex_op == 'st'): #nothing to be done by ima for st here return 1 elif (ex_op == 'set'): + # print('In Set') for i in range (self.de_vec): # write to dataMem - check if addr is a valid datamem address dst_addr = self.de_d1 + i @@ -495,8 +529,10 @@ def do_execute (self, ex_op, fid): writeToXbarMem (self, dst_addr, self.de_val1) elif (ex_op == 'cp'): + # print('In Copy') for i in range (self.de_vec): src_addr = self.de_r1 + i + # print('Source Address',src_addr) # based on address read from dataMem or xb_inMem if (src_addr >= datamem_off): ex_val1 = self.dataMem.read (src_addr) @@ -505,6 +541,8 @@ def do_execute (self, ex_op, fid): dst_addr = self.de_d1 + i # based on the address write to dataMem or xb_inMem + # print('Destination Address',dst_addr) + # print('data', ex_val1) if (dst_addr >= datamem_off): self.dataMem.write (dst_addr, ex_val1) else: @@ -539,6 +577,7 @@ def do_execute (self, ex_op, fid): else: writeToXbarMem (self, dst_addr, ex_val1) + elif (ex_op == 'alui'): for i in range (self.de_vec): # read val 2 either from data memory or xbar_outmem @@ -565,8 +604,16 @@ def do_execute (self, ex_op, fid): ## Define function to perform inner-product on specified mvmu # Note: Inner product with shift and add (shift-sub with last bit), works for 2s complement # representation for positive and negative numbers + #import pdb; pdb.set_trace(); + + #print('AHA Do Execute') + #for k in range(cfg.num_matrix): + # for l in range(cfg.phy2log_ratio): + # print(self.matrix_list[k]['f'][l].get_value()) + def inner_product (mat_id, key): # reset the xb out memory before starting to accumulate + #import pdb; pdb.set_trace() self.xb_outMem_list[mat_id][key].reset () ## Loop to cover all bits of inputs @@ -581,7 +628,7 @@ def inner_product (mat_id, key): #*************************************** HACK ********************************************* # convert digital values to analog - out_dac = self.dacArray_list[mat_id][key].propagate_dummy (out_xb_inMem) #pass through + out_dac = self.dacArray_list[mat_id][key].propagate_dummy(out_xb_inMem) #pass through # Do for (data_width/xbar_bits) xbars num_xb = cfg.data_width / cfg.xbar_bits @@ -589,9 +636,13 @@ def inner_product (mat_id, key): out_snh = [[] for x in range(num_xb)] for m in range (num_xb): # compute dot-product - out_xbar[m] = self.matrix_list[mat_id][key][m].propagate_dummy (out_dac) + #print('check dac/wt') + #print(out_dac) + # print(self.matrix_list[mat_id][key][m].get_value()) + # import pdb; pdb.set_trace() + out_xbar[m] = self.matrix_list[mat_id][key][m].propagate_dummy(out_dac) # do sampling and hold - out_snh[m] = self.snh_list[mat_id*num_xb+m].propagate_dummy (out_xbar[m]) + out_snh[m] = self.snh_list[mat_id*num_xb+m].propagate_dummy(out_xbar[m]) # each of the num_xb produce shifted bits of output (weight bits have been distributed) for j in xrange (cfg.xbar_size): # this 'for' across xbar outs to adc happens via mux @@ -600,9 +651,9 @@ def inner_product (mat_id, key): for m in range (num_xb): # convert from analog to digital adc_id = (mat_id*num_xb + m) % cfg.num_adc - out_mux1 = self.mux1_list[mat_id].propagate_dummy (out_snh[m][j]) # i is the ith xbar - out_mux2 = self.mux2_list[mat_id % cfg.num_adc].propagate_dummy (out_mux1) - out_adc = self.adc_list[adc_id].propagate_dummy (out_mux2) + out_mux1 = self.mux1_list[mat_id].propagate_dummy(out_snh[m][j]) # i is the ith xbar + out_mux2 = self.mux2_list[mat_id % cfg.num_adc].propagate_dummy(out_mux1) + out_adc = self.adc_list[adc_id].propagate_dummy(out_mux2) # shift and add outputs from difefrent wt_bits alu_op = 'sna' @@ -659,19 +710,26 @@ def outer_product (mat_id, key): self.matrix_list[mat_id][key][m].propagate_op_dummy (out_dac1, out_dac2, cfg.lr) ## Traverse through the matrices in a core - for i in xrange (cfg.num_matrix): + if (not cfg.inference): + for i in xrange (cfg.num_matrix): # traverse through f/b/d mvmu(s) for the matrix and execute if applicable - mask_temp = self.de_xb_nma[i] - if (mask_temp[0] == '1'): + mask_temp = self.de_xb_nma[i] + if (mask_temp[0] == '1'): # foward xbar operation - #print ("ima_id: " + str(self.ima_id) + " mat_id: " + str(i) + " MVM") - inner_product (i, 'f') - if (mask_temp[1] == '1'): + print ("ima_id: " + str(self.ima_id) + " mat_id: " + str(i) + " MVM") + inner_product (i, 'f') + if (mask_temp[1] == '1'): #print ("ima_id: " + str(self.ima_id) + " mat_id: " + str(i) + " MTVM") # backward xbar operation - inner_product (i, 'b') - if (mask_temp[2] == '1'): - outer_product (i, 'd') + inner_product (i, 'b') + if (mask_temp[2] == '1'): + outer_product (i, 'd') + + if (cfg.inference): + for i in xrange(cfg.num_matrix): + if self.de_xb_nma[i]: + print ("ima_id: " +str(self.ima_id) + " mat_id: " +str(i) + " MVM") + inner_product(i,'f') elif (ex_op == 'crs'): # read weights from delta-xbar, synchronize, write to f/b xbars @@ -861,6 +919,7 @@ def xbComputeLatency (self, mask): if (st_data_addr >= datamem_off): for num in range (cfg.edram_buswidth / cfg.data_width): # modified ex_val1[num] = self.dataMem.read (st_data_addr+num) # modified + else: for num in range (cfg.edram_buswidth / cfg.data_width): # modified ex_val1[num] = readFromXbarMem (self, st_data_addr+num) @@ -877,6 +936,7 @@ def xbComputeLatency (self, mask): (self.de_opcode == 'ld' and self.stage_cycle[sId] >= self.stage_latency[sId]-1 and self.ex_vec_count == (self.de_vec-1) and update_ready)): ex_op = self.de_opcode #print ("doing exe stage for op: " + ex_op) + #import pdb ; pdb.set_trace() do_execute (self, ex_op, fid) self.stage_done[sId] = 1 self.stage_cycle[sId] = 0 @@ -947,6 +1007,17 @@ def pipe_run (self, cycle, fid = ''): # fid is tracefile's id update_ready = self.stage_done[i+1] # run the stage based on its update_ready argument + #print('check weights in pipe_run') + #print(imod.xbar.xbar_value) + # print('AHA Pipe Run') + #for j in range(cfg.num_ima): + # for k in range(cfg.num_matrix): + # for l in range(cfg.phy2log_ratio): + # print(self.matrix_list[k]['f'][l].get_value()) + + + #if (i == 2): + # import pdb; pdb.set_trace() stage_function[i] (update_ready, fid) # If specified, print thetrace (pipeline stage information) diff --git a/src/ima_modules.py b/src/ima_modules.py index d9d263d8..fe14ba0c 100644 --- a/src/ima_modules.py +++ b/src/ima_modules.py @@ -6,8 +6,8 @@ import sys import numpy as np -import constants as param -import config as cfg +import include.constants as param +import include.config as cfg import math from data_convert import * @@ -233,14 +233,14 @@ def real2bin (self, inp, num_bits): return ('0'*(num_bits - len(bin_value)) + bin_value) def propagate (self, inp): - self.num_access += 1 + #self.num_access += 1 assert (type(inp) in [float, np.float32, np.float64]), 'adc input type mismatch (float, np.float32, np.float64 expected)' num_bits = self.adc_res return self.real2bin (inp, num_bits) # HACK - until propagate doesn't have correct analog functionality def propagate_dummy (self, inp): - self.num_access += 1 + #self.num_access += 1 return inp # Doesn't replicate the exact (sample and hold) functionality (just does hold) @@ -649,7 +649,7 @@ def __init__ (self): self.rd_width = 0 self.addr = 0 # add sent by ima to mem controller self.ramload = 0 # data (for LD) sent by edram to ima - self.ramstore = 0 # data (for ST) sent by ima to men controller + self.ramstore = 0 # data (for ST) sent by ima to mem controller ## For DEBUG of IMA only - define a memory element and preload some values #self.edram = memory (cfg.dataMem_size, 0) diff --git a/src/instrn_proto.py b/src/instrn_proto.py index 17a9cf15..2172ad0a 100644 --- a/src/instrn_proto.py +++ b/src/instrn_proto.py @@ -6,8 +6,6 @@ import include.config as cfg import include.constants as param -from src.data_convert import * - # Define nstruction prototypes # generate load prototype - load data from edram to (datamem/xbinmem) phy2log_ratio = cfg.num_bits/cfg.xbar_bits @@ -92,10 +90,23 @@ def i_alui (aluop, d1, r1, imm, vec = 1): # return i_temp # TODO: just a hack for now, but eventually opcode will be different in i_mvm and i_train -def i_mvm (xb_nma = cfg.num_matrix*['000'], r1=0, r2=0): # r1 is displacement, r2 is length of a continuum of data +# def i_mvm (xb_nma = cfg.num_matrix*['000'], r1=0, r2=0): # r1 is displacement, r2 is length of a continuum of data +# xb_nma_str = xb_nma[0] +# #xb_nma_str = xb_nma +# xb_nma_list = [xb_nma_str[i*3:(i+1)*3] for i in range(len(xb_nma_str)/3)] # split into list of 3-bit masks +# assert (len(xb_nma_list) == cfg.num_matrix) # each matrix in a core has a 3-bit mask +# i_temp = param.dummy_instrn.copy() +# i_temp['opcode'] = 'mvm' +# i_temp['r1'] = r1 +# i_temp['r2'] = r2 +# i_temp['xb_nma'] = xb_nma_list +# return i_temp + +#Defined to take the xb_nma as string instead of list of strings +def i_mvm (xb_nma = cfg.num_matrix*'0', r1=0, r2=0): # r1 is displacement, r2 is length of a continuum of data xb_nma_str = xb_nma[0] #xb_nma_str = xb_nma - xb_nma_list = [xb_nma_str[i*3:(i+1)*3] for i in range(len(xb_nma_str)/3)] # split into list of 3-bit masks + xb_nma_list = [xb_nma_str[i]+'00' for i in range(len(xb_nma_str))] # split into list of 3-bit masks assert (len(xb_nma_list) == cfg.num_matrix) # each matrix in a core has a 3-bit mask i_temp = param.dummy_instrn.copy() i_temp['opcode'] = 'mvm' diff --git a/src/node_dump.py b/src/node_dump.py index 7154cedb..3af03630 100644 --- a/src/node_dump.py +++ b/src/node_dump.py @@ -21,15 +21,18 @@ def mem_dump (fid, memfile, name, node = '', tile_id = ''): # to print in float format if (memfile[addr] != ''): temp_val = fixed2float (memfile[addr], cfg.int_bits, cfg.frac_bits) - # use this for debugging/viewing addresses - #temp_val = bin2int (memfile[addr], cfg.num_bits) - #else: # not printing zero values for ease of view - # temp_val = 0.0 if (name == 'EDRAM' and (node != '') and (tile_id != '')): # for EDRAM also show counter/valid fid.write ('valid: ' + str(node.tile_list[tile_id].edram_controller.valid[addr]) \ - + ' | counter: ' + str(node.tile_list[tile_id].edram_controller.counter[addr]) + ' | ') + + ' | counter: ' + str(node.tile_list[tile_id].edram_controller.counter[addr]) + ' | ') + fid.write(str(temp_val) + '\n') + # use this for debugging/viewing addresses + #temp_val = bin2int (memfile[addr], cfg.num_bits) + else: # not printing zero values for ease of view + temp_val = 0.0 + if (name != 'EDRAM'): fid.write(str(temp_val) + '\n') + def node_dump (node, filepath = ''): assert (filepath != ''), 'Debug flag is set, filepath cannot be nil' for i in range(len(node.tile_list)): @@ -53,10 +56,10 @@ def node_dump (node, filepath = ''): for mvmu_t in mvmu_list: # dump the xbar input memory mem_dump (fid, node.tile_list[i].ima_list[j].xb_inMem_list[k][mvmu_t].memfile, \ - 'Xbar Input Memory: matrixId: ' + str(k) + 'mvmu_type: ' + mvmu_t, 'Xbar Input Memory') + 'Xbar Input Memory: imaId:'+ str(j) +' matrixId:' + str(k) + ' mvmu_type:' + mvmu_t, 'Xbar Input Memory') # dump the xbar output memory mem_dump (fid, node.tile_list[i].ima_list[j].xb_outMem_list[k][mvmu_t].memfile, \ - 'Xbar Output Memory: matrixId: ' + str(k) + 'mvmu_type: ' + mvmu_t, 'Xbar Output Memory') + 'Xbar Output Memory: imaId:'+ str(j) +' matrixId:' + str(k) + ' mvmu_type:' + mvmu_t, 'Xbar Output Memory') fid.close() diff --git a/src/record_xbar.py b/src/record_xbar.py index 8b8025f1..4dc3448b 100644 --- a/src/record_xbar.py +++ b/src/record_xbar.py @@ -9,10 +9,12 @@ def record_xbar (node): for i in range (len(node.tile_list)): print ('Dumping xbar currents from tile num: ', i) for j in range (len(node.tile_list[0].ima_list)): - for k in range (len(node.tile_list[0].ima_list[0].xbar_list)): + for k in range (len(node.tile_list[0].ima_list[0].matrix_list)): # check for empty list - if (node.tile_list[i].ima_list[j].xbar_list[k].xb_record != []): - xbar_currents.append(node.tile_list[i].ima_list[j].xbar_list[k].xb_record) + for l in (node.tile_list[i].ima_list[j].matrix_list[k]['f']): + if (l.xb_record != []): + xbar_currents.append(l.xb_record) + #print(l) xbar_currents_arr = np.asarray (xbar_currents) From 76da76691affb71d0e79f79cad2509734fba5e35 Mon Sep 17 00:00:00 2001 From: Deepika Sharma Date: Mon, 23 Mar 2020 15:24:14 -0400 Subject: [PATCH 03/15] Synchronising with upstream --- include/config.py | 122 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 include/config.py diff --git a/include/config.py b/include/config.py new file mode 100644 index 00000000..7456665e --- /dev/null +++ b/include/config.py @@ -0,0 +1,122 @@ +# This file contains the configurable parameters in DPE (all hierarchies - IMA, Tile, Node) +## All user specified parameters are provided by this file only + +## Debug - 0 (1): dpe simulation will (won't) produce ima/tile traces while simulating +cycles_max = 5000000 # Put both these to very large numbers (when design is bug-free)! +debug = 1 +xbar_record = 1 +inference = 1 +training = not(inference) + +## Operand precision (fixed point allowed only): num_bits = int_bits + frac_bits +num_bits = 16 +int_bits = 4 +frac_bits = num_bits - int_bits + +## IMA configurable parameters (permissible values for each parameter provided here) +## Instruction generation - affected by xbar_bits, num_xbar, xbar_size. +# xbar_bits: 2, 4, 6 +# num_xbar: positive integer +# xbar_size: 32, 64, 128, 256 +# dac_res: positive integer <= num_bits +# adc_res: positive integer <= num_bits +# num_adc: positive integer <= num_xbar (doesn't allow more than one ADC per xbar) +# num_ALU: positive integer +# dataMem_size: (in Bytes) - 256, 512, 1024, 2048 (affects instrn width, hence capped) +# instrnMem_size: (in Bytes) - 512, 1024, 2048 + +# Fixed parameters +data_width = num_bits # (in bits) +xbdata_width = data_width # (in bits) +instrn_width = 48 # (in bits) + +# Change here - Specify the IMA parameters here +xbar_bits = 2 +num_matrix = 2 # each matrix is 1-fw logical xbar for inference and 1-fw, 1-bw, and 1 delta logical xbar for training. Each logical xbar for inference is 8-fw physical xbar and for training 8-fw, 8-bw and 16-delta physical xbars. +xbar_size = 128 +dac_res = 1 +# ADC configuration +adc_res = 8 # around 4 to 8. this value should be +num_adc_per_matrix = 2 +num_adc = num_adc_per_matrix * num_matrix + +# The idea is to have different ADC resolution value for each ADC. +# The number of ADC if defined by num_adc property. Currently it is 2 * num_matrix(2) = 4 +# NOTE: Only taking in account indexes 0 and 2, 1 and 3 are ignored, because ADCs 1 and 3 are assumed t be equal to 0 and 2. +adc_res_new = { + 'matrix_adc_0' : 8, + 'matrix_adc_1' : 4, + 'matrix_adc_2' : 8, + 'matrix_adc_3' : 4 + } + +num_ALU = num_matrix*2 +#dataMem_size = num_matrix*(6*xbar_size) # 4 for 4 input spaces within matrix (1 for f/b each, 2 for d) +dataMem_size = 2048 # 2048 is larger than num_matrix*(6*xbar_size) +instrnMem_size = 512 #in entries + +# This depends on above parameters +if (training): + datamem_off = xbar_size * (num_matrix*6) # each matrix has 6 memory spaces (1 for f/b, 2 for d) + +if (inference): + datamem_off = xbar_size * (num_matrix*2) # each matrix has 2 memory spaces ( 1 input Xbar memory and 1 output Xbar memory) + +phy2log_ratio = num_bits / xbar_bits # ratio of physical to logical xbar #vaulue is 8 +lr = 0.25 # learning rate for updates to d-xbar + +## Tile configurable parameters (permissible values for each parameter provided here) +## Instruction generation - affected by num_ima +# num_ima: positive integer +# edram buswidth: positive integer <= 16 (actual buswidth - this integer*data_width) +# edram_size: (in KiloBytes) - 64, 128, 256, 512 +# receive_buffer_depth: 4, 8, 12, 16, 32 (number of edram buffer entries (each entry maps to a virtual tile)) \ +# puts a cap on the maximum num ber of tiles that can send data to a tile in next layer +# receive_buffer_width: edram_buswidth/data_width (Fixed - in terms of number of neurons) +# tile_instrnMem_size: 256, 512, 1024 (in Bytes) + +# Fixed parameters +instrn_width = 48 # bits (op-2, vtile_id-6, send/receive_width-8, target_addr/counter-16, vw-8, mem_addr-16) +edram_buswidth = 256 # in bits +#receive_buffer_depth = 16 +receive_buffer_depth = 150 #set equal to num_tile_max +receive_buffer_width = edram_buswidth / num_bits # size of receive buffeer entry (in terms of number of neurons) + +# Change here - Specify the Tile parameters here +num_ima = 8 +edram_size = 64 # in Kilobytes (64 KB - same as issac) +tile_instrnMem_size = 2048 # in entries + +## Node configurable parameters (permissible values for each parameter provided here) +## Instruction generation - affected by num_tile +# num_tile_compute = positive integer +# inj_rate < 0.2 (depends on the mapping) +# num_port: 4, 8 + +# Fixed parameters +# NOC topology: cmesh (n=2, k=4, c=4) - can fit k*n*c tiles +cmesh_c = 4 +num_bits_tileId =32 +flit_width = 32 +packet_width = edram_buswidth/data_width #in multiples of flits (data considered only - booksim consider address itself) +# (b bit of address = logN, N is the number of nodes) + +# Change here - Specify the Node parameters here +num_tile_compute = 7 # number of tiles mapped by dnn (leaving input and output tiles) +num_tile_max = 168.0 # maximum number of tiles per node +num_inj_max = num_tile_max # [conservative] max number of packet injections that can occur in a cycle (each tile injects a packet into NOC each cycle) +noc_inj_rate = 0.005 +noc_num_port = 4 + +## Node parameters - Our way of simulation just assumes all tile in one actual node +num_node = 1 + +# Do not change this - total number of tiles +num_tile = num_node * num_tile_compute + 2 # +1 for first tile (I/O tile) - dummy, others - compute + +#Security parameters - Used to verify if the model used is encryted or authenticated (set by dpe.py) +#Do not change +encrypted = False +authenticated = False +cypher_name = '' +cypher_hash = '' From ea1bbc419e687198eb5b20632bb1707dd1f7ec8d Mon Sep 17 00:00:00 2001 From: Deepika Sharma Date: Wed, 27 May 2020 00:23:35 -0400 Subject: [PATCH 04/15] Mergingchanges for digital MVMU energy mnumbers --- src/hw_stats.py | 251 ++++++++++++++++++++++++++++++++++++++------- src/ima.py | 43 ++++++-- src/ima_metrics.py | 35 ++++--- src/ima_modules.py | 86 ++++++++++++++-- 4 files changed, 347 insertions(+), 68 deletions(-) diff --git a/src/hw_stats.py b/src/hw_stats.py index 8b38cb2d..6f0df0c3 100644 --- a/src/hw_stats.py +++ b/src/hw_stats.py @@ -14,11 +14,47 @@ # Copied from /include/constants.py file # Enlists components at core, tile, and node levels -hw_comp_energy = {'xbar_mvm':param.xbar_ip_pow_dyn*param.xbar_ip_lat, 'xbar_op':param.xbar_op_pow_dyn*param.xbar_op_lat, - 'xbar_mtvm':param.xbar_ip_pow_dyn*param.xbar_ip_lat, +hw_comp_energy = {'xbar_mvm':{ '100':param.xbar_ip_energy_dict['100'], \ + '90': param.xbar_ip_energy_dict['90'], \ + '80': param.xbar_ip_energy_dict['80'], \ + '70': param.xbar_ip_energy_dict['70'], \ + '60': param.xbar_ip_energy_dict['60'], \ + '50': param.xbar_ip_energy_dict['50'], \ + '40': param.xbar_ip_energy_dict['40'], \ + '30': param.xbar_ip_energy_dict['30'], \ + '20': param.xbar_ip_energy_dict['20'], \ + '10': param.xbar_ip_energy_dict['10']}, \ + 'xbar_op':{ '100': param.xbar_ip_energy_dict['100'], \ + '90': param.xbar_ip_energy_dict['90'], \ + '80': param.xbar_ip_energy_dict['80'], \ + '70': param.xbar_ip_energy_dict['70'], \ + '60': param.xbar_ip_energy_dict['60'], \ + '50': param.xbar_ip_energy_dict['50'], \ + '40': param.xbar_ip_energy_dict['40'], \ + '30': param.xbar_ip_energy_dict['30'], \ + '20': param.xbar_ip_energy_dict['20'], \ + '10': param.xbar_ip_energy_dict['10']}, \ + 'xbar_mtvm':{ '100':param.xbar_ip_energy_dict['100'], \ + '90': param.xbar_ip_energy_dict['90'], \ + '80': param.xbar_ip_energy_dict['80'], \ + '70': param.xbar_ip_energy_dict['70'], \ + '60': param.xbar_ip_energy_dict['60'], \ + '50': param.xbar_ip_energy_dict['50'], \ + '40': param.xbar_ip_energy_dict['40'], \ + '30': param.xbar_ip_energy_dict['30'], \ + '20': param.xbar_ip_energy_dict['20'], \ + '10': param.xbar_ip_energy_dict['10']}, \ 'xbar_rd':param.xbar_rd_pow_dyn*param.xbar_rd_lat, 'xbar_wr':param.xbar_wr_pow_dyn*param.xbar_wr_lat, 'dac':param.dac_pow_dyn, 'snh':param.snh_pow_dyn, \ - 'mux1':param.mux_pow_dyn, 'mux2':param.mux_pow_dyn, 'adc':param.adc_pow_dyn, \ + 'mux1':param.mux_pow_dyn, 'mux2':param.mux_pow_dyn, 'adc':{ 'n' : param.adc_pow_dyn_dict[str(cfg.adc_res)], \ + 'n/2': param.adc_pow_dyn_dict[str(cfg.adc_res-1)], \ + '3n/4': param.adc_pow_dyn_dict[str(cfg.adc_res-2)], \ + '7n/8': param.adc_pow_dyn_dict[str(cfg.adc_res-3)], \ + '15n/16': param.adc_pow_dyn_dict[str(cfg.adc_res-4)], \ + '31n/32': param.adc_pow_dyn_dict[str(cfg.adc_res-5)], \ + '63n/64': param.adc_pow_dyn_dict[str(cfg.adc_res-6)], \ + '127n/128': param.adc_pow_dyn_dict[str(cfg.adc_res-7)], \ + '255n/256': param.adc_pow_dyn_dict[str(cfg.adc_res-7)]}, \ 'alu_div': param.alu_pow_div_dyn, 'alu_mul':param.alu_pow_mul_dyn, \ 'alu_act': param.act_pow_dyn, 'alu_other':param.alu_pow_others_dyn, \ 'alu_sna': param.sna_pow_dyn, \ @@ -38,11 +74,47 @@ def get_hw_stats (fid, node_dut, cycle): # List of all components that dissipate power - hw_comp_access = {'xbar_mvm':0, 'xbar_op':0, - 'xbar_mtvm':0, - 'xbar_rd':0, 'xbar_wr':0, + hw_comp_access = {'xbar_mvm':{ '100':0, \ + '90': 0, \ + '80': 0, \ + '70': 0, \ + '60': 0, \ + '50': 0, \ + '40': 0, \ + '30': 0, \ + '20': 0, \ + '10': 0}, \ + 'xbar_op':{ '100':0, \ + '90': 0, \ + '80': 0, \ + '70': 0, \ + '60': 0, \ + '50': 0, \ + '40': 0, \ + '30': 0, \ + '20': 0, \ + '10': 0}, \ + 'xbar_mtvm':{ '100':0, \ + '90': 0, \ + '80': 0, \ + '70': 0, \ + '60': 0, \ + '50': 0, \ + '40': 0, \ + '30': 0, \ + '20': 0, \ + '10': 0}, \ + 'xbar_rd':0, 'xbar_wr':0, \ 'dac':0, 'snh':0, \ - 'mux1':0, 'mux2':0, 'adc':0, \ + 'mux1':0, 'mux2':0, 'adc':{ 'n' : 0, \ + 'n/2': 0, \ + '3n/4': 0, \ + '7n/8': 0, \ + '15n/16': 0, \ + '31n/32': 0, \ + '63n/64': 0, \ + '127n/128': 0, \ + '255n/256': 0}, \ 'alu_div':0, 'alu_mul':0, \ 'alu_act':0, 'alu_other':0, \ 'alu_sna':0, \ @@ -84,32 +156,56 @@ def get_hw_stats (fid, node_dut, cycle): for k in range (cfg.num_matrix): for mvmu_t in mvmu_type: # Xbar accesses - for m in range(cfg.phy2log_ratio): + if cfg.MVMU_ver == "Analog": + for m in range(cfg.phy2log_ratio): + if (mvmu_t == 'd'): + for key,value in hw_comp_access['xbar_op'].items(): + hw_comp_access['xbar_op'][key] += node_dut.tile_list[i].ima_list[j].matrix_list[k][mvmu_t][m].num_access[key] + elif (mvmu_t == 'b'): + for key,value in hw_comp_access['xbar_mtvm'].items(): + hw_comp_access['xbar_mtvm'][key] += node_dut.tile_list[i].ima_list[j].matrix_list[k][mvmu_t][m].num_access[key] + else: + for key,value in hw_comp_access['xbar_mvm'].items(): + hw_comp_access['xbar_mvm'][key] += node_dut.tile_list[i].ima_list[j].matrix_list[k][mvmu_t][m].num_access[key] + hw_comp_access['xbar_rd'] += \ + node_dut.tile_list[i].ima_list[j].matrix_list[k][mvmu_t][m].num_access_rd / (cfg.xbar_size**2) + hw_comp_access['xbar_wr'] += \ + node_dut.tile_list[i].ima_list[j].matrix_list[k][mvmu_t][m].num_access_wr / (cfg.xbar_size**2) + + else: if (mvmu_t == 'd'): - hw_comp_access['xbar_op'] += node_dut.tile_list[i].ima_list[j].matrix_list[k][mvmu_t][m].num_access + for key,value in hw_comp_access['xbar_op'].items(): + hw_comp_access['xbar_op'][key] += node_dut.tile_list[i].ima_list[j].matrix_list[k][mvmu_t][0].num_access[key] elif (mvmu_t == 'b'): - hw_comp_access['xbar_mtvm'] += node_dut.tile_list[i].ima_list[j].matrix_list[k][mvmu_t][m].num_access + for key,value in hw_comp_access['xbar_mtvm'].items(): + hw_comp_access['xbar_mtvm'][key] += node_dut.tile_list[i].ima_list[j].matrix_list[k][mvmu_t][0].num_access[key] else: - hw_comp_access['xbar_mvm'] += node_dut.tile_list[i].ima_list[j].matrix_list[k][mvmu_t][m].num_access + for key,value in hw_comp_access['xbar_mvm'].items(): + hw_comp_access['xbar_mvm'][key] += node_dut.tile_list[i].ima_list[j].matrix_list[k][mvmu_t][0].num_access[key] hw_comp_access['xbar_rd'] += \ - node_dut.tile_list[i].ima_list[j].matrix_list[k][mvmu_t][m].num_access_rd / (cfg.xbar_size**2) + node_dut.tile_list[i].ima_list[j].matrix_list[k][mvmu_t][0].num_access_rd / (cfg.xbar_size**2) hw_comp_access['xbar_wr'] += \ - node_dut.tile_list[i].ima_list[j].matrix_list[k][mvmu_t][m].num_access_wr / (cfg.xbar_size**2) + node_dut.tile_list[i].ima_list[j].matrix_list[k][mvmu_t][0].num_access_wr / (cfg.xbar_size**2) + # Xb_InMem accesses - hw_comp_access['xbInmem_rd'] += node_dut.tile_list[i].ima_list[j].xb_inMem_list[k][mvmu_t].num_access_read + if cfg.MVMU_ver == "Analog": + hw_comp_access['xbInmem_rd'] += node_dut.tile_list[i].ima_list[j].xb_inMem_list[k][mvmu_t].num_access_read hw_comp_access['xbInmem_wr'] += node_dut.tile_list[i].ima_list[j].xb_inMem_list[k][mvmu_t].num_access_write # Xb_OutMem accesses - hw_comp_access['xbOutmem'] += node_dut.tile_list[i].ima_list[j].xb_outMem_list[k][mvmu_t].num_access + if cfg.MVMU_ver == "Analog": + hw_comp_access['xbOutmem'] += node_dut.tile_list[i].ima_list[j].xb_outMem_list[k][mvmu_t].num_access for k in range(cfg.num_matrix): dac_type = ['f', 'b', 'd_r', 'd_c'] for dac_t in dac_type: for l in range(cfg.xbar_size): - hw_comp_access['dac'] += node_dut.tile_list[i].ima_list[j].dacArray_list[k][dac_t].dac_list[l].num_access + if cfg.MVMU_ver == "Analog": + hw_comp_access['dac'] += node_dut.tile_list[i].ima_list[j].dacArray_list[k][dac_t].dac_list[l].num_access - for k in range (2*cfg.num_matrix*cfg.phy2log_ratio): - hw_comp_access['snh'] += (node_dut.tile_list[i].ima_list[j].snh_list[k].num_access * cfg.xbar_size) # each snh is - # basically an array of multiple snhs (individual power in constants file must be for one discerete snh) + if cfg.MVMU_ver == "Analog": + for k in range (2*cfg.num_matrix*cfg.phy2log_ratio): + hw_comp_access['snh'] += (node_dut.tile_list[i].ima_list[j].snh_list[k].num_access * cfg.xbar_size) # each snh is + # basically an array of multiple snhs (individual power in constants file must be for one discerete snh) for k in range (2*cfg.num_matrix): hw_comp_access['mux1'] += node_dut.tile_list[i].ima_list[j].mux1_list[k].num_access @@ -117,8 +213,10 @@ def get_hw_stats (fid, node_dut, cycle): for k in range (cfg.num_adc): hw_comp_access['mux2'] += node_dut.tile_list[i].ima_list[j].mux1_list[k].num_access - for k in range (cfg.num_adc): - hw_comp_access['adc'] += node_dut.tile_list[i].ima_list[j].adc_list[k].num_access + if cfg.MVMU_ver == "Analog": + for k in range (cfg.num_adc): + for key,value in hw_comp_access['adc'].items(): + hw_comp_access['adc'][key] += node_dut.tile_list[i].ima_list[j].adc_list[k].num_access[key] for k in range (cfg.num_ALU): hw_comp_access['alu_div'] += node_dut.tile_list[i].ima_list[j].alu_list[k].num_access_div + \ @@ -143,19 +241,92 @@ def get_hw_stats (fid, node_dut, cycle): hw_comp_access['tile_control'] = sum_num_cycle_ima total_energy = 0 + total_adc_energy = 0 + total_adc_access = 0 + total_mvm_energy = 0 + total_mvm_access = 0 + total_mtvm_access = 0 + total_mtvm_energy = 0 + total_op_access = 0 + total_op_energy = 0 # Compute the total dynamic energy consumption - for key, value in hw_comp_access.items(): - total_energy += value * hw_comp_energy[key] + if cfg.MVMU_ver == "Analog": + for key, value in hw_comp_access.items(): + if key == 'adc': + for key1, value1 in hw_comp_access['adc'].items(): + total_energy += value1*hw_comp_energy['adc'][key1] + total_adc_energy += value1*hw_comp_energy['adc'][key1] # Not needed for function but for output visualisation + total_adc_access += value1 + elif key == 'xbar_mvm': + for key1, value1 in hw_comp_access['xbar_mvm'].items(): + total_energy += value1*hw_comp_energy['xbar_mvm'][key1] + total_mvm_energy += value1*hw_comp_energy['xbar_mvm'][key1] # Not needed for function but for output visualisation + total_mvm_access += value1 + elif key == 'xbar_mtvm': + for key1, value1 in hw_comp_access['xbar_mtvm'].items(): + total_energy += value1*hw_comp_energy['xbar_mtvm'][key1] + total_mvm_energy += value1*hw_comp_energy['xbar_mtvm'][key1] # Not needed for function but for output visualisation + total_mvm_access += value1 + elif key == 'xbar_op': + for key1, value1 in hw_comp_access['xbar_op'].items(): + total_energy += value1*hw_comp_energy['xbar_op'][key1] + total_op_energy += value1*hw_comp_energy['xbar_op'][key1] # Not needed for function but for output visualisation + total_op_access += value1 + else: + total_energy += value * hw_comp_energy[key] + else: + for key, value in hw_comp_access.items(): + if key == 'adc': + for key1, value1 in hw_comp_access['adc'].items(): + total_energy += value1*hw_comp_energy['adc'][key1] + total_adc_energy += value1*hw_comp_energy['adc'][key1] # Not needed for function but for output visualisation + total_adc_access += value1 + elif key == 'xbar_mvm': + for key1, value1 in hw_comp_access['xbar_mvm'].items(): + total_energy += (value1/16)*hw_comp_energy['xbar_mvm'][key1] + total_mvm_energy += (value1/16)*hw_comp_energy['xbar_mvm'][key1] # Not needed for function but for output visualisation + total_mvm_access += (value1/16) + elif key == 'xbar_mtvm': + for key1, value1 in hw_comp_access['xbar_mtvm'].items(): + total_energy += (value1/16)*hw_comp_energy['xbar_mtvm'][key1] + total_mvm_energy += (value1/16)*hw_comp_energy['xbar_mtvm'][key1] # Not needed for function but for output visualisation + total_mvm_access += (value1/16) + elif key == 'xbar_op': + for key1, value1 in hw_comp_access['xbar_op'].items(): + total_energy += (value1/16)*hw_comp_energy['xbar_op'][key1] + total_op_energy += (value1/16)*hw_comp_energy['xbar_op'][key1] # Not needed for function but for output visualisation + total_op_access += (value1/16) + else: + total_energy += value * hw_comp_energy[key] # Write the dict comp_access & energy proportion to a file for visualization + fid.write ("MVMU Type : " + cfg.MVMU_ver + "\n") fid.write ('Access and energy distribution of dynamic energy: \n') fid.write ('Component num_access percent\n') for key, value in hw_comp_access.items(): # put extra spaces for better visulalization of values bl_spc1 = (28-len(key)) * ' ' - bl_spc2 = (22-len(str(value))) * ' ' - fid.write (key + bl_spc1 + str(value) + bl_spc2 +\ - (str(value*hw_comp_energy[key]/total_energy*100))[0:4] + ' %\n') + # bl_spc2 = (22-len(str(value))) * ' ' + if key == 'adc': + bl_spc2 = (22-len(str(total_adc_access))) * ' ' + fid.write (key + bl_spc1 + str(total_adc_access) + bl_spc2 +\ + (str(total_adc_energy/total_energy*100))[0:4] + ' %\n') + elif key == 'xbar_mvm': + bl_spc2 = (22-len(str(total_mvm_access))) * ' ' + fid.write (key + bl_spc1 + str(total_mvm_access) + bl_spc2 +\ + (str(total_mvm_energy/total_energy*100))[0:4] + ' %\n') + elif key == 'xbar_mtvm': + bl_spc2 = (22-len(str(total_mtvm_access))) * ' ' + fid.write (key + bl_spc1 + str(total_mtvm_access) + bl_spc2 +\ + (str(total_mtvm_energy/total_energy*100))[0:4] + ' %\n') + elif key == 'xbar_op': + bl_spc2 = (22-len(str(total_op_access))) * ' ' + fid.write (key + bl_spc1 + str(total_op_access) + bl_spc2 +\ + (str(total_op_energy/total_energy*100))[0:4] + ' %\n') + else: + bl_spc2 = (22-len(str(value))) * ' ' + fid.write (key + bl_spc1 + str(value) + bl_spc2 +\ + (str(value*hw_comp_energy[key]/total_energy*100))[0:4] + ' %\n') fid.write ('\n') @@ -168,17 +339,17 @@ def get_hw_stats (fid, node_dut, cycle): # Write the leakage energy(J), total_energy(J), average_power (mW), peak_power (mW), # area (mm2), cycles and time (seconds) to a dict & file - metric_dict = {'leakage_energy':0.0, - 'dynamic_energy':0.0, - 'total_energy':0.0, - 'average_power':0.0, - 'peak_power':0.0, - 'leakage_power':0.0, - 'node_area':0.0, - 'tile_area':0.0, - 'core_area':0.0, - 'cycles':0, - 'time':0.0} + metric_dict = { 'leakage_energy':0.0, + 'dynamic_energy':0.0, + 'total_energy':0.0, + 'average_power':0.0, + 'peak_power':0.0, + 'leakage_power':0.0, + 'node_area':0.0, + 'tile_area':0.0, + 'core_area':0.0, + 'cycles':0, + 'time':0.0} metric_dict['leakage_power'] = node_metrics.compute_pow_leak () # in mW metric_dict['peak_power'] = node_metrics.compute_pow_peak () # in mW @@ -187,10 +358,14 @@ def get_hw_stats (fid, node_dut, cycle): metric_dict['core_area'] = ima_metrics.compute_area ()# in mm2 metric_dict['cycles'] = cycle metric_dict['time'] = cycle * param.cycle_time * (10**(-9)) # in sec - metric_dict['dynamic_energy'] = total_energy * ns * mw # in joule #metric_dict['leakage_enegy'] = metric_dict['leakage_power'] * mw * metric_dict['time'] # in joule metric_dict['leakage_energy'] = leakage_energy * ns * mw # in joule + # if cfg.MVMU_ver == "Analog": + metric_dict['dynamic_energy'] = total_energy * ns * mw # in joule metric_dict['total_energy'] = metric_dict['dynamic_energy'] + metric_dict['leakage_energy'] + # else: + # metric_dict['total_energy'] = total_energy * ns * mw # in joule + # metric_dict['dynamic_energy'] = metric_dict['total_energy'] metric_dict['average_power'] = metric_dict['total_energy'] / metric_dict['time'] * (10**(3)) # in mW for key, value in metric_dict.items(): diff --git a/src/ima.py b/src/ima.py index b7f5c33e..303b73a6 100644 --- a/src/ima.py +++ b/src/ima.py @@ -39,6 +39,7 @@ def __init__ (self): self.matrix_list = [] # list of dicts of mvmu(s) self.xb_inMem_list = [] # list of dicts of xbar input memory self.xb_outMem_list = [] # list of dicts of xbar output memory + self.xbar_inMem_Sparsity_list = [] # list of sparsity od xbar in mem (may have to be removed if found redundant) for i in xrange(cfg.num_matrix): # each matrix represents three mvmus - 1 mvmu for fw, 1 mvmu for bw, 1 mvmu (2X width) for delta @@ -465,7 +466,7 @@ def getXbarAddr (data_addr): else: assert (1==0), "xbar memory addressing failed" - return [num_matrix, xbar_type, mem_addr, xbar_addr] + return [num_matrix, xbar_type, mem_addr, xbar_addr] # write to the xbar memory (in/out) space depending on the address def writeToXbarMem (self, data_addr, data): @@ -594,12 +595,24 @@ def inner_product (mat_id, key): # reset the xb out memory before starting to accumulate self.xb_outMem_list[mat_id][key].reset () + xbar_inMem = self.xb_inMem_list[mat_id][key].read_all () + # print ("xb_inMem", xbar_inMem) + # calculate sparsity of xbar_in_mem + non_0_val = 0 + for i in range(cfg.xbar_size): + if xbar_inMem[i] != '0': + non_0_val = non_0_val +1 + sparsity = (cfg.xbar_size-non_0_val)*100.0/cfg.xbar_size + # print ("non_0_val", non_0_val) + # print ("Sparsity", sparsity) + ## Loop to cover all bits of inputs for k in xrange (cfg.xbdata_width / cfg.dac_res): #for k in xrange (1): # read the values from the xbar's input register out_xb_inMem = self.xb_inMem_list[mat_id][key].read (cfg.dac_res) - + # print("out_xb_inMem", out_xb_inMem) + #*************************************** HACK ********************************************* ###### CAUTION: Not replicated exact "functional" circuit behaviour for analog parts ###### Use propagate (not propagate_hack) for DAC, Xbar, TIA, SNH, ADC when above is done @@ -614,7 +627,7 @@ def inner_product (mat_id, key): out_snh = [[] for x in range(num_xb)] for m in range (num_xb): # compute dot-product - out_xbar[m] = self.matrix_list[mat_id][key][m].propagate_dummy(out_dac) + out_xbar[m] = self.matrix_list[mat_id][key][m].propagate_dummy(non_0_val, out_dac) # do sampling and hold out_snh[m] = self.snh_list[mat_id*num_xb+m].propagate_dummy(out_xbar[m]) @@ -627,7 +640,7 @@ def inner_product (mat_id, key): adc_id = (mat_id*num_xb + m) % cfg.num_adc out_mux1 = self.mux1_list[mat_id].propagate_dummy(out_snh[m][j]) # i is the ith xbar out_mux2 = self.mux2_list[mat_id % cfg.num_adc].propagate_dummy(out_mux1) - out_adc = self.adc_list[adc_id].propagate_dummy(out_mux2) + out_adc = self.adc_list[adc_id].propagate_dummy(out_mux2, non_0_val) # shift and add outputs from difefrent wt_bits alu_op = 'sna' @@ -781,7 +794,7 @@ def xbComputeLatency (self, mask): lat_temp = 0 # We assume all ADCs in a matrix has the same resolution adc_idx = idx*cfg.num_adc_per_matrix - lat_temp = self.adc_list[adc_idx].getLatency() + lat_temp = self.adc_list[adc_idx].getLatency(cfg.xbar_size) ''' print("adc_idx", adc_idx) print("lat_temp", lat_temp) @@ -842,7 +855,25 @@ def xbComputeLatency (self, mask): elif (ex_op == 'mvm'): mask_temp = self.de_xb_nma - self.stage_latency[sId] = xbComputeLatency (self, mask_temp) # mask tells which of ip/op or both is occurring + if (cfg.MVMU_ver == "Analog"): + self.stage_latency[sId] = xbComputeLatency (self, mask_temp) # mask tells which of ip/op or both is occurring + else: + mvm_lat_temp = 0 + if (cfg.inference): + for p in xrange(cfg.num_matrix): + if self.de_xb_nma[p]: + xbar_inMem = self.xb_inMem_list[p]['f'].read_all () + non_0_val = 0 + for i in range(cfg.xbar_size): + if xbar_inMem[i] != '0': + non_0_val = non_0_val +1 + print ("non_0_val", non_0_val) + nval_percent = int(non_0_val*100/128) + if (nval_percent%10!=0): + nval_percent = nval_percent + 10 + mvm_lat_temp += param.Digital_xbar_lat_dict[cfg.MVMU_ver][str(cfg.xbar_size)][str(nval_percent)] + self.stage_latency[sId] = mvm_lat_temp + print("MVM Latency", self.stage_latency[sId]) # Needs update - use xbar serial read latency elif (ex_op == 'crs'): diff --git a/src/ima_metrics.py b/src/ima_metrics.py index e68dd112..0f3a1ea1 100644 --- a/src/ima_metrics.py +++ b/src/ima_metrics.py @@ -11,12 +11,13 @@ def compute_area (): #in mm2 area = 0.0 area += (cfg.num_matrix*3) * param.xbar_inMem_area # xbar_inMem one each for f/b/d xbars - area += (cfg.num_matrix*11) * cfg.xbar_size * param.dac_area # 1 dac for input of f/b/d xbars, each phy xbar in d-xbar will have a dac_array, hence 8 + if cfg.MVMU_ver == "Analog": + area += (cfg.num_matrix*11) * cfg.xbar_size * param.dac_area # 1 dac for input of f/b/d xbars, each phy xbar in d-xbar will have a dac_array, hence 8 + area += (cfg.num_matrix*2) * cfg.xbar_size * param.snh_area # snh for f/b xbars + area += (cfg.num_matrix*2) * param.sna_area # sna for one each f/b xbars + area += cfg.num_adc * param.adc_area # adc + area += (cfg.num_matrix*3) * param.xbar_outMem_area # xbar_outMem (1 OR for 8 xbars - 16 bit weights, 2 bit xbars) area += (cfg.num_matrix*4) * param.xbar_area # d-xbar has 2X xbars than f/b - area += (cfg.num_matrix*2) * cfg.xbar_size * param.snh_area # snh for f/b xbars - area += cfg.num_adc * param.adc_area # adc - area += (cfg.num_matrix*2) * param.sna_area # sna for one each f/b xbars - area += (cfg.num_matrix*3) * param.xbar_outMem_area # xbar_outMem (1 OR for 8 xbars - 16 bit weights, 2 bit xbars) area += param.instrnMem_area # instrnMem area += param.dataMem_area # dataMem area += param.alu_area # alu @@ -35,12 +36,13 @@ def compute_area (): #in mm2 def compute_pow_leak (): leak_pow = 0.0 leak_pow += (cfg.num_matrix*3) * param.xbar_inMem_pow_leak # xbar_inMem - leak_pow += (cfg.num_matrix*11) * cfg.xbar_size * param.dac_pow_leak # dac + if cfg.MVMU_ver == "Analog": + leak_pow += (cfg.num_matrix*11) * cfg.xbar_size * param.dac_pow_leak # dac + leak_pow += (cfg.num_matrix*2) * cfg.xbar_size * param.snh_pow_leak # snh + leak_pow += cfg.num_adc * param.adc_pow_leak # adc + leak_pow += (cfg.num_matrix*2) * param.sna_pow_leak # sna + leak_pow += (cfg.num_matrix*3) * param.xbar_outMem_pow_leak # xbar_outMem leak_pow += (cfg.num_matrix*4) * param.xbar_pow_leak # xbar area - leak_pow += (cfg.num_matrix*2) * cfg.xbar_size * param.snh_pow_leak # snh - leak_pow += cfg.num_adc * param.adc_pow_leak # adc - leak_pow += (cfg.num_matrix*2) * param.sna_pow_leak # sna - leak_pow += (cfg.num_matrix*3) * param.xbar_outMem_pow_leak # xbar_outMem leak_pow += param.instrnMem_pow_leak # instrnMem leak_pow += param.dataMem_pow_leak # dataMem leak_pow += param.alu_pow_leak # alu @@ -51,15 +53,16 @@ def compute_pow_leak (): # Peak dynamic power (assumes all components are being accessed in each cycle) def compute_pow_dyn (): dyn_pow = 0.0 - dyn_pow += (cfg.num_matrix*3) * (param.xbar_inMem_pow_dyn_write + param.xbar_inMem_pow_dyn_read/cfg.xbar_size) # xbar_inMem - num_xbar * dac_res bits will be + if cfg.MVMU_ver == "Analog": + dyn_pow += (cfg.num_matrix*3) * (param.xbar_inMem_pow_dyn_write + param.xbar_inMem_pow_dyn_read/cfg.xbar_size) # xbar_inMem - num_xbar * dac_res bits will be # read from xb_inMem in an interval that equals xbar_access time # dyn_pow += cfg.num_xbar/2 * 1.2 # (adding dyn pow the way issac does for comparison) - dyn_pow += (cfg.num_matrix*11) * cfg.xbar_size * param.dac_pow_dyn # dac + dyn_pow += (cfg.num_matrix*11) * cfg.xbar_size * param.dac_pow_dyn # dac + dyn_pow += (cfg.num_matrix*2) * cfg.xbar_size * param.snh_pow_dyn # snh + dyn_pow += cfg.num_adc * param.adc_pow_dyn # adc + dyn_pow += (cfg.num_matrix*2) * param.sna_pow_dyn # sna + dyn_pow += (cfg.num_matrix*3) * param.xbar_outMem_pow_dyn # xbar_outMem (1 OR for 8 xbars - 16 bit weights, 2 bit xbars) dyn_pow += (cfg.num_matrix*4) * param.xbar_ip_pow_dyn # xbar ip power considred as ip>op power - dyn_pow += (cfg.num_matrix*2) * cfg.xbar_size * param.snh_pow_dyn # snh - dyn_pow += cfg.num_adc * param.adc_pow_dyn # adc - dyn_pow += (cfg.num_matrix*2) * param.sna_pow_dyn # sna - dyn_pow += (cfg.num_matrix*3) * param.xbar_outMem_pow_dyn # xbar_outMem (1 OR for 8 xbars - 16 bit weights, 2 bit xbars) dyn_pow += param.instrnMem_pow_dyn # instrnMem dyn_pow += param.dataMem_pow_dyn # dataMem dyn_pow += param.alu_pow_dyn # alu diff --git a/src/ima_modules.py b/src/ima_modules.py index 7a574817..1aa6f3c3 100644 --- a/src/ima_modules.py +++ b/src/ima_modules.py @@ -15,7 +15,18 @@ class xbar (object): def __init__ (self, xbar_size, xbar_value= 'nil' ): # define num_accesses for different operations - self.num_access = 0 # parallel reads (inner-product) + # parallel reads (inner-product) + self.num_access = { '100':0, \ + '90': 0, \ + '80': 0, \ + '70': 0, \ + '60': 0, \ + '50': 0, \ + '40': 0, \ + '30': 0, \ + '20': 0, \ + '10': 0} \ + self.num_access_rd = 0 # serial reads self.num_access_wr = 0 # serial writes @@ -90,11 +101,36 @@ def propagate (self, inp = 'nil'): return out # HACK - until propagate doesn't have correct analog functionality - def propagate_dummy (self, inp = 'nil'): + def propagate_dummy (self, n_val, inp = 'nil'): # data input is list of bit strings (of length dac_res) - fixed point binary assert (inp != 'nil'), 'propagate needs a non-nil input' assert (len(inp) == self.xbar_size), 'xbar input size mismatch' - self.num_access += 1 + + #Modification to accomodate sparsity and digital crossbars + if cfg.MVMU_ver == "Analog": + self.num_access['100'] += 1 + else: + if n_val>cfg.xbar_size*9/10.0: + self.num_access['100'] += 1 + elif n_val>cfg.xbar_size*8/10.0: + self.num_access['90'] += 1 + elif n_val>cfg.xbar_size*7/10.0: + self.num_access['80'] += 1 + elif n_val>cfg.xbar_size*6/10.0: + self.num_access['70'] += 1 + elif n_val>cfg.xbar_size*5/10.0: + self.num_access['60'] += 1 + elif n_val>cfg.xbar_size*4/10.0: + self.num_access['50'] += 1 + elif n_val>cfg.xbar_size*3/10.0: + self.num_access['40'] += 1 + elif n_val>cfg.xbar_size*2/10.0: + self.num_access['30'] += 1 + elif n_val>cfg.xbar_size*1/10.0: + self.num_access['20'] += 1 + else: + self.num_access['10'] += 1 + # convert input from fixed point binary (string) to float inp_float = [0.0] * self.xbar_size for i in range(len(inp)): @@ -222,14 +258,23 @@ def propagate_dummy (self, inp_list): class adc (object): def __init__ (self, adc_res): # define num_access - self.num_access = 0 + self.num_access = { 'n' : 0, + 'n/2': 0, + '3n/4': 0, + '7n/8': 0, + '15n/16': 0, + '31n/32': 0, + '63n/64': 0, + '127n/128': 0, + '255n/256': 0} # define latency - self.latency = param.adc_lat_dict[str(adc_res)] + # self.latency = param.adc_lat_dict[str(adc_res)] self.adc_res = adc_res - def getLatency (self): + def getLatency (self, n_val): + self.latency = param.adc_lat_dict[str(self.adc_res)] return self.latency def real2bin (self, inp, num_bits): @@ -246,8 +291,26 @@ def propagate (self, inp): return self.real2bin (inp, num_bits) # HACK - until propagate doesn't have correct analog functionality - def propagate_dummy (self, inp): - #self.num_access += 1 + def propagate_dummy (self, inp, n_val): + if n_val>cfg.xbar_size/2.0: + self.num_access['n'] += 1 + elif n_val>cfg.xbar_size/4.0: + self.num_access['n/2'] += 1 + elif n_val>cfg.xbar_size/8.0: + self.num_access['3n/4'] += 1 + elif n_val>cfg.xbar_size/16.0: + self.num_access['7n/8'] += 1 + elif n_val>cfg.xbar_size/32.0: + self.num_access['15n/16'] += 1 + elif n_val>cfg.xbar_size/64.0: + self.num_access['31n/32'] += 1 + elif n_val>cfg.xbar_size/128.0: + self.num_access['63n/64'] += 1 + elif n_val>cfg.xbar_size/256.0: + self.num_access['127n/128'] += 1 + else: + self.num_access['255n/256'] += 1 + return inp # Doesn't replicate the exact (sample and hold) functionality (just does hold) @@ -512,6 +575,13 @@ def read (self, num_bits): out_list.append(value[-1*num_bits:]) return out_list + def read_all (self): + out_list = [] + for i in xrange(self.xbar_size): + value = self.memfile[i] + out_list.append(value) + return out_list + def write (self, addr, data): self.num_access_write += 1 assert (type(addr) == int), 'addr type should be int' From 3645ad129395398de79d5793bb85471a8e6de26d Mon Sep 17 00:00:00 2001 From: Deepika Sharma Date: Wed, 27 May 2020 00:24:41 -0400 Subject: [PATCH 05/15] Mergingchanges for digital MVMU energy mnumbers --- include/config.py | 8 +- include/constants.py | 226 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 230 insertions(+), 4 deletions(-) diff --git a/include/config.py b/include/config.py index 7456665e..80567c82 100644 --- a/include/config.py +++ b/include/config.py @@ -5,9 +5,15 @@ cycles_max = 5000000 # Put both these to very large numbers (when design is bug-free)! debug = 1 xbar_record = 1 -inference = 1 +inference = 1 # For training change this flag training = not(inference) +## Variable to define the type of MVMU +# One of "Analog", "Digital_V1" or "Digital_V2" +# Digital_V1 has compressed inputs (Data+Offset style) +# Digital_V2 has uncompressed inputs (Skips computations for 0 activation) +MVMU_ver = "Digital_V2" + ## Operand precision (fixed point allowed only): num_bits = int_bits + frac_bits num_bits = 16 int_bits = 4 diff --git a/include/constants.py b/include/constants.py index 4246f10e..ee931fc0 100644 --- a/include/constants.py +++ b/include/constants.py @@ -119,6 +119,182 @@ xbar_rd_pow = 208.0 * 1000 * (1/32.0) / xbar_rd_lat xbar_wr_pow = 676.0 * 1000 * (1/32.0) / xbar_rd_lat +## Adding power area and latency for Digital MVMU V1 and V2 +Digital_xbar_lat_dict = {'Digital_V1': {'32': { '100':130, # first indexed by version then by xbar_size and then by % of non_0 values + '90': 114, # For V1 it is (4n+2)*T and for V2 it is (3n+2+xbar_size)*T + '80': 102, + '70': 90, + '60': 78, + '50': 66, + '40': 50, + '30': 38, + '20': 26, + '10': 14}, + '64': { '100':258, + '90': 230, + '80': 206, + '70': 178, + '60': 154, + '50': 130, + '40': 102, + '30': 78, + '20': 50, + '10': 26}, + '128':{ '100':514, + '90': 462, + '80': 410, + '70': 358, + '60': 306, + '50': 258, + '40': 206, + '30': 154, + '20': 102, + '10': 50}, + '256':{ '100':1026, + '90': 922, + '80': 818, + '70': 718, + '60': 614, + '50': 514, + '40': 410, + '30': 306, + '20': 206, + '10': 102}}, + 'Digital_V2': {'32' :{ '100':130, + '90': 118, + '80': 109, + '70': 100, + '60': 91, + '50': 82, + '40': 70, + '30': 61, + '20': 52, + '10': 43}, + '64' :{ '100':258, + '90': 237, + '80': 219, + '70': 198, + '60': 180, + '50': 162, + '40': 141, + '30': 123, + '20': 102, + '10': 84}, + '128':{ '100':514, + '90': 475, + '80': 436, + '70': 397, + '60': 358, + '50': 322, + '40': 283, + '30': 244, + '20': 205, + '10': 166}, + '256':{ '100':1026, + '90': 948, + '80': 870, + '70': 795, + '60': 717, + '50': 642, + '40': 564, + '30': 486, + '20': 411, + '10': 333}}} + +Digital_xbar_area_dict = {'Digital_V1': { '32' : 0.16977, # first indexed by version then by xbar_size + '64' : 0.27701, + '128': 1.74020, + '256': 7.29481}, + 'Digital_V2': { '32' : 0.16949, + '64' : 0.27645, + '128': 1.73908, + '256': 7.29257}} + +Digital_xbar_energy_dict = {'Digital_V1':{'32':{'100':5261.43744, # first indexed by version then by xbar_size and then by % of non_0 values + '90': 4613.872832, # For V1 it is (4n+2)*T and for V2 it is (3n+2+xbar_size)*T + '80': 4128.199376, # in pJ + '70': 3642.52592, + '60': 3156.852464, + '50': 2671.179008, + '40': 2023.6144, + '30': 1537.940944, + '20': 1052.267488, + '10': 566.594032}, + '64':{'100':20844.00864, + '90': 18581.86252, + '80': 16642.88014, + '70': 14380.73402, + '60': 12441.75163, + '50': 10502.76925, + '40': 8240.623131, + '30': 6301.640745, + '20': 4039.494628, + '10': 2100.512242}, + '128':{'100': 83018.14464, + '90': 74619.39346, + '80': 66220.64228, + '70': 57821.8911, + '60': 49423.13992, + '50': 41670.44653, + '40': 33271.69535, + '30': 24872.94417, + '20': 16474.19299, + '10': 8075.441812}, + '256':{'100': 331639.0958, + '90': 298022.5268, + '80': 264405.9578, + '70': 232082.3337, + '60': 198465.7647, + '50': 166142.1407, + '40': 132525.5717, + '30': 98909.00265, + '20': 66585.3786, + '10': 32968.80959}}, + 'Digital_V2':{'32':{'100':4466.744263, + '90': 4053.765767, + '80': 3744.031895, + '70': 3434.298023, + '60': 3124.564151, + '50': 2814.830279, + '40': 2401.851783, + '30': 2092.117911, + '20': 1782.384039, + '10': 1472.650167}, + '64':{'100':17654.27322, + '90': 16216.06481, + '80': 14983.31474, + '70': 13545.10633, + '60': 12312.35626, + '50': 11079.6062, + '40': 9641.397787, + '30': 8408.647721, + '20': 6970.439311, + '10': 5737.689245}, + '128':{'100': 70237.24474, + '90': 64904.19392, + '80': 59571.14309, + '70': 54238.09226, + '60': 48905.04144, + '50': 43982.22529, + '40': 38649.17446, + '30': 33316.12363, + '20': 27983.07281, + '10': 22650.02198}, + '256':{'100': 280471.5471, + '90': 259128.5, + '80': 237785.453, + '70': 217263.2925, + '60': 195920.2454, + '50': 175398.0849, + '40': 154055.0379, + '30': 132711.9909, + '20': 112189.8303, + '10': 90846.78326}}} +Digital_xbar_pow_leak_dict = { '32' :5.575928889, #in mW + '64' :12.82466678, + '128':40.24037556, + '256':120.2098611} + # DAC - Discuss exact values with ISSAC authors dac_lat_dict = {'1' : 1, '2' : 1, @@ -145,27 +321,44 @@ '16': 1.67 * 10**(-7)} # ADC - Discuss exact values with ISSAC authors +# ADC Values for including sparsity adc_lat_dict = {'1' : 12.5, '2' : 25, + '3' : 37.5, '4' : 50, + '5' : 62.5, + '6' : 75, + '7' : 87.5, '8' : 100, '16': 200} adc_pow_dyn_dict = {'1' : 0.225, '2' : 0.45, + '3' : 0.675, '4' : 0.9, + '5' : 1.125, + '6' : 1.35, + '7' : 1.575, '8' : 1.8, '16': 3.6} adc_pow_leak_dict = {'1' : 0.025, '2' : 0.05, + '3' : 0.075, '4' : 0.1, + '5' : 0.125, + '6' : 0.15, + '7' : 0.175, '8' : 0.2, '16': 0.4} adc_area_dict = {'1' : 0.0012, '2' : 0.0012, + '3' : 0.0012, '4' : 0.0012, + '5' : 0.0012, + '6' : 0.0012, + '7' : 0.0012, '8' : 0.0012, '16': 0.0012} @@ -319,7 +512,19 @@ # Chosen latency based on config file - only for components whose latency is parameter dependent #xbar_lat = xbar_lat_dict [str(cfg.xbar_bits)][str(cfg.xbar_size)] -xbar_ip_lat = xbar_ip_lat +# xbar_innerp_lat_dict = {'32':{'100':0, '90':0, '80':0, '70':0, '60':0, '70':0, '50':0, '40':0, '30':0, '20':0, '10':0}, +# '64':{'100':0, '90':0, '80':0, '70':0, '60':0, '70':0, '50':0, '40':0, '30':0, '20':0, '10':0}, +# '128':{'100':0, '90':0, '80':0, '70':0, '60':0, '70':0, '50':0, '40':0, '30':0, '20':0, '10':0}, +# '256':{'100':0, '90':0, '80':0, '70':0, '60':0, '70':0, '50':0, '40':0, '30':0, '20':0, '10':0}} +xbar_ip_lat_dict = {'100':0, '90':0, '80':0, '70':0, '60':0, '70':0, '50':0, '40':0, '30':0, '20':0, '10':0} +if cfg.MVMU_ver == "Analog": + for key, value in xbar_ip_lat_dict.items(): + xbar_ip_lat_dict[key] = xbar_ip_lat +else: + xbar_ip_lat_dict = Digital_xbar_lat_dict[cfg.MVMU_ver][str(cfg.xbar_size)] +print("xbar_ip_lat_dict",xbar_ip_lat_dict) + + xbar_op_lat = xbar_op_lat xbar_rd_lat = xbar_rd_lat xbar_wr_lat = xbar_wr_lat @@ -332,7 +537,10 @@ dataMem_lat = dataMem_lat_dict[str(cfg.dataMem_size)] # Chosen area based on config file - only for components whose latency is parameter dependent -xbar_area = xbar_area_dict [str(cfg.xbar_bits)][str(cfg.xbar_size)] +if cfg.MVMU_ver == "Analog": + xbar_area = xbar_area_dict[str(cfg.xbar_bits)][str(cfg.xbar_size)] +else: + xbar_area = Digital_xbar_area_dict[cfg.MVMU_ver][str(cfg.xbar_size)] dac_area = dac_area_dict [str(cfg.dac_res)] adc_area = adc_area_dict [str(cfg.adc_res)] xbar_inMem_area = xbar_inMem_area_dict[str(cfg.xbar_size)] @@ -354,8 +562,20 @@ instrnMem_pow_dyn = instrnMem_pow_dyn_dict[str(cfg.instrnMem_size)] * math.sqrt(8) #area scaling for 8 bytes per instruction dataMem_pow_dyn = dataMem_pow_dyn_dict[str(cfg.dataMem_size)] +# Energy +xbar_ip_energy_dict = {'100':0, '90':0, '80':0, '70':0, '60':0, '70':0, '50':0, '40':0, '30':0, '20':0, '10':0} +if cfg.MVMU_ver == "Analog": + for key,value in xbar_ip_energy_dict.items(): + xbar_ip_energy_dict[key] = xbar_ip_lat*xbar_ip_pow_dyn +else: + xbar_ip_energy_dict = Digital_xbar_energy_dict[cfg.MVMU_ver][str(cfg.xbar_size)] +print('xbar_ip_energy_dict', xbar_ip_energy_dict) + # Chosen leak_power based on config file - only for components whose latency is parameter dependent -xbar_pow_leak = 0 +if cfg.MVMU_ver == "Analog": + xbar_pow_leak = 0 +else: + xbar_pow_leak = Digital_xbar_pow_leak_dict[str(cfg.xbar_size)] dac_pow_leak = dac_pow_leak_dict [str(cfg.dac_res)] adc_pow_leak = adc_pow_leak_dict [str(cfg.adc_res)] xbar_inMem_pow_leak = xbar_inMem_pow_leak_dict[str(cfg.xbar_size)] From b231a28f6f7b048558855572e4123ef08d672808 Mon Sep 17 00:00:00 2001 From: Deepika Sharma Date: Wed, 10 Jun 2020 16:51:53 -0400 Subject: [PATCH 06/15] Corrected area computation fro ima --- src/ima_metrics.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/ima_metrics.py b/src/ima_metrics.py index 0f3a1ea1..f05f6087 100644 --- a/src/ima_metrics.py +++ b/src/ima_metrics.py @@ -10,14 +10,16 @@ # Area is computed as the summation of all component area (doesn't consider physical layout) def compute_area (): #in mm2 area = 0.0 - area += (cfg.num_matrix*3) * param.xbar_inMem_area # xbar_inMem one each for f/b/d xbars if cfg.MVMU_ver == "Analog": area += (cfg.num_matrix*11) * cfg.xbar_size * param.dac_area # 1 dac for input of f/b/d xbars, each phy xbar in d-xbar will have a dac_array, hence 8 area += (cfg.num_matrix*2) * cfg.xbar_size * param.snh_area # snh for f/b xbars area += (cfg.num_matrix*2) * param.sna_area # sna for one each f/b xbars area += cfg.num_adc * param.adc_area # adc area += (cfg.num_matrix*3) * param.xbar_outMem_area # xbar_outMem (1 OR for 8 xbars - 16 bit weights, 2 bit xbars) - area += (cfg.num_matrix*4) * param.xbar_area # d-xbar has 2X xbars than f/b + area += (cfg.num_matrix*4) * cfg.phy2log_ratio * param.xbar_area # d-xbar has 2X xbars than f/b + else: + area += (cfg.num_matrix*2) * param.xbar_area # d-xbar are not needed in Digital MVMUs xbars than f/b + area += (cfg.num_matrix*3) * param.xbar_inMem_area # xbar_inMem one each for f/b/d xbars area += param.instrnMem_area # instrnMem area += param.dataMem_area # dataMem area += param.alu_area # alu From b34045f8c7436c98d3223ea91d905e7977eb173a Mon Sep 17 00:00:00 2001 From: Deepika Sharma Date: Thu, 11 Jun 2020 22:09:00 -0400 Subject: [PATCH 07/15] Merging changes for digital MVMUs --- include/config.py | 8 +++++--- include/example-configs/config-cnn.py | 6 ++++++ include/example-configs/config-mlp.py | 6 ++++++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/include/config.py b/include/config.py index 46226f52..086b2416 100644 --- a/include/config.py +++ b/include/config.py @@ -5,14 +5,14 @@ cycles_max = 5000000 # Put both these to very large numbers (when design is bug-free)! debug = 1 xbar_record = 1 -inference = 1 # For training change this flag +inference = 1 training = not(inference) ## Variable to define the type of MVMU # One of "Analog", "Digital_V1" or "Digital_V2" # Digital_V1 has compressed inputs (Data+Offset style) # Digital_V2 has uncompressed inputs (Skips computations for 0 activation) -MVMU_ver = "Digital_V2" +MVMU_ver = "Digital_V2" ## Operand precision (fixed point allowed only): num_bits = int_bits + frac_bits num_bits = 16 @@ -36,7 +36,9 @@ data_width = num_bits # (in bits) xbdata_width = data_width # (in bits) instrn_width = 48 # (in bits) - +# Input and Weight parameters +input_prec = 16 +weight_width = 16 # Change here - Specify the IMA parameters here xbar_bits = 2 num_matrix = 2 # each matrix is 1-fw logical xbar for inference and 1-fw, 1-bw, and 1 delta logical xbar for training. Each logical xbar for inference is 8-fw physical xbar and for training 8-fw, 8-bw and 16-delta physical xbars. diff --git a/include/example-configs/config-cnn.py b/include/example-configs/config-cnn.py index d60c657e..10e2e418 100644 --- a/include/example-configs/config-cnn.py +++ b/include/example-configs/config-cnn.py @@ -8,6 +8,12 @@ inference = 1 training = not(inference) +## Variable to define the type of MVMU +# One of "Analog", "Digital_V1" or "Digital_V2" +# Digital_V1 has compressed inputs (Data+Offset style) +# Digital_V2 has uncompressed inputs (Skips computations for 0 activation) +MVMU_ver = "Digital_V2" + ## Operand precision (fixed point allowed only): num_bits = int_bits + frac_bits num_bits = 16 int_bits = 4 diff --git a/include/example-configs/config-mlp.py b/include/example-configs/config-mlp.py index ee3647fa..086b2416 100644 --- a/include/example-configs/config-mlp.py +++ b/include/example-configs/config-mlp.py @@ -8,6 +8,12 @@ inference = 1 training = not(inference) +## Variable to define the type of MVMU +# One of "Analog", "Digital_V1" or "Digital_V2" +# Digital_V1 has compressed inputs (Data+Offset style) +# Digital_V2 has uncompressed inputs (Skips computations for 0 activation) +MVMU_ver = "Digital_V2" + ## Operand precision (fixed point allowed only): num_bits = int_bits + frac_bits num_bits = 16 int_bits = 4 From 15f1748a754aee27b496dfcb3530725116ded7c4 Mon Sep 17 00:00:00 2001 From: Deepika Sharma Date: Fri, 12 Jun 2020 00:08:33 -0400 Subject: [PATCH 08/15] Merging changes for digital MVMUs --- src/ima_metrics.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/ima_metrics.py b/src/ima_metrics.py index f05f6087..8cfb1c4e 100644 --- a/src/ima_metrics.py +++ b/src/ima_metrics.py @@ -37,14 +37,16 @@ def compute_area (): #in mm2 # Leakage power is computed as sum of leakage powers of all components def compute_pow_leak (): leak_pow = 0.0 - leak_pow += (cfg.num_matrix*3) * param.xbar_inMem_pow_leak # xbar_inMem if cfg.MVMU_ver == "Analog": leak_pow += (cfg.num_matrix*11) * cfg.xbar_size * param.dac_pow_leak # dac leak_pow += (cfg.num_matrix*2) * cfg.xbar_size * param.snh_pow_leak # snh leak_pow += cfg.num_adc * param.adc_pow_leak # adc leak_pow += (cfg.num_matrix*2) * param.sna_pow_leak # sna leak_pow += (cfg.num_matrix*3) * param.xbar_outMem_pow_leak # xbar_outMem - leak_pow += (cfg.num_matrix*4) * param.xbar_pow_leak # xbar area + leak_pow += (cfg.num_matrix*4) * param.xbar_pow_leak # xbar area + else: + leak_pow += (cfg.num_matrix*2) * param.xbar_pow_leak # xbar area + leak_pow += (cfg.num_matrix*3) * param.xbar_inMem_pow_leak # xbar_inMem leak_pow += param.instrnMem_pow_leak # instrnMem leak_pow += param.dataMem_pow_leak # dataMem leak_pow += param.alu_pow_leak # alu From 4b308d348af9e51e8258c2d04705458b1eb7b87f Mon Sep 17 00:00:00 2001 From: Deepika Sharma Date: Fri, 12 Jun 2020 00:36:21 -0400 Subject: [PATCH 09/15] Merging changes for digital MVMUs --- src/ima_modules.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ima_modules.py b/src/ima_modules.py index 84c16cd1..922e61af 100644 --- a/src/ima_modules.py +++ b/src/ima_modules.py @@ -6,8 +6,8 @@ import sys import numpy as np -import include.constants as param -import include.config as cfg +import constants as param +import config as cfg import math from data_convert import * From 1f053a058afcc8a1c64ee76f031c5744b685150c Mon Sep 17 00:00:00 2001 From: Deepika Sharma Date: Sat, 13 Jun 2020 01:21:11 -0400 Subject: [PATCH 10/15] Commiting after suggested changes --- include/config.py | 3 +- include/constants.py | 199 ++------------------------ include/constants_digital.py | 175 ++++++++++++++++++++++ include/example-configs/config-cnn.py | 1 + include/example-configs/config-mlp.py | 1 + src/dnn_wt_p.py | 1 - src/hw_stats.py | 144 ++++--------------- src/ima.py | 72 +++++----- src/ima_metrics.py | 14 +- src/ima_modules.py | 96 +++++-------- src/node_dump.py | 4 +- test/mvm_ip_test.py | 83 ----------- 12 files changed, 299 insertions(+), 494 deletions(-) create mode 100644 include/constants_digital.py delete mode 100644 test/mvm_ip_test.py diff --git a/include/config.py b/include/config.py index 086b2416..e09ebedc 100644 --- a/include/config.py +++ b/include/config.py @@ -7,12 +7,13 @@ xbar_record = 1 inference = 1 training = not(inference) +sparse_opt = 1 # Flag for Sparsity optimisaton (Make it 0 for only dense computations) ## Variable to define the type of MVMU # One of "Analog", "Digital_V1" or "Digital_V2" # Digital_V1 has compressed inputs (Data+Offset style) # Digital_V2 has uncompressed inputs (Skips computations for 0 activation) -MVMU_ver = "Digital_V2" +MVMU_ver = "Analog" ## Operand precision (fixed point allowed only): num_bits = int_bits + frac_bits num_bits = 16 diff --git a/include/constants.py b/include/constants.py index 55ea43f1..d16395df 100644 --- a/include/constants.py +++ b/include/constants.py @@ -2,6 +2,7 @@ ## It also holds power, area and latency numbers of different component used in DPE design import config as cfg import math +import constants_digital as digi_param # Limits the number of cycles an IMA runs in case it doesn't halt infinity = 100000 @@ -119,182 +120,6 @@ xbar_rd_pow = 208.0 * 1000 * (1/32.0) / xbar_rd_lat xbar_wr_pow = 676.0 * 1000 * (1/32.0) / xbar_rd_lat -## Adding power area and latency for Digital MVMU V1 and V2 -Digital_xbar_lat_dict = {'Digital_V1': {'32': { '100':130, # first indexed by version then by xbar_size and then by % of non_0 values - '90': 114, # For V1 it is (4n+2)*T and for V2 it is (3n+2+xbar_size)*T - '80': 102, - '70': 90, - '60': 78, - '50': 66, - '40': 50, - '30': 38, - '20': 26, - '10': 14}, - '64': { '100':258, - '90': 230, - '80': 206, - '70': 178, - '60': 154, - '50': 130, - '40': 102, - '30': 78, - '20': 50, - '10': 26}, - '128':{ '100':514, - '90': 462, - '80': 410, - '70': 358, - '60': 306, - '50': 258, - '40': 206, - '30': 154, - '20': 102, - '10': 50}, - '256':{ '100':1026, - '90': 922, - '80': 818, - '70': 718, - '60': 614, - '50': 514, - '40': 410, - '30': 306, - '20': 206, - '10': 102}}, - 'Digital_V2': {'32' :{ '100':130, - '90': 118, - '80': 109, - '70': 100, - '60': 91, - '50': 82, - '40': 70, - '30': 61, - '20': 52, - '10': 43}, - '64' :{ '100':258, - '90': 237, - '80': 219, - '70': 198, - '60': 180, - '50': 162, - '40': 141, - '30': 123, - '20': 102, - '10': 84}, - '128':{ '100':514, - '90': 475, - '80': 436, - '70': 397, - '60': 358, - '50': 322, - '40': 283, - '30': 244, - '20': 205, - '10': 166}, - '256':{ '100':1026, - '90': 948, - '80': 870, - '70': 795, - '60': 717, - '50': 642, - '40': 564, - '30': 486, - '20': 411, - '10': 333}}} - -Digital_xbar_area_dict = {'Digital_V1': { '32' : 0.16977, # first indexed by version then by xbar_size - '64' : 0.27701, - '128': 1.74020, - '256': 7.29481}, - 'Digital_V2': { '32' : 0.16949, - '64' : 0.27645, - '128': 1.73908, - '256': 7.29257}} - -Digital_xbar_energy_dict = {'Digital_V1':{'32':{'100':5261.43744, # first indexed by version then by xbar_size and then by % of non_0 values - '90': 4613.872832, # For V1 it is (4n+2)*T and for V2 it is (3n+2+xbar_size)*T - '80': 4128.199376, # in pJ - '70': 3642.52592, - '60': 3156.852464, - '50': 2671.179008, - '40': 2023.6144, - '30': 1537.940944, - '20': 1052.267488, - '10': 566.594032}, - '64':{'100':20844.00864, - '90': 18581.86252, - '80': 16642.88014, - '70': 14380.73402, - '60': 12441.75163, - '50': 10502.76925, - '40': 8240.623131, - '30': 6301.640745, - '20': 4039.494628, - '10': 2100.512242}, - '128':{'100': 83018.14464, - '90': 74619.39346, - '80': 66220.64228, - '70': 57821.8911, - '60': 49423.13992, - '50': 41670.44653, - '40': 33271.69535, - '30': 24872.94417, - '20': 16474.19299, - '10': 8075.441812}, - '256':{'100': 331639.0958, - '90': 298022.5268, - '80': 264405.9578, - '70': 232082.3337, - '60': 198465.7647, - '50': 166142.1407, - '40': 132525.5717, - '30': 98909.00265, - '20': 66585.3786, - '10': 32968.80959}}, - 'Digital_V2':{'32':{'100':4466.744263, - '90': 4053.765767, - '80': 3744.031895, - '70': 3434.298023, - '60': 3124.564151, - '50': 2814.830279, - '40': 2401.851783, - '30': 2092.117911, - '20': 1782.384039, - '10': 1472.650167}, - '64':{'100':17654.27322, - '90': 16216.06481, - '80': 14983.31474, - '70': 13545.10633, - '60': 12312.35626, - '50': 11079.6062, - '40': 9641.397787, - '30': 8408.647721, - '20': 6970.439311, - '10': 5737.689245}, - '128':{'100': 70237.24474, - '90': 64904.19392, - '80': 59571.14309, - '70': 54238.09226, - '60': 48905.04144, - '50': 43982.22529, - '40': 38649.17446, - '30': 33316.12363, - '20': 27983.07281, - '10': 22650.02198}, - '256':{'100': 280471.5471, - '90': 259128.5, - '80': 237785.453, - '70': 217263.2925, - '60': 195920.2454, - '50': 175398.0849, - '40': 154055.0379, - '30': 132711.9909, - '20': 112189.8303, - '10': 90846.78326}}} -Digital_xbar_pow_leak_dict = { '32' :5.575928889, #in mW - '64' :12.82466678, - '128':40.24037556, - '256':120.2098611} - # DAC - Discuss exact values with ISSAC authors dac_lat_dict = {'1' : 1, '2' : 1, @@ -533,20 +358,12 @@ # Chosen latency based on config file - only for components whose latency is parameter dependent -#xbar_lat = xbar_lat_dict [str(cfg.xbar_bits)][str(cfg.xbar_size)] -# xbar_innerp_lat_dict = {'32':{'100':0, '90':0, '80':0, '70':0, '60':0, '70':0, '50':0, '40':0, '30':0, '20':0, '10':0}, -# '64':{'100':0, '90':0, '80':0, '70':0, '60':0, '70':0, '50':0, '40':0, '30':0, '20':0, '10':0}, -# '128':{'100':0, '90':0, '80':0, '70':0, '60':0, '70':0, '50':0, '40':0, '30':0, '20':0, '10':0}, -# '256':{'100':0, '90':0, '80':0, '70':0, '60':0, '70':0, '50':0, '40':0, '30':0, '20':0, '10':0}} -xbar_ip_lat_dict = {'100':0, '90':0, '80':0, '70':0, '60':0, '70':0, '50':0, '40':0, '30':0, '20':0, '10':0} +xbar_ip_lat_dict = {'0':0, '90':0, '80':0, '70':0, '60':0, '50':0, '40':0, '30':0, '20':0, '10':0} if cfg.MVMU_ver == "Analog": for key, value in xbar_ip_lat_dict.items(): xbar_ip_lat_dict[key] = xbar_ip_lat else: - xbar_ip_lat_dict = Digital_xbar_lat_dict[cfg.MVMU_ver][str(cfg.xbar_size)] -print("xbar_ip_lat_dict",xbar_ip_lat_dict) - - + xbar_ip_lat_dict = digi_param.Digital_xbar_lat_dict[cfg.MVMU_ver][str(cfg.xbar_size)] xbar_op_lat = xbar_op_lat xbar_rd_lat = xbar_rd_lat xbar_wr_lat = xbar_wr_lat @@ -558,11 +375,11 @@ instrnMem_lat = instrnMem_lat_dict[str(cfg.instrnMem_size)] dataMem_lat = dataMem_lat_dict[str(cfg.dataMem_size)] -# Chosen area based on config file - only for components whose latency is parameter dependent +# Chosen area based on config file - only for components whose area is parameter dependent if cfg.MVMU_ver == "Analog": xbar_area = xbar_area_dict[str(cfg.xbar_bits)][str(cfg.xbar_size)] else: - xbar_area = Digital_xbar_area_dict[cfg.MVMU_ver][str(cfg.xbar_size)] + xbar_area = digi_param.Digital_xbar_area_dict[cfg.MVMU_ver][str(cfg.xbar_size)] dac_area = dac_area_dict [str(cfg.dac_res)] adc_area = adc_area_dict [str(cfg.adc_res)] xbar_inMem_area = xbar_inMem_area_dict[str(cfg.xbar_size)] @@ -585,19 +402,19 @@ dataMem_pow_dyn = dataMem_pow_dyn_dict[str(cfg.dataMem_size)] # Energy -xbar_ip_energy_dict = {'100':0, '90':0, '80':0, '70':0, '60':0, '70':0, '50':0, '40':0, '30':0, '20':0, '10':0} +xbar_ip_energy_dict = {'0':0, '90':0, '80':0, '70':0, '60':0, '50':0, '40':0, '30':0, '20':0, '10':0} if cfg.MVMU_ver == "Analog": for key,value in xbar_ip_energy_dict.items(): xbar_ip_energy_dict[key] = xbar_ip_lat*xbar_ip_pow_dyn else: - xbar_ip_energy_dict = Digital_xbar_energy_dict[cfg.MVMU_ver][str(cfg.xbar_size)] + xbar_ip_energy_dict = digi_param.Digital_xbar_energy_dict[cfg.MVMU_ver][str(cfg.xbar_size)] print('xbar_ip_energy_dict', xbar_ip_energy_dict) # Chosen leak_power based on config file - only for components whose latency is parameter dependent if cfg.MVMU_ver == "Analog": xbar_pow_leak = 0 else: - xbar_pow_leak = Digital_xbar_pow_leak_dict[str(cfg.xbar_size)] + xbar_pow_leak = digi_param.Digital_xbar_pow_leak_dict[str(cfg.xbar_size)] dac_pow_leak = dac_pow_leak_dict [str(cfg.dac_res)] adc_pow_leak = adc_pow_leak_dict [str(cfg.adc_res)] xbar_inMem_pow_leak = xbar_inMem_pow_leak_dict[str(cfg.xbar_size)] diff --git a/include/constants_digital.py b/include/constants_digital.py new file mode 100644 index 00000000..abe5310b --- /dev/null +++ b/include/constants_digital.py @@ -0,0 +1,175 @@ +## This file contains the power, area and latency numbers of Digital MVMUs for two versions +Digital_xbar_lat_dict = {'Digital_V1': {'32': { '0': 130, # first indexed by version then by xbar_size and then by % sparsity + '10': 114, # For V1 it is (4n+2)*T and for V2 it is (3n+2+xbar_size)*T + '20': 102, + '30': 90, + '40': 78, + '50': 66, + '60': 50, + '70': 38, + '80': 26, + '90': 14}, + '64': { '0' : 258, + '10': 230, + '20': 206, + '30': 178, + '40': 154, + '50': 130, + '60': 102, + '70': 78, + '80': 50, + '90': 26}, + '128':{ '0' : 514, + '10': 462, + '20': 410, + '30': 358, + '40': 306, + '50': 258, + '60': 206, + '70': 154, + '80': 102, + '90': 50}, + '256':{ '0' : 1026, + '10': 922, + '20': 818, + '30': 718, + '40': 614, + '50': 514, + '60': 410, + '70': 306, + '80': 206, + '90': 102}}, + 'Digital_V2': {'32' :{ '0' : 130, + '10': 118, + '20': 109, + '30': 100, + '40': 91, + '50': 82, + '60': 70, + '70': 61, + '80': 52, + '90': 43}, + '64' :{ '0' : 258, + '10': 237, + '20': 219, + '30': 198, + '40': 180, + '50': 162, + '60': 141, + '70': 123, + '80': 102, + '90': 84}, + '128':{ '0' : 514, + '10': 475, + '20': 436, + '30': 397, + '40': 358, + '50': 322, + '60': 283, + '70': 244, + '80': 205, + '90': 166}, + '256':{ '0' : 1026, + '10': 948, + '20': 870, + '30': 795, + '40': 717, + '50': 642, + '60': 564, + '70': 486, + '80': 411, + '90': 333}}} + +Digital_xbar_area_dict = {'Digital_V1': { '32' : 0.16977, # first indexed by version then by xbar_size + '64' : 0.27701, + '128': 1.74020, + '256': 7.29481}, + 'Digital_V2': { '32' : 0.16949, + '64' : 0.27645, + '128': 1.73908, + '256': 7.29257}} + +Digital_xbar_energy_dict = {'Digital_V1':{'32':{'0' : 5261.43744, # first indexed by version then by xbar_size and then by % of non_0 values + '10': 4613.872832, # For V1 it is (4n+2)*T and for V2 it is (3n+2+xbar_size)*T + '20': 4128.199376, # in pJ + '30': 3642.52592, + '40': 3156.852464, + '50': 2671.179008, + '60': 2023.6144, + '70': 1537.940944, + '80': 1052.267488, + '90': 566.594032}, + '64':{'0' : 20844.00864, + '10': 18581.86252, + '20': 16642.88014, + '30': 14380.73402, + '40': 12441.75163, + '50': 10502.76925, + '60': 8240.623131, + '70': 6301.640745, + '80': 4039.494628, + '90': 2100.512242}, + '128':{ '0' : 83018.14464, + '10': 74619.39346, + '20': 66220.64228, + '30': 57821.8911, + '40': 49423.13992, + '50': 41670.44653, + '60': 33271.69535, + '70': 24872.94417, + '80': 16474.19299, + '90': 8075.441812}, + '256':{ '0' : 331639.0958, + '10': 298022.5268, + '20': 264405.9578, + '30': 232082.3337, + '40': 198465.7647, + '50': 166142.1407, + '60': 132525.5717, + '70': 98909.00265, + '80': 66585.3786, + '90': 32968.80959}}, + 'Digital_V2':{'32':{'0' : 4466.744263, + '10': 4053.765767, + '20': 3744.031895, + '30': 3434.298023, + '40': 3124.564151, + '50': 2814.830279, + '60': 2401.851783, + '70': 2092.117911, + '80': 1782.384039, + '90': 1472.650167}, + '64':{'0' : 17654.27322, + '10': 16216.06481, + '20': 14983.31474, + '30': 13545.10633, + '40': 12312.35626, + '50': 11079.6062, + '60': 9641.397787, + '70': 8408.647721, + '80': 6970.439311, + '90': 5737.689245}, + '128':{ '0' : 70237.24474, + '10': 64904.19392, + '20': 59571.14309, + '30': 54238.09226, + '40': 48905.04144, + '50': 43982.22529, + '60': 38649.17446, + '70': 33316.12363, + '80': 27983.07281, + '90': 22650.02198}, + '256':{ '0' : 280471.5471, + '10': 259128.5, + '20': 237785.453, + '30': 217263.2925, + '40': 195920.2454, + '50': 175398.0849, + '60': 154055.0379, + '70': 132711.9909, + '80': 112189.8303, + '90': 90846.78326}}} +Digital_xbar_pow_leak_dict = { '32' :5.575928889, #in mW + '64' :12.82466678, + '128':40.24037556, + '256':120.2098611} \ No newline at end of file diff --git a/include/example-configs/config-cnn.py b/include/example-configs/config-cnn.py index 10e2e418..4b2bdde5 100644 --- a/include/example-configs/config-cnn.py +++ b/include/example-configs/config-cnn.py @@ -7,6 +7,7 @@ xbar_record = 1 inference = 1 training = not(inference) +sparse_opt = 1 # Flag for Sparsity optimisaton (Make it 0 for only dense computations) ## Variable to define the type of MVMU # One of "Analog", "Digital_V1" or "Digital_V2" diff --git a/include/example-configs/config-mlp.py b/include/example-configs/config-mlp.py index 086b2416..02e78dd1 100644 --- a/include/example-configs/config-mlp.py +++ b/include/example-configs/config-mlp.py @@ -7,6 +7,7 @@ xbar_record = 1 inference = 1 training = not(inference) +sparse_opt = 1 # Flag for Sparsity optimisaton (Make it 0 for only dense computations) ## Variable to define the type of MVMU # One of "Analog", "Digital_V1" or "Digital_V2" diff --git a/src/dnn_wt_p.py b/src/dnn_wt_p.py index 3d19cb04..df1b4a08 100644 --- a/src/dnn_wt_p.py +++ b/src/dnn_wt_p.py @@ -28,4 +28,3 @@ def prog_dnn_wt(self, instrnpath, node_dut): wt_temp = np.load(wt_filename) node_dut.tile_list[i].ima_list[j].matrix_list[k]['f'][l].program(wt_temp) node_dut.tile_list[i].ima_list[j].matrix_list[k]['b'][l].program(wt_temp) - diff --git a/src/hw_stats.py b/src/hw_stats.py index 6f0df0c3..6970de88 100644 --- a/src/hw_stats.py +++ b/src/hw_stats.py @@ -14,7 +14,7 @@ # Copied from /include/constants.py file # Enlists components at core, tile, and node levels -hw_comp_energy = {'xbar_mvm':{ '100':param.xbar_ip_energy_dict['100'], \ +hw_comp_energy = {'xbar_mvm':{ '0':param.xbar_ip_energy_dict['0'], \ '90': param.xbar_ip_energy_dict['90'], \ '80': param.xbar_ip_energy_dict['80'], \ '70': param.xbar_ip_energy_dict['70'], \ @@ -24,37 +24,20 @@ '30': param.xbar_ip_energy_dict['30'], \ '20': param.xbar_ip_energy_dict['20'], \ '10': param.xbar_ip_energy_dict['10']}, \ - 'xbar_op':{ '100': param.xbar_ip_energy_dict['100'], \ - '90': param.xbar_ip_energy_dict['90'], \ - '80': param.xbar_ip_energy_dict['80'], \ - '70': param.xbar_ip_energy_dict['70'], \ - '60': param.xbar_ip_energy_dict['60'], \ - '50': param.xbar_ip_energy_dict['50'], \ - '40': param.xbar_ip_energy_dict['40'], \ - '30': param.xbar_ip_energy_dict['30'], \ - '20': param.xbar_ip_energy_dict['20'], \ - '10': param.xbar_ip_energy_dict['10']}, \ - 'xbar_mtvm':{ '100':param.xbar_ip_energy_dict['100'], \ - '90': param.xbar_ip_energy_dict['90'], \ - '80': param.xbar_ip_energy_dict['80'], \ - '70': param.xbar_ip_energy_dict['70'], \ - '60': param.xbar_ip_energy_dict['60'], \ - '50': param.xbar_ip_energy_dict['50'], \ - '40': param.xbar_ip_energy_dict['40'], \ - '30': param.xbar_ip_energy_dict['30'], \ - '20': param.xbar_ip_energy_dict['20'], \ - '10': param.xbar_ip_energy_dict['10']}, \ - 'xbar_rd':param.xbar_rd_pow_dyn*param.xbar_rd_lat, 'xbar_wr':param.xbar_wr_pow_dyn*param.xbar_wr_lat, + 'xbar_op':param.xbar_ip_energy_dict['0'], \ + 'xbar_mtvm':param.xbar_ip_energy_dict['0'], \ + 'xbar_rd':param.xbar_rd_pow_dyn*param.xbar_rd_lat, \ + 'xbar_wr':param.xbar_wr_pow_dyn*param.xbar_wr_lat, 'dac':param.dac_pow_dyn, 'snh':param.snh_pow_dyn, \ - 'mux1':param.mux_pow_dyn, 'mux2':param.mux_pow_dyn, 'adc':{ 'n' : param.adc_pow_dyn_dict[str(cfg.adc_res)], \ - 'n/2': param.adc_pow_dyn_dict[str(cfg.adc_res-1)], \ - '3n/4': param.adc_pow_dyn_dict[str(cfg.adc_res-2)], \ - '7n/8': param.adc_pow_dyn_dict[str(cfg.adc_res-3)], \ - '15n/16': param.adc_pow_dyn_dict[str(cfg.adc_res-4)], \ - '31n/32': param.adc_pow_dyn_dict[str(cfg.adc_res-5)], \ - '63n/64': param.adc_pow_dyn_dict[str(cfg.adc_res-6)], \ - '127n/128': param.adc_pow_dyn_dict[str(cfg.adc_res-7)], \ - '255n/256': param.adc_pow_dyn_dict[str(cfg.adc_res-7)]}, \ + 'mux1':param.mux_pow_dyn, 'mux2':param.mux_pow_dyn, \ + 'adc':{ 'n' : param.adc_pow_dyn_dict[str(cfg.adc_res)], \ + 'n/2': param.adc_pow_dyn_dict[str(cfg.adc_res-1)], \ + 'n/4': param.adc_pow_dyn_dict[str(cfg.adc_res-2)], \ + 'n/8': param.adc_pow_dyn_dict[str(cfg.adc_res-3)], \ + 'n/16': param.adc_pow_dyn_dict[str(cfg.adc_res-4)], \ + 'n/32': param.adc_pow_dyn_dict[str(cfg.adc_res-5)], \ + 'n/64': param.adc_pow_dyn_dict[str(cfg.adc_res-6)], \ + 'n/128': param.adc_pow_dyn_dict[str(cfg.adc_res-7)]}, \ 'alu_div': param.alu_pow_div_dyn, 'alu_mul':param.alu_pow_mul_dyn, \ 'alu_act': param.act_pow_dyn, 'alu_other':param.alu_pow_others_dyn, \ 'alu_sna': param.sna_pow_dyn, \ @@ -74,47 +57,18 @@ def get_hw_stats (fid, node_dut, cycle): # List of all components that dissipate power - hw_comp_access = {'xbar_mvm':{ '100':0, \ - '90': 0, \ - '80': 0, \ - '70': 0, \ - '60': 0, \ - '50': 0, \ - '40': 0, \ - '30': 0, \ - '20': 0, \ - '10': 0}, \ - 'xbar_op':{ '100':0, \ - '90': 0, \ - '80': 0, \ - '70': 0, \ - '60': 0, \ - '50': 0, \ - '40': 0, \ - '30': 0, \ - '20': 0, \ - '10': 0}, \ - 'xbar_mtvm':{ '100':0, \ - '90': 0, \ - '80': 0, \ - '70': 0, \ - '60': 0, \ - '50': 0, \ - '40': 0, \ - '30': 0, \ - '20': 0, \ - '10': 0}, \ + hw_comp_access = {'xbar_mvm':{ '0':0, '90': 0,'80': 0,'70': 0,'60': 0,'50': 0,'40': 0,'30': 0,'20': 0,'10': 0}, \ + 'xbar_op':0, 'xbar_mtvm':0, \ 'xbar_rd':0, 'xbar_wr':0, \ 'dac':0, 'snh':0, \ - 'mux1':0, 'mux2':0, 'adc':{ 'n' : 0, \ - 'n/2': 0, \ - '3n/4': 0, \ - '7n/8': 0, \ - '15n/16': 0, \ - '31n/32': 0, \ - '63n/64': 0, \ - '127n/128': 0, \ - '255n/256': 0}, \ + 'mux1':0, 'mux2':0, 'adc':{ 'n' : 0, \ + 'n/2': 0, \ + 'n/4': 0, \ + 'n/8': 0, \ + 'n/16': 0, \ + 'n/32': 0, \ + 'n/64': 0, \ + 'n/128': 0}, \ 'alu_div':0, 'alu_mul':0, \ 'alu_act':0, 'alu_other':0, \ 'alu_sna':0, \ @@ -159,11 +113,9 @@ def get_hw_stats (fid, node_dut, cycle): if cfg.MVMU_ver == "Analog": for m in range(cfg.phy2log_ratio): if (mvmu_t == 'd'): - for key,value in hw_comp_access['xbar_op'].items(): - hw_comp_access['xbar_op'][key] += node_dut.tile_list[i].ima_list[j].matrix_list[k][mvmu_t][m].num_access[key] + hw_comp_access['xbar_op'] += node_dut.tile_list[i].ima_list[j].matrix_list[k][mvmu_t][m].num_access['0'] elif (mvmu_t == 'b'): - for key,value in hw_comp_access['xbar_mtvm'].items(): - hw_comp_access['xbar_mtvm'][key] += node_dut.tile_list[i].ima_list[j].matrix_list[k][mvmu_t][m].num_access[key] + hw_comp_access['xbar_mtvm'] += node_dut.tile_list[i].ima_list[j].matrix_list[k][mvmu_t][m].num_access['0'] else: for key,value in hw_comp_access['xbar_mvm'].items(): hw_comp_access['xbar_mvm'][key] += node_dut.tile_list[i].ima_list[j].matrix_list[k][mvmu_t][m].num_access[key] @@ -174,11 +126,9 @@ def get_hw_stats (fid, node_dut, cycle): else: if (mvmu_t == 'd'): - for key,value in hw_comp_access['xbar_op'].items(): - hw_comp_access['xbar_op'][key] += node_dut.tile_list[i].ima_list[j].matrix_list[k][mvmu_t][0].num_access[key] + hw_comp_access['xbar_op'] += node_dut.tile_list[i].ima_list[j].matrix_list[k][mvmu_t][0].num_access['0'] elif (mvmu_t == 'b'): - for key,value in hw_comp_access['xbar_mtvm'].items(): - hw_comp_access['xbar_mtvm'][key] += node_dut.tile_list[i].ima_list[j].matrix_list[k][mvmu_t][0].num_access[key] + hw_comp_access['xbar_mtvm'] += node_dut.tile_list[i].ima_list[j].matrix_list[k][mvmu_t][0].num_access['0'] else: for key,value in hw_comp_access['xbar_mvm'].items(): hw_comp_access['xbar_mvm'][key] += node_dut.tile_list[i].ima_list[j].matrix_list[k][mvmu_t][0].num_access[key] @@ -245,10 +195,6 @@ def get_hw_stats (fid, node_dut, cycle): total_adc_access = 0 total_mvm_energy = 0 total_mvm_access = 0 - total_mtvm_access = 0 - total_mtvm_energy = 0 - total_op_access = 0 - total_op_energy = 0 # Compute the total dynamic energy consumption if cfg.MVMU_ver == "Analog": for key, value in hw_comp_access.items(): @@ -262,16 +208,6 @@ def get_hw_stats (fid, node_dut, cycle): total_energy += value1*hw_comp_energy['xbar_mvm'][key1] total_mvm_energy += value1*hw_comp_energy['xbar_mvm'][key1] # Not needed for function but for output visualisation total_mvm_access += value1 - elif key == 'xbar_mtvm': - for key1, value1 in hw_comp_access['xbar_mtvm'].items(): - total_energy += value1*hw_comp_energy['xbar_mtvm'][key1] - total_mvm_energy += value1*hw_comp_energy['xbar_mtvm'][key1] # Not needed for function but for output visualisation - total_mvm_access += value1 - elif key == 'xbar_op': - for key1, value1 in hw_comp_access['xbar_op'].items(): - total_energy += value1*hw_comp_energy['xbar_op'][key1] - total_op_energy += value1*hw_comp_energy['xbar_op'][key1] # Not needed for function but for output visualisation - total_op_access += value1 else: total_energy += value * hw_comp_energy[key] else: @@ -286,16 +222,6 @@ def get_hw_stats (fid, node_dut, cycle): total_energy += (value1/16)*hw_comp_energy['xbar_mvm'][key1] total_mvm_energy += (value1/16)*hw_comp_energy['xbar_mvm'][key1] # Not needed for function but for output visualisation total_mvm_access += (value1/16) - elif key == 'xbar_mtvm': - for key1, value1 in hw_comp_access['xbar_mtvm'].items(): - total_energy += (value1/16)*hw_comp_energy['xbar_mtvm'][key1] - total_mvm_energy += (value1/16)*hw_comp_energy['xbar_mtvm'][key1] # Not needed for function but for output visualisation - total_mvm_access += (value1/16) - elif key == 'xbar_op': - for key1, value1 in hw_comp_access['xbar_op'].items(): - total_energy += (value1/16)*hw_comp_energy['xbar_op'][key1] - total_op_energy += (value1/16)*hw_comp_energy['xbar_op'][key1] # Not needed for function but for output visualisation - total_op_access += (value1/16) else: total_energy += value * hw_comp_energy[key] @@ -315,14 +241,6 @@ def get_hw_stats (fid, node_dut, cycle): bl_spc2 = (22-len(str(total_mvm_access))) * ' ' fid.write (key + bl_spc1 + str(total_mvm_access) + bl_spc2 +\ (str(total_mvm_energy/total_energy*100))[0:4] + ' %\n') - elif key == 'xbar_mtvm': - bl_spc2 = (22-len(str(total_mtvm_access))) * ' ' - fid.write (key + bl_spc1 + str(total_mtvm_access) + bl_spc2 +\ - (str(total_mtvm_energy/total_energy*100))[0:4] + ' %\n') - elif key == 'xbar_op': - bl_spc2 = (22-len(str(total_op_access))) * ' ' - fid.write (key + bl_spc1 + str(total_op_access) + bl_spc2 +\ - (str(total_op_energy/total_energy*100))[0:4] + ' %\n') else: bl_spc2 = (22-len(str(value))) * ' ' fid.write (key + bl_spc1 + str(value) + bl_spc2 +\ @@ -358,14 +276,10 @@ def get_hw_stats (fid, node_dut, cycle): metric_dict['core_area'] = ima_metrics.compute_area ()# in mm2 metric_dict['cycles'] = cycle metric_dict['time'] = cycle * param.cycle_time * (10**(-9)) # in sec + metric_dict['dynamic_energy'] = total_energy * ns * mw # in joule #metric_dict['leakage_enegy'] = metric_dict['leakage_power'] * mw * metric_dict['time'] # in joule metric_dict['leakage_energy'] = leakage_energy * ns * mw # in joule - # if cfg.MVMU_ver == "Analog": - metric_dict['dynamic_energy'] = total_energy * ns * mw # in joule metric_dict['total_energy'] = metric_dict['dynamic_energy'] + metric_dict['leakage_energy'] - # else: - # metric_dict['total_energy'] = total_energy * ns * mw # in joule - # metric_dict['dynamic_energy'] = metric_dict['total_energy'] metric_dict['average_power'] = metric_dict['total_energy'] / metric_dict['time'] * (10**(3)) # in mW for key, value in metric_dict.items(): diff --git a/src/ima.py b/src/ima.py index 99815cb6..1759ed19 100644 --- a/src/ima.py +++ b/src/ima.py @@ -9,6 +9,7 @@ import include.config as cfg #import include.configTest as cfg import include.constants as param +import constants_digital as digi_param import src.ima_modules as imod from data_convert import * @@ -39,7 +40,6 @@ def __init__ (self): self.matrix_list = [] # list of dicts of mvmu(s) self.xb_inMem_list = [] # list of dicts of xbar input memory self.xb_outMem_list = [] # list of dicts of xbar output memory - self.xbar_inMem_Sparsity_list = [] # list of sparsity od xbar in mem (may have to be removed if found redundant) for i in xrange(cfg.num_matrix): # each matrix represents three mvmus - 1 mvmu for fw, 1 mvmu for bw, 1 mvmu (2X width) for delta @@ -505,9 +505,9 @@ def do_execute (self, ex_op, fid): for i in range (self.de_r2): dst_addr = data_addr + i if (dst_addr >= datamem_off): - self.dataMem.write (dst_addr, data[i]) + self.dataMem.write (dst_addr, data[i]) else: - writeToXbarMem (self, dst_addr, data[i]) + writeToXbarMem (self, dst_addr, data[i]) elif (ex_op == 'st'): #nothing to be done by ima for st here return 1 @@ -599,22 +599,23 @@ def inner_product (mat_id, key): self.xb_outMem_list[mat_id][key].reset () xbar_inMem = self.xb_inMem_list[mat_id][key].read_all () - # print ("xb_inMem", xbar_inMem) - # calculate sparsity of xbar_in_mem non_0_val = 0 for i in range(cfg.xbar_size): - if xbar_inMem[i] != '0': + if xbar_inMem[i] != '0000000000000000': non_0_val = non_0_val +1 - sparsity = (cfg.xbar_size-non_0_val)*100.0/cfg.xbar_size - # print ("non_0_val", non_0_val) - # print ("Sparsity", sparsity) + sparsity = int((cfg.xbar_size-non_0_val)*100.0/cfg.xbar_size) + sparsity_adc = sparsity + if (sparsity%10!=0): + sparsity = sparsity-(sparsity%10) + else: + if (sparsity == 100): + sparsity = sparsity-10 ## Loop to cover all bits of inputs for k in xrange (int(math.ceil(cfg.input_prec / cfg.dac_res))): #quantization affects the # of streams #for k in xrange (1): # read the values from the xbar's input register out_xb_inMem = self.xb_inMem_list[mat_id][key].read (cfg.dac_res) - # print("out_xb_inMem", out_xb_inMem) #*************************************** HACK ********************************************* ###### CAUTION: Not replicated exact "functional" circuit behaviour for analog parts @@ -630,7 +631,7 @@ def inner_product (mat_id, key): out_snh = [[] for x in range(num_xb)] for m in range (num_xb): # compute dot-product - out_xbar[m] = self.matrix_list[mat_id][key][m].propagate_dummy(non_0_val, out_dac) + out_xbar[m] = self.matrix_list[mat_id][key][m].propagate_dummy(out_dac, sparsity) # do sampling and hold out_snh[m] = self.snh_list[mat_id*num_xb+m].propagate_dummy(out_xbar[m]) @@ -643,7 +644,7 @@ def inner_product (mat_id, key): adc_id = (mat_id*num_xb + m) % cfg.num_adc out_mux1 = self.mux1_list[mat_id].propagate_dummy(out_snh[m][j]) # i is the ith xbar out_mux2 = self.mux2_list[mat_id % cfg.num_adc].propagate_dummy(out_mux1) - out_adc = self.adc_list[adc_id].propagate_dummy(out_mux2, non_0_val) + out_adc = self.adc_list[adc_id].propagate_dummy(out_mux2, sparsity_adc) # shift and add outputs from difefrent wt_bits alu_op = 'sna' @@ -772,8 +773,8 @@ def outer_product (mat_id, key): # do nothing for nop instruction - # Computes the latency for mvm instruction based on DPE configuration - def xbComputeLatency (self, mask): + # Computes the latency for Analog mvm instruction based on DPE configuration + def xbComputeLatency_Analog (self, mask): latency_out_list = [] fb_found = 0 d_found = 0 @@ -797,7 +798,7 @@ def xbComputeLatency (self, mask): lat_temp = 0 # We assume all ADCs in a matrix has the same resolution adc_idx = idx*cfg.num_adc_per_matrix - lat_temp = self.adc_list[adc_idx].getLatency(cfg.xbar_size) + lat_temp = self.adc_list[adc_idx].getLatency() ''' print("adc_idx", adc_idx) print("lat_temp", lat_temp) @@ -822,6 +823,26 @@ def xbComputeLatency (self, mask): latency_out_list.append(latency_out) return max(latency_out_list) + # Computes the latency for Analog mvm instruction based on DPE configuration + def xbComputeLatency_Digital (self): + mvm_lat_temp = 0 + if (cfg.inference): + for p in xrange(cfg.num_matrix): + if self.de_xb_nma[p]: + xbar_inMem = self.xb_inMem_list[p]['f'].read_all () + non_0_val = 0 + for i in range(cfg.xbar_size): + if xbar_inMem[i] != '0000000000000000': + non_0_val = non_0_val +1 + sparsity = int((cfg.xbar_size-non_0_val)*100.0/cfg.xbar_size) + if (sparsity%10!=0): + sparsity = sparsity-(sparsity%10) + else: + if (sparsity == 100): + sparsity = sparsity-10 + mvm_lat_temp += digi_param.Digital_xbar_lat_dict[cfg.MVMU_ver][str(cfg.xbar_size)][str(sparsity)] + return mvm_lat_temp + # State machine runs only if the stage is non-empty # Describe the functionality on a cycle basis if (self.stage_empty[sId] != 1): @@ -860,24 +881,9 @@ def xbComputeLatency (self, mask): elif (ex_op == 'mvm'): mask_temp = self.de_xb_nma if (cfg.MVMU_ver == "Analog"): - self.stage_latency[sId] = xbComputeLatency (self, mask_temp) # mask tells which of ip/op or both is occurring + self.stage_latency[sId] = xbComputeLatency_Analog (self, mask_temp) # mask tells which of ip/op or both is occurring else: - mvm_lat_temp = 0 - if (cfg.inference): - for p in xrange(cfg.num_matrix): - if self.de_xb_nma[p]: - xbar_inMem = self.xb_inMem_list[p]['f'].read_all () - non_0_val = 0 - for i in range(cfg.xbar_size): - if xbar_inMem[i] != '0': - non_0_val = non_0_val +1 - print ("non_0_val", non_0_val) - nval_percent = int(non_0_val*100/128) - if (nval_percent%10!=0): - nval_percent = nval_percent + 10 - mvm_lat_temp += param.Digital_xbar_lat_dict[cfg.MVMU_ver][str(cfg.xbar_size)][str(nval_percent)] - self.stage_latency[sId] = mvm_lat_temp - print("MVM Latency", self.stage_latency[sId]) + self.stage_latency[sId] = xbComputeLatency_Digital(self) # Needs update - use xbar serial read latency elif (ex_op == 'crs'): @@ -1014,7 +1020,7 @@ def pipe_run (self, cycle, fid = ''): # fid is tracefile's id update_ready = self.stage_done[i+1] # run the stage based on its update_ready argument - + stage_function[i] (update_ready, fid) # If specified, print thetrace (pipeline stage information) diff --git a/src/ima_metrics.py b/src/ima_metrics.py index 8cfb1c4e..f7ec20ac 100644 --- a/src/ima_metrics.py +++ b/src/ima_metrics.py @@ -18,7 +18,7 @@ def compute_area (): #in mm2 area += (cfg.num_matrix*3) * param.xbar_outMem_area # xbar_outMem (1 OR for 8 xbars - 16 bit weights, 2 bit xbars) area += (cfg.num_matrix*4) * cfg.phy2log_ratio * param.xbar_area # d-xbar has 2X xbars than f/b else: - area += (cfg.num_matrix*2) * param.xbar_area # d-xbar are not needed in Digital MVMUs xbars than f/b + area += (cfg.num_matrix*2) * param.xbar_area # d-xbar are not needed in Digital MVMUs only f and b are there area += (cfg.num_matrix*3) * param.xbar_inMem_area # xbar_inMem one each for f/b/d xbars area += param.instrnMem_area # instrnMem area += param.dataMem_area # dataMem @@ -43,9 +43,9 @@ def compute_pow_leak (): leak_pow += cfg.num_adc * param.adc_pow_leak # adc leak_pow += (cfg.num_matrix*2) * param.sna_pow_leak # sna leak_pow += (cfg.num_matrix*3) * param.xbar_outMem_pow_leak # xbar_outMem - leak_pow += (cfg.num_matrix*4) * param.xbar_pow_leak # xbar area + leak_pow += (cfg.num_matrix*4) * param.xbar_pow_leak # xbar power fr analog else: - leak_pow += (cfg.num_matrix*2) * param.xbar_pow_leak # xbar area + leak_pow += (cfg.num_matrix*2) * param.xbar_pow_leak # d-xbar are not needed in Digital MVMUs only f and b are there leak_pow += (cfg.num_matrix*3) * param.xbar_inMem_pow_leak # xbar_inMem leak_pow += param.instrnMem_pow_leak # instrnMem leak_pow += param.dataMem_pow_leak # dataMem @@ -58,15 +58,17 @@ def compute_pow_leak (): def compute_pow_dyn (): dyn_pow = 0.0 if cfg.MVMU_ver == "Analog": - dyn_pow += (cfg.num_matrix*3) * (param.xbar_inMem_pow_dyn_write + param.xbar_inMem_pow_dyn_read/cfg.xbar_size) # xbar_inMem - num_xbar * dac_res bits will be - # read from xb_inMem in an interval that equals xbar_access time # dyn_pow += cfg.num_xbar/2 * 1.2 # (adding dyn pow the way issac does for comparison) dyn_pow += (cfg.num_matrix*11) * cfg.xbar_size * param.dac_pow_dyn # dac dyn_pow += (cfg.num_matrix*2) * cfg.xbar_size * param.snh_pow_dyn # snh dyn_pow += cfg.num_adc * param.adc_pow_dyn # adc dyn_pow += (cfg.num_matrix*2) * param.sna_pow_dyn # sna dyn_pow += (cfg.num_matrix*3) * param.xbar_outMem_pow_dyn # xbar_outMem (1 OR for 8 xbars - 16 bit weights, 2 bit xbars) - dyn_pow += (cfg.num_matrix*4) * param.xbar_ip_pow_dyn # xbar ip power considred as ip>op power + dyn_pow += (cfg.num_matrix*4) * param.xbar_ip_pow_dyn # xbar ip power considred as ip>op power + else: + dyn_pow += (cfg.num_matrix*2) * param.xbar_ip_pow_dyn # xbar ip power considred as ip>op power + dyn_pow += (cfg.num_matrix*3) * (param.xbar_inMem_pow_dyn_write + param.xbar_inMem_pow_dyn_read/cfg.xbar_size) # xbar_inMem - num_xbar * dac_res bits will be + # read from xb_inMem in an interval that equals xbar_access time dyn_pow += param.instrnMem_pow_dyn # instrnMem dyn_pow += param.dataMem_pow_dyn # dataMem dyn_pow += param.alu_pow_dyn # alu diff --git a/src/ima_modules.py b/src/ima_modules.py index 922e61af..a7dac03f 100644 --- a/src/ima_modules.py +++ b/src/ima_modules.py @@ -16,16 +16,7 @@ class xbar (object): def __init__ (self, xbar_size, xbar_value= 'nil' ): # define num_accesses for different operations # parallel reads (inner-product) - self.num_access = { '100':0, \ - '90': 0, \ - '80': 0, \ - '70': 0, \ - '60': 0, \ - '50': 0, \ - '40': 0, \ - '30': 0, \ - '20': 0, \ - '10': 0} \ + self.num_access = { '0':0, '90': 0,'80': 0,'70': 0,'60': 0,'50': 0,'40': 0,'30': 0,'20': 0,'10': 0} self.num_access_rd = 0 # serial reads self.num_access_wr = 0 # serial writes @@ -101,35 +92,16 @@ def propagate (self, inp = 'nil'): return out # HACK - until propagate doesn't have correct analog functionality - def propagate_dummy (self, n_val, inp = 'nil'): + def propagate_dummy (self, inp = 'nil', sparsity = 0): # data input is list of bit strings (of length dac_res) - fixed point binary assert (inp != 'nil'), 'propagate needs a non-nil input' assert (len(inp) == self.xbar_size), 'xbar input size mismatch' #Modification to accomodate sparsity and digital crossbars if cfg.MVMU_ver == "Analog": - self.num_access['100'] += 1 + self.num_access['0'] += 1 else: - if n_val>cfg.xbar_size*9/10.0: - self.num_access['100'] += 1 - elif n_val>cfg.xbar_size*8/10.0: - self.num_access['90'] += 1 - elif n_val>cfg.xbar_size*7/10.0: - self.num_access['80'] += 1 - elif n_val>cfg.xbar_size*6/10.0: - self.num_access['70'] += 1 - elif n_val>cfg.xbar_size*5/10.0: - self.num_access['60'] += 1 - elif n_val>cfg.xbar_size*4/10.0: - self.num_access['50'] += 1 - elif n_val>cfg.xbar_size*3/10.0: - self.num_access['40'] += 1 - elif n_val>cfg.xbar_size*2/10.0: - self.num_access['30'] += 1 - elif n_val>cfg.xbar_size*1/10.0: - self.num_access['20'] += 1 - else: - self.num_access['10'] += 1 + self.num_access[str(sparsity)] +=1 # convert input from fixed point binary (string) to float inp_float = [0.0] * self.xbar_size @@ -159,7 +131,7 @@ class xbar_op (xbar): # add function for outer_product computation def propagate_op_dummy (self, inp1 = 'nil', inp2 = 'nil', lr=1, in1_bit=cfg.dac_res, in2_bit=cfg.xbar_bits): # inner-product and outer_product functions should have different energies (and other metrics) - NEEDS UPDATE - self.num_access += 1 + self.num_access['0'] += 1 # check both data inputs assert (inp1 != 'nil' and inp2 != 'nil'), 'propagate needs a non-nil inputs' assert ((len(inp1) == self.xbar_size) and (len(inp1[0]) == in1_bit)), 'inp1 size mismatch - should be \ @@ -258,22 +230,14 @@ def propagate_dummy (self, inp_list): class adc (object): def __init__ (self, adc_res): # define num_access - self.num_access = { 'n' : 0, - 'n/2': 0, - '3n/4': 0, - '7n/8': 0, - '15n/16': 0, - '31n/32': 0, - '63n/64': 0, - '127n/128': 0, - '255n/256': 0} - + self.num_access = { 'n':0, 'n/2': 0,'n/4': 0,'n/8': 0,'n/16': 0,'n/32': 0,'n/64': 0,'n/128': 0} + # define latency - # self.latency = param.adc_lat_dict[str(adc_res)] + self.latency = param.adc_lat_dict[str(adc_res)] self.adc_res = adc_res - def getLatency (self, n_val): + def getLatency (self): self.latency = param.adc_lat_dict[str(self.adc_res)] return self.latency @@ -285,31 +249,39 @@ def real2bin (self, inp, num_bits): return ('0'*(num_bits - len(bin_value)) + bin_value) def propagate (self, inp): - #self.num_access += 1 + self.num_access += 1 assert (type(inp) in [float, np.float32, np.float64]), 'adc input type mismatch (float, np.float32, np.float64 expected)' num_bits = self.adc_res return self.real2bin (inp, num_bits) # HACK - until propagate doesn't have correct analog functionality - def propagate_dummy (self, inp, n_val): - if n_val>cfg.xbar_size/2.0: + def propagate_dummy (self, inp, sparsity): + if sparsity>50: self.num_access['n'] += 1 - elif n_val>cfg.xbar_size/4.0: + self.adc_res = cfg.adc_res + elif sparsity>25: self.num_access['n/2'] += 1 - elif n_val>cfg.xbar_size/8.0: - self.num_access['3n/4'] += 1 - elif n_val>cfg.xbar_size/16.0: - self.num_access['7n/8'] += 1 - elif n_val>cfg.xbar_size/32.0: - self.num_access['15n/16'] += 1 - elif n_val>cfg.xbar_size/64.0: - self.num_access['31n/32'] += 1 - elif n_val>cfg.xbar_size/128.0: - self.num_access['63n/64'] += 1 - elif n_val>cfg.xbar_size/256.0: - self.num_access['127n/128'] += 1 + self.adc_res = cfg.adc_res-1 + elif sparsity>12.5: + self.num_access['n/4'] += 1 + self.adc_res = cfg.adc_res-2 + elif sparsity>6.25: + self.num_access['n/8'] += 1 + self.adc_res = cfg.adc_res-3 + elif sparsity>3.125: + self.num_access['n/16'] += 1 + self.adc_res = cfg.adc_res-4 + elif sparsity>1.5625: + self.num_access['n/32'] += 1 + self.adc_res = cfg.adc_res-5 + elif sparsity>0.78125: + self.num_access['n/64'] += 1 + self.adc_res = cfg.adc_res-6 else: - self.num_access['255n/256'] += 1 + self.num_access['n/128'] += 1 + self.adc_res = cfg.adc_res-7 + if(self.adc_res<0): + self.adc_res = 1 return inp diff --git a/src/node_dump.py b/src/node_dump.py index 0dd27441..281f1d15 100644 --- a/src/node_dump.py +++ b/src/node_dump.py @@ -25,8 +25,8 @@ def mem_dump (fid, memfile, name, node = '', tile_id = ''): #temp_val = bin2int (memfile[addr], cfg.num_bits) if (name == 'EDRAM' and (node != '') and (tile_id != '')): # for EDRAM also show counter/valid fid.write ('valid: ' + str(node.tile_list[tile_id].edram_controller.valid[addr]) \ - + ' | counter: ' + str(node.tile_list[tile_id].edram_controller.counter[addr]) + ' | ') - fid.write(str(temp_val) + '\n') + + ' | counter: ' + str(node.tile_list[tile_id].edram_controller.counter[addr]) + ' | ') + fid.write(str(temp_val) + '\n') else: # not printing zero values for ease of view temp_val = 0.0 if (name != 'EDRAM'): diff --git a/test/mvm_ip_test.py b/test/mvm_ip_test.py deleted file mode 100644 index 2e58c9c9..00000000 --- a/test/mvm_ip_test.py +++ /dev/null @@ -1,83 +0,0 @@ -# API for testing MVM inner product operation -import sys -import os -import numpy as np - -root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -sys.path.insert(0, root_dir) - -from src.data_convert import * -import src.ima as ima -from src.instrn_proto import * -import include.config as cfg - -#change the core and mvmu id'd here: -# tile_ID = 2 -# core_ID = 1 -# matrix_ID = 0 - -for tile_ID in range(2, cfg.num_tile): - for core_ID in range(cfg.num_ima): - for matrix_ID in range(cfg.num_matrix): - - path = 'testasm/mlp/' - wt_path = path +'weights/tile'+ str(tile_ID)+ '/core'+ str(core_ID)+ '/' - inst_file = path + 'tile'+ str(tile_ID)+ '/core_imem'+ str(core_ID)+ '.npy' - trace_path = 'traces/mlp/' - trace_file = trace_path + 'tile'+ str(tile_ID)+ '/ima_trace'+ str(core_ID)+ '.txt' - dump_file = trace_path + 'tile'+ str(tile_ID)+ '/memsim.txt' - - datamem_off = cfg.datamem_off # each matrix has 6 memory spaces (1 for f/b, 2 for d) - phy2log_ratio = cfg.phy2log_ratio # ratio of physical to logical xbar - - if (os.path.exists(wt_path)): # check if weights for the xbar exist - # print ('wtfile exits: ' + 'tile' + str(tile_ID) +' core ' + str(core_ID) + 'matrix ' + str(matrix_ID)) - - xbar_input = ['']*cfg.xbar_size - xbar_output = ['']*cfg.xbar_size - with open(dump_file, 'r') as file: - lines=file.readlines() - - for i in range (len(lines)): - if(lines[i] == 'Xbar Input Memory: imaId:'+ str(core_ID)+ ' matrixId:'+ str(matrix_ID)+ ' mvmu_type:f contents\n'): - ip_start=i+1 - if(lines[i] == 'Xbar Output Memory: imaId:'+ str(core_ID)+ ' matrixId:'+ str(matrix_ID)+ ' mvmu_type:f contents\n'): - op_start=i+1 - ip_end=i-1 - if(lines[i] == 'Xbar Input Memory: imaId:'+ str(core_ID)+ ' matrixId:'+ str(matrix_ID)+ ' mvmu_type:b contents\n'): - op_end=i-1 - - # print(ip_start) - # print(ip_end) - # print(op_start) - # print(op_end) - # print('Length of input=',ip_end-ip_start+1 ) - # print('Length of output=',op_end-op_start+1 ) - - for j in range (ip_end-ip_start+1): - xbar_input[j] = float(lines[ip_start+j]) - for j in range (op_end-op_start+1): - xbar_output[j] = float(lines[op_start+j]) - - # print(xbar_input) - # print(xbar_output) - - ## Testcases for Functionality Debug of MVM (1,2,3,4) - ## 1. compare golden output to ima output - wt_gold = np.load(wt_path+'log_xbar0.npy') - # print(wt_gold) - # out_gold = np.dot (ima.dataMem.memfile_float, wt_gold) - if(ip_end-ip_start+1 == 128): - - out_gold = np.dot (np.asarray(xbar_input), wt_gold) - out_exp = np.asarray(xbar_output) - - # print (out_gold) - # print (out_exp) - - err = np.tanh(out_gold) - np.tanh(out_exp) - print ("error for tile"+ str(tile_ID) +" core" + str(core_ID) + " matrix" + str(matrix_ID)+ " has mean= " + str(np.average(err)) + " and stdev= " + \ - str(np.std(err))) - - else: - print("No or less than length 128 input available for tile"+ str(tile_ID) +" core" + str(core_ID) + " matrix" + str(matrix_ID)+".") From 4302e61902361809af34a85363ed33f1af9353a5 Mon Sep 17 00:00:00 2001 From: Deepika Sharma Date: Sat, 13 Jun 2020 03:09:50 -0400 Subject: [PATCH 11/15] Commiting after suggested changes in PR --- include/config.py | 2 +- include/constants.py | 1 + src/dnn_wt_p.py | 1 + src/ima.py | 55 ++++++++++++++++++++++++-------------------- src/ima_metrics.py | 2 +- src/ima_modules.py | 2 +- 6 files changed, 35 insertions(+), 28 deletions(-) diff --git a/include/config.py b/include/config.py index e09ebedc..02e78dd1 100644 --- a/include/config.py +++ b/include/config.py @@ -13,7 +13,7 @@ # One of "Analog", "Digital_V1" or "Digital_V2" # Digital_V1 has compressed inputs (Data+Offset style) # Digital_V2 has uncompressed inputs (Skips computations for 0 activation) -MVMU_ver = "Analog" +MVMU_ver = "Digital_V2" ## Operand precision (fixed point allowed only): num_bits = int_bits + frac_bits num_bits = 16 diff --git a/include/constants.py b/include/constants.py index d16395df..a2cfe755 100644 --- a/include/constants.py +++ b/include/constants.py @@ -358,6 +358,7 @@ # Chosen latency based on config file - only for components whose latency is parameter dependent +#xbar_lat = xbar_lat_dict [str(cfg.xbar_bits)][str(cfg.xbar_size)] xbar_ip_lat_dict = {'0':0, '90':0, '80':0, '70':0, '60':0, '50':0, '40':0, '30':0, '20':0, '10':0} if cfg.MVMU_ver == "Analog": for key, value in xbar_ip_lat_dict.items(): diff --git a/src/dnn_wt_p.py b/src/dnn_wt_p.py index df1b4a08..3d19cb04 100644 --- a/src/dnn_wt_p.py +++ b/src/dnn_wt_p.py @@ -28,3 +28,4 @@ def prog_dnn_wt(self, instrnpath, node_dut): wt_temp = np.load(wt_filename) node_dut.tile_list[i].ima_list[j].matrix_list[k]['f'][l].program(wt_temp) node_dut.tile_list[i].ima_list[j].matrix_list[k]['b'][l].program(wt_temp) + diff --git a/src/ima.py b/src/ima.py index 1759ed19..ea54481a 100644 --- a/src/ima.py +++ b/src/ima.py @@ -351,7 +351,7 @@ def do_decode (self, dec_op): assert (self.fd_instrn['r2'] >= datamem_off), 'operand2 for beq comes from data memory' self.de_val1 = self.dataMem.read(self.fd_instrn['r1']) self.de_val2 = self.dataMem.read(self.fd_instrn['r2']) - + elif (dec_op == 'alu_int'): self.de_aluop = self.fd_instrn['aluop'] self.de_d1 = self.fd_instrn['d1'] # addr for rf @@ -359,7 +359,7 @@ def do_decode (self, dec_op): assert (self.fd_instrn['r2'] >= datamem_off), 'operand2 for alu_int comes from data memory' self.de_val1 = self.dataMem.read(self.fd_instrn['r1']) self.de_val2 = self.dataMem.read(self.fd_instrn['r2']) - + # do nothing for halt/jmp in decode (just propagate to ex when applicable) @@ -598,18 +598,21 @@ def inner_product (mat_id, key): # reset the xb out memory before starting to accumulate self.xb_outMem_list[mat_id][key].reset () - xbar_inMem = self.xb_inMem_list[mat_id][key].read_all () - non_0_val = 0 - for i in range(cfg.xbar_size): - if xbar_inMem[i] != '0000000000000000': - non_0_val = non_0_val +1 - sparsity = int((cfg.xbar_size-non_0_val)*100.0/cfg.xbar_size) - sparsity_adc = sparsity - if (sparsity%10!=0): - sparsity = sparsity-(sparsity%10) - else: - if (sparsity == 100): - sparsity = sparsity-10 + sparsity=0 + sparsity_adc=0 + if cfg.sparse_opt: + xbar_inMem = self.xb_inMem_list[mat_id][key].read_all () + non_0_val = 0 + for i in range(cfg.xbar_size): + if xbar_inMem[i] != '0000000000000000': + non_0_val = non_0_val +1 + sparsity = int((cfg.xbar_size-non_0_val)*100.0/cfg.xbar_size) + sparsity_adc = sparsity + if (sparsity%10!=0): + sparsity = sparsity-(sparsity%10) + else: + if (sparsity == 100): + sparsity = sparsity-10 ## Loop to cover all bits of inputs for k in xrange (int(math.ceil(cfg.input_prec / cfg.dac_res))): #quantization affects the # of streams @@ -829,17 +832,19 @@ def xbComputeLatency_Digital (self): if (cfg.inference): for p in xrange(cfg.num_matrix): if self.de_xb_nma[p]: - xbar_inMem = self.xb_inMem_list[p]['f'].read_all () - non_0_val = 0 - for i in range(cfg.xbar_size): - if xbar_inMem[i] != '0000000000000000': - non_0_val = non_0_val +1 - sparsity = int((cfg.xbar_size-non_0_val)*100.0/cfg.xbar_size) - if (sparsity%10!=0): - sparsity = sparsity-(sparsity%10) - else: - if (sparsity == 100): - sparsity = sparsity-10 + sparsity=0 + if cfg.sparse_opt: + xbar_inMem = self.xb_inMem_list[p]['f'].read_all () + non_0_val = 0 + for i in range(cfg.xbar_size): + if xbar_inMem[i] != '0000000000000000': + non_0_val = non_0_val +1 + sparsity = int((cfg.xbar_size-non_0_val)*100.0/cfg.xbar_size) + if (sparsity%10!=0): + sparsity = sparsity-(sparsity%10) + else: + if (sparsity == 100): + sparsity = sparsity-10 mvm_lat_temp += digi_param.Digital_xbar_lat_dict[cfg.MVMU_ver][str(cfg.xbar_size)][str(sparsity)] return mvm_lat_temp diff --git a/src/ima_metrics.py b/src/ima_metrics.py index f7ec20ac..c0314e9d 100644 --- a/src/ima_metrics.py +++ b/src/ima_metrics.py @@ -66,7 +66,7 @@ def compute_pow_dyn (): dyn_pow += (cfg.num_matrix*3) * param.xbar_outMem_pow_dyn # xbar_outMem (1 OR for 8 xbars - 16 bit weights, 2 bit xbars) dyn_pow += (cfg.num_matrix*4) * param.xbar_ip_pow_dyn # xbar ip power considred as ip>op power else: - dyn_pow += (cfg.num_matrix*2) * param.xbar_ip_pow_dyn # xbar ip power considred as ip>op power + dyn_pow += (cfg.num_matrix*2) * param.xbar_ip_pow_dyn # xbar ip power considred as ip>op power # d-xbar are not needed in Digital MVMUs only f and b are there dyn_pow += (cfg.num_matrix*3) * (param.xbar_inMem_pow_dyn_write + param.xbar_inMem_pow_dyn_read/cfg.xbar_size) # xbar_inMem - num_xbar * dac_res bits will be # read from xb_inMem in an interval that equals xbar_access time dyn_pow += param.instrnMem_pow_dyn # instrnMem diff --git a/src/ima_modules.py b/src/ima_modules.py index a7dac03f..8080896a 100644 --- a/src/ima_modules.py +++ b/src/ima_modules.py @@ -255,7 +255,7 @@ def propagate (self, inp): return self.real2bin (inp, num_bits) # HACK - until propagate doesn't have correct analog functionality - def propagate_dummy (self, inp, sparsity): + def propagate_dummy (self, inp, sparsity = 0): if sparsity>50: self.num_access['n'] += 1 self.adc_res = cfg.adc_res From 9f506352b9d97791e0f26aad02d07c0794a59ba6 Mon Sep 17 00:00:00 2001 From: Deepika Sharma Date: Sat, 13 Jun 2020 03:15:06 -0400 Subject: [PATCH 12/15] Commiting after suggested changes in PR --- src/dnn_wt_p.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/dnn_wt_p.py b/src/dnn_wt_p.py index 3d19cb04..820d5ee3 100644 --- a/src/dnn_wt_p.py +++ b/src/dnn_wt_p.py @@ -29,3 +29,4 @@ def prog_dnn_wt(self, instrnpath, node_dut): node_dut.tile_list[i].ima_list[j].matrix_list[k]['f'][l].program(wt_temp) node_dut.tile_list[i].ima_list[j].matrix_list[k]['b'][l].program(wt_temp) + From fd778f552f849bf110255345a4bd28c7ea7e2ccc Mon Sep 17 00:00:00 2001 From: Deepika Sharma Date: Sat, 13 Jun 2020 03:25:23 -0400 Subject: [PATCH 13/15] Updated how_to_run.md --- how_to_run.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/how_to_run.md b/how_to_run.md index f9a06abd..f4bcce92 100644 --- a/how_to_run.md +++ b/how_to_run.md @@ -82,6 +82,9 @@ cp -R puma-simulator/test/testasm/ #### 6.1 - Setup config file : +Use the appropriate config file from ```puma-simulator/include/example-configs/(config file name)```. +For example: for mlp use ```config-mlp.py```. +Copy the file to ```puma-simulator/include/``` and rename it to ```config.py```. Config file - ```puma-simulator/include/config.py```. Update ```num_tile_compute``` in config file based on the number of tiles generated in your `````` model. From 05b48982d1588776d929bdffff93e52a53984a6a Mon Sep 17 00:00:00 2001 From: Deepika Sharma Date: Sat, 13 Jun 2020 03:28:03 -0400 Subject: [PATCH 14/15] Updated how_to_run.md --- how_to_run.md | 1 - 1 file changed, 1 deletion(-) diff --git a/how_to_run.md b/how_to_run.md index f4bcce92..bf1dc791 100644 --- a/how_to_run.md +++ b/how_to_run.md @@ -85,7 +85,6 @@ cp -R puma-simulator/test/testasm/ Use the appropriate config file from ```puma-simulator/include/example-configs/(config file name)```. For example: for mlp use ```config-mlp.py```. Copy the file to ```puma-simulator/include/``` and rename it to ```config.py```. -Config file - ```puma-simulator/include/config.py```. Update ```num_tile_compute``` in config file based on the number of tiles generated in your `````` model. From af3516a8d307919931532a3598820004025c49e8 Mon Sep 17 00:00:00 2001 From: Deepika Sharma Date: Sat, 13 Jun 2020 16:08:57 -0400 Subject: [PATCH 15/15] corrrected sparsity comparison function in adc module --- include/config.py | 2 +- src/ima_modules.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/config.py b/include/config.py index 02e78dd1..e09ebedc 100644 --- a/include/config.py +++ b/include/config.py @@ -13,7 +13,7 @@ # One of "Analog", "Digital_V1" or "Digital_V2" # Digital_V1 has compressed inputs (Data+Offset style) # Digital_V2 has uncompressed inputs (Skips computations for 0 activation) -MVMU_ver = "Digital_V2" +MVMU_ver = "Analog" ## Operand precision (fixed point allowed only): num_bits = int_bits + frac_bits num_bits = 16 diff --git a/src/ima_modules.py b/src/ima_modules.py index 8080896a..b0d5b2d0 100644 --- a/src/ima_modules.py +++ b/src/ima_modules.py @@ -256,25 +256,25 @@ def propagate (self, inp): # HACK - until propagate doesn't have correct analog functionality def propagate_dummy (self, inp, sparsity = 0): - if sparsity>50: + if sparsity<50: self.num_access['n'] += 1 self.adc_res = cfg.adc_res - elif sparsity>25: + elif sparsity<75: self.num_access['n/2'] += 1 self.adc_res = cfg.adc_res-1 - elif sparsity>12.5: + elif sparsity<87.5: self.num_access['n/4'] += 1 self.adc_res = cfg.adc_res-2 - elif sparsity>6.25: + elif sparsity<93.75: self.num_access['n/8'] += 1 self.adc_res = cfg.adc_res-3 - elif sparsity>3.125: + elif sparsity<96.875: self.num_access['n/16'] += 1 self.adc_res = cfg.adc_res-4 - elif sparsity>1.5625: + elif sparsity<98.4375: self.num_access['n/32'] += 1 self.adc_res = cfg.adc_res-5 - elif sparsity>0.78125: + elif sparsity<99.21875: self.num_access['n/64'] += 1 self.adc_res = cfg.adc_res-6 else: