Skip to content

Commit

Permalink
Merge pull request #26 from negishubham/training
Browse files Browse the repository at this point in the history
Bug-fixes for CNN and simulator enhancements.
  • Loading branch information
Aayush-Ankit authored Apr 15, 2020
2 parents f4f8f41 + 994016b commit 090feed
Show file tree
Hide file tree
Showing 19 changed files with 726 additions and 44 deletions.
1 change: 1 addition & 0 deletions include/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
# instrnMem_size: (in Bytes) - 512, 1024, 2048

# Fixed parameters
addr_width = 22 # Added to address larger address space for conv layers (#TODO: Compiler needs to fix shared memory reuse)
data_width = num_bits # (in bits)
xbdata_width = data_width # (in bits)
instrn_width = 48 # (in bits)
Expand Down
92 changes: 60 additions & 32 deletions include/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,59 +214,77 @@
dataMem_lat_dict = {'256' : 1,
'512' : 1,
'1024': 1,
'2048': 1}
'2048': 1,
'4096':1}

dataMem_pow_dyn_dict = {'256' : 0.16,
'512' : 0.24,
'1024': 0.33,
'2048': 0.57}
'2048': 0.57,
'4096': 0.57}

dataMem_pow_leak_dict = {'256' : 0.044,
'512' : 0.078,
'1024': 0.147,
'2048': 0.33}
'2048': 0.33,
'4096': 0.33}

dataMem_area_dict = {'256' : 0.00056,
'512' : 0.00108,
'1024': 0.00192,
'2048': 0.00392}
'2048': 0.00392,
'4096': 0.00392}

dataMem_lat_dict = {'256' : 1,
'512' : 1,
'1024': 1,
'2048': 1}
'2048': 1,
'4096':1}

dataMem_pow_dyn_dict = {'256' : 0.16,
'512' : 0.24,
'1024': 0.33,
'2048': 0.57}
'2048': 0.57,
'4096': 0.57}

dataMem_pow_leak_dict = {'256' : 0.044,
'512' : 0.078,
'1024': 0.147,
'2048': 0.33}
'2048': 0.33,
'4096': 0.33}

dataMem_area_dict = {'256' : 0.00056,
'512' : 0.00108,
'1024': 0.00192,
'2048': 0.00392}
'2048': 0.00392,
'4096': 0.00392}

# Instruction Memory value dictionary
instrnMem_lat_dict = {'512' : 1,
'1024': 1,
'2048': 1}
'2048': 1,
'4096': 1,
'8192': 1}

instrnMem_pow_dyn_dict = {'512' : 0.46,
'1024': 0.53,
'2048': 0.65}
'2048': 0.65,
'4096': 0.65,
'8192': 0.65}

instrnMem_pow_leak_dict = {'512' : 0.078,
'1024': 0.147,
'2048': 0.33}
'2048': 0.33,
'4096': 0.33,
'8192': 0.33}


instrnMem_area_dict = {'512' : 0.00108,
'1024': 0.00192,
'2048': 0.0041}
'2048': 0.0041,
'4096': 0.0041,
'8192': 0.0041}


# Xbar_inMem value dictionary (1 access means reading (dac_res) bits for each xbar row)
# for computing average power of ima - scale dyn_pow down by xbar_size
Expand Down Expand Up @@ -382,38 +400,48 @@

# Tile component latency/pow/area
# EDRAM value dictionary (counter storage is not coounted)
edram_lat_dict = {'8' :2,
'64' : 2, #edram access width is constant = 256 bits
'128': 2}

edram_pow_dyn_dict = {'8' : 17.2/2,
'64' : 17.2/2, # (0.0172 nJ with 2 cycles access latency)
'128': 25.35/2}

edram_pow_leak_dict = {'8' : 0.46,
'64' : 0.46,
'128': 0.77}

edram_area_dict = {'8' : 0.086,
'64' : 0.086,
'128': 0.121}
edram_lat_dict = {'8' : 2,
'64' : 2, #edram access width is constant = 256 bits
'128' : 2,
'2048': 2}

edram_pow_dyn_dict = {'8' : 17.2/2,
'64' : 17.2/2, # (0.0172 nJ with 2 cycles access latency)
'128' : 25.35/2,
'2048': 25.35/2}

edram_pow_leak_dict = {'8' : 0.46,
'64' : 0.46,
'128' : 0.77,
'2048': 0.77}

edram_area_dict = {'8' : 0.086,
'64' : 0.086,
'128' : 0.121,
'2048': 0.121}

# Tile Instruction Memory value dictionary
tile_instrnMem_lat_dict = {'512' : 1,
tile_instrnMem_lat_dict = {'512': 1,
'1024': 1,
'2048': 1}
'2048': 1,
'4096': 1}

tile_instrnMem_pow_dyn_dict = {'512' : 0.46,
'1024': 0.53,
'2048': 0.65}
'2048': 0.65,
'4096': 0.65}

tile_instrnMem_pow_leak_dict = {'512' : 0.078,
'1024': 0.147,
'2048': 0.33}
'2048': 0.33,
'4096': 0.33}


tile_instrnMem_area_dict = {'512' : 0.00108,
'1024': 0.00192,
'2048': 0.0041}
'2048': 0.0041,
'4096': 0.0041}


# counter storage (2048 Byte Scratch RAM - 1 counter entry shared by 256 bits of data (16 neurons))
# area scaling (X8)
Expand Down
123 changes: 123 additions & 0 deletions include/example-configs/config-cnn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# This file contains the configurable parameters in DPE (all hierarchies - IMA, Tile, Node)
## All user specified parameters are provided by this file only

## Debug - 0 (1): dpe simulation will (won't) produce ima/tile traces while simulating
cycles_max = 5000000 # Put both these to very large numbers (when design is bug-free)!
debug = 1
xbar_record = 1
inference = 1
training = not(inference)

## Operand precision (fixed point allowed only): num_bits = int_bits + frac_bits
num_bits = 16
int_bits = 4
frac_bits = num_bits - int_bits

## IMA configurable parameters (permissible values for each parameter provided here)
## Instruction generation - affected by xbar_bits, num_xbar, xbar_size.
# xbar_bits: 2, 4, 6
# num_xbar: positive integer
# xbar_size: 32, 64, 128, 256
# dac_res: positive integer <= num_bits
# adc_res: positive integer <= num_bits
# num_adc: positive integer <= num_xbar (doesn't allow more than one ADC per xbar)
# num_ALU: positive integer
# dataMem_size: (in Bytes) - 256, 512, 1024, 2048 (affects instrn width, hence capped)
# instrnMem_size: (in Bytes) - 512, 1024, 2048

# Fixed parameters
addr_width = 22 # Added to address larger address space for conv layers (#TODO: Compiler needs to fix shared memory reuse)
data_width = num_bits # (in bits)
xbdata_width = data_width # (in bits)
instrn_width = 48 # (in bits)

# Change here - Specify the IMA parameters here
xbar_bits = 2
num_matrix = 2 # each matrix is 1-fw logical xbar for inference and 1-fw, 1-bw, and 1 delta logical xbar for training. Each logical xbar for inference is 8-fw physical xbar and for training 8-fw, 8-bw and 16-delta physical xbars.
xbar_size = 128
dac_res = 1
# ADC configuration
adc_res = 8 # around 4 to 8. this value should be
num_adc_per_matrix = 2
num_adc = num_adc_per_matrix * num_matrix

# The idea is to have different ADC resolution value for each ADC.
# The number of ADC if defined by num_adc property. Currently it is 2 * num_matrix(2) = 4
# NOTE: Only taking in account indexes 0 and 2, 1 and 3 are ignored, because ADCs 1 and 3 are assumed t be equal to 0 and 2.
adc_res_new = {
'matrix_adc_0' : 8,
'matrix_adc_1' : 4,
'matrix_adc_2' : 8,
'matrix_adc_3' : 4
}

num_ALU = num_matrix*2
#dataMem_size = num_matrix*(6*xbar_size) # 4 for 4 input spaces within matrix (1 for f/b each, 2 for d)
dataMem_size = 4096 # 2048 is larger than num_matrix*(6*xbar_size)
instrnMem_size = 8192 #in entries

# This depends on above parameters
if (training):
datamem_off = xbar_size * (num_matrix*6) # each matrix has 6 memory spaces (1 for f/b, 2 for d)

if (inference):
datamem_off = xbar_size * (num_matrix*2) # each matrix has 2 memory spaces ( 1 input Xbar memory and 1 output Xbar memory)

phy2log_ratio = num_bits / xbar_bits # ratio of physical to logical xbar #vaulue is 8
lr = 0.25 # learning rate for updates to d-xbar

## Tile configurable parameters (permissible values for each parameter provided here)
## Instruction generation - affected by num_ima
# num_ima: positive integer
# edram buswidth: positive integer <= 16 (actual buswidth - this integer*data_width)
# edram_size: (in KiloBytes) - 64, 128, 256, 512
# receive_buffer_depth: 4, 8, 12, 16, 32 (number of edram buffer entries (each entry maps to a virtual tile)) \
# puts a cap on the maximum num ber of tiles that can send data to a tile in next layer
# receive_buffer_width: edram_buswidth/data_width (Fixed - in terms of number of neurons)
# tile_instrnMem_size: 256, 512, 1024 (in Bytes)

# Fixed parameters
instrn_width = 48 # bits (op-2, vtile_id-6, send/receive_width-8, target_addr/counter-16, vw-8, mem_addr-16)
edram_buswidth = 256 # in bits
#receive_buffer_depth = 16
receive_buffer_depth = 150 #set equal to num_tile_max
receive_buffer_width = edram_buswidth / num_bits # size of receive buffeer entry (in terms of number of neurons)

# Change here - Specify the Tile parameters here
num_ima = 8
edram_size = 2048 # in Kilobytes (64 KB - same as issac)
tile_instrnMem_size = 4096 # in entries

## Node configurable parameters (permissible values for each parameter provided here)
## Instruction generation - affected by num_tile
# num_tile_compute = positive integer
# inj_rate < 0.2 (depends on the mapping)
# num_port: 4, 8

# Fixed parameters
# NOC topology: cmesh (n=2, k=4, c=4) - can fit k*n*c tiles
cmesh_c = 4
num_bits_tileId =32
flit_width = 32
packet_width = edram_buswidth/data_width #in multiples of flits (data considered only - booksim consider address itself)
# (b bit of address = logN, N is the number of nodes)

# Change here - Specify the Node parameters here
num_tile_compute = 7 # number of tiles mapped by dnn (leaving input and output tiles)
num_tile_max = 168.0 # maximum number of tiles per node
num_inj_max = num_tile_max # [conservative] max number of packet injections that can occur in a cycle (each tile injects a packet into NOC each cycle)
noc_inj_rate = 0.005
noc_num_port = 4

## Node parameters - Our way of simulation just assumes all tile in one actual node
num_node = 1

# Do not change this - total number of tiles
num_tile = num_node * num_tile_compute + 2 # +1 for first tile (I/O tile) - dummy, others - compute

#Security parameters - Used to verify if the model used is encryted or authenticated (set by dpe.py)
#Do not change
encrypted = False
authenticated = False
cypher_name = ''
cypher_hash = ''
Loading

0 comments on commit 090feed

Please sign in to comment.