diff --git a/include/config.py b/include/config.py index 7456665e..8cd88e99 100644 --- a/include/config.py +++ b/include/config.py @@ -26,6 +26,7 @@ # instrnMem_size: (in Bytes) - 512, 1024, 2048 # Fixed parameters +addr_width = 22 # Added to address larger address space for conv layers (#TODO: Compiler needs to fix shared memory reuse) data_width = num_bits # (in bits) xbdata_width = data_width # (in bits) instrn_width = 48 # (in bits) diff --git a/include/constants.py b/include/constants.py index 4246f10e..85889215 100644 --- a/include/constants.py +++ b/include/constants.py @@ -214,59 +214,77 @@ dataMem_lat_dict = {'256' : 1, '512' : 1, '1024': 1, - '2048': 1} + '2048': 1, + '4096':1} dataMem_pow_dyn_dict = {'256' : 0.16, '512' : 0.24, '1024': 0.33, - '2048': 0.57} + '2048': 0.57, + '4096': 0.57} dataMem_pow_leak_dict = {'256' : 0.044, '512' : 0.078, '1024': 0.147, - '2048': 0.33} + '2048': 0.33, + '4096': 0.33} dataMem_area_dict = {'256' : 0.00056, '512' : 0.00108, '1024': 0.00192, - '2048': 0.00392} + '2048': 0.00392, + '4096': 0.00392} dataMem_lat_dict = {'256' : 1, '512' : 1, '1024': 1, - '2048': 1} + '2048': 1, + '4096':1} dataMem_pow_dyn_dict = {'256' : 0.16, '512' : 0.24, '1024': 0.33, - '2048': 0.57} + '2048': 0.57, + '4096': 0.57} dataMem_pow_leak_dict = {'256' : 0.044, '512' : 0.078, '1024': 0.147, - '2048': 0.33} + '2048': 0.33, + '4096': 0.33} dataMem_area_dict = {'256' : 0.00056, '512' : 0.00108, '1024': 0.00192, - '2048': 0.00392} + '2048': 0.00392, + '4096': 0.00392} # Instruction Memory value dictionary instrnMem_lat_dict = {'512' : 1, '1024': 1, - '2048': 1} + '2048': 1, + '4096': 1, + '8192': 1} instrnMem_pow_dyn_dict = {'512' : 0.46, '1024': 0.53, - '2048': 0.65} + '2048': 0.65, + '4096': 0.65, + '8192': 0.65} instrnMem_pow_leak_dict = {'512' : 0.078, '1024': 0.147, - '2048': 0.33} + '2048': 0.33, + '4096': 0.33, + '8192': 0.33} + instrnMem_area_dict = {'512' : 0.00108, '1024': 0.00192, - '2048': 0.0041} + '2048': 0.0041, + '4096': 0.0041, + '8192': 0.0041} + # Xbar_inMem value dictionary (1 access means reading (dac_res) bits for each xbar row) # for computing average power of ima - scale dyn_pow down by xbar_size @@ -382,38 +400,48 @@ # Tile component latency/pow/area # EDRAM value dictionary (counter storage is not coounted) -edram_lat_dict = {'8' :2, - '64' : 2, #edram access width is constant = 256 bits - '128': 2} - -edram_pow_dyn_dict = {'8' : 17.2/2, - '64' : 17.2/2, # (0.0172 nJ with 2 cycles access latency) - '128': 25.35/2} - -edram_pow_leak_dict = {'8' : 0.46, - '64' : 0.46, - '128': 0.77} - -edram_area_dict = {'8' : 0.086, - '64' : 0.086, - '128': 0.121} +edram_lat_dict = {'8' : 2, + '64' : 2, #edram access width is constant = 256 bits + '128' : 2, + '2048': 2} + +edram_pow_dyn_dict = {'8' : 17.2/2, + '64' : 17.2/2, # (0.0172 nJ with 2 cycles access latency) + '128' : 25.35/2, + '2048': 25.35/2} + +edram_pow_leak_dict = {'8' : 0.46, + '64' : 0.46, + '128' : 0.77, + '2048': 0.77} + +edram_area_dict = {'8' : 0.086, + '64' : 0.086, + '128' : 0.121, + '2048': 0.121} # Tile Instruction Memory value dictionary -tile_instrnMem_lat_dict = {'512' : 1, +tile_instrnMem_lat_dict = {'512': 1, '1024': 1, - '2048': 1} + '2048': 1, + '4096': 1} tile_instrnMem_pow_dyn_dict = {'512' : 0.46, '1024': 0.53, - '2048': 0.65} + '2048': 0.65, + '4096': 0.65} tile_instrnMem_pow_leak_dict = {'512' : 0.078, '1024': 0.147, - '2048': 0.33} + '2048': 0.33, + '4096': 0.33} + tile_instrnMem_area_dict = {'512' : 0.00108, '1024': 0.00192, - '2048': 0.0041} + '2048': 0.0041, + '4096': 0.0041} + # counter storage (2048 Byte Scratch RAM - 1 counter entry shared by 256 bits of data (16 neurons)) # area scaling (X8) diff --git a/include/example-configs/config-cnn.py b/include/example-configs/config-cnn.py new file mode 100644 index 00000000..af9f7668 --- /dev/null +++ b/include/example-configs/config-cnn.py @@ -0,0 +1,123 @@ +# This file contains the configurable parameters in DPE (all hierarchies - IMA, Tile, Node) +## All user specified parameters are provided by this file only + +## Debug - 0 (1): dpe simulation will (won't) produce ima/tile traces while simulating +cycles_max = 5000000 # Put both these to very large numbers (when design is bug-free)! +debug = 1 +xbar_record = 1 +inference = 1 +training = not(inference) + +## Operand precision (fixed point allowed only): num_bits = int_bits + frac_bits +num_bits = 16 +int_bits = 4 +frac_bits = num_bits - int_bits + +## IMA configurable parameters (permissible values for each parameter provided here) +## Instruction generation - affected by xbar_bits, num_xbar, xbar_size. +# xbar_bits: 2, 4, 6 +# num_xbar: positive integer +# xbar_size: 32, 64, 128, 256 +# dac_res: positive integer <= num_bits +# adc_res: positive integer <= num_bits +# num_adc: positive integer <= num_xbar (doesn't allow more than one ADC per xbar) +# num_ALU: positive integer +# dataMem_size: (in Bytes) - 256, 512, 1024, 2048 (affects instrn width, hence capped) +# instrnMem_size: (in Bytes) - 512, 1024, 2048 + +# Fixed parameters +addr_width = 22 # Added to address larger address space for conv layers (#TODO: Compiler needs to fix shared memory reuse) +data_width = num_bits # (in bits) +xbdata_width = data_width # (in bits) +instrn_width = 48 # (in bits) + +# Change here - Specify the IMA parameters here +xbar_bits = 2 +num_matrix = 2 # each matrix is 1-fw logical xbar for inference and 1-fw, 1-bw, and 1 delta logical xbar for training. Each logical xbar for inference is 8-fw physical xbar and for training 8-fw, 8-bw and 16-delta physical xbars. +xbar_size = 128 +dac_res = 1 +# ADC configuration +adc_res = 8 # around 4 to 8. this value should be +num_adc_per_matrix = 2 +num_adc = num_adc_per_matrix * num_matrix + +# The idea is to have different ADC resolution value for each ADC. +# The number of ADC if defined by num_adc property. Currently it is 2 * num_matrix(2) = 4 +# NOTE: Only taking in account indexes 0 and 2, 1 and 3 are ignored, because ADCs 1 and 3 are assumed t be equal to 0 and 2. +adc_res_new = { + 'matrix_adc_0' : 8, + 'matrix_adc_1' : 4, + 'matrix_adc_2' : 8, + 'matrix_adc_3' : 4 + } + +num_ALU = num_matrix*2 +#dataMem_size = num_matrix*(6*xbar_size) # 4 for 4 input spaces within matrix (1 for f/b each, 2 for d) +dataMem_size = 4096 # 2048 is larger than num_matrix*(6*xbar_size) +instrnMem_size = 8192 #in entries + +# This depends on above parameters +if (training): + datamem_off = xbar_size * (num_matrix*6) # each matrix has 6 memory spaces (1 for f/b, 2 for d) + +if (inference): + datamem_off = xbar_size * (num_matrix*2) # each matrix has 2 memory spaces ( 1 input Xbar memory and 1 output Xbar memory) + +phy2log_ratio = num_bits / xbar_bits # ratio of physical to logical xbar #vaulue is 8 +lr = 0.25 # learning rate for updates to d-xbar + +## Tile configurable parameters (permissible values for each parameter provided here) +## Instruction generation - affected by num_ima +# num_ima: positive integer +# edram buswidth: positive integer <= 16 (actual buswidth - this integer*data_width) +# edram_size: (in KiloBytes) - 64, 128, 256, 512 +# receive_buffer_depth: 4, 8, 12, 16, 32 (number of edram buffer entries (each entry maps to a virtual tile)) \ +# puts a cap on the maximum num ber of tiles that can send data to a tile in next layer +# receive_buffer_width: edram_buswidth/data_width (Fixed - in terms of number of neurons) +# tile_instrnMem_size: 256, 512, 1024 (in Bytes) + +# Fixed parameters +instrn_width = 48 # bits (op-2, vtile_id-6, send/receive_width-8, target_addr/counter-16, vw-8, mem_addr-16) +edram_buswidth = 256 # in bits +#receive_buffer_depth = 16 +receive_buffer_depth = 150 #set equal to num_tile_max +receive_buffer_width = edram_buswidth / num_bits # size of receive buffeer entry (in terms of number of neurons) + +# Change here - Specify the Tile parameters here +num_ima = 8 +edram_size = 2048 # in Kilobytes (64 KB - same as issac) +tile_instrnMem_size = 4096 # in entries + +## Node configurable parameters (permissible values for each parameter provided here) +## Instruction generation - affected by num_tile +# num_tile_compute = positive integer +# inj_rate < 0.2 (depends on the mapping) +# num_port: 4, 8 + +# Fixed parameters +# NOC topology: cmesh (n=2, k=4, c=4) - can fit k*n*c tiles +cmesh_c = 4 +num_bits_tileId =32 +flit_width = 32 +packet_width = edram_buswidth/data_width #in multiples of flits (data considered only - booksim consider address itself) +# (b bit of address = logN, N is the number of nodes) + +# Change here - Specify the Node parameters here +num_tile_compute = 7 # number of tiles mapped by dnn (leaving input and output tiles) +num_tile_max = 168.0 # maximum number of tiles per node +num_inj_max = num_tile_max # [conservative] max number of packet injections that can occur in a cycle (each tile injects a packet into NOC each cycle) +noc_inj_rate = 0.005 +noc_num_port = 4 + +## Node parameters - Our way of simulation just assumes all tile in one actual node +num_node = 1 + +# Do not change this - total number of tiles +num_tile = num_node * num_tile_compute + 2 # +1 for first tile (I/O tile) - dummy, others - compute + +#Security parameters - Used to verify if the model used is encryted or authenticated (set by dpe.py) +#Do not change +encrypted = False +authenticated = False +cypher_name = '' +cypher_hash = '' diff --git a/include/example-configs/config-mlp.py b/include/example-configs/config-mlp.py new file mode 100644 index 00000000..8cd88e99 --- /dev/null +++ b/include/example-configs/config-mlp.py @@ -0,0 +1,123 @@ +# This file contains the configurable parameters in DPE (all hierarchies - IMA, Tile, Node) +## All user specified parameters are provided by this file only + +## Debug - 0 (1): dpe simulation will (won't) produce ima/tile traces while simulating +cycles_max = 5000000 # Put both these to very large numbers (when design is bug-free)! +debug = 1 +xbar_record = 1 +inference = 1 +training = not(inference) + +## Operand precision (fixed point allowed only): num_bits = int_bits + frac_bits +num_bits = 16 +int_bits = 4 +frac_bits = num_bits - int_bits + +## IMA configurable parameters (permissible values for each parameter provided here) +## Instruction generation - affected by xbar_bits, num_xbar, xbar_size. +# xbar_bits: 2, 4, 6 +# num_xbar: positive integer +# xbar_size: 32, 64, 128, 256 +# dac_res: positive integer <= num_bits +# adc_res: positive integer <= num_bits +# num_adc: positive integer <= num_xbar (doesn't allow more than one ADC per xbar) +# num_ALU: positive integer +# dataMem_size: (in Bytes) - 256, 512, 1024, 2048 (affects instrn width, hence capped) +# instrnMem_size: (in Bytes) - 512, 1024, 2048 + +# Fixed parameters +addr_width = 22 # Added to address larger address space for conv layers (#TODO: Compiler needs to fix shared memory reuse) +data_width = num_bits # (in bits) +xbdata_width = data_width # (in bits) +instrn_width = 48 # (in bits) + +# Change here - Specify the IMA parameters here +xbar_bits = 2 +num_matrix = 2 # each matrix is 1-fw logical xbar for inference and 1-fw, 1-bw, and 1 delta logical xbar for training. Each logical xbar for inference is 8-fw physical xbar and for training 8-fw, 8-bw and 16-delta physical xbars. +xbar_size = 128 +dac_res = 1 +# ADC configuration +adc_res = 8 # around 4 to 8. this value should be +num_adc_per_matrix = 2 +num_adc = num_adc_per_matrix * num_matrix + +# The idea is to have different ADC resolution value for each ADC. +# The number of ADC if defined by num_adc property. Currently it is 2 * num_matrix(2) = 4 +# NOTE: Only taking in account indexes 0 and 2, 1 and 3 are ignored, because ADCs 1 and 3 are assumed t be equal to 0 and 2. +adc_res_new = { + 'matrix_adc_0' : 8, + 'matrix_adc_1' : 4, + 'matrix_adc_2' : 8, + 'matrix_adc_3' : 4 + } + +num_ALU = num_matrix*2 +#dataMem_size = num_matrix*(6*xbar_size) # 4 for 4 input spaces within matrix (1 for f/b each, 2 for d) +dataMem_size = 2048 # 2048 is larger than num_matrix*(6*xbar_size) +instrnMem_size = 512 #in entries + +# This depends on above parameters +if (training): + datamem_off = xbar_size * (num_matrix*6) # each matrix has 6 memory spaces (1 for f/b, 2 for d) + +if (inference): + datamem_off = xbar_size * (num_matrix*2) # each matrix has 2 memory spaces ( 1 input Xbar memory and 1 output Xbar memory) + +phy2log_ratio = num_bits / xbar_bits # ratio of physical to logical xbar #vaulue is 8 +lr = 0.25 # learning rate for updates to d-xbar + +## Tile configurable parameters (permissible values for each parameter provided here) +## Instruction generation - affected by num_ima +# num_ima: positive integer +# edram buswidth: positive integer <= 16 (actual buswidth - this integer*data_width) +# edram_size: (in KiloBytes) - 64, 128, 256, 512 +# receive_buffer_depth: 4, 8, 12, 16, 32 (number of edram buffer entries (each entry maps to a virtual tile)) \ +# puts a cap on the maximum num ber of tiles that can send data to a tile in next layer +# receive_buffer_width: edram_buswidth/data_width (Fixed - in terms of number of neurons) +# tile_instrnMem_size: 256, 512, 1024 (in Bytes) + +# Fixed parameters +instrn_width = 48 # bits (op-2, vtile_id-6, send/receive_width-8, target_addr/counter-16, vw-8, mem_addr-16) +edram_buswidth = 256 # in bits +#receive_buffer_depth = 16 +receive_buffer_depth = 150 #set equal to num_tile_max +receive_buffer_width = edram_buswidth / num_bits # size of receive buffeer entry (in terms of number of neurons) + +# Change here - Specify the Tile parameters here +num_ima = 8 +edram_size = 64 # in Kilobytes (64 KB - same as issac) +tile_instrnMem_size = 2048 # in entries + +## Node configurable parameters (permissible values for each parameter provided here) +## Instruction generation - affected by num_tile +# num_tile_compute = positive integer +# inj_rate < 0.2 (depends on the mapping) +# num_port: 4, 8 + +# Fixed parameters +# NOC topology: cmesh (n=2, k=4, c=4) - can fit k*n*c tiles +cmesh_c = 4 +num_bits_tileId =32 +flit_width = 32 +packet_width = edram_buswidth/data_width #in multiples of flits (data considered only - booksim consider address itself) +# (b bit of address = logN, N is the number of nodes) + +# Change here - Specify the Node parameters here +num_tile_compute = 7 # number of tiles mapped by dnn (leaving input and output tiles) +num_tile_max = 168.0 # maximum number of tiles per node +num_inj_max = num_tile_max # [conservative] max number of packet injections that can occur in a cycle (each tile injects a packet into NOC each cycle) +noc_inj_rate = 0.005 +noc_num_port = 4 + +## Node parameters - Our way of simulation just assumes all tile in one actual node +num_node = 1 + +# Do not change this - total number of tiles +num_tile = num_node * num_tile_compute + 2 # +1 for first tile (I/O tile) - dummy, others - compute + +#Security parameters - Used to verify if the model used is encryted or authenticated (set by dpe.py) +#Do not change +encrypted = False +authenticated = False +cypher_name = '' +cypher_hash = '' diff --git a/src/dpe.py b/src/dpe.py index 8cc08a7a..d4124c4c 100644 --- a/src/dpe.py +++ b/src/dpe.py @@ -168,8 +168,8 @@ def run(self, net): if (cfg.debug): node_dump(node_dut, self.tracepath) - if (cfg.xbar_record): - record_xbar(node_dut) + #if (cfg.xbar_record): + # record_xbar(node_dut) # Dump the contents of output tile (DNN output) to output file (output.txt) output_file = self.tracepath + 'output.txt' diff --git a/src/ima.py b/src/ima.py index 80b669ef..ddf31af1 100644 --- a/src/ima.py +++ b/src/ima.py @@ -284,7 +284,8 @@ def do_decode (self, dec_op): # instruction specific (for eg: ld_dec - load's decode stage) if (dec_op == 'ld'): assert (self.fd_instrn['r1'] >= datamem_off), 'load address for tile memory comes from data memory' - self.de_r1 = bin2int(self.dataMem.read(self.fd_instrn['r1']), cfg.num_bits) # absolute mem addr + self.de_r1 = bin2int(self.dataMem.read(self.fd_instrn['r1']), cfg.addr_width) # absolute mem addr + assert (self.de_r1 >=0) # mem addr for load should be non negative self.de_d1 = self.fd_instrn['d1'] self.de_r2 = self.fd_instrn['imm'] # used for incrementing/decrementing counter for edram entries self.de_vec = self.fd_instrn['vec'] @@ -298,7 +299,8 @@ def do_decode (self, dec_op): elif (dec_op == 'st'): assert (self.fd_instrn['d1'] >= datamem_off), 'store address for tile memory comes from data memory' - self.de_d1 = bin2int(self.dataMem.read(self.fd_instrn['d1']), cfg.num_bits) #absolute mem addr + self.de_d1 = bin2int(self.dataMem.read(self.fd_instrn['d1']), cfg.addr_width) #absolute mem addr + assert (self.de_d1 >=0) # mem addr for store should be non negative self.de_r1 = self.fd_instrn['r1'] # reg addr self.de_vec = self.fd_instrn['vec'] # source value will be read in execute stage @@ -514,8 +516,9 @@ def do_execute (self, ex_op, fid): # write to dataMem - check if addr is a valid datamem address dst_addr = self.de_d1 + i if (dst_addr >= datamem_off): - self.dataMem.write (dst_addr, self.de_val1) + self.dataMem.write(addr=dst_addr, data=self.de_val1, type_t='addr') #Updated for separate data_width and addr_width else: + assert (1==0) # Set instructions cannot write to MVMU storage writeToXbarMem (self, dst_addr, self.de_val1) elif (ex_op == 'cp'): diff --git a/src/ima_modules.py b/src/ima_modules.py index 46328b54..790bff46 100644 --- a/src/ima_modules.py +++ b/src/ima_modules.py @@ -466,12 +466,22 @@ def read (self, addr): return self.memfile[addr - self.addr_start] - def write (self, addr, data): + def write (self, addr, data, type_t='data'): self.num_access += 1 assert (type(addr) == int), 'addr type should be int' assert (self.addr_start <= addr <= self.addr_end), 'addr exceeds the memory bounds' #print 'length of data ' + str(len(data)) - assert ((type(data) == str) and (len(data) == cfg.data_width)), 'data should be a string with mem_width bits' + #assert ((type(data) == str) and (len(data) == cfg.data_width)), 'data should be a string with mem_width bits' + assert ((type(data) == str) and ((type_t == 'data')) or (type_t == 'addr')) # UPDATE - Pointer/address for LD/ST written by previous SET instrn. can be larger than data_width + if (type_t == 'data'): + try: + assert (len(data) == cfg.data_width) + #print("I am here!!") + except AssertionError: + print("Warning: Data width received is not-coherent, NEEDS DEBUGGING") + data = data[0:16] + else: + assert (len(data) == cfg.addr_width) # Specification for pointer (or addres type data) self.memfile[addr - self.addr_start] = data def reset (self): diff --git a/src/instrn_proto.py b/src/instrn_proto.py index ccad494e..4992bb25 100644 --- a/src/instrn_proto.py +++ b/src/instrn_proto.py @@ -52,7 +52,7 @@ def i_set (d1, imm, vec = 1): i_temp = param.dummy_instrn.copy () i_temp['opcode'] = 'set' i_temp['d1'] = d1 - i_temp['imm'] = imm if (type(imm) == str) else int2bin(imm, 16) + i_temp['imm'] = imm if (type(imm) == str) else int2bin(imm, cfg.addr_width) i_temp['vec'] = vec return i_temp diff --git a/src/tile.py b/src/tile.py index b42d49ca..a849dd35 100644 --- a/src/tile.py +++ b/src/tile.py @@ -89,7 +89,7 @@ def tile_init (self, instrnpath, tracepath): self.ima_list[i].pipe_init (instrnfile, self.fid_list[i]) # Initialize the EDRAM - invalidate all entries (valid_list) - self.edram_controller.valid = [0] * (cfg.edram_size*1024/(cfg.data_width/8)) + self.edram_controller.valid = [0] * (cfg.edram_size*1024*8/(cfg.data_width)) # Intiialize the receive buffer - invalidate self.receive_buffer.inv () diff --git a/src/tile_modules.py b/src/tile_modules.py index 40a04b7a..417f95e2 100644 --- a/src/tile_modules.py +++ b/src/tile_modules.py @@ -119,9 +119,9 @@ def __init__ (self): self.num_access_counter = 0 # Instantiate EDRAM, valid and counter fields - self.mem = edram (cfg.edram_size*1024/(cfg.data_width/8)) #edram_size is in KB - self.valid = [0] * (cfg.edram_size*1024/(cfg.data_width/8)) #edram_size is in KB - self.counter = [0] * (cfg.edram_size*1024/(cfg.data_width/8)) #edram_size is in KB + self.mem = edram (cfg.edram_size*1024*8/(cfg.data_width)) #edram_size is in KB + self.valid = [0] * (cfg.edram_size*1024*8/(cfg.data_width)) #edram_size is in KB + self.counter = [0] * (cfg.edram_size*1024*8/(cfg.data_width)) #edram_size is in KB # Define latency self.latency = param.edram_lat diff --git a/test/cnn/conv-layer-benchmark1.cpp b/test/cnn/conv-layer-benchmark1.cpp new file mode 100644 index 00000000..4dabac9a --- /dev/null +++ b/test/cnn/conv-layer-benchmark1.cpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2019 IMPACT Research Group, University of Illinois. + * All rights reserved. + * + * This file is covered by the LICENSE.txt license file in the root directory. + * + */ + +#include +#include +#include + +#include "puma.h" +#include "conv-layer.h" + +int main(int argc, char** argv) { + + Model model = Model::create("conv1-layer"); + + // Process parameters + unsigned int in_size_x = 9; + unsigned int in_size_y = 9; + unsigned int in_channels = 64; + unsigned int out_channels = 64; + unsigned int k_size_x = 3; + unsigned int k_size_y = 3; + if(argc == 7) { + in_size_x = atoi(argv[1]); + in_size_y = atoi(argv[2]); + in_channels = atoi(argv[3]); + out_channels = atoi(argv[4]); + k_size_x = atoi(argv[5]); + k_size_y = atoi(argv[6]); + } + + // Input stream + auto in_stream = InputImagePixelStream::create(model, "in_stream", in_size_x, in_size_y, in_channels); + + // Output stream + unsigned int out_size_x = in_size_x; + unsigned int out_size_y = in_size_y; + auto out_stream = OutputImagePixelStream::create(model, "out_stream", out_size_x, out_size_y, out_channels); + + // Layer + out_stream = conv_layer(model, "", k_size_x, k_size_y, in_size_x, in_size_y, in_channels, out_channels, in_stream); + + // Compile + model.compile(); + + // Destroy model + model.destroy(); + + return 0; + +} + diff --git a/test/cnn/conv-layer-benchmark2.cpp b/test/cnn/conv-layer-benchmark2.cpp new file mode 100644 index 00000000..b6f08f0f --- /dev/null +++ b/test/cnn/conv-layer-benchmark2.cpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2019 IMPACT Research Group, University of Illinois. + * All rights reserved. + * + * This file is covered by the LICENSE.txt license file in the root directory. + * + */ + +#include +#include +#include + +#include "puma.h" +#include "conv-layer.h" + +int main(int argc, char** argv) { + + Model model = Model::create("conv2-layer"); + + // Process parameters + unsigned int in_size_x = 5; + unsigned int in_size_y = 5; + unsigned int in_channels = 256; + unsigned int out_channels = 256; + unsigned int k_size_x = 3; + unsigned int k_size_y = 3; + if(argc == 7) { + in_size_x = atoi(argv[1]); + in_size_y = atoi(argv[2]); + in_channels = atoi(argv[3]); + out_channels = atoi(argv[4]); + k_size_x = atoi(argv[5]); + k_size_y = atoi(argv[6]); + } + + // Input stream + auto in_stream = InputImagePixelStream::create(model, "in_stream", in_size_x, in_size_y, in_channels); + + // Output stream + unsigned int out_size_x = in_size_x; + unsigned int out_size_y = in_size_y; + auto out_stream = OutputImagePixelStream::create(model, "out_stream", out_size_x, out_size_y, out_channels); + + // Layer + out_stream = conv_layer(model, "", k_size_x, k_size_y, in_size_x, in_size_y, in_channels, out_channels, in_stream); + + // Compile + model.compile(); + + // Destroy model + model.destroy(); + + return 0; + +} + diff --git a/test/cnn/conv-layer-stride.cpp b/test/cnn/conv-layer-stride.cpp new file mode 100644 index 00000000..1c13dee8 --- /dev/null +++ b/test/cnn/conv-layer-stride.cpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2019 IMPACT Research Group, University of Illinois. + * All rights reserved. + * + * This file is covered by the LICENSE.txt license file in the root directory. + * + */ + +#include +#include +#include +#include + +#include "puma.h" +#include "conv-layer.h" +using namespace std; +int main(int argc, char** argv) { + +// Model model = Model::create("conv3-layer"); + + // Process parameter + unsigned int in_size_x ; + unsigned int in_size_y ; + unsigned int in_channels ; + unsigned int out_channels ; + unsigned int k_size_x ; + unsigned int k_size_y ; + unsigned int padding ; + unsigned int stride ; + + if(argc == 10) { + in_size_x = atoi(argv[1]); + in_size_y = atoi(argv[2]); + in_channels = atoi(argv[3]); + out_channels = atoi(argv[4]); + k_size_x = atoi(argv[5]); + k_size_y = atoi(argv[6]); + padding = atoi(argv[7]); + stride = atoi(argv[8]); + } + std:: string str=std::string("conv") + argv[9] + std::string("-layer"); + Model model = Model::create(str); + + // Input stream + auto in_stream = InputImagePixelStream::create(model, "in_stream", in_size_x, in_size_y, in_channels); + + // Output stream + unsigned int out_size_x = (in_size_x - k_size_x + 2*padding)/stride + 1; + unsigned int out_size_y = (in_size_y - k_size_y + 2*padding)/stride + 1; + + assert((in_size_x - k_size_x + 2*padding)%stride==0); //input image size should result in integer out image size + auto out_stream = OutputImagePixelStream::create(model, "out_stream", out_size_x, out_size_y, out_channels); + + // Layer + out_stream = conv_layer(model, "", k_size_x, k_size_y, in_size_x, in_size_y, in_channels, out_channels, stride, out_size_x, out_size_y, in_stream); + // Compile + model.compile(); + + // Destroy model + model.destroy(); + + return 0; + +} + diff --git a/test/cnn/conv-layer-stride.h b/test/cnn/conv-layer-stride.h new file mode 100644 index 00000000..a7eb88c7 --- /dev/null +++ b/test/cnn/conv-layer-stride.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2019 IMPACT Research Group, University of Illinois. + * All rights reserved. + * + * This file is covered by the LICENSE.txt license file in the root directory. + * + */ + +#ifndef _PUMA_TEST_CONV_LAYER_ +#define _PUMA_TEST_CONV_LAYER_ + +#include "puma.h" + +static ImagePixelStream conv_layer(Model model, std::string layerName, unsigned int k_size_x, unsigned int k_size_y, unsigned int in_size_x, unsigned int in_size_y, unsigned int in_channels, unsigned int out_channels, unsigned int stride, unsigned int out_size_x, unsigned int out_size_y, ImagePixelStream in_stream) { + + ConvolutionalConstantMatrix mat = ConvolutionalConstantMatrix::create(model, layerName + "conv_mat", k_size_x, k_size_y, in_channels, out_channels, stride, out_size_x, out_size_y); + + return sig(mat*in_stream); + +} + +static ImagePixelStream convmax_layer(Model model, std::string layerName, unsigned int k_size_x, unsigned int k_size_y, unsigned int in_size_x, unsigned int in_size_y, unsigned int in_channels, unsigned int out_channels, unsigned int stride, unsigned int out_size_x, unsigned int max_pool_size_x, unsigned int max_pool_size_y, ImagePixelStream in_stream) { + + ConvolutionalConstantMatrix mat = ConvolutionalConstantMatrix::create(model, layerName + "conv_mat", k_size_x, k_size_y, in_channels, out_channels, stride, out_size_x, out_size_x); + + return maxpool(sig(mat*in_stream), max_pool_size_y, max_pool_size_x); + +} + +#endif + diff --git a/test/cnn/conv-layer.h b/test/cnn/conv-layer.h new file mode 100644 index 00000000..ec23ac57 --- /dev/null +++ b/test/cnn/conv-layer.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2019 IMPACT Research Group, University of Illinois. + * All rights reserved. + * + * This file is covered by the LICENSE.txt license file in the root directory. + * + */ + +#ifndef _PUMA_TEST_CONV_LAYER_ +#define _PUMA_TEST_CONV_LAYER_ + +#include "puma.h" + +static ImagePixelStream conv_layer(Model model, std::string layerName, unsigned int k_size_x, unsigned int k_size_y, unsigned int in_size_x, unsigned int in_size_y, unsigned int in_channels, unsigned int out_channels, ImagePixelStream in_stream) { + + ConvolutionalConstantMatrix mat = ConvolutionalConstantMatrix::create(model, layerName + "conv_mat", k_size_x, k_size_y, in_channels, out_channels); + + return sig(mat*in_stream); + +} + +static ImagePixelStream convmax_layer(Model model, std::string layerName, unsigned int k_size_x, unsigned int k_size_y, unsigned int in_size_x, unsigned int in_size_y, unsigned int in_channels, unsigned int out_channels, unsigned int max_pool_size_x, unsigned int max_pool_size_y, ImagePixelStream in_stream) { + + ConvolutionalConstantMatrix mat = ConvolutionalConstantMatrix::create(model, layerName + "conv_mat", k_size_x, k_size_y, in_channels, out_channels); + + return maxpool(sig(mat*in_stream), max_pool_size_y, max_pool_size_x); + +} + +#endif + diff --git a/test/utils/run-cnn-benchmark.sh b/test/utils/run-cnn-benchmark.sh new file mode 100755 index 00000000..5168c1e4 --- /dev/null +++ b/test/utils/run-cnn-benchmark.sh @@ -0,0 +1,49 @@ +set -v +set -e +path=`pwd` #path to your puma directory +echo $path +cppfile=conv-layer #name for cpp file that you want to compile ex- mlp_l4_mnist.cpp, conv-layer.cpp, convmax-layer.cpp +name=conv #name for the folder generated by compiler +pumaenv=pumaenv #name for the environment +fileno=0 #variable so that conv folder generated by compilers do not overlap (u might want to change this variable to different int values for different layers) +name=$name$fileno +#layer parameters +inx=9 +iny=9 +inC=64 +outC=64 +kx=3 +ky=3 +p=1 +s=1 +#copying cnn config file +rm ${path}/puma-simulator/include/config.py #remove existing config file +cp ${path}/puma-simulator/include/example-configs/config-cnn.py ${path}/puma-simulator/include/config.py #copy the mlp config file to include +#copying model file +rm ${path}/puma-compiler/test/${cppfile}.cpp ${path}/puma-compiler/test/${cppfile}.h +cp ${path}/puma-simulator/test/cnn/conv-layer-stride.cpp ${path}/puma-compiler/test/${cppfile}.cpp #copy the mlp config file to include +cp ${path}/puma-simulator/test/cnn/conv-layer-stride.h ${path}/puma-compiler/test/${cppfile}.h #copy the mlp config file to include + +cd ${path}/puma-compiler/src +source ~/.bashrc +conda activate ${pumaenv} + +make clean +make + +cd ${path}/puma-compiler/test +make clean +make ${cppfile}.test +export LD_LIBRARY_PATH=`pwd`/../src:$LD_LIBRARY_PATH +./${cppfile}.test ${inx} ${iny} ${inC} ${outC} ${kx} ${ky} ${p} ${s} ${fileno} +echo $cppfile +./generate-py.sh +cp -r ${name} ../../puma-simulator/test/testasm + +cd ${path}/puma-simulator/src + + +python dpe.py -n ${name} + + + diff --git a/test/utils/run-cnn-benchmark1.sh b/test/utils/run-cnn-benchmark1.sh new file mode 100755 index 00000000..e62c464b --- /dev/null +++ b/test/utils/run-cnn-benchmark1.sh @@ -0,0 +1,36 @@ +set -v +set -e +path=`pwd` #path to your puma directory +echo $path +cppfile=conv-layer #name for cpp file that you want to compile ex- mlp_l4_mnist.cpp, conv-layer.cpp, convmax-layer.cpp +pumaenv=pumaenv #name for the environment +#copying cnn config file +rm ${path}/puma-simulator/include/config.py #remove existing config file +cp ${path}/puma-simulator/include/example-configs/config-cnn.py ${path}/puma-simulator/include/config.py #copy the mlp config file to include +#copying model file +rm ${path}/puma-compiler/test/conv-layer.cpp +cp ${path}/puma-simulator/test/cnn/conv-layer-benchmark1.cpp ${path}/puma-compiler/test/${cppfile}.cpp #copy the mlp config file to include + +cd ${path}/puma-compiler/src +source ~/.bashrc +conda activate ${pumaenv} + +make clean +make + +cd ${path}/puma-compiler/test +make clean +make ${cppfile}.test +export LD_LIBRARY_PATH=`pwd`/../src:$LD_LIBRARY_PATH +./${cppfile}.test +echo $cppfile +./generate-py.sh +cp -r conv1 ../../puma-simulator/test/testasm + +cd ${path}/puma-simulator/src + + +python dpe.py -n conv1 + + + diff --git a/test/utils/run-cnn-benchmark2.sh b/test/utils/run-cnn-benchmark2.sh new file mode 100755 index 00000000..ed918825 --- /dev/null +++ b/test/utils/run-cnn-benchmark2.sh @@ -0,0 +1,36 @@ +set -v +set -e +path=`pwd` #path to your puma directory +echo $path +cppfile=conv-layer #name for cpp file that you want to compile ex- mlp_l4_mnist.cpp, conv-layer.cpp, convmax-layer.cpp +pumaenv=pumaenv #name for the environment +#copying cnn config file +rm ${path}/puma-simulator/include/config.py #remove existing config file +cp ${path}/puma-simulator/include/example-configs/config-cnn.py ${path}/puma-simulator/include/config.py #copy the mlp config file to include +#copying model file +rm ${path}/puma-compiler/test/conv-layer.cpp +cp ${path}/puma-simulator/test/cnn/conv-layer-benchmark2.cpp ${path}/puma-compiler/test/${cppfile}.cpp #copy the mlp config file to include + +cd ${path}/puma-compiler/src +source ~/.bashrc +conda activate ${pumaenv} + +make clean +make + +cd ${path}/puma-compiler/test +make clean +make ${cppfile}.test +export LD_LIBRARY_PATH=`pwd`/../src:$LD_LIBRARY_PATH +./${cppfile}.test +echo $cppfile +./generate-py.sh +cp -r conv2 ../../puma-simulator/test/testasm + +cd ${path}/puma-simulator/src + + +python dpe.py -n conv2 + + + diff --git a/test/utils/run-mlp-benchmark.sh b/test/utils/run-mlp-benchmark.sh new file mode 100755 index 00000000..c7ad4480 --- /dev/null +++ b/test/utils/run-mlp-benchmark.sh @@ -0,0 +1,34 @@ +set -v +set -e +path=`pwd` #path to your puma directory +echo $path +cppfile=mlp_l4_mnist #name for cpp file that you want to compile ex- mlp_l4_mnist.cpp, conv-layer.cpp, convmax-layer.cpp +name=mlp #name for the folder generated by compiler +pumaenv=pumaenv #name for the environment + +rm ${path}/puma-simulator/include/config.py #remove existing config file +cp ${path}/puma-simulator/include/example-configs/config-mlp.py ${path}/puma-simulator/include/config.py #copy the mlp config file to include + +cd ${path}/puma-compiler/src +source ~/.bashrc +conda activate ${pumaenv} + +make clean +make + +cd ${path}/puma-compiler/test +make clean +make ${cppfile}.test +export LD_LIBRARY_PATH=`pwd`/../src:$LD_LIBRARY_PATH +./${cppfile}.test +echo $cppfile +./generate-py.sh +cp -r ${name} ../../puma-simulator/test/testasm + +cd ${path}/puma-simulator/src + + +python dpe.py -n ${name} + + +