Hello I’m using PYNQ 2.5.1 image and I have created 2 layer fully connected NN.
class nn_hw_accel:
def __init__(self, weights):
self.nn_overlay = Overlay("/home/xilinx/jupyter_notebooks/design_nn_full.bit")
print("Translating weight list to numpy array...")
self.w_state = 0
w = np.array(weights, dtype=np.float32)
layers_count, neurons_count, weights_count = w.shape
self.layers_count = layers_count
self.neurons_count = neurons_count
self.weights_count = weights_count
print("Allocating dma...")
self.target = allocate(shape=(768), dtype=np.float32)
#self.out_nn = allocate(shape=(768), dtype=np.float32)
self.calc_error = allocate(shape=(768), dtype=np.float32)
print("Allocating weights...")
self.weights_new = allocate((2, 768, 769), dtype=np.float32)
self.weights = allocate((2, 768, 769), dtype=np.float32)
for x in range(w.shape[0]):
for y in range(w.shape[1]):
for z in range(w.shape[2]):
self.weights[x][y][z]=weights[x][y][z]
self.weights_new[x][y][z]=weights[x][y][z]
print("Allocating neural network outputs...")
self.outputs = allocate(shape=(3, 768), dtype=np.float32)
print("Setting physical adresses for IP's")
print("Forward propagation IP")
self.nn_overlay.forward_propagate_L2_0.register_map.leak = 1
self.nn_overlay.forward_propagate_L2_0.register_map.weight = self.weights.physical_address
self.nn_overlay.forward_propagate_L2_0.register_map.output_r = self.outputs.physical_address
self.nn_overlay.forward_propagate_L2_0.register_map.m_lay = layers_count
self.nn_overlay.forward_propagate_L2_0.register_map.m_neu = neurons_count
print("Back propagation IP")
self.nn_overlay.back_propagate_L2_N_0.register_map.m_lay = layers_count
self.nn_overlay.back_propagate_L2_N_0.register_map.m_neu = neurons_count
self.nn_overlay.back_propagate_L2_N_0.register_map.new_weight = self.weights_new.physical_address
self.nn_overlay.back_propagate_L2_N_0.register_map.weight = self.weights.physical_address
self.nn_overlay.back_propagate_L2_N_0.register_map.output_offset = self.outputs.physical_address
self.nn_overlay.back_propagate_L2_N_0.register_map.en_offset = self.calc_error.physical_address
print("Finished")
def calculate_error(self, target):
target = np.array(target, dtype=np.float32)
target_size = target.shape[0]
buf = 0
for i in range(target_size):
buf = buf + ((self.outputs[2][i] - target[i])**2)/2
return buf
def output_error(self):
self.nn_overlay.axi_dma_target.sendchannel.transfer(self.target)
self.nn_overlay.axi_dma_nn_out.sendchannel.transfer(self.outputs[2])
self.nn_overlay.axi_dma_out_r.recvchannel.transfer(self.calc_error)
self.nn_overlay.axi_dma_nn_out.sendchannel.wait()
self.nn_overlay.axi_dma_target.sendchannel.wait()
self.nn_overlay.axi_dma_out_r.recvchannel.wait()
print(f"Self.target {self.target}")
print(f"self.outputs[2] {self.outputs[2]}")
print(f"self.calc_error {self.calc_error}")
#free running
def float_to_uint(self, f):
return int(struct.unpack('<I', struct.pack('<f', f))[0])
def uint_to_float(self, f):
return float(struct.unpack('<f', struct.pack('<I', f))[0])
def backpropagationG(self, targ, lr):
for i in range(targ.shape[0]):
self.target[i] = targ[i]
#krnl_dma
self.output_error()
#krnl_dma
#krnl
le_lr = self.float_to_uint(lr)
self.nn_overlay.back_propagate_L2_N_0.register_map.learning_rate = le_lr
self.nn_overlay.back_propagate_L2_N_0.register_map.CTRL.AP_START = 1
#krnl
while True:
done_idle = self.nn_overlay.back_propagate_L2_N_0.register_map.CTRL.AP_IDLE
done_start = self.nn_overlay.back_propagate_L2_N_0.register_map.CTRL.AP_START
if (done_idle == 1 and done_start == 0):
break
print(f"Waiting...")
"""
if (self.w_state == 0):
self.nn_overlay.back_propagate_L2_N_0.register_map.new_weight = self.weights_new.physical_address
self.nn_overlay.back_propagate_L2_N_0.register_map.weight = self.weights.physical_address
self.w_state = 1
else:
self.nn_overlay.back_propagate_L2_N_0.register_map.new_weight = self.weights.physical_address
self.nn_overlay.back_propagate_L2_N_0.register_map.weight = self.weights_new.physical_address
self.w_state = 0
"""
for x in range(self.layers_count):
for y in range(self.neurons_count):
for z in range(self.weights_count-1):
self.weights[x][y][z]=self.weights_new[x][y][z]
def forward_propagate(self, inputs):
inputs = np.array(inputs, dtype=np.float32)
print(inputs.shape)
for x in range(inputs.shape[0]):
self.outputs[0][x] = inputs[x]
#krnl
self.nn_overlay.forward_propagate_L2_0.register_map.CTRL.AP_START = 1
#krnl
It works fine but it is slow because of:
for x in range(self.layers_count):
for y in range(self.neurons_count):
for z in range(self.weights_count-1):
self.weights[x][y][z]=self.weights_new[x][y][z]
Next thing I have thought of is, why should I copy all weights if I could just use second buffer for new weights. Instead of for loop I’m using this:
if (self.w_state == 0):
self.nn_overlay.back_propagate_L2_N_0.register_map.new_weight = self.weights_new.physical_address
self.nn_overlay.back_propagate_L2_N_0.register_map.weight = self.weights.physical_address
self.w_state = 1
else:
self.nn_overlay.back_propagate_L2_N_0.register_map.new_weight = self.weights.physical_address
self.nn_overlay.back_propagate_L2_N_0.register_map.weight = self.weights_new.physical_address
self.w_state = 0
But changing physical address breaks IP core and after 3-4 iterations every output goes to infinity.
Is rapidly changing physical_address from buffers breaking somehow accelerator? (Writing same physical_adress many times seems not to break anything)
void forward_propagate_L2_N(int m_lay, int m_neu, float leak, float weight[MAX_LAYERS][MAX_NEURONS][MAX_NEURONS+1], float output[MAX_LAYERS+1][MAX_NEURONS]){
#pragma HLS INTERFACE s_axilite port=return bundle=ctrl
#pragma HLS INTERFACE ap_ctrl_hs port=return bundle=ctrl
#pragma HLS INTERFACE m_axi port=weight offset=slave bundle=weights
#pragma HLS INTERFACE m_axi port=output offset=slave bundle=outputs
#pragma HLS INTERFACE s_axilite port=m_lay bundle=ctrl
#pragma HLS INTERFACE s_axilite port=m_neu bundle=ctrl
#pragma HLS INTERFACE s_axilite port=leak bundle=ctrl
#pragma HLS INTERFACE s_axilite port=weights bundle=ctrl
#pragma HLS INTERFACE s_axilite port=output bundle=ctrl
float leak_loc = leak;
float activation_value;
for (unsigned int l = 0; l < m_lay; l++){
for (unsigned int n = 0; n < m_neu; n++){
activation_value = 0;
for (int w = 0; w < m_neu; w++){
#pragma HLS PIPELINE
//std::cout<< weight[l][n][w] << "*"<< output[l][w] << std::endl;
activation_value = activation_value + weight[l][n][w] * output[l][w];
}
activation_value = activation_value + weight[l][n][m_neu];
output[l+1][n] = activation_function(activation_value, leak_loc);
}
}
};
void back_propagate_L2_N(int m_lay, int m_neu, float learning_rate, float weight[MAX_LAYERS][MAX_NEURONS][MAX_NEURONS+1],
float new_weight[MAX_LAYERS][MAX_NEURONS][MAX_NEURONS+1], float output[MAX_LAYERS][MAX_NEURONS], float en[MAX_NEURONS]){
#pragma HLS INTERFACE s_axilite port=return bundle=ctrl
#pragma HLS INTERFACE m_axi port=weight offset=slave bundle=weights
#pragma HLS INTERFACE m_axi port=output offset=slave bundle=output
#pragma HLS INTERFACE s_axilite port=weights bundle=ctrl
#pragma HLS INTERFACE s_axilite port=output bundle=ctrl
#pragma HLS INTERFACE m_axi port=new_weight offset=slave bundle=new_weights
#pragma HLS INTERFACE s_axilite port=new_weight bundle=ctrl
#pragma HLS INTERFACE m_axi port=en offset=slave bundle=en
#pragma HLS INTERFACE s_axilite port=en bundle=ctrl
#pragma HLS INTERFACE s_axilite port=m_lay bundle=ctrl
#pragma HLS INTERFACE s_axilite port=m_neu bundle=ctrl
#pragma HLS INTERFACE s_axilite port=learning_rate bundle=ctrl
for (unsigned int n = 0; n < m_neu; n++){
for (unsigned int w = 0; w < m_neu; w++){
#pragma HLS PIPELINE
//std::cout << weight[m_lay-1][n][w] << "-" << learning_rate << "*" << en[n] << "*" << transfer_derivative(output[m_lay-1][n]) << "*" << output[m_lay-1][w] << std::endl;
new_weight[m_lay-1][n][w] = weight[m_lay-1][n][w] - learning_rate * en[n] * transfer_derivative(output[m_lay-1][n]) * output[m_lay-1][w];
}
}
for (unsigned int n = 0; n < m_neu; n++){
float wyn = 0;
for (unsigned int w = 0; w < m_neu; w++){
#pragma HLS PIPELINE
wyn = wyn + en[w]*transfer_derivative(output[m_lay-1][w])*weight[m_lay-1][w][n];
}
//std::cout << "Wyn: " << wyn << std::endl;
for (unsigned int w = 0; w < m_neu; w++){
#pragma HLS PIPELINE
//std::cout << weight[m_lay-2][n][w] << "-" << learning_rate << "*" << wyn << "*" << transfer_derivative(output[m_lay-2][n]) << "*" << output[m_lay-2][w] << std::endl;
new_weight[m_lay-2][n][w] = weight[m_lay-2][n][w] - learning_rate * wyn * transfer_derivative(output[m_lay-2][n])* output[m_lay-2][w];
}
}
};