PYNQ: PYTHON PRODUCTIVITY

M_axi changing buffer physical address

Hello I’m using PYNQ 2.5.1 image and I have created 2 layer fully connected NN.

class nn_hw_accel:

def __init__(self, weights):
    self.nn_overlay = Overlay("/home/xilinx/jupyter_notebooks/design_nn_full.bit")
    
    print("Translating weight list to numpy array...")
    self.w_state = 0
    w = np.array(weights, dtype=np.float32)
    layers_count, neurons_count, weights_count = w.shape
    self.layers_count = layers_count
    self.neurons_count = neurons_count
    self.weights_count = weights_count
    
    print("Allocating dma...")
    self.target = allocate(shape=(768), dtype=np.float32)
    #self.out_nn = allocate(shape=(768), dtype=np.float32)
    self.calc_error = allocate(shape=(768), dtype=np.float32)
    
    print("Allocating weights...")
    self.weights_new = allocate((2, 768, 769), dtype=np.float32)
    self.weights = allocate((2, 768, 769), dtype=np.float32) 
    for x in range(w.shape[0]):
        for y in range(w.shape[1]):
            for z in range(w.shape[2]):
                self.weights[x][y][z]=weights[x][y][z]
                self.weights_new[x][y][z]=weights[x][y][z]
       
    print("Allocating neural network outputs...")    
    self.outputs = allocate(shape=(3, 768), dtype=np.float32)
    print("Setting physical adresses for IP's")
    
    print("Forward propagation IP")
    self.nn_overlay.forward_propagate_L2_0.register_map.leak = 1
    self.nn_overlay.forward_propagate_L2_0.register_map.weight = self.weights.physical_address
    self.nn_overlay.forward_propagate_L2_0.register_map.output_r = self.outputs.physical_address
    self.nn_overlay.forward_propagate_L2_0.register_map.m_lay = layers_count
    self.nn_overlay.forward_propagate_L2_0.register_map.m_neu = neurons_count
    
    print("Back propagation IP")
    self.nn_overlay.back_propagate_L2_N_0.register_map.m_lay = layers_count
    self.nn_overlay.back_propagate_L2_N_0.register_map.m_neu = neurons_count
    self.nn_overlay.back_propagate_L2_N_0.register_map.new_weight = self.weights_new.physical_address
    self.nn_overlay.back_propagate_L2_N_0.register_map.weight = self.weights.physical_address
    self.nn_overlay.back_propagate_L2_N_0.register_map.output_offset = self.outputs.physical_address
    self.nn_overlay.back_propagate_L2_N_0.register_map.en_offset = self.calc_error.physical_address
    print("Finished")
    
def calculate_error(self, target):
    target = np.array(target, dtype=np.float32)
    target_size = target.shape[0]
    buf = 0
    for i in range(target_size):
        buf = buf + ((self.outputs[2][i] - target[i])**2)/2
    return buf

def output_error(self):
    self.nn_overlay.axi_dma_target.sendchannel.transfer(self.target)
    self.nn_overlay.axi_dma_nn_out.sendchannel.transfer(self.outputs[2])
    self.nn_overlay.axi_dma_out_r.recvchannel.transfer(self.calc_error)
    self.nn_overlay.axi_dma_nn_out.sendchannel.wait()
    self.nn_overlay.axi_dma_target.sendchannel.wait()
    self.nn_overlay.axi_dma_out_r.recvchannel.wait()
    print(f"Self.target {self.target}")
    print(f"self.outputs[2] {self.outputs[2]}")
    print(f"self.calc_error {self.calc_error}")
    #free running

def float_to_uint(self, f):
    return int(struct.unpack('<I', struct.pack('<f', f))[0])

def uint_to_float(self, f):
    return float(struct.unpack('<f', struct.pack('<I', f))[0])

def backpropagationG(self, targ, lr):
    for i in range(targ.shape[0]):
        self.target[i] = targ[i]
        
    #krnl_dma
    self.output_error()
    #krnl_dma
    
    #krnl
    le_lr = self.float_to_uint(lr)
    self.nn_overlay.back_propagate_L2_N_0.register_map.learning_rate = le_lr
    self.nn_overlay.back_propagate_L2_N_0.register_map.CTRL.AP_START = 1
    #krnl
    while True:
        done_idle = self.nn_overlay.back_propagate_L2_N_0.register_map.CTRL.AP_IDLE
        done_start = self.nn_overlay.back_propagate_L2_N_0.register_map.CTRL.AP_START
        if (done_idle == 1 and done_start == 0):
            break
        print(f"Waiting...") 
    """
    if (self.w_state == 0):
        self.nn_overlay.back_propagate_L2_N_0.register_map.new_weight = self.weights_new.physical_address
        self.nn_overlay.back_propagate_L2_N_0.register_map.weight = self.weights.physical_address
        self.w_state = 1
    else:
        self.nn_overlay.back_propagate_L2_N_0.register_map.new_weight = self.weights.physical_address
        self.nn_overlay.back_propagate_L2_N_0.register_map.weight = self.weights_new.physical_address
        self.w_state = 0
    """
    for x in range(self.layers_count):
        for y in range(self.neurons_count):
            for z in range(self.weights_count-1):
                self.weights[x][y][z]=self.weights_new[x][y][z]
    
def forward_propagate(self, inputs):
    inputs = np.array(inputs, dtype=np.float32)
    print(inputs.shape)
    for x in range(inputs.shape[0]):
        self.outputs[0][x] = inputs[x]

    #krnl
    self.nn_overlay.forward_propagate_L2_0.register_map.CTRL.AP_START = 1
    #krnl

It works fine but it is slow because of:

for x in range(self.layers_count):
        for y in range(self.neurons_count):
            for z in range(self.weights_count-1):
                self.weights[x][y][z]=self.weights_new[x][y][z]

Next thing I have thought of is, why should I copy all weights if I could just use second buffer for new weights. Instead of for loop I’m using this:

if (self.w_state == 0):
    self.nn_overlay.back_propagate_L2_N_0.register_map.new_weight = self.weights_new.physical_address
    self.nn_overlay.back_propagate_L2_N_0.register_map.weight = self.weights.physical_address
    self.w_state = 1
else:
    self.nn_overlay.back_propagate_L2_N_0.register_map.new_weight = self.weights.physical_address
    self.nn_overlay.back_propagate_L2_N_0.register_map.weight = self.weights_new.physical_address
    self.w_state = 0

But changing physical address breaks IP core and after 3-4 iterations every output goes to infinity.
Is rapidly changing physical_address from buffers breaking somehow accelerator? (Writing same physical_adress many times seems not to break anything)

void forward_propagate_L2_N(int m_lay, int m_neu, float leak, float weight[MAX_LAYERS][MAX_NEURONS][MAX_NEURONS+1], float output[MAX_LAYERS+1][MAX_NEURONS]){
#pragma HLS INTERFACE s_axilite port=return bundle=ctrl
#pragma HLS INTERFACE ap_ctrl_hs port=return bundle=ctrl

#pragma HLS INTERFACE m_axi port=weight offset=slave bundle=weights
#pragma HLS INTERFACE m_axi port=output offset=slave bundle=outputs
#pragma HLS INTERFACE s_axilite port=m_lay bundle=ctrl
#pragma HLS INTERFACE s_axilite port=m_neu bundle=ctrl
#pragma HLS INTERFACE s_axilite port=leak bundle=ctrl
#pragma HLS INTERFACE s_axilite port=weights bundle=ctrl
#pragma HLS INTERFACE s_axilite port=output bundle=ctrl


	float leak_loc = leak;
	float activation_value;

	for (unsigned int l = 0; l < m_lay; l++){
		for (unsigned int n = 0; n < m_neu; n++){
			activation_value = 0;
			for (int w = 0; w < m_neu; w++){
				#pragma HLS PIPELINE
				//std::cout<< weight[l][n][w] << "*"<< output[l][w] << std::endl;
				activation_value = activation_value + weight[l][n][w] * output[l][w];
			}
			activation_value = activation_value + weight[l][n][m_neu];
			output[l+1][n] = activation_function(activation_value, leak_loc);
		}
	}
};

void back_propagate_L2_N(int m_lay, int m_neu, float learning_rate, float weight[MAX_LAYERS][MAX_NEURONS][MAX_NEURONS+1],
float new_weight[MAX_LAYERS][MAX_NEURONS][MAX_NEURONS+1], float output[MAX_LAYERS][MAX_NEURONS], float en[MAX_NEURONS]){
#pragma HLS INTERFACE s_axilite port=return bundle=ctrl

#pragma HLS INTERFACE m_axi port=weight offset=slave bundle=weights
#pragma HLS INTERFACE m_axi port=output offset=slave bundle=output
#pragma HLS INTERFACE s_axilite port=weights bundle=ctrl
#pragma HLS INTERFACE s_axilite port=output bundle=ctrl

#pragma HLS INTERFACE m_axi port=new_weight offset=slave bundle=new_weights
#pragma HLS INTERFACE s_axilite port=new_weight bundle=ctrl

#pragma HLS INTERFACE m_axi port=en offset=slave bundle=en
#pragma HLS INTERFACE s_axilite port=en bundle=ctrl

#pragma HLS INTERFACE s_axilite port=m_lay bundle=ctrl
#pragma HLS INTERFACE s_axilite port=m_neu bundle=ctrl
#pragma HLS INTERFACE s_axilite port=learning_rate bundle=ctrl

for (unsigned int n = 0; n < m_neu; n++){
	for (unsigned int w = 0; w < m_neu; w++){
		#pragma HLS PIPELINE
		//std::cout << weight[m_lay-1][n][w] << "-" << learning_rate << "*" << en[n] << "*" << transfer_derivative(output[m_lay-1][n]) << "*" << output[m_lay-1][w] << std::endl;
		new_weight[m_lay-1][n][w] = weight[m_lay-1][n][w] - learning_rate * en[n] * transfer_derivative(output[m_lay-1][n]) * output[m_lay-1][w];
	}
}

for (unsigned int n = 0; n < m_neu; n++){
		float wyn = 0;
		for (unsigned int w = 0; w < m_neu; w++){
			#pragma HLS PIPELINE
			wyn = wyn + en[w]*transfer_derivative(output[m_lay-1][w])*weight[m_lay-1][w][n];
		}
		//std::cout << "Wyn: " << wyn << std::endl;
		for (unsigned int w = 0; w < m_neu; w++){
			#pragma HLS PIPELINE
			//std::cout << weight[m_lay-2][n][w] << "-" << learning_rate << "*" << wyn << "*" << transfer_derivative(output[m_lay-2][n]) << "*" << output[m_lay-2][w] << std::endl;
			new_weight[m_lay-2][n][w] = weight[m_lay-2][n][w] - learning_rate * wyn * transfer_derivative(output[m_lay-2][n])* output[m_lay-2][w];
	}
}

};

design_nn_full_hwh.log (840.5 KB)

So the problem was omitted by changing weights from IP-core level, thou I’m still curious why changing physical address method doesn’t work :slight_smile: