#include #define BATCH_SIZE 32 #define M_outputs 2 #define N_inputs 7 #define RET_NUM M_outputs * N_inputs + M_outputs #define LIM1 BATCH_SIZE * M_outputs #define LIM2 BATCH_SIZE * (2 * M_outputs) #define LIM3 BATCH_SIZE * (2 * M_outputs + N_inputs) struct datatype { float data; bool last; }; typedef ap_fixed<32,16> stream_fixed_type; void backward_lite(datatype in_stream[(2*BATCH_SIZE)+(BATCH_SIZE*DEGREE)], float& bias1, float& w1_1, float& w1_2, float& w1_3, float& w1_4, float& w1_5, float& w1_6, float& w1_7, float& bias2, float& w2_1, float& w2_2, float& w2_3, float& w2_4, float& w2_5, float& w2_6, float& w2_7) { #pragma HLS INTERFACE ap_ctrl_none port=return #pragma HLS INTERFACE axis port=in_stream #pragma HLS INTERFACE axis port=out_stream #pragma HLS INTERFACE s_axilite port=bias1 #pragma HLS INTERFACE s_axilite port=bias2 #pragma HLS INTERFACE s_axilite port=w1_1 #pragma HLS INTERFACE s_axilite port=w1_2 #pragma HLS INTERFACE s_axilite port=w1_3 #pragma HLS INTERFACE s_axilite port=w1_4 #pragma HLS INTERFACE s_axilite port=w1_5 #pragma HLS INTERFACE s_axilite port=w1_6 #pragma HLS INTERFACE s_axilite port=w1_7 #pragma HLS INTERFACE s_axilite port=w2_1 #pragma HLS INTERFACE s_axilite port=w2_2 #pragma HLS INTERFACE s_axilite port=w2_3 #pragma HLS INTERFACE s_axilite port=w2_4 #pragma HLS INTERFACE s_axilite port=w2_5 #pragma HLS INTERFACE s_axilite port=w2_6 #pragma HLS INTERFACE s_axilite port=w2_7 stream_fixed_type dif, abs_dif, smooth_grad[M_outputs * BATCH_SIZE], batch_x_mat[BATCH_SIZE * N_inputs], nn_out_mat[BATCH_SIZE * M_outputs], batch_y_mat[BATCH_SIZE]; stream_fixed_type bias1_temp = 0, w1_1_temp = 0, w1_2_temp = 0, w1_3_temp = 0, w1_4_temp = 0, w1_5_temp = 0, w1_6_temp = 0, w1_7_temp = 0; stream_fixed_type bias2_temp = 0, w2_1_temp = 0, w2_2_temp = 0, w2_3_temp = 0, w2_4_temp = 0, w2_5_temp = 0, w2_6_temp = 0, w2_7_temp = 0; int i; int start_iter; int stop_iter; for (i = 0; i < LIM1; i++) { #pragma HLS PIPELINE nn_out_mat[i] = in_stream[i].data; // NN output[M_outputs * BATCH_SIZE] } for (i = LIM1; i < LIM2; i++) { #pragma HLS PIPELINE batch_y_mat[i-LIM1] = in_stream[i].data; // Y output data[M_outputs * BATCH_SIZE] } for (i = LIM2; i < LIM3; i++) { #pragma HLS PIPELINE batch_x_mat[i-LIM2] = in_stream[i].data; // X Input data[N_inputs * BATCH_SIZE] } for (i = 0; i < M_outputs * BATCH_SIZE; i++) { // (a1 - y1), (a2 - y2) #pragma HLS PIPELINE dif = nn_out_mat[i] - batch_y_mat[i]; if (dif < 0) { abs_dif = -dif; } else { abs_dif = dif; } if (abs_dif < 1) { smooth_grad[i] = (stream_fixed_type)(1.0 / BATCH_SIZE) * dif; } else { // To prevent vanishing gradient? smooth_grad[i] = (stream_fixed_type)(1.0 / BATCH_SIZE) * ( (dif) / (abs_dif) ); } } for (i = 0; i < BATCH_SIZE; i++) { bias1_temp = bias1_temp + (stream_fixed_type)smooth_grad[i]; bias2_temp = bias2_temp + (stream_fixed_type)smooth_grad[i + BATCH_SIZE]; } bias1 = bias1_temp; bias2 = bias2_temp; for (i = 0; i < BATCH_SIZE; i++) { // smooth_grad[0,31] = (a1 - y1), update w1_n w1_1_temp = w1_1_temp + (stream_fixed_type)(smooth_grad[i] * batch_x_mat[i]); // 0-31 w1_2_temp = w1_2_temp + (stream_fixed_type)(smooth_grad[i] * batch_x_mat[i+BATCH_SIZE]); // 32-63 w1_3_temp = w1_3_temp + (stream_fixed_type)(smooth_grad[i] * batch_x_mat[i+2*BATCH_SIZE]); // 64-95 w1_4_temp = w1_4_temp + (stream_fixed_type)(smooth_grad[i] * batch_x_mat[i+3*BATCH_SIZE]); // 96-127 w1_5_temp = w1_5_temp + (stream_fixed_type)(smooth_grad[i] * batch_x_mat[i+4*BATCH_SIZE]); // 128-159 w1_6_temp = w1_6_temp + (stream_fixed_type)(smooth_grad[i] * batch_x_mat[i+5*BATCH_SIZE]); // 96-127 w1_7_temp = w1_7_temp + (stream_fixed_type)(smooth_grad[i] * batch_x_mat[i+6*BATCH_SIZE]); // 128-159 // smooth_grad[32,63] = (a2 - y2), update w2_n w2_1_temp = w2_1_temp + (stream_fixed_type)(smooth_grad[BATCH_SIZE + i] * batch_x_mat[i]); // 0-31 w2_2_temp = w2_2_temp + (stream_fixed_type)(smooth_grad[BATCH_SIZE + i] * batch_x_mat[i+BATCH_SIZE]); // 32-63 w2_3_temp = w2_3_temp + (stream_fixed_type)(smooth_grad[BATCH_SIZE + i] * batch_x_mat[i+2*BATCH_SIZE]); // 64-95 w2_4_temp = w2_4_temp + (stream_fixed_type)(smooth_grad[BATCH_SIZE + i] * batch_x_mat[i+3*BATCH_SIZE]); // 96-127 w2_5_temp = w2_5_temp + (stream_fixed_type)(smooth_grad[BATCH_SIZE + i] * batch_x_mat[i+4*BATCH_SIZE]); // 128-159 w2_6_temp = w2_6_temp + (stream_fixed_type)(smooth_grad[BATCH_SIZE + i] * batch_x_mat[i+5*BATCH_SIZE]); // 96-127 w2_7_temp = w2_7_temp + (stream_fixed_type)(smooth_grad[BATCH_SIZE + i] * batch_x_mat[i+6*BATCH_SIZE]); // 128-159 } w1_1 = w1_1_temp; w1_2 = w1_2_temp; w1_3 = w1_3_temp; w1_4 = w1_4_temp; w1_5 = w1_5_temp; w1_6 = w1_6_temp; w1_7 = w1_7_temp; w2_1 = w2_1_temp; w2_2 = w2_2_temp; w2_3 = w2_3_temp; w2_4 = w2_4_temp; w2_5 = w2_5_temp; w2_6 = w2_6_temp; w2_7 = w2_7_temp; }