#include <ap_axi_sdata.h>

#define BATCH_SIZE 32
#define M_outputs 2
#define N_inputs 7
#define RET_NUM M_outputs * N_inputs + M_outputs
#define LIM1 BATCH_SIZE * M_outputs
#define LIM2 BATCH_SIZE * (2 * M_outputs)
#define LIM3 BATCH_SIZE * (2 * M_outputs + N_inputs)

struct datatype {
  float data;
  bool last;
};

typedef ap_fixed<32,16> stream_fixed_type;

void backward_lite(datatype in_stream[(2*BATCH_SIZE)+(BATCH_SIZE*DEGREE)], float& bias1, float& w1_1, float& w1_2, float& w1_3, float& w1_4, float& w1_5, float& w1_6, float& w1_7, float& bias2, float& w2_1, float& w2_2, float& w2_3, float& w2_4, float& w2_5, float& w2_6, float& w2_7) {

	#pragma HLS INTERFACE ap_ctrl_none port=return
	#pragma HLS INTERFACE axis port=in_stream
	#pragma HLS INTERFACE axis port=out_stream

	#pragma HLS INTERFACE s_axilite  port=bias1
	#pragma HLS INTERFACE s_axilite  port=bias2
	#pragma HLS INTERFACE s_axilite  port=w1_1
	#pragma HLS INTERFACE s_axilite  port=w1_2
	#pragma HLS INTERFACE s_axilite  port=w1_3
	#pragma HLS INTERFACE s_axilite  port=w1_4
	#pragma HLS INTERFACE s_axilite  port=w1_5
	#pragma HLS INTERFACE s_axilite  port=w1_6
	#pragma HLS INTERFACE s_axilite  port=w1_7
	#pragma HLS INTERFACE s_axilite  port=w2_1
	#pragma HLS INTERFACE s_axilite  port=w2_2
	#pragma HLS INTERFACE s_axilite  port=w2_3
	#pragma HLS INTERFACE s_axilite  port=w2_4
	#pragma HLS INTERFACE s_axilite  port=w2_5
	#pragma HLS INTERFACE s_axilite  port=w2_6
	#pragma HLS INTERFACE s_axilite  port=w2_7

	stream_fixed_type dif, abs_dif, smooth_grad[M_outputs * BATCH_SIZE], batch_x_mat[BATCH_SIZE * N_inputs], nn_out_mat[BATCH_SIZE * M_outputs], batch_y_mat[BATCH_SIZE];
	stream_fixed_type bias1_temp = 0, w1_1_temp = 0, w1_2_temp = 0, w1_3_temp = 0, w1_4_temp = 0, w1_5_temp = 0, w1_6_temp = 0, w1_7_temp = 0;
	stream_fixed_type bias2_temp = 0, w2_1_temp = 0, w2_2_temp = 0, w2_3_temp = 0, w2_4_temp = 0, w2_5_temp = 0, w2_6_temp = 0, w2_7_temp = 0;
	int i;

	int start_iter;
	int stop_iter;

	for (i = 0; i < LIM1; i++) {
		#pragma HLS PIPELINE
		nn_out_mat[i] = in_stream[i].data; // NN output[M_outputs * BATCH_SIZE]
	}

	for (i = LIM1; i < LIM2; i++) {
		#pragma HLS PIPELINE
		batch_y_mat[i-LIM1] = in_stream[i].data; // Y output data[M_outputs * BATCH_SIZE]
	}

	for (i = LIM2; i < LIM3; i++) {
		#pragma HLS PIPELINE
		batch_x_mat[i-LIM2] = in_stream[i].data; // X Input data[N_inputs * BATCH_SIZE]
	}

	for (i = 0; i < M_outputs * BATCH_SIZE; i++) { // (a1 - y1), (a2 - y2)
		#pragma HLS PIPELINE

		dif = nn_out_mat[i] - batch_y_mat[i];
		if (dif < 0) {
			abs_dif = -dif;
		} else {
			abs_dif = dif;
		}
		if (abs_dif < 1) {
			smooth_grad[i] = (stream_fixed_type)(1.0 / BATCH_SIZE) * dif;
		} else {
			// To prevent vanishing gradient?
			smooth_grad[i] = (stream_fixed_type)(1.0 / BATCH_SIZE) * ( (dif) / (abs_dif) ); 
		}
	}

	for (i = 0; i < BATCH_SIZE; i++) {
		bias1_temp = bias1_temp + (stream_fixed_type)smooth_grad[i];
		bias2_temp = bias2_temp + (stream_fixed_type)smooth_grad[i + BATCH_SIZE];
	}

	bias1 = bias1_temp;
	bias2 = bias2_temp;

	for (i = 0; i < BATCH_SIZE; i++) { 
		// smooth_grad[0,31] = (a1 - y1), update w1_n
		w1_1_temp = w1_1_temp + (stream_fixed_type)(smooth_grad[i] * batch_x_mat[i]); // 0-31
		w1_2_temp = w1_2_temp + (stream_fixed_type)(smooth_grad[i] * batch_x_mat[i+BATCH_SIZE]); // 32-63
		w1_3_temp = w1_3_temp + (stream_fixed_type)(smooth_grad[i] * batch_x_mat[i+2*BATCH_SIZE]); // 64-95
		w1_4_temp = w1_4_temp + (stream_fixed_type)(smooth_grad[i] * batch_x_mat[i+3*BATCH_SIZE]); // 96-127
		w1_5_temp = w1_5_temp + (stream_fixed_type)(smooth_grad[i] * batch_x_mat[i+4*BATCH_SIZE]); // 128-159
		w1_6_temp = w1_6_temp + (stream_fixed_type)(smooth_grad[i] * batch_x_mat[i+5*BATCH_SIZE]); // 96-127
		w1_7_temp = w1_7_temp + (stream_fixed_type)(smooth_grad[i] * batch_x_mat[i+6*BATCH_SIZE]); // 128-159

		// smooth_grad[32,63] = (a2 - y2), update w2_n
		w2_1_temp = w2_1_temp + (stream_fixed_type)(smooth_grad[BATCH_SIZE + i] * batch_x_mat[i]); // 0-31
		w2_2_temp = w2_2_temp + (stream_fixed_type)(smooth_grad[BATCH_SIZE + i] * batch_x_mat[i+BATCH_SIZE]); // 32-63
		w2_3_temp = w2_3_temp + (stream_fixed_type)(smooth_grad[BATCH_SIZE + i] * batch_x_mat[i+2*BATCH_SIZE]); // 64-95
		w2_4_temp = w2_4_temp + (stream_fixed_type)(smooth_grad[BATCH_SIZE + i] * batch_x_mat[i+3*BATCH_SIZE]); // 96-127
		w2_5_temp = w2_5_temp + (stream_fixed_type)(smooth_grad[BATCH_SIZE + i] * batch_x_mat[i+4*BATCH_SIZE]); // 128-159
		w2_6_temp = w2_6_temp + (stream_fixed_type)(smooth_grad[BATCH_SIZE + i] * batch_x_mat[i+5*BATCH_SIZE]); // 96-127
		w2_7_temp = w2_7_temp + (stream_fixed_type)(smooth_grad[BATCH_SIZE + i] * batch_x_mat[i+6*BATCH_SIZE]); // 128-159
	}


	w1_1 = w1_1_temp;
	w1_2 = w1_2_temp;
	w1_3 = w1_3_temp;
	w1_4 = w1_4_temp;
	w1_5 = w1_5_temp;
	w1_6 = w1_6_temp;
	w1_7 = w1_7_temp;

	w2_1 = w2_1_temp;
	w2_2 = w2_2_temp;
	w2_3 = w2_3_temp;
	w2_4 = w2_4_temp;
	w2_5 = w2_5_temp;
	w2_6 = w2_6_temp;
	w2_7 = w2_7_temp;
}