Hello, I’m trying to use dma with vitis hls and pynq os to transfer the data and do some kind of simple adding.
My board is ZCU104 board.
When I assign input buffer in pynq (jupyter notebook) with allocate for size of smaller than 256, it have no problem.
However when I try with larger buffer, it just stuck at dma.sendchannel.wait() line.
I allocate like this
“allocate(shape=256, dtype=np.float32)”
And when I move on to
print(cc.sendchannel.idle)
cc.sendchannel.transfer(c)
print(cc.sendchannel.idle)
cc.sendchannel.wait()
print(cc.sendchannel.idle)
print part always say False and the notebook just stuck at wait().
I don’t understand why this just stuck when allocate shape increase.
I use 2 dma with same setting in vivado.
This is my header file in hls.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ap_fixed.h>
#include “hls_stream.h”
#include “ap_axi_sdata.h”#ifndef POLY_H
#define POLY_H#define VECTOR_LEN 30000
#define VECTOR_LEN_tb 30000typedef float f_data_type;
typedef ap_axis<32, 0, 0, 0> stream_val;
typedef hls::stream<stream_val> stream_val_data;typedef union {
uint32_t iint;
float ffloat;
} fp_data;void poly(
stream_val_data &co,
stream_val_data &i,
volatile float *ii_t,
volatile float *cc1_t,
volatile float *cc2_t,
volatile float *oo1_t,
volatile float *oo2_t);#endif
This is testbench in hls.
#include <stdio.h>
#include <stdio.h>
#include <stdlib.h>
#include
#include <math.h>
#include “poly.h”unsigned globalSeed;
int main(void){
bool correct = true;f_data_type *ttco = (f_data_type *) malloc(sizeof(f_data_type) * 2*VECTOR_LEN_tb); f_data_type *tti = (f_data_type *) malloc(sizeof(f_data_type) * VECTOR_LEN_tb); f_data_type *tto = (f_data_type *) malloc(sizeof(f_data_type) * VECTOR_LEN_tb); f_data_type *o_t = (f_data_type *) malloc(sizeof(f_data_type) * VECTOR_LEN_tb); f_data_type *i_test = (f_data_type *) malloc(sizeof(f_data_type) * VECTOR_LEN_tb); f_data_type *c1_test = (f_data_type *) malloc(sizeof(f_data_type) * VECTOR_LEN_tb); f_data_type *c2_test = (f_data_type *) malloc(sizeof(f_data_type) * VECTOR_LEN_tb); f_data_type *o1_test = (f_data_type *) malloc(sizeof(f_data_type) * VECTOR_LEN_tb); f_data_type *o2_test = (f_data_type *) malloc(sizeof(f_data_type) * VECTOR_LEN_tb); memset(ttco, 0, sizeof(f_data_type) * 2*VECTOR_LEN_tb); memset(tti, 0, sizeof(f_data_type) * VECTOR_LEN_tb); memset(tto, 0, sizeof(f_data_type) * VECTOR_LEN_tb); memset(o_t, 0, sizeof(f_data_type) * VECTOR_LEN_tb); memset(i_test, 0, sizeof(f_data_type) * VECTOR_LEN_tb); memset(c1_test, 0, sizeof(f_data_type) * VECTOR_LEN_tb); memset(c2_test, 0, sizeof(f_data_type) * VECTOR_LEN_tb); memset(o1_test, 0, sizeof(f_data_type) * VECTOR_LEN_tb); memset(o2_test, 0, sizeof(f_data_type) * VECTOR_LEN_tb); for (int cc=0; cc<2*VECTOR_LEN_tb;cc++){ ttco[cc] = (float) (rand() % 1024 - 512) / 512; } for (int cc=0; cc<VECTOR_LEN_tb;cc++){ tti[cc] = (float) (rand() % 1024 - 512) / 512; } tto[0] = 0; o_t[0] = 0; printf("============TESTBENCH============\n"); printf("a: %f, ", ttco[0]); printf("b: %f, ", ttco[1]); printf("c: %f, ", ttco[VECTOR_LEN_tb]); printf("d: %f, ", ttco[VECTOR_LEN_tb+1]); printf("i: %f, ", tti[0]); printf("o: %f\n", tto[0]); stream_val_data a_stream("coeff input"); stream_val_data i_stream("input"); stream_val_data c_stream("output"); stream_val tmp_input; fp_data tmp_input_f; for (int i = 0; i < 2*VECTOR_LEN_tb; i++){ tmp_input_f.ffloat = f_data_type(ttco[i]); tmp_input.data = tmp_input_f.iint; tmp_input.keep = 1; if ((i + 1) % VECTOR_LEN_tb == 0) { tmp_input.last = 1; } else { tmp_input.last = 0; } a_stream.write(tmp_input); } stream_val tmp_input2; fp_data tmp_input2_f; for (int j = 0; j < VECTOR_LEN_tb; j++){ tmp_input2_f.ffloat = f_data_type (tti[j]); tmp_input2.data = tmp_input2_f.iint; tmp_input2.keep = 1; if(j==VECTOR_LEN_tb-1){ tmp_input2.last = 1; } else{ tmp_input2.last = 0; } i_stream.write(tmp_input2); } poly(a_stream, i_stream, i_test, c1_test, c2_test, o1_test, o2_test); o_t[0] = tti[0] + ttco[0] + ttco[1] + ttco[VECTOR_LEN_tb] + ttco[VECTOR_LEN_tb+1]; printf("o_t: %f\n", o_t[0]); printf("o: %f\n", tto[0]); printf("i_test %f\n", i_test[0]); printf("c1_test %f %f \n", c1_test[0], c1_test[1]); printf("c2_test %f %f \n", c2_test[0], c2_test[1]); printf("o1_test %f \n", o1_test[0]); printf("o2_test %f \n", o2_test[0]); if (o2_test[0] != o_t[0]){ correct = false; } free(ttco); free(tti); free(tto); free(o_t); free(i_test); free(c1_test); free(c2_test); free(o1_test); free(o2_test); if (correct){ printf("Test successful\n"); return 0; } else{ printf("Test unsuccessful\n"); return 0; }
}
This is my code in hls.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include “./poly.h”void poly(
stream_val_data &co,
stream_val_data &i,
volatile float *ii_t,
volatile float *cc1_t,
volatile float *cc2_t,
volatile float *oo1_t,
volatile float *oo2_t
){
#pragma HLS INTERFACE axis register both port=co
#pragma HLS INTERFACE axis register both port=i
#pragma HLS INTERFACE m_axi port = ii_t depth = 1024 offset = slave bundle = ii_t_port
#pragma HLS INTERFACE m_axi port = cc1_t depth = 1024 offset = slave bundle = cc1_t_port
#pragma HLS INTERFACE m_axi port = cc2_t depth = 1024 offset = slave bundle = cc2_t_port
#pragma HLS INTERFACE m_axi port = oo1_t depth = 1024 offset = slave bundle = oo1_t_port
#pragma HLS INTERFACE m_axi port = oo2_t depth = 1024 offset = slave bundle = oo2_t_port#pragma HLS INTERFACE s_axilite port = ii_t bundle = CONTROL_BUS #pragma HLS INTERFACE s_axilite port = cc1_t bundle = CONTROL_BUS #pragma HLS INTERFACE s_axilite port = cc2_t bundle = CONTROL_BUS #pragma HLS INTERFACE s_axilite port = oo1_t bundle = CONTROL_BUS #pragma HLS INTERFACE s_axilite port = oo2_t bundle = CONTROL_BUS #pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS f_data_type cco[30000]; f_data_type ii[30000]; f_data_type oo[1]; f_data_type oo2[1]; stream_val tmp_co; stream_val tmp_co2; stream_val tmp_in; stream_val tmp_out; fp_data tmp_coo; fp_data tmp_co2o; fp_data tmp_ino; fp_data tmp_outo; for(int j = 0; j < VECTOR_LEN; j++){ co.read(tmp_co); tmp_coo.iint = tmp_co.data; cco[j] = tmp_coo.ffloat; cc1_t[j] = cco[j]; } for(int j = 0; j < VECTOR_LEN; j++){ i.read(tmp_in); tmp_ino.iint = tmp_in.data; ii[j] = tmp_ino.ffloat; ii_t[j] = ii[j]; // printf("%f\n", cco[j]); // printf("a: %s\n", cco[j].to_string(10).c_str()); // tmp_in.last = (j == VECTOR_LEN-1) ? 1 : 0; } oo[0] = ii[0] + cco[0] + cco[1]; oo1_t[0] = oo[0]; for(int j = 0; j < VECTOR_LEN; j++){ co.read(tmp_co); tmp_coo.iint = tmp_co.data; cco[j] = tmp_coo.ffloat; cc2_t[j] = cco[j]; } oo2[0] = oo[0] + cco[0] + cco[1]; oo2_t[0] = oo2[0];
}
I want to set allocate shape larger than 30000.