DMA sendchannel.transfer() and wait() stuck with larger buffer

Hello, I’m trying to use dma with vitis hls and pynq os to transfer the data and do some kind of simple adding.

My board is ZCU104 board.

When I assign input buffer in pynq (jupyter notebook) with allocate for size of smaller than 256, it have no problem.
However when I try with larger buffer, it just stuck at dma.sendchannel.wait() line.

I allocate like this
“allocate(shape=256, dtype=np.float32)”

And when I move on to

print(cc.sendchannel.idle)
cc.sendchannel.transfer(c)
print(cc.sendchannel.idle)
cc.sendchannel.wait()
print(cc.sendchannel.idle)
print part always say False and the notebook just stuck at wait().

I don’t understand why this just stuck when allocate shape increase.

I use 2 dma with same setting in vivado.

This is my header file in hls.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ap_fixed.h>
#include “hls_stream.h”
#include “ap_axi_sdata.h”

#ifndef POLY_H
#define POLY_H

#define VECTOR_LEN 30000
#define VECTOR_LEN_tb 30000

typedef float f_data_type;

typedef ap_axis<32, 0, 0, 0> stream_val;
typedef hls::stream<stream_val> stream_val_data;

typedef union {
uint32_t iint;
float ffloat;
} fp_data;

void poly(
stream_val_data &co,
stream_val_data &i,
volatile float *ii_t,
volatile float *cc1_t,
volatile float *cc2_t,
volatile float *oo1_t,
volatile float *oo2_t);

#endif

This is testbench in hls.

#include <stdio.h>
#include <stdio.h>
#include <stdlib.h>
#include
#include <math.h>
#include “poly.h”

unsigned globalSeed;

int main(void){
bool correct = true;

f_data_type *ttco = (f_data_type *) malloc(sizeof(f_data_type) * 2*VECTOR_LEN_tb);
f_data_type *tti = (f_data_type *) malloc(sizeof(f_data_type) * VECTOR_LEN_tb);
f_data_type *tto = (f_data_type *) malloc(sizeof(f_data_type) * VECTOR_LEN_tb);
f_data_type *o_t = (f_data_type *) malloc(sizeof(f_data_type) * VECTOR_LEN_tb);
f_data_type *i_test = (f_data_type *) malloc(sizeof(f_data_type) * VECTOR_LEN_tb);
f_data_type *c1_test = (f_data_type *) malloc(sizeof(f_data_type) * VECTOR_LEN_tb);
f_data_type *c2_test = (f_data_type *) malloc(sizeof(f_data_type) * VECTOR_LEN_tb);
f_data_type *o1_test = (f_data_type *) malloc(sizeof(f_data_type) * VECTOR_LEN_tb);
f_data_type *o2_test = (f_data_type *) malloc(sizeof(f_data_type) * VECTOR_LEN_tb);

memset(ttco, 0, sizeof(f_data_type) * 2*VECTOR_LEN_tb);
memset(tti, 0, sizeof(f_data_type) * VECTOR_LEN_tb);
memset(tto, 0, sizeof(f_data_type) * VECTOR_LEN_tb);
memset(o_t, 0, sizeof(f_data_type) * VECTOR_LEN_tb);
memset(i_test, 0, sizeof(f_data_type) * VECTOR_LEN_tb);
memset(c1_test, 0, sizeof(f_data_type) * VECTOR_LEN_tb);
memset(c2_test, 0, sizeof(f_data_type) * VECTOR_LEN_tb);
memset(o1_test, 0, sizeof(f_data_type) * VECTOR_LEN_tb);
memset(o2_test, 0, sizeof(f_data_type) * VECTOR_LEN_tb);

for (int cc=0; cc<2*VECTOR_LEN_tb;cc++){
    ttco[cc] = (float) (rand() % 1024 - 512) / 512;
}
for (int cc=0; cc<VECTOR_LEN_tb;cc++){
    tti[cc] = (float) (rand() % 1024 - 512) / 512;
}
tto[0] = 0;
o_t[0] = 0;

printf("============TESTBENCH============\n");
printf("a: %f, ", ttco[0]);
printf("b: %f, ", ttco[1]);
printf("c: %f, ", ttco[VECTOR_LEN_tb]);
printf("d: %f, ", ttco[VECTOR_LEN_tb+1]);
printf("i: %f, ", tti[0]);
printf("o: %f\n", tto[0]);

stream_val_data a_stream("coeff input");
stream_val_data i_stream("input");
stream_val_data c_stream("output");

stream_val tmp_input;
fp_data tmp_input_f;

for (int i = 0; i < 2*VECTOR_LEN_tb; i++){
    tmp_input_f.ffloat = f_data_type(ttco[i]);
    tmp_input.data = tmp_input_f.iint;
    tmp_input.keep = 1;
    if ((i + 1) % VECTOR_LEN_tb == 0) {
        tmp_input.last = 1;
    } else {
        tmp_input.last = 0;
    }
    a_stream.write(tmp_input);
}

stream_val tmp_input2;
fp_data tmp_input2_f;

for (int j = 0; j < VECTOR_LEN_tb; j++){
    tmp_input2_f.ffloat = f_data_type (tti[j]);
    tmp_input2.data = tmp_input2_f.iint;
    tmp_input2.keep = 1;
    if(j==VECTOR_LEN_tb-1){
        tmp_input2.last = 1;
    }
    else{
        tmp_input2.last = 0;
    }
    i_stream.write(tmp_input2);
}


poly(a_stream, i_stream, i_test, c1_test, c2_test, o1_test, o2_test);

o_t[0] = tti[0] + ttco[0] + ttco[1] + ttco[VECTOR_LEN_tb] + ttco[VECTOR_LEN_tb+1];
printf("o_t: %f\n", o_t[0]);
printf("o: %f\n", tto[0]);

printf("i_test %f\n", i_test[0]);
printf("c1_test %f %f \n", c1_test[0], c1_test[1]);
printf("c2_test %f %f \n", c2_test[0], c2_test[1]);
printf("o1_test %f \n", o1_test[0]);
printf("o2_test %f \n", o2_test[0]);

if (o2_test[0] != o_t[0]){
    correct = false;
}

free(ttco);
free(tti);
free(tto);
free(o_t);
free(i_test);
free(c1_test);
free(c2_test);
free(o1_test);
free(o2_test);

if (correct){
    printf("Test successful\n");
    return 0;
} else{
    printf("Test unsuccessful\n");
    return 0;
}

}

This is my code in hls.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include “./poly.h”

void poly(
stream_val_data &co,
stream_val_data &i,
volatile float *ii_t,
volatile float *cc1_t,
volatile float *cc2_t,
volatile float *oo1_t,
volatile float *oo2_t
){
#pragma HLS INTERFACE axis register both port=co
#pragma HLS INTERFACE axis register both port=i
#pragma HLS INTERFACE m_axi port = ii_t depth = 1024 offset = slave bundle = ii_t_port
#pragma HLS INTERFACE m_axi port = cc1_t depth = 1024 offset = slave bundle = cc1_t_port
#pragma HLS INTERFACE m_axi port = cc2_t depth = 1024 offset = slave bundle = cc2_t_port
#pragma HLS INTERFACE m_axi port = oo1_t depth = 1024 offset = slave bundle = oo1_t_port
#pragma HLS INTERFACE m_axi port = oo2_t depth = 1024 offset = slave bundle = oo2_t_port

#pragma HLS INTERFACE s_axilite port = ii_t bundle = CONTROL_BUS
#pragma HLS INTERFACE s_axilite port = cc1_t bundle = CONTROL_BUS
#pragma HLS INTERFACE s_axilite port = cc2_t bundle = CONTROL_BUS
#pragma HLS INTERFACE s_axilite port = oo1_t bundle = CONTROL_BUS
#pragma HLS INTERFACE s_axilite port = oo2_t bundle = CONTROL_BUS

#pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS

f_data_type cco[30000];
f_data_type ii[30000];
f_data_type oo[1];
f_data_type oo2[1];

stream_val tmp_co;
stream_val tmp_co2;
stream_val tmp_in;
stream_val tmp_out;

fp_data tmp_coo;
fp_data tmp_co2o;
fp_data tmp_ino;
fp_data tmp_outo;

for(int j = 0; j < VECTOR_LEN; j++){
    co.read(tmp_co);
    tmp_coo.iint = tmp_co.data;
    cco[j] = tmp_coo.ffloat;
    cc1_t[j] = cco[j];
}

for(int j = 0; j < VECTOR_LEN; j++){
    i.read(tmp_in);
    tmp_ino.iint = tmp_in.data;
    ii[j] = tmp_ino.ffloat;
    ii_t[j] = ii[j];
    // printf("%f\n", cco[j]);
    // printf("a: %s\n", cco[j].to_string(10).c_str());

    // tmp_in.last = (j == VECTOR_LEN-1) ? 1 : 0;
}

oo[0] = ii[0] + cco[0] + cco[1];
oo1_t[0] = oo[0];

for(int j = 0; j < VECTOR_LEN; j++){
    co.read(tmp_co);
    tmp_coo.iint = tmp_co.data;
    cco[j] = tmp_coo.ffloat;
    cc2_t[j] = cco[j];
}

oo2[0] = oo[0] + cco[0] + cco[1];
oo2_t[0] = oo2[0];

}

I want to set allocate shape larger than 30000.

Hi,
Did you check this post?

Debugging Common DMA Issues [Part 3]

1 Like

Yes but it didn’t solved my problem.

Can you please send a picture of your block design, to see what is connected to the DMA block?


Here’s the block design

Hi @ja2021

PYNQ’s allocate() function requires the shape parameter to be a tuple (e.g. allocate(shape=(256,), dtype=np.float32) not allocate(shape=256, dtype=np.float32) as you have in your example). The PynqBuffer class inherits from numpy.ndarray, so its documentation might be useful in explaining why this is the case.

In terms of debugging beyond that, have you tried simulating the behaviour of different-sized buffers in HLS? If so, I would also advise adding an ILA to your Vivado design and checking the signals coming in and out of your poly IP are as you expect them to be. The tlast signal is a very common problem with DMAs, so make sure it is asserting when you expect it to.

Thanks,

Josh

1 Like