Always stuck at dma.sendchannel.wait()

Hi guys! I am new for PYNQ and I’m trying to modify the 2020 DAC SDC Champion Ultranet. I design my own IP via Vivado HLS . My board is Ultra96V2 and my PYNQ version is 2.6.This is the block design in Vivado.


When running on jupyter notebook, the program stuck at dma.sendchannel.wait() forever , and after it stops running, the message is as shown below.

here are part of python codes:

BATCH_SIZE = team.batch_size
IMAGE_RAW_ROW = 360
IMAGE_RAW_COL = 640
IMAGE_ROW = 160
IMAGE_COL = 320
GRID_ROw = 10
GRID_COL = 20
X_SCALE = IMAGE_RAW_COL / IMAGE_COL
Y_SCALE = IMAGE_RAW_ROW / IMAGE_ROW


in_buffer0 = xlnk.cma_array(shape=(BATCH_SIZE, IMAGE_RAW_ROW, 
IMAGE_RAW_COL, 3), dtype=np.uint8, cacheable = 1)
in_buffer1 = xlnk.cma_array(shape=(BATCH_SIZE, IMAGE_RAW_ROW, 
IMAGE_RAW_COL, 3), dtype=np.uint8, cacheable = 1)
in_buffers = [in_buffer0, in_buffer1]
out_buffer0 = xlnk.cma_array(shape=(BATCH_SIZE, GRID_ROw, GRID_COL, 6, 
6), dtype=np.int32, cacheable = 1)
out_buffer1 = xlnk.cma_array(shape=(BATCH_SIZE, GRID_ROw, GRID_COL, 6, 
6), dtype=np.int32, cacheable = 1)
out_buffers = [out_buffer0, out_buffer1]


#use c code load image
def load_image(image_paths, buff):
paths = [str(path) for path in image_paths]
tmp = np.asarray(buff)
dataptr = tmp.ctypes.data_as(ctypes.c_char_p)
paths_p_list = [ctypes.c_char_p(bytes(str_, 'utf-8')) for str_ in paths]
paths_c = (ctypes.c_char_p*len(paths_p_list))(*paths_p_list)
cfuns.load_image(paths_c, dataptr, len(paths), IMAGE_ROW, IMAGE_COL, 3)

def sigmoid(x):
s = 1 / (1 + np.exp(-x))
return s

def yolo(out_buffer, batch_n, div, last_bais=None, result=None):
res_np = np.array(out_buffer[:batch_n]).reshape(batch_n, -1, 6, 6)
conf = res_np[...,4].sum(axis=2)
max_index = conf.argmax(1)

grid_x = max_index % GRID_COL
grid_y = max_index // GRID_COL

boxs = np.zeros((batch_n, 6, 4))
for i in range(batch_n):
    boxs[i, :, :] = res_np[i, max_index[i], :, :4] / div + last_bais
xy = sigmoid(boxs[..., :2]).mean(axis=1)
wh = np.exp(boxs[..., 2:4]).mean(axis=1)

xy[:, 0] += grid_x
xy[:, 1] += grid_y

xy *= 16
wh *= 20

xy[:, 0] *= X_SCALE
xy[:, 1] *= Y_SCALE
wh[:, 0] *= X_SCALE
wh[:, 1] *= Y_SCALE
xmin = xy[:, 0] - wh[:, 0] / 2
xmax = xy[:, 0] + wh[:, 0] / 2
ymin = xy[:, 1] - wh[:, 1] / 2
ymax = xy[:, 1] + wh[:, 1] / 2

for i in range(batch_n):
    temp = [int(xmin[i]), int(xmax[i]), int(ymin[i]), int(ymax[i])]
    result.append(temp)

which_buffer = 0
first_batch = True
net_cnt = 0
last_batch_size = BATCH_SIZE

def net(img_paths, result):

global first_batch
global which_buffer    
global net_cnt
global last_batch_size
# buffer first batch
if first_batch == True:
    first_batch = False
    which_buffer = 0
    load_image(img_paths, in_buffers[which_buffer])
    return
# count
net_cnt += 1
nn_ctrl.write(0x0, 0) # Reset
nn_ctrl.write(0x10, in_buffers[which_buffer].shape[0])
nn_ctrl.write(0x0, 1) # Deassert reset
dma.recvchannel.transfer(out_buffers[which_buffer])
dma.sendchannel.transfer(in_buffers[which_buffer])

# switch buffer
if which_buffer == 0:
    which_buffer = 1
else:
    which_buffer = 0
# buffer next batch
if img_paths is not None:
    load_image(img_paths, in_buffers[which_buffer])

# yolo 
if net_cnt > 1:
    yolo(out_buffers[which_buffer], BATCH_SIZE, 7 * 15, last_bais, result)

if img_paths is not None and len(img_paths) != BATCH_SIZE:
    last_batch_size = len(img_paths)
        
dma.sendchannel.wait()
dma.recvchannel.wait()
# last batch 
if img_paths is None:
    yolo(out_buffers[(which_buffer + 1) % 2], last_batch_size, 7 * 15, last_bais, result) 

I’ve been struggling for this a few days. Any suggestions would be appreciated.