AXI stream with multi-DMA addition example never stops

Hello I tried to replicate the same design as in this tutorial With a small change of using float instead of integers. This is my HLS code to generate the IP:
streamADD.h file

#ifndef _STREAMADD_H_
#define _STREAMADD_H_

#include "ap_int.h"

struct axis_t
	float 		data;
	ap_int<1> 	last;

void sadd(axis_t *IN_1, axis_t *IN_2, unsigned int size, axis_t *OUT);


streamADD.cpp file

#include "streamADD.h"

void sadd(axis_t *IN_1, axis_t *IN_2, unsigned int size, axis_t *OUT)
#pragma HLS INTERFACE s_axilite port=size bundle=CTRL
#pragma HLS INTERFACE axis port=IN_1 depth=50
#pragma HLS INTERFACE axis port=IN_2 depth=50
#pragma HLS INTERFACE axis port=OUT  depth=50

	for(int i = 0; i<size; i++)
		axis_t	curr1 = *IN_1++;
		axis_t	curr2 = *IN_2++; = +;

		*OUT++ =  curr1;

Then I followed the exact steps to generate the block design and got the attached block designs
design_sadd.pdf (94.4 KB)
stream_add.pdf (40.1 KB)
and finally, I uploaded the bitstream and .hwh files to the pynq board Z1and I wrote the following code:

import time
from pynq import Overlay
import pynq.lib.dma
from pynq import Xlnk
import numpy as np
from pynq import MMIO
import random

ol = Overlay('/home/xilinx/jupyter_notebooks/add_tuto/sadd.bit') # check this path # this downloads your bitstream into FPGA
dma1 = ol.stream_add.sadd_dma1 # first DMA. Note that we had to access the hierarchy before accessing the DMA
dma2 = ol.stream_add.sadd_dma2 # second DMA
sadd_ip = MMIO(0x40000000, 0x10000) # we got this address from
xlnk = Xlnk()

length = 8

in_buffer1 = xlnk.cma_array(shape=(length,), dtype=np.float32) # input buffer 1
in_buffer2 = xlnk.cma_array(shape=(length,), dtype=np.float32) # input buffer 2
out_buffer = xlnk.cma_array(shape=(length,), dtype=np.float32) # output buffer

samples = random.sample(range(0, length), length)
np.copyto(in_buffer1, samples)
samples = random.sample(range(0, length), length)
np.copyto(in_buffer2, samples)

sadd_ip.write(0x10, length) # we got this address from Vivado source. Since we didn't do port=return, and we set a constant for ap_start, we only have to write length.
t_start = time.time()
print('sending ....')
print('wait receive ....')
t_stop = time.time()
print('Hardware execution time: ', t_stop-t_start)
for i in range(0, length):
    print('{}+{} = {}'.format(in_buffer1[i], in_buffer2[i], out_buffer[i]))

What did I do wrong? any Help?
(Lab: Axistream Multiple DMAs (axis) — pp4fpgas 0.0.1 documentation)

Can you try read back the status register of the DMA? It may help point to the issue.