Why I cant get correct result from my IP

Hello, I tried to create a Vitis HLS IP as follows:
kernel.hpp

#ifndef _Z_KERNEL_H_
#define _Z_KERNEL_H_

#include "ap_axi_sdata.h"
#include "ap_int.h"
#include "hls_stream.h"

#define M 64
#define N 512

typedef ap_axiu<32, 0, 0, 0> axis_t;

void z_kernel(hls::stream<axis_t> *A, hls::stream<axis_t> *X, hls::stream<axis_t> *Y, hls::stream<axis_t> *Z);

#endif

kernel.cpp

#include "z_kernel.hpp"
#include "hls_stream.h"

void z_kernel(hls::stream<axis_t> &A, hls::stream<axis_t> &X, hls::stream<axis_t> &Y, hls::stream<axis_t> &Z)
{
#pragma HLS INTERFACE s_axilite port=return bundle=control
#pragma HLS INTERFACE axis port=A
#pragma HLS INTERFACE axis port=X
#pragma HLS INTERFACE axis port=Y
#pragma HLS INTERFACE axis port=Z

	float	x[N];
#pragma HLS ARRAY_PARTITION variable=x dim=1 complete

	ld_X: for(int n=0;n<N;n++)
	{
		axis_t tx = X.read();
		x[n] = tx.data;
	}

	l1: for(int m=0;m<M;m++)
	{
		float sum = 0;
		l2: for(int n=0;n<N;n++)
		{
			axis_t ta = A.read();
			sum += ta.data * x[n];
		}
		axis_t ty = Y.read();
		ty.data = sum - ty.data;
		Z.write(ty);
	}
}

Then I created the following BD on Vivado and generated the bitstream with any critical warning:

and on the pynq board I made the following:

import time
from pynq import Overlay
import pynq.lib.dma
from pynq import Xlnk
import numpy as np
from pynq import MMIO
import random

ol = Overlay('z_accel.bit')

dma_A = ol.dma_A
dma_X = ol.dma_X
dma_YZ = ol.dma_YZ
z_ip = ol.z_kernel_0
xlnk = Xlnk()

# Define dimensions
M = 64
N = 512
# Allocate memory for DMA transfers
A_buffer = xlnk.cma_array(shape=(M,N), dtype=np.float32, cacheable=False)
X_buffer = xlnk.cma_array(shape=(N), dtype=np.float32, cacheable=False)
Y_buffer = xlnk.cma_array(shape=(M), dtype=np.float32, cacheable=False)
Z_buffer = xlnk.cma_array(shape=(M), dtype=np.float32, cacheable=False)

def run_kernel():
    dma_A.sendchannel.transfer(A_buffer)
    dma_X.sendchannel.transfer(X_buffer)
    dma_YZ.sendchannel.transfer(Y_buffer)
    dma_YZ.recvchannel.transfer(Z_buffer)
    
    z_ip.write(0x00, ((1<<0) | (1<<7)))  # initialize the module
    
    dma_A.sendchannel.wait()
    dma_X.sendchannel.wait()
    dma_YZ.sendchannel.wait()
    dma_YZ.recvchannel.wait()

A = np.random.rand(M, N).astype(dtype=np.float32)
X = np.random.rand(N).astype(dtype=np.float32)
Y = np.random.rand(M).astype(dtype=np.float32)

A_buffer[:] = A
X_buffer[:] = X
Y_buffer[:] = Y

run_kernel()

but the output is zeros on the Z_buffer. What could be the problem? Any help here please?

For me, it is working for Integer (Changing back to int in both HLS and python code). If you or someone could have some fix for the float type, please share the info.