I am implementing a simple vector addition function:
Z = X + Y
where: X, Y and Z are integer vector(array).
In HLS, I was able to confirm/simulate that array Z contains correct value. But, when implemented in PYNQ, the correct answer is found in every other elements.
For example: array X contains all 1, array Y contains all 2, but in array Z, the answers are all reflected on every other elements (i.e, z[0]=3;z[1]=0; z[2]=3; z[3]=0; z[4]=3; z[5]=0, etc.
Any suggestions will be appreciated.
***PYNQ code as follows:
from pynq import Overlay
# Load the overlay
overlay = Overlay('/home/xilinx/pynq/overlays/vecadd_int/vecadd_int3.bit')
# IP alias
from pynq import Xlnk
import numpy as np
# Allocate contiguous buffer for memory transfer
xlnk = Xlnk()
x_buffer = xlnk.cma_array(shape=(N,), dtype=np.int)
y_buffer = xlnk.cma_array(shape=(N,), dtype=np.int)
z_buffer = xlnk.cma_array(shape=(N,), dtype=np.int)
# Copy the DNA string to the in_buffer
# check buffer status
# initialize AXI with address and length,
vecadd.write(0x10,z_buffer.physical_address) # vector Z is AXI so need to initialize address
vecadd.write(0x18,y_buffer.physical_address) # vector Y is AXI so need to initialize address
vecadd.write(0x20,x_buffer.physical_address) # vector X is AXI so need to initialize address
vecadd.write(0x28,N) # initialize N
vecadd.write(0x00,0x01) # start
while vecadd.read(0x00 & 0x4)!= 0x04:
vecadd.write(0x00,0x00) # stop
np.copyto(vec_z, z_buffer)
***HLS code as follows:
#include <string.h>
typedef int data_t ;
void vecadd_int3(volatile data_t *z, volatile const data_t *y, volatile const data_t *x, unsigned int N) {
#pragma HLS INTERFACE m_axi port=z offset=slave depth=32767 bundle=out_z
#pragma HLS INTERFACE m_axi port=y offset=slave depth=32767 bundle=out_y
#pragma HLS INTERFACE m_axi port=x offset=slave depth=32767 bundle=in_x
#pragma HLS INTERFACE s_axilite port=y bundle=cntl
#pragma HLS INTERFACE s_axilite port=x bundle=cntl
#pragma HLS INTERFACE s_axilite port=N bundle=cntl
#pragma HLS INTERFACE s_axilite port=return bundle=cntl
data_t x_buff[32767];
data_t y_buff[32767];
data_t z_buff[32767];
memcpy (y_buff, (const data_t*) y, N*sizeof(data_t));
memcpy (x_buff, (const data_t*) x, N*sizeof(data_t));
unsigned int i;
VECLOOP: for (i=0;i<N;i++)
#pragma HLS LOOP_TRIPCOUNT min=1024 max=32767
z_buff[i] = x_buff[i] + y_buff[i];
memcpy ((data_t*) z, z_buff, N*sizeof(data_t));