My goal here is to basically put the function
import numpy
def vectorExponentiation(x):
return numpy.exp(x)
Onto hardware.
I have the Vitis HLS files as folllowed:
vexp.hpp
:
#include "ap_axi_sdata.h"
#include "hls_stream.h"
#define N 16
#define PRESCISION 32
typedef ap_axiu<32, 1, 1, 1> pkt;
typedef hls::stream< pkt > strm;
typedef float DataType;
template <typename T> void test_kernel(T in[N], T out[N]);
DataType exponential(DataType x);
vexp.cpp
:
#include "vexp.hpp"
DataType exponential(DataType x)
{
DataType sum = DataType(1);
for (int i = PRESCISION - 1; i > 0; --i ){
sum = DataType(1) + x * sum / DataType(i);
}
return sum;
}
template <typename T> void test_kernel(T in[N], T out[N]) {
for(int i = 0; i < N; i++){
out[i] = exponential(in[i]);
}
}
void vexp
(
strm &INPUT,
strm &OUTPUT
)
{
#pragma HLS INTERFACE axis port=INPUT
#pragma HLS INTERFACE axis port=OUTPUT
DataType l_0[N];
DataType out[N];
load_A:
for (int i = 0; i < N; i++) {
pkt temp = INPUT.read();
l_0[i] = temp.data;
}
test_kernel<DataType>(l_0, out);
write_C:
for (int i = 0; i < N; i++) {
pkt temp;
temp.data = out[i];
ap_uint<1> last = 0;
if (i == N - 1) {
last = 1;
}
temp.last = last;
temp.keep = -1;
OUTPUT.write(temp);
}
}
vexp_tb.cpp
:
#include "vexp.hpp"
void vexp_sw(DataType in[N], DataType out[N]) {
for(int i = 0; i < N; i++){
out[i] = exponential(in[i]);
}
}
int main(void) {
int i, j, err;
DataType in[N];
DataType out_sw[N];
DataType out_hw[N];
/* initiation */
for (i = 0; i < N; i++) {
in[i] = DataType(i);
out_sw[i] = DataType(0);
out_hw[i] = DataType(0);
}
/* hardware execute */
test_kernel<DataType>(in, out_hw);
printf("\nHardware kernel complete\n");
/* software execute */
vexp_sw(in, out_sw);
printf("\nSoftware kernel complete\n");
err = 1;
for(int i = 0; i < N; i++) {
err = out_sw[i] == out_hw[i];
printf("%f, %f \n", out_sw[i], out_hw[i]);
}
printf("\n");
if (err == 1) {
printf("\nTest successful!\r\n");
return 0;
}
printf("\nTest failed!\r\n");
return 1;
}
And these all compile properly with vitis.
However, when executing the bitstream on vexp.ipynb
on the PYNQ:
from pynq import Overlay, allocate
import pynq.lib.dma
import numpy as np
DIM=16
DATA_TYPE = np.float32
in_buffer_A = allocate(shape=(DIM,), dtype=DATA_TYPE, cacheable=False)
out_buffer_C = allocate(shape=(DIM,), dtype=DATA_TYPE, cacheable=False)
A = np.array([x for x in range(DIM)], dtype=DATA_TYPE)
C = np.zeros((DIM,), dtype=DATA_TYPE)
ol = Overlay('./design_1.bit')
ol.download()
dma0 = ol.axi_dma_0
np.copyto(in_buffer_A, A)
np.copyto(out_buffer_C, C)
def silicon():
dma0.sendchannel.transfer(in_buffer_A)
dma0.recvchannel.transfer(out_buffer_C)
dma0.sendchannel.wait()
dma0.recvchannel.wait()
return out_buffer_C
def native():
return np.exp(A)
%time golden = native()
%time FPGA = silicon()
print(FPGA)
The printed output is mainly zeros.
Also, this is the Vivado layout:
design_1-1.pdf (16.4 KB)
Am using Vitis/Vivado 2022.1 … Does anyone know what I have done wrong?