Hello all,
I’m new to FPGAs and currently trying to implement the following top function:
int fit(float I[N][d], float b[N], float x[N], float p[N], float q[N], float r[N], float lambda, float gamma) {
#pragma HLS INTERFACE m_axi bundle=inI port=I
#pragma HLS INTERFACE m_axi bundle=inb port=b
#pragma HLS INTERFACE m_axi bundle=inx port=x
#pragma HLS INTERFACE m_axi bundle=inp port=p
#pragma HLS INTERFACE m_axi bundle=inr port=r
#pragma HLS INTERFACE m_axi bundle=inq port=q
#pragma HLS INTERFACE s_axilite port=lambda
#pragma HLS INTERFACE s_axilite port=gamma
#pragma HLS INTERFACE s_axilite port=return
I tried implementing the hardware as in the pdf, and it works functionally! but very very slow!
design_1_fit.pdf (79.6 KB)
In Vitis HLS, it was estimated to be 1 second for 100Mhz
in reality, it is 6.5 seconds for 100Mhz.
I think the issue is that I didn’t use a DMA, but instead used the Xlnk() buffers as I saw in one tutorial.
from pynq import Overlay
from pynq import Xlnk #linux buffer to read/write axi
import numpy as np
import time
ol = Overlay('design_1_fit.bit')
ol.download()
fit_ip = ol.fit
#create variables buffers
N = 1024
d = 16
I = Xlnk().cma_array(shape=(N,d), dtype=np.float32)
b = Xlnk().cma_array(shape=(N,), dtype=np.float32)
x = Xlnk().cma_array(shape=(N,), dtype=np.float32)
r = Xlnk().cma_array(shape=(N,), dtype=np.float32)
p = Xlnk().cma_array(shape=(N,), dtype=np.float32)
q = Xlnk().cma_array(shape=(N,), dtype=np.float32)
#read data
z = np.zeros(N)
with open("d1024_16f.csv") as file_name:
array = np.loadtxt(file_name, delimiter=",")
#assign data to buffers
np.copyto(I, array[:, 0:16])
np.copyto(b, array[:, 16])
np.copyto(x, z)
np.copyto(r, z)
np.copyto(p, z)
np.copyto(q, z)
fit_ip.write(0x00, 0x00)
# memeory addresses:
# I: 0x18
fit_ip.write(0x18, I.physical_address)
# b: 0x24
fit_ip.write(0x24, b.physical_address)
# x: 0x30
fit_ip.write(0x30, x.physical_address)
# p: 0x3c
fit_ip.write(0x3c, p.physical_address)
# q: 0x48
fit_ip.write(0x48, q.physical_address)
# r: 0x54
fit_ip.write(0x54, r.physical_address)
# lambda: 0x60
fit_ip.write(0x60, 0x3f000000)
# Gamma: 0x68
fit_ip.write(0x68, 0x3dcccccd)
#start the ip and measure the time
fit_ip.write(0x00, 0x01)
t_start = time.time()
while fit_ip.read(0x00) & 0b10 != 0b10 :
pass
t_stop = time.time()
print(t_stop-t_start)
print("Number of iterations " ,fit_ip.read(0x10)) #return of fit function
#output results
for i in range(N) :
print(x[i])
Can anyone please help me on what ip blocks I need to save all of these vectors and matrix into DDR memory, and then make the ip talk directly with DDR, (maybe using DMA).
I tried looking for tutorials online, but all I could find is related to DMA with axis (streams). my design doesn’t use streams, and I’m still very confused on what all of the ip blocks mean.