I am in the process of creating a DMA-based co-processor that undertakes the Euclidean transform of a an input binary integer array, and outputs a float array. I have gotten it working in the case where the array is a fixed size like I uploaded in the code backup.
#include “matmult.h”
#include “hls_stream.h”
#include “hls_math.h”void euc_trans_mmult(int in[N2], float out[N2]) {
int i, j;
L_set_max:for(int i = 0; i <N2; ++i){
out[i] = 10.0f;
}
l2:for (int i = 0; i < N1; i++) {
#pragma HLS LOOP_FLATTEN
l3:for (int j = 0; j < N1; j++) {
if (in[i * N1 + j] == 1) {
out[i * N1 + j] = 0; // Distance to itself is 0
} else {
// Compute the Euclidean distance to all 1’s in the binary map
l4:for (int k = 0; k < N1; k++) {
#pragma HLS LOOP_FLATTEN
l5:for (int l = 0; l < N1; l++) {
if (in[k * N1 + l] == 1) {
float distance = hls::sqrtf((i - k) * (i - k) + (j - l) * (j - l));
if (distance < out[i * N1 + j]) {
out[i * N1 + j] = distance;
}
}
}
}
}
}
}
}void matmult_accel(hls::stream<axis_t> &in, hls::stream<axis_t> &out) {
#pragma HLS INTERFACE s_axilite port = return bundle = control
#pragma HLS INTERFACE axis port = in
#pragma HLS INTERFACE axis port = outint l_A[N2];
float l_C[N2];#pragma HLS ARRAY_PARTITION variable = l_A factor = 16 dim = 1 cyclic
#pragma HLS ARRAY_PARTITION variable = l_C factor = 16 dim = 1 cyclicconverter_t converter;
load_A:
for (int i = 0; i < N2; i++) {
axis_t temp;
in.read(temp);
l_A[i] = temp.data;
}euc_trans_mmult(l_A, l_C);
writeC:
for (int i = 0; i < N2; i++) {
axis_t temp;
ap_uint<1> last = 0;
if (i == N2 - 1) {
last = 1;
}
converter.d = l_C[i];
temp.data = converter.i;
temp.last = last;
temp.keep = -1; // enabling all bytes
out.write(temp);
}
}
Howeverh when I modify the co-processor to take into account variable heights and widths, the code stops working and when I run it in PYNQ it freeze the system for a few minutes before crashing.
void euc_trans_mmult(int in, float out, int height, int width) {
// Initialize all distances to a large value
for (int i = 0; i < height * width; ++i) {
out[i] = 10.0f;
}// Iterate over each point in the input matrix for (int i = 0; i < height; i++) { for (int j = 0; j < width; j++) { if (in[i * width + j] == 1) { out[i * width + j] = 0; // Distance to itself is 0 } else { // Compute the Euclidean distance to all 1's in the binary map for (int k = 0; k < height; k++) { for (int l = 0; l < width; l++) {
#pragma HLS UNROLL
if (in[k * width + l] == 1) {
float distance = hls::sqrtf((i - k) * (i - k) + (j - l) * (j - l));
if (distance < out[i * width + j]) {
out[i * width + j] = distance;
}
}
}
}
}
}
}
}void matmult_accel(hls::stream<axis_t> &in, hls::stream<axis_t> &out, int height, int width) {
#pragma HLS INTERFACE s_axilite port = return bundle = control
#pragma HLS INTERFACE s_axilite port = height
#pragma HLS INTERFACE s_axilite port = width
#pragma HLS INTERFACE axis port = in
#pragma HLS INTERFACE axis port = out
int total_size = height*width;
int l_A[N2];
float l_C[N2];#pragma HLS ARRAY_PARTITION variable = l_A factor = 16 dim = 1 cyclic
#pragma HLS ARRAY_PARTITION variable = l_C factor = 16 dim = 1 cyclicconverter_t converter;
load_A:
for (int i = 0; i < total_size; i++) {
axis_t temp;
in.read(temp);
l_A[i] = temp.data;
}euc_trans_mmult(l_A, l_C,height,width);
writeC:
for (int i = 0; i < total_size; i++) {
axis_t temp;
ap_uint<1> last = 0;
if (i == total_size - 1) {
last = 1;
}
converter.d = l_C[i];
temp.data = converter.i;
temp.last = last;
temp.keep = -1; // enabling all bytes
out.write(temp);
}
}
Can someone help me figure out why my code keeps crashing when I change the code like this even when the vivaddo block remains the same.