Variable Sized DMA Output

I am in the process of creating a DMA-based co-processor that undertakes the Euclidean transform of a an input binary integer array, and outputs a float array. I have gotten it working in the case where the array is a fixed size like I uploaded in the code backup.

#include “matmult.h”
#include “hls_stream.h”
#include “hls_math.h”

void euc_trans_mmult(int in[N2], float out[N2]) {
int i, j;
L_set_max:for(int i = 0; i <N2; ++i){
out[i] = 10.0f;
}
l2:for (int i = 0; i < N1; i++) {
#pragma HLS LOOP_FLATTEN
l3:for (int j = 0; j < N1; j++) {
if (in[i * N1 + j] == 1) {
out[i * N1 + j] = 0; // Distance to itself is 0
} else {
// Compute the Euclidean distance to all 1’s in the binary map
l4:for (int k = 0; k < N1; k++) {
#pragma HLS LOOP_FLATTEN
l5:for (int l = 0; l < N1; l++) {
if (in[k * N1 + l] == 1) {
float distance = hls::sqrtf((i - k) * (i - k) + (j - l) * (j - l));
if (distance < out[i * N1 + j]) {
out[i * N1 + j] = distance;
}
}
}
}
}
}
}
}

void matmult_accel(hls::stream<axis_t> &in, hls::stream<axis_t> &out) {
#pragma HLS INTERFACE s_axilite port = return bundle = control
#pragma HLS INTERFACE axis port = in
#pragma HLS INTERFACE axis port = out

int l_A[N2];
float l_C[N2];

#pragma HLS ARRAY_PARTITION variable = l_A factor = 16 dim = 1 cyclic
#pragma HLS ARRAY_PARTITION variable = l_C factor = 16 dim = 1 cyclic

converter_t converter;
load_A:
for (int i = 0; i < N2; i++) {
axis_t temp;
in.read(temp);
l_A[i] = temp.data;
}

euc_trans_mmult(l_A, l_C);

writeC:
for (int i = 0; i < N2; i++) {
axis_t temp;
ap_uint<1> last = 0;
if (i == N2 - 1) {
last = 1;
}
converter.d = l_C[i];
temp.data = converter.i;
temp.last = last;
temp.keep = -1; // enabling all bytes
out.write(temp);
}
}

Howeverh when I modify the co-processor to take into account variable heights and widths, the code stops working and when I run it in PYNQ it freeze the system for a few minutes before crashing.

void euc_trans_mmult(int in, float out, int height, int width) {
// Initialize all distances to a large value
for (int i = 0; i < height * width; ++i) {
out[i] = 10.0f;
}

// Iterate over each point in the input matrix
for (int i = 0; i < height; i++) {
    for (int j = 0; j < width; j++) {
        if (in[i * width + j] == 1) {
            out[i * width + j] = 0; // Distance to itself is 0
        } else {
            // Compute the Euclidean distance to all 1's in the binary map
            for (int k = 0; k < height; k++) {
                for (int l = 0; l < width; l++) {

#pragma HLS UNROLL
if (in[k * width + l] == 1) {
float distance = hls::sqrtf((i - k) * (i - k) + (j - l) * (j - l));
if (distance < out[i * width + j]) {
out[i * width + j] = distance;
}
}
}
}
}
}
}
}

void matmult_accel(hls::stream<axis_t> &in, hls::stream<axis_t> &out, int height, int width) {
#pragma HLS INTERFACE s_axilite port = return bundle = control
#pragma HLS INTERFACE s_axilite port = height
#pragma HLS INTERFACE s_axilite port = width
#pragma HLS INTERFACE axis port = in
#pragma HLS INTERFACE axis port = out
int total_size = height*width;
int l_A[N2];
float l_C[N2];

#pragma HLS ARRAY_PARTITION variable = l_A factor = 16 dim = 1 cyclic
#pragma HLS ARRAY_PARTITION variable = l_C factor = 16 dim = 1 cyclic

converter_t converter;
load_A:
for (int i = 0; i < total_size; i++) {
axis_t temp;
in.read(temp);
l_A[i] = temp.data;
}

euc_trans_mmult(l_A, l_C,height,width);

writeC:
for (int i = 0; i < total_size; i++) {
axis_t temp;
ap_uint<1> last = 0;
if (i == total_size - 1) {
last = 1;
}
converter.d = l_C[i];
temp.data = converter.i;
temp.last = last;
temp.keep = -1; // enabling all bytes
out.write(temp);
}
}

Can someone help me figure out why my code keeps crashing when I change the code like this even when the vivaddo block remains the same.