Hi all, I am new to FPGA design and am currently designing my own IP for implementation on PYNQ Z1, V3.0.1. However, in Jupyter notebook, when I try to run my kernel, it is stuck on dma.recvchannel.wait(). I know this is a common issue and several posts have been made about it, but I have tried some of the common solutions such as restarting the board, ensuring TLAST is set in HLS implementation and ensuring Buffer Length Register Width is set adequately to no avail. I am unsure if it is something I missed or did wrong. Any help will be much appreciated.

Edit: In the jupyter notebook implementation, when I try to run run_kernel() it will be stuck on â€śwaiting for dma receive channelâ€ť.

**â€“HLS Implementationâ€“**

```
#include "hls_stream.h"
#include "gaussianelim_header.h"
#include "hls_math.h"
template <typename T> void gaussian_elimination(T A[N2], T b[N], T out[N]) {
int perm[N];
for (int i = 0; i < N; i++) {
perm[i] = i;
}
// Forward Elimination with Partial Pivoting
for (int k = 0; k < N - 1; k++) {
#pragma HLS PIPELINE II = 1
T max_val = 0;
int pivot_row = k;
for (int i = k; i < N; i++) {
if (fabs(A[perm[i] * N + k]) > max_val) {
max_val = fabs(A[perm[i] * N + k]);
pivot_row = i;
}
}
if (pivot_row != k) {
int temp = perm[k];
perm[k] = perm[pivot_row];
perm[pivot_row] = temp;
}
for (int i = k + 1; i < N; i++) {
T factor = A[perm[i] * N + k] / A[perm[k] * N + k];
for (int j = k; j < N; j++) {
A[perm[i] * N + j] -= factor * A[perm[k] * N + j];
}
b[perm[i]] -= factor * b[perm[k]];
}
}
// Back Substitution
for (int i = N - 1; i >= 0; i--) {
out[i] = b[perm[i]];
for (int j = i + 1; j < N; j++) {
out[i] -= A[perm[i] * N + j] * out[j];
}
out[i] = out[i] / A[perm[i] * N + i];
}
}
extern "C" {
void gaussian_elim_accel(hls::stream<axis_t> &in, hls::stream<axis_t> &out) {
#pragma HLS INTERFACE axis port=in
#pragma HLS INTERFACE axis port=out
#pragma HLS INTERFACE s_axilite port=return bundle=CTRL_BUS
DataType A[N2]; // Declaration of A
DataType B[N]; // Declaration of B
DataType X[N]; // Declaration of X
// Read input data from the 'in' stream and populate A and B
converter_t converter;
for (int i = 0; i < N*N; ++i) { // Correctly iterating over N*N elements for A
axis_t temp = in.read();
converter.i = temp.data;
A[i] = converter.d; // Populate A matrix
}
// Then, read input data for B
for (int i = 0; i < N; ++i) { // Correctly iterating over N elements for B
axis_t temp = in.read();
converter.i = temp.data;
B[i] = converter.d; // Populate B vector
}
gaussian_elimination<DataType>(A, B, X);
for (int i = 0; i < N; ++i) {
axis_t temp;
converter.d = X[i]; // Use X vector for the result
temp.data = converter.i;
// Set the last signal for the last data word
temp.last = (i == N - 1) ? 1 : 0;
temp.keep = -1;
out.write(temp);
}
}
}
```

**â€“Header Fileâ€“**

```
#ifndef _GAUSSIAN_ELIM_
#define _GAUSSIAN_ELIM_
#include "ap_axi_sdata.h"
#include "ap_int.h"
#include <inttypes.h>
#define N 128
#define N2 128*128
#define DWIDTH 512
typedef ap_axiu<DWIDTH, 0, 0, 0> axis_t;
typedef ap_uint<512> uint512_t;
typedef float DataType; // Data type for matrix elements
const int DataTypeSize = sizeof(DataType) * 8;
typedef ap_uint<DataTypeSize> DataTypeInt;
typedef union converter {
DataType d;
uint32_t i;
} converter_t;
template <typename T> void gaussian_elimination(T matrix[N2],T vector[N] ,T result[N]);
#endif // _GAUSSIAN_ELIM_
```