Runtime error: DMA channel not started on Ultra96

Hi all, I’m using PYNQ for the first time. I’m getting a raise RuntimeError(“DMA channel not started”) error on the line dma_send.transfer(input_buffer) when running my python script on the bitstream.

Here is my register map before and after the error:
Register Map Before Execution:

RegisterMap {
  MM2S_DMACR = Register(RS=1, Reset=0, Keyhole=0, Cyclic_BD_Enable=0, IOC_IrqEn=0, Dly_IrqEn=0, Err_IrqEn=0, IRQThreshold=1, IRQDelay=0),
  MM2S_DMASR = Register(Halted=0, Idle=0, SGIncld=0, DMAIntErr=0, DMASlvErr=0, DMADecErr=0, SGIntErr=0, SGSlvErr=0, SGDecErr=0, IOC_Irq=0, Dly_Irq=0, Err_Irq=0, IRQThresholdSts=0, IRQDelaySts=0),
  MM2S_CURDESC = Register(Current_Descriptor_Pointer=0),
  MM2S_CURDESC_MSB = Register(Current_Descriptor_Pointer=0),
  MM2S_TAILDESC = Register(Tail_Descriptor_Pointer=0),
  MM2S_TAILDESC_MSB = Register(Tail_Descriptor_Pointer=0),
  MM2S_SA = Register(Source_Address=0),
  MM2S_SA_MSB = Register(Source_Address=0),
  MM2S_LENGTH = Register(Length=0),
  SG_CTL = Register(SG_CACHE=0, SG_USER=0),
  S2MM_DMACR = Register(RS=1, Reset=0, Keyhole=0, Cyclic_BD_Enable=0, IOC_IrqEn=0, Dly_IrqEn=0, Err_IrqEn=0, IRQThreshold=1, IRQDelay=0),
  S2MM_DMASR = Register(Halted=0, Idle=0, SGIncld=0, DMAIntErr=0, DMASlvErr=0, DMADecErr=0, SGIntErr=0, SGSlvErr=0, SGDecErr=0, IOC_Irq=0, Dly_Irq=0, Err_Irq=0, IRQThresholdSts=0, IRQDelaySts=0),
  S2MM_CURDESC = Register(Current_Descriptor_Pointer=0),
  S2MM_CURDESC_MSB = Register(Current_Descriptor_Pointer=0),
  S2MM_TAILDESC = Register(Tail_Descriptor_Pointer=0),
  S2MM_TAILDESC_MSB = Register(Tail_Descriptor_Pointer=0),
  S2MM_DA = Register(Destination_Address=0),
  S2MM_DA_MSB = Register(Destination_Address=0),
  S2MM_LENGTH = Register(Length=0)
}

After:

RegisterMap {
  MM2S_DMACR = Register(RS=0, Reset=0, Keyhole=0, Cyclic_BD_Enable=0, IOC_IrqEn=0, Dly_IrqEn=0, Err_IrqEn=0, IRQThreshold=1, IRQDelay=0),
  MM2S_DMASR = Register(Halted=1, Idle=0, SGIncld=0, DMAIntErr=0, DMASlvErr=0, DMADecErr=0, SGIntErr=0, SGSlvErr=0, SGDecErr=0, IOC_Irq=1, Dly_Irq=0, Err_Irq=0, IRQThresholdSts=0, IRQDelaySts=0),
  MM2S_CURDESC = Register(Current_Descriptor_Pointer=0),
  MM2S_CURDESC_MSB = Register(Current_Descriptor_Pointer=0),
  MM2S_TAILDESC = Register(Tail_Descriptor_Pointer=0),
  MM2S_TAILDESC_MSB = Register(Tail_Descriptor_Pointer=0),
  MM2S_SA = Register(Source_Address=838860800),
  MM2S_SA_MSB = Register(Source_Address=0),
  MM2S_LENGTH = Register(Length=720),
  SG_CTL = Register(SG_CACHE=0, SG_USER=0),
  S2MM_DMACR = Register(RS=0, Reset=0, Keyhole=0, Cyclic_BD_Enable=0, IOC_IrqEn=0, Dly_IrqEn=0, Err_IrqEn=0, IRQThreshold=1, IRQDelay=0),
  S2MM_DMASR = Register(Halted=1, Idle=0, SGIncld=0, DMAIntErr=1, DMASlvErr=0, DMADecErr=0, SGIntErr=0, SGSlvErr=0, SGDecErr=0, IOC_Irq=1, Dly_Irq=0, Err_Irq=1, IRQThresholdSts=0, IRQDelaySts=0),
  S2MM_CURDESC = Register(Current_Descriptor_Pointer=0),
  S2MM_CURDESC_MSB = Register(Current_Descriptor_Pointer=0),
  S2MM_TAILDESC = Register(Tail_Descriptor_Pointer=0),
  S2MM_TAILDESC_MSB = Register(Tail_Descriptor_Pointer=0),
  S2MM_DA = Register(Destination_Address=761257984),
  S2MM_DA_MSB = Register(Destination_Address=0),
  S2MM_LENGTH = Register(Length=0)
}

The python script:

import numpy as np
import pandas as pd
import time
from pynq import Overlay, allocate

print("Starting...")

overlay = Overlay("/home/xilinx/AI/design_1_wrapper.bit")

dma = overlay.axi_dma_0 
dma_send = dma.sendchannel
dma_rcv = dma.recvchannel

print("Loading HLS IP")
hls_ip = overlay.predict_0

print("Register Map Before Execution:")
print(dma.register_map)  # Print initial register state

CONTROL_REGISTER = 0x0
hls_ip.write(CONTROL_REGISTER, 0x81)
print(f"HLS IP Control Register: {hls_ip.read(CONTROL_REGISTER)}")

print("Loading test data")
test_data = pd.read_csv("/home/xilinx/AI/test_data.csv")
print("Test data loaded")

X_test = test_data.drop(columns=["label"]).values.astype(np.float32)
Y_test = test_data["label"].values.astype(np.int32)

print(f"Number of samples: {len(X_test)}")

INPUT_SIZE = 180  
OUTPUT_SIZE = 8  
input_buffer = allocate(shape=(INPUT_SIZE,), dtype=np.float32)
output_buffer = allocate(shape=(OUTPUT_SIZE,), dtype=np.float32)

correct = 0
total_time = 0

try:
    for i in range(2):
        np.copyto(input_buffer, X_test[i])

        print(f"\nTest Sample {i+1}: Transferring data...")
        start_time = time.time()
        dma_send.transfer(input_buffer)
        dma_rcv.transfer(output_buffer)
        dma_send.wait()
        dma_rcv.wait()

        end_time = time.time()
        predicted_class = int(np.argmax(output_buffer))
        true_class = Y_test[i]
        print(f"Test Sample {i+1}: Predicted={predicted_class}, Actual={true_class}")
        if predicted_class == true_class:
            correct += 1
        total_time += (end_time - start_time)

except Exception as e:
    print(f"Error during DMA transfer: {e}")

finally:
    # Print the register map after error
    print("Register Map After DMA Transfer (or Error):")
    print(dma.register_map)

Vitis HLS C++ code:

// Define fixed-point types
typedef ap_fixed<32, 16> fixed_point; 
typedef ap_axis<32, 2, 5, 6> axis_t; 

void predict(hls::stream<axis_t> &input_stream, hls::stream<axis_t> &output_stream) {

    #pragma HLS INTERFACE mode=axis port=input_stream
    #pragma HLS INTERFACE mode=axis port=output_stream
    #pragma HLS INTERFACE mode=s_axilite port=return

    fixed_point input_values[INPUT_SIZE];
    fixed_point layer1_output[LAYER1_SIZE];
    fixed_point layer2_output[LAYER2_SIZE];
    fixed_point output_layer_output[OUTPUT_SIZE];

    // Read input from AXI stream
    for (int i = 0; i < INPUT_SIZE; i++) {
        #pragma HLS PIPELINE
        axis_t temp = input_stream.read();
        input_values[i] = temp.data;
    }

    // some more code for matrix multiplication for each layer

    for (int i = 0; i < OUTPUT_SIZE; i++) {
        axis_t output_data;
        output_data.data = output_layer_output[i];  
        output_data.keep = 1;
        output_data.strb = 1;
        output_data.last = (i == OUTPUT_SIZE - 1);  // Mark last element in stream
        output_stream.write(output_data);
    }
}

I noticed in the register map printed after the error: DMAIntErr = 1 for S2MM_DMASR. I also tried reading the documentation and other relevant posts regarding this but could not find the exact solution for the issue. In the hls code, I have correctly set tlast bit to 1 and I’m sending over the bitstream via ssh.
I would appreciate any advise on how to debug the issue. Thanks!

Also sharing the DMA ip configuration:

Hi @cool,

Welcome to the PYNQ community.

This is likely due to TLAST and input and output buffer size. I wrote extensible about debugging this. Debugging Common DMA Issues [Part 3]

Mario

Thanks for the reply.
I see that in my hls code I have set the last bit to 1. Also, I am currently unable to connect to the device via cable to send the bitstream, and can only access via ssh so I can’t view the waveform on Vivado. Is there any other way to debug?

I see that in my hls code I have set the last bit to 1.

Yes, I see this, but something is not working correctly.

can only access via ssh so I can’t view the waveform on Vivado. Is there any other way to debug?

Yes, see part 4 of that tutorial.

Mario