Composable pipeline need to wait before readframe() from VDMA

Thank you for the suggestions, @marioruiz, I believe I already covered them and it is likely something a bit more abstract. I added an ILA to the AXI4-Stream from the switch to the VDMA and all the AXI4-Stream Video protocol signals seem about right. The simple [ps_in, ps_out] configuration also fails on the first iteration without time.sleep(0.01) but passes all iterations with time.sleep(0.01).

I think my issue is not related to the configuration of the HLS IPs and the pipeline, each HLS IP was developed with self-checking test benches and I have a test suit using PYNQ as well. Here is an example test setup that fails on the first frame if I remove time.sleep(0.01). The bypass test setup is the same as this one without the colour conversion.

import os
import gc
import cv2
import time
import pytest
import numpy as np
import pynq_composable
from pynq import Overlay, DefaultIP
from pynq.lib.video import VideoMode
from mlvap.data import VOCLoader
from mlvap.utils import compare_results

class BGR2RGB(DefaultIP):
    bindto = ['xilinx.com:hls:bgr2rgb_accel:1.0']
    
    START = 0x81
    STOP  = 0x00
    
    def __init__(self, description):
        super().__init__(description)
        
    def config(self, rows:int, cols:int):
        # Configure IP for processing
        self.register_map.rows = rows
        self.register_map.cols = cols
        
    def start(self):
        self.register_map.CTRL = self.START
        
    def stop(self):
        self.register_map.CTRL = self.STOP

ROOT = os.path.dirname(os.path.realpath(__file__))
OVERLAY_PATH = os.path.join(ROOT, '../../overlays/mlvap_test/mlvap_test.bit')

ITERATIONS = 100

BGR2RGB_DIMENSIONS = [
    (160, 320, ITERATIONS),
    (320, 320, ITERATIONS),
    (416, 416, ITERATIONS),
    (480, 640, ITERATIONS)
]

@pytest.mark.parametrize('rows, cols, N', BGR2RGB_DIMENSIONS)
def test_mlvap_bgr2rgb(rows, cols, N):
    ol = Overlay(OVERLAY_PATH)
    
    pipeline = ol.pipeline
    vdma = pipeline.vdma
    bgr2rgb = pipeline.bgr2rgb
    
    # Configure IP and pipeline
    bgr2rgb.config(rows, cols)
    pipeline.compose([pipeline.ps_in, bgr2rgb, pipeline.ps_out])
    
    # Configure VDMA
    vdma_in = vdma.writechannel
    vdma_in.mode = VideoMode(cols, rows, 24)
    vdma_out = vdma.readchannel
    vdma_out.mode = VideoMode(cols, rows, 24)
    
    # Start IP and VDMA
    vdma_in.start()
    vdma_out.start()
    bgr2rgb.start()
    
    for i in range(N):
        # Generate random input
        input_data = (np.random.rand(rows, cols, 3) * 255).astype(np.uint8)
        
        # Process test input using the hardware
        in_frame = vdma_in.newframe()
        in_frame[...] = input_data
        vdma_in.writeframe(in_frame)
        
        # Wait for result -> it is a bit sensitive and need to wait.
        # If the test fails, try increasing the wait time.
        time.sleep(0.01)
        
        out_data = vdma_out.readframe()

        # Generate reference output
        reference_output = cv2.cvtColor(input_data, cv2.COLOR_BGR2RGB)
        err_per = compare_results(out_data, reference_output, 0.005)

        if err_per == 0.0:
            out_data.freebuffer()
            continue
            
        cv2.imwrite(os.path.join(ROOT, 'sw.jpg'), reference_output)
        cv2.imwrite(os.path.join(ROOT, 'hw.jpg'), out_data)

       out_data.freebuffer()
            
        # Clean up in case of error
        vdma_out.stop()
        vdma_in.stop()
        bgr2rgb.stop()
        del ol
        gc.collect()
        assert False, f'Failed at {i+1}/{N}'
    
    vdma_out.stop()
    vdma_in.stop()
    bgr2rgb.stop()
    del ol
    gc.collect()

Everything works when it comes to the behaviour of the pipeline, I just need to spend some time waiting before reading the output from the VDMA which is not very elegant. I suspect it has something to do with interrupts, but I am out of ideas what to check regarding that.

Bonus question: My current issue is likely unrelated to my older question on the forum Clarification for using VDMA. If you have time, could you please take a look, I am still not clear on some of the issues I raised there.

Thank you,
Mario