PYNQ DMA call overhead

Hi,

I am using a PYNQ-Z2 FPGA board running PYNQ framework v2.5. I have run up with some performance issues after implementing an HLS IP with 2 AXI-Stream interfaces, one for receiving input from the PS and other to give back the output to the PS after processing. I believe there is a large overhead associated with DMA calls from the python script which eliminates all of the acceleration advantages I get by using the PL. I would love to have your valuable insight regarding this issue.

I am attaching the IP HLS code(top function only) and the Python code as well.

Thanks in advance

HLS IP code:-

void sha256(hls::stream<AXI_VAL> &in_stream_a, hls::stream<AXI_VAL> &out_stream){
#pragma HLS INTERFACE axis port=in_stream_a
#pragma HLS INTERFACE axis port=out_stream
#pragma HLS INTERFACE ap_ctrl_none port=return
AXI_VAL valIn_a, valOut;
int i;

//input section
for(i=0;i<32;++i)
{
   #pragma HLS unroll
   valIn_a = in_stream_a.read();
   input_hash[i]= valIn_a.data_single;
}
i=0;

//processing part
while(i!=32)
{
  #pragma HLS pipeline II=1
  a=call=i=0;
  sha256_init();
  sha256_update();
  string_gen();
  sha256_final();
  for(i=0;i<32;i++)
  {
      #pragma HLS unroll
	  if(data_hashed[i]!=input_hash[i])
	  {
		break;
	  }
     }

     //output section
     if(i==32)
     {
       for(i=0;data[i]!='\0';++i)
       {
      valOut.data_single = data[i];
      if(i==size-1)
    	valOut.last = 1;
      else
    	valOut.last = 0;
      out_stream.write(valOut);
       }
       i=32;
     }
   }
 }

Python code:-

from pynq import Overlay
from pynq import GPIO
from pynq import Xlnk
import time
import numpy
import binascii
overlay = Overlay( "./ fes256.bit" )
dma_ip = overlay.axi_dma
xlnk = Xlnk()
output_size = 32
input_data = [0xc6 ,0x1a ,0x40 ,0x87 ,0x6a ,0x99 ,0x21 ,0xa2 ,0xb9 ,0x66 ,0xd7 ,0x1d ,0x4b ,0x39 ,0x7d ,0xc2 ,0xa9 ,0x5f ,0xe3 ,0xe7 ,0xa4 ,0x5c ,0x14 ,0x89 ,0x4a ,0x50 ,0xb8 ,0xeb ,0xf9 ,0x1d ,0x4f ,0x4c]
input_buffer = xlnk.cma_array (shape =(32) , dtype = numpy.int32 )
output_buffer = xlnk.cma_array ( shape =(10000) , dtype = numpy.int32 )
numpy.copyto ( input_buffer , input_data )
print( input_buffer )
start_time = time.time ()
dma_ip.sendchannel.transfer ( input_buffer )
dma_ip.recvchannel.transfer ( output_buffer )
dma_ip.sendchannel.wait ()
dma_ip.recvchannel.wait ()
end_time = time.time ()
print( output_buffer )
print( end_time - start_time )