I suspected that when I read it, but I thought it might be my mistake. It seemed weird that Xilinx created this function just for one channel when the original from OpenCV has the 3 ones.
Taking advantage of this thread, do you think is possible synthesize the function cv::pyrDown(InputArray _src, OutputArray _dst, const Size& _dsz, int borderType) just modifying part of the code? Or is it something unfeasible? pyrDown() is an example, but I would need to do it with other functions too. Below part of the pyrDown code:
void cv::pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType )
{
CV_INSTRUMENT_REGION();
CV_Assert(borderType != BORDER_CONSTANT);
CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
ocl_pyrDown(_src, _dst, _dsz, borderType))
CV_OVX_RUN(_src.dims() <= 2,
openvx_pyrDown(_src, _dst, _dsz, borderType))
Mat src = _src.getMat();
Size dsz = _dsz.empty() ? Size((src.cols + 1)/2, (src.rows + 1)/2) : _dsz;
_dst.create( dsz, src.type() );
Mat dst = _dst.getMat();
int depth = src.depth();
CALL_HAL(pyrDown, cv_hal_pyrdown, src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows, depth, src.channels(), borderType);
#ifdef HAVE_TEGRA_OPTIMIZATION
if(borderType == BORDER_DEFAULT && tegra::useTegra() && tegra::pyrDown(src, dst))
return;
#endif
#ifdef HAVE_IPP
bool isolated = (borderType & BORDER_ISOLATED) != 0;
int borderTypeNI = borderType & ~BORDER_ISOLATED;
#endif
CV_IPP_RUN(borderTypeNI == BORDER_DEFAULT && (!_src.isSubmatrix() || isolated) && dsz == Size((_src.cols() + 1)/2, (_src.rows() + 1)/2),
ipp_pyrdown( _src, _dst, _dsz, borderType));
PyrFunc func = 0;
if( depth == CV_8U )
func = pyrDown_< FixPtCast<uchar, 8> >;
else if( depth == CV_16S )
func = pyrDown_< FixPtCast<short, 8> >;
else if( depth == CV_16U )
func = pyrDown_< FixPtCast<ushort, 8> >;
else if( depth == CV_32F )
func = pyrDown_< FltCast<float, 8> >;
else if( depth == CV_64F )
func = pyrDown_< FltCast<double, 8> >;
else
CV_Error( CV_StsUnsupportedFormat, "" );
func( src, dst, borderType );
}
pyrDown_( const Mat& _src, Mat& _dst, int borderType )
{
const int PD_SZ = 5;
typedef typename CastOp::type1 WT;
typedef typename CastOp::rtype T;
CV_Assert( !_src.empty() );
Size ssize = _src.size(), dsize = _dst.size();
int cn = _src.channels();
int bufstep = (int)alignSize(dsize.width*cn, 16);
AutoBuffer<WT> _buf(bufstep*PD_SZ + 16);
WT* buf = alignPtr((WT*)_buf.data(), 16);
int tabL[CV_CN_MAX*(PD_SZ+2)], tabR[CV_CN_MAX*(PD_SZ+2)];
AutoBuffer<int> _tabM(dsize.width*cn);
int* tabM = _tabM.data();
WT* rows[PD_SZ];
CastOp castOp;
CV_Assert( ssize.width > 0 && ssize.height > 0 &&
std::abs(dsize.width*2 - ssize.width) <= 2 &&
std::abs(dsize.height*2 - ssize.height) <= 2 );
int k, x, sy0 = -PD_SZ/2, sy = sy0, width0 = std::min((ssize.width-PD_SZ/2-1)/2 + 1, dsize.width);
for( x = 0; x <= PD_SZ+1; x++ )
{
int sx0 = borderInterpolate(x - PD_SZ/2, ssize.width, borderType)*cn;
int sx1 = borderInterpolate(x + width0*2 - PD_SZ/2, ssize.width, borderType)*cn;
for( k = 0; k < cn; k++ )
{
tabL[x*cn + k] = sx0 + k;
tabR[x*cn + k] = sx1 + k;
}
}
ssize.width *= cn;
dsize.width *= cn;
width0 *= cn;
for( x = 0; x < dsize.width; x++ )
tabM[x] = (x/cn)*2*cn + x % cn;
for( int y = 0; y < dsize.height; y++ )
{
T* dst = _dst.ptr<T>(y);
WT *row0, *row1, *row2, *row3, *row4;
// fill the ring buffer (horizontal convolution and decimation)
for( ; sy <= y*2 + 2; sy++ )
{
WT* row = buf + ((sy - sy0) % PD_SZ)*bufstep;
int _sy = borderInterpolate(sy, ssize.height, borderType);
const T* src = _src.ptr<T>(_sy);
int limit = cn;
const int* tab = tabL;
for( x = 0;;)
{
for( ; x < limit; x++ )
{
row[x] = src[tab[x+cn*2]]*6 + (src[tab[x+cn]] + src[tab[x+cn*3]])*4 +
src[tab[x]] + src[tab[x+cn*4]];
}
if( x == dsize.width )
break;
if( cn == 1 )
{
x += PyrDownVecH<T, WT, 1>(src + x * 2 - 2, row + x, width0 - x);
for( ; x < width0; x++ )
row[x] = src[x*2]*6 + (src[x*2 - 1] + src[x*2 + 1])*4 +
src[x*2 - 2] + src[x*2 + 2];
}
else if( cn == 2 )
{
x += PyrDownVecH<T, WT, 2>(src + x * 2 - 4, row + x, width0 - x);
for( ; x < width0; x += 2 )
{
const T* s = src + x*2;
WT t0 = s[0] * 6 + (s[-2] + s[2]) * 4 + s[-4] + s[4];
WT t1 = s[1] * 6 + (s[-1] + s[3]) * 4 + s[-3] + s[5];
row[x] = t0; row[x + 1] = t1;
}
}
else if( cn == 3 )
{
x += PyrDownVecH<T, WT, 3>(src + x * 2 - 6, row + x, width0 - x);
for( ; x < width0; x += 3 )
{
const T* s = src + x*2;
WT t0 = s[0]*6 + (s[-3] + s[3])*4 + s[-6] + s[6];
WT t1 = s[1]*6 + (s[-2] + s[4])*4 + s[-5] + s[7];
WT t2 = s[2]*6 + (s[-1] + s[5])*4 + s[-4] + s[8];
row[x] = t0; row[x+1] = t1; row[x+2] = t2;
}
}
else if( cn == 4 )
{
x += PyrDownVecH<T, WT, 4>(src + x * 2 - 8, row + x, width0 - x);
for( ; x < width0; x += 4 )
{
const T* s = src + x*2;
WT t0 = s[0]*6 + (s[-4] + s[4])*4 + s[-8] + s[8];
WT t1 = s[1]*6 + (s[-3] + s[5])*4 + s[-7] + s[9];
row[x] = t0; row[x+1] = t1;
t0 = s[2]*6 + (s[-2] + s[6])*4 + s[-6] + s[10];
t1 = s[3]*6 + (s[-1] + s[7])*4 + s[-5] + s[11];
row[x+2] = t0; row[x+3] = t1;
}
}
else
{
for( ; x < width0; x++ )
{
int sx = tabM[x];
row[x] = src[sx]*6 + (src[sx - cn] + src[sx + cn])*4 +
src[sx - cn*2] + src[sx + cn*2];
}
}
limit = dsize.width;
tab = tabR - x;
}
}
// do vertical convolution and decimation and write the result to the destination image
for( k = 0; k < PD_SZ; k++ )
rows[k] = buf + ((y*2 - PD_SZ/2 + k - sy0) % PD_SZ)*bufstep;
row0 = rows[0]; row1 = rows[1]; row2 = rows[2]; row3 = rows[3]; row4 = rows[4];
x = PyrDownVecV<WT, T>(rows, dst, dsize.width);
for( ; x < dsize.width; x++ )
dst[x] = castOp(row2[x]*6 + (row1[x] + row3[x])*4 + row0[x] + row4[x]);
}
}
From my knowledge, I see it impossible to copy the code (so many references from other libraries) and modify part of them to allow Vivado HLS synthesize it. Would you suggest to write my own code instead?
Thank you so much for your reply.