neon加速图像转置
一般的矩阵旋转操作都是对矩阵中的元素逐个操作,假设矩阵大小为m*n,那么时间复杂度就是o(mn)。如果使用了arm公司提供的neon加速技术,则可以并行的读取多个元素,对多个元素进行操作,虽然时间复杂度还是o(mn),但是常数因子会变小,并且在寄存器里的操作比在普通内存中还要快一些,所以会带来一定的性能提升。针对灰度图进行图像转置,图像数据为uint8_t.
c语言实现:
for(y=0;y<h;y++)
{
for(x=0;x<w;x++)
dst_gray[x*h+y]=src_gray[y*w+x];
}
neon加速思路:
考虑将一个矩阵划分成若干子矩阵,例如:一个128×256大小的矩阵可以划分为16×32个8×8大小的矩阵。分别对每个8x8的子矩阵进行旋转,再将其复制到输出矩阵中正确的坐标上即可。可以总结为2步:
循环执行以下步骤,直到所有子矩阵均被处理过
1.旋转当前子矩阵
2.将旋转后的子矩阵复制到输出矩阵中
Neon指令vtrn是解决转置问题的核心,其相当于对2x2矩阵进行转置,故对于8x8矩阵,可以先进行uint8x8_t的转置,然后转化为uint16x4_t,隔行转置,再转化为uint32x2_t,隔三行转置,最后转化为uint8x8_t输出到目标矩阵的对应位置。同时要注意剩余数据的处理,因为基于8x8矩阵,所以剩下的数据不足构成8x8矩阵时,需要对剩余数据进行处理,直接逐个操作即可。
neon优化代码:
int transposition_neon(uint8_t* src,uint8_t* dst,int w,int h)
{
uint8x8x4_t mat1;
uint8x8x4_t mat2;
uint8x8x2_t temp1;
uint8x8x2_t temp2;
uint8x8x2_t temp3;
uint8x8x2_t temp4;
uint16x4x4_t temp11;
uint16x4x4_t temp12;
uint16x4x2_t temp5;
uint16x4x2_t temp6;
uint16x4x2_t temp7;
uint16x4x2_t temp8;
uint32x2x4_t temp21;
uint32x2x4_t temp22;
uint32x2x2_t res1;
uint32x2x2_t res2;
uint32x2x2_t res3;
uint32x2x2_t res4;
int dw=w&7;
int dh=h&7;
int sw=w-dw;
int sh=h-dh;
int x,y;
for(y=0;y<sh;y=y+8)
{
for(x=0;x<sw;x=x+8)
{
mat1.val[0]=vld1_u8(src+y*w+x);
mat1.val[1]=vld1_u8(src+(y+1)*w+x);
mat1.val[2]=vld1_u8(src+(y+2)*w+x);
mat1.val[3]=vld1_u8(src+(y+3)*w+x);
mat2.val[0]=vld1_u8(src+(y+4)*w+x);
mat2.val[1]=vld1_u8(src+(y+5)*w+x);
mat2.val[2]=vld1_u8(src+(y+6)*w+x);
mat2.val[3]=vld1_u8(src+(y+7)*w+x);
temp1=vtrn_u8(mat1.val[0],mat1.val[1]);
temp2=vtrn_u8(mat1.val[2],mat1.val[3]);
temp3=vtrn_u8(mat2.val[0],mat2.val[1]);
temp4=vtrn_u8(mat2.val[2],mat2.val[3]);
temp11.val[0]=vreinterpret_u16_u8(temp1.val[0]);
temp11.val[1]=vreinterpret_u16_u8(temp1.val[1]);
temp11.val[2]=vreinterpret_u16_u8(temp2.val[0]);
temp11.val[3]=vreinterpret_u16_u8(temp2.val[1]);
temp12.val[0]=vreinterpret_u16_u8(temp3.val[0]);
temp12.val[1]=vreinterpret_u16_u8(temp3.val[1]);
temp12.val[2]=vreinterpret_u16_u8(temp4.val[0]);
temp12.val[3]=vreinterpret_u16_u8(temp4.val[1]);
temp5=vtrn_u16(temp11.val[0],temp11.val[2]);
temp6=vtrn_u16(temp11.val[1],temp11.val[3]);
temp7=vtrn_u16(temp12.val[0],temp12.val[2]);
temp8=vtrn_u16(temp12.val[1],temp12.val[3]);
temp21.val[0]=vreinterpret_u32_u16(temp5.val[0]);
temp21.val[1]=vreinterpret_u32_u16(temp5.val[1]);
temp21.val[2]=vreinterpret_u32_u16(temp6.val[0]);
temp21.val[3]=vreinterpret_u32_u16(temp6.val[1]);
temp22.val[0]=vreinterpret_u32_u16(temp7.val[0]);
temp22.val[1]=vreinterpret_u32_u16(temp7.val[1]);
temp22.val[2]=vreinterpret_u32_u16(temp8.val[0]);
temp22.val[3]=vreinterpret_u32_u16(temp8.val[1]);
res1=vtrn_u32(temp21.val[0],temp22.val[0]);
res2=vtrn_u32(temp21.val[1],temp22.val[1]);
res3=vtrn_u32(temp21.val[2],temp22.val[2]);
res4=vtrn_u32(temp21.val[3],temp22.val[3]);
mat1.val[0]=vreinterpret_u8_u32(res1.val[0]);
mat1.val[1]=vreinterpret_u8_u32(res2.val[0]);
mat1.val[2]=vreinterpret_u8_u32(res3.val[0]);
mat1.val[3]=vreinterpret_u8_u32(res4.val[0]);
mat2.val[0]=vreinterpret_u8_u32(res1.val[1]);
mat2.val[1]=vreinterpret_u8_u32(res2.val[1]);
mat2.val[2]=vreinterpret_u8_u32(res3.val[1]);
mat2.val[3]=vreinterpret_u8_u32(res4.val[1]);
vst1_u8(dst+x*h+y,mat1.val[0]);
vst1_u8(dst+(x+1)*h+y,mat1.val[1]);
vst1_u8(dst+(x+2)*h+y,mat1.val[2]);
vst1_u8(dst+(x+3)*h+y,mat1.val[3]);
vst1_u8(dst+(x+4)*h+y,mat2.val[0]);
vst1_u8(dst+(x+5)*h+y,mat2.val[1]);
vst1_u8(dst+(x+6)*h+y,mat2.val[2]);
vst1_u8(dst+(x+7)*h+y,mat2.val[3]);
}
}
for(y=sh-1;y<h;y++)
{
for(x=0;x<w;x++)
dst[x*h+y]=src[y*w+x];
}
for(x=sw-1;x<w;x++)
{
for(y=0;y<sh;y++)
{
dst[x*h+y]=src[y*w+x];
}
}
return 0;
}
测试图像素:1680*1050
测试平台:海思3559
测试结果:O3级优化编译下,约提速2.5倍(O3优化效果似乎跟平台有关,没研究过),默认编译下,约提速1.5倍,网上说能提速10倍,暂时不知道如何实现
参考资料:
http://blog.csdn.net/jxt1234and2010/article/details/50437884
http://book.51cto.com/art/201506/481001.htm
http://www.cnblogs.com/hrlnw/p/3723072.html
http://www.cnblogs.com/hrlnw/p/3767853.html