项目记录:SSE/NEON快速指令集优化像素拷贝
程序员文章站
2022-04-19 13:53:35
...
记录一下在实验室做了的一些项目与demo
记录其中的知识点与技术要点
SSE/NEON
快速指令集优化像素拷贝
项目需求:
360全景图划分成42
TILE
.高纬度按
4:1
下采样. 中纬度按2:1
下采样. 低纬度1:1
渲染时,高纬度按
1:4
还原. 中纬度按1:2
还原.
参考资料:
具体实现:
// 宏定义: 以空间换时间.. 小心使用..
// ---------PC端:SSE指令集----------------
// 高纬度的1:4 拷贝 4字节->16
#define set4(output_buf_y,input_buf_y) {\
__m128i itmp1 = _mm_loadl_epi64((__m128i*)(input_buf_y));\
__m128i itmp2 = _mm_unpacklo_epi8(itmp1, itmp1);\
__m128i dst = _mm_unpacklo_epi8(itmp2, itmp2);\
__m128i* otmp = (__m128i*)(output_buf_y);\
_mm_store_si128(otmp, dst);\
}
// 中纬度的1:2 拷贝 8字节->16
#define set2(output_buf_y,input_buf_y) {\
__m128i itmp =_mm_loadl_epi64((__m128i*)(input_buf_y));\
__m128i* otmp = (__m128i*)(output_buf_y);\
_mm_store_si128(otmp, _mm_unpacklo_epi8(itmp, itmp));\
}
// ---------Android端:NEON指令集----------------
#define NEON_INTRINSICS 1// 1:neon内联函数 0:汇编
#if NEON_INTRINSICS
//1:4 拷贝 8->32
#define set4(output_buf_y,input_buf_y) {\
uint8x8x4_t v;\
v.val[0] = vld1_u8(input_buf_y);\
v.val[1] = v.val[0];\
v.val[2] = v.val[0];\
v.val[3] = v.val[0];\
vst4_u8(output_buf_y,v);\
}
// 1:2 拷贝 8字节->16
#define set2(output_buf_y,input_buf_y) {\
uint8x8x2_t v;\
v.val[0] =vld1_u8(input_buf_y);\
v.val[1] = v.val[0];\
vst2_u8(output_buf_y,v);\
}
#else
// 汇编版本
void set4(unsigned char * output_buf_y,unsigned char * input_buf_y) {\
__asm__(\
"VLD1.8 {d0},[r1] \t\n"\
"VMOV d1,d0 \t\n"\
"VMOV d2,d0 \t\n"\
"VMOV d3,d0 \t\n"\
"VST4.8 {d0, d1, d2, d3}, [r0]"\
);\
}
void set2(unsigned char *output_buf_y,unsigned char *input_buf_y) {\
__asm__(\
"VLD1.8 {d0},[r1] \t\n"\
"VMOV d1,d0 \t\n"\
"VST2.8 {d0, d1}, [r0]"\
);\
}
#endif
下一篇: java 线程池