奇妙之旅：SIMD加速矩阵运算

程序员文章站 2022-03-17 14:13:57

...

奇妙之旅：SIMD加速矩阵运算

1.前言
2.预备知识
3.计算逻辑
4.代码实战

1.前言

游戏会涉及到大量4x4的矩阵乘法运算，而乘法最简单直观的实现就是循环4×4×4次乘法，以及若干次加法，得到结果。
在计算量较少时，cpu并不是很紧张。然而游戏通常每秒伴随着大量运算，此时，计算的效率就显得尤为重要。
通过查阅文献，我发现了一种SIMD（Single Instruction Multiple Data，单指令多数据流）技术，可以同时操作多个数据，极大的提高了cpu的运算能力。

2.预备知识

原理：

以加法指令为例，单指令单数据（SISD）的CPU对加法指令译码后，执行部件先访问内存，取得第一个操作数；之后再一次访问内存，取得第二个操作数；随后才能进行求和运算。而在SIMD型的CPU中，指令译码后几个执行部件同时访问内存，一次性获得所有操作数进行运算。这个特点使SIMD特别适合于多媒体应用等数据密集型运算。

下列是一些AVX，AVX2指令集上的函数原型:
手册地址:https://software.intel.com/sites/landingpage/IntrinsicsGuide/

__m256 _mm256_i32gather_ps (float const* base_addr, __m256i vindex, const int scale)
- 概要
  __m256 _mm256_i32gather_ps (float const* base_addr, __m256i vindex, const int scale)
  #include <immintrin.h>
  Instruction: vgatherdps ymm, vm32x, ymm
  CPUID Flags: AVX2
- 描述
  Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
- 伪代码
```
FOR j := 0 to 7
		i := j*32
		m := j*32
		addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
		dst[i+31:i] := MEM[addr+31:addr]
ENDFOR
dst[MAX:256] := 0
```
- 口胡解释：用于将base_addr指向的内存中的数据按vindex中放置的索引序号并乘以scale表示的字节大小收集数据。
__m256 _mm256_dp_ps (__m256 a, __m256 b, const int imm8)
- 概要
  __m256 _mm256_dp_ps (__m256 a, __m256 b, const int imm8)
  #include <immintrin.h>
  Instruction: vdpps ymm, ymm, ymm, imm
  CPUID Flags: AVX
- 描述
  Conditionally multiply the packed single-precision (32-bit) floating-point elements in a and b using the high 4 bits in imm8, sum the four products, and conditionally store the sum in dst using the low 4 bits of imm8.
- 伪代码
```
DEFINE DP(a[127:0], b[127:0], imm8[7:0]) {
	FOR j := 0 to 3
		i := j*32
		IF imm8[(4+j)%8]
			temp[i+31:i] := a[i+31:i] * b[i+31:i]
		ELSE
			temp[i+31:i] := 0
		FI
	ENDFOR
	
	sum[31:0] := (temp[127:96] + temp[95:64]) + (temp[63:32] + temp[31:0])
	
	FOR j := 0 to 3
		i := j*32
		IF imm8[j%8]
			tmpdst[i+31:i] := sum[31:0]
		ELSE
			tmpdst[i+31:i] := 0
		FI
	ENDFOR
	RETURN tmpdst[127:0]
}
dst[127:0] := DP(a[127:0], b[127:0], imm8[7:0])
dst[255:128] := DP(a[255:128], b[255:128], imm8[7:0])
dst[MAX:256] := 0
```
- 口胡解释：将a，b的8个float值，分别相乘（如果imm8第4-7位为1），然后前4个求和，后4个求和，如果imm8的低0-3位为1就分配这个和值到目标区域。

好了，有了以上两个函数，我们就可以将4x4矩阵乘法的乘法次数降到8次了。
什么？你问怎么做？请往下看。

3.计算逻辑

因为笔者的CPU只支持到AVX2，所以不能用512位的寄存器（哭～），这里就用256位寄存器做示范。
考虑原来的乘法需要64次的float乘法，如果用256位寄存器（8个float）打包就可以降到 $64\div8 =8$ 次，太惊人了。

下面就是计算思路：
$1.记X_{ab}为矩阵X的第a行和第b行构成的一个8个float值的寄存器\\ 2.记X^{uv}为矩阵X的第u列和第v列构成的一个8个float值的寄存器\\$
matrix4 c;
__mm256 temp;
int mask = 0b11110001;

一次性计算两次点积
temp =_mm256_dp_ps $( A_{12} ,B^{11},mask)$
c.data[0] = temp[0];
c.data[1] = temp[4];

temp =_mm256_dp_ps $( A_{34} ,B^{11},mask)$
c.data[2] = temp[0];
c.data[3] = temp[4];

剩下的类似…
temp =_mm256_dp_ps $( A_{12} ,B^{22},mask)$
c.data[4] = temp[0];
c.data[5] = temp[4];

temp =_mm256_dp_ps $( A_{34} ,B^{22},mask)$
c.data[6] = temp[0];
c.data[7] = temp[4];

temp =_mm256_dp_ps $( A_{12} ,B^{33},mask)$
c.data[8] = temp[0];
c.data[9] = temp[4];

temp =_mm256_dp_ps $( A_{34} ,B^{33},mask)$
c.data[10] = temp[0];
c.data[11] = temp[4];

temp =_mm256_dp_ps $( A_{12} ,B^{44},mask)$
c.data[12] = temp[0];
c.data[13] = temp[4];

temp =_mm256_dp_ps $( A_{34} ,B^{44},mask)$
c.data[14] = temp[0];
c.data[15] = temp[4];

至此，一次矩阵乘法结束。
关于如何取 $A_{12}，A_{34}，B_{11}...$ 等值：

__declspec(align(16)) __m256i gatherA12 = _mm256_set_epi32(13, 9, 5, 1, 12, 8, 4, 0);
__declspec(align(16)) __m256i gatherA34 = _mm256_set_epi32(15, 11, 7, 3, 14, 10, 6, 2);

__declspec(align(16)) __m256i gatherB11 = _mm256_set_epi32(3, 2, 1, 0, 3, 2, 1, 0);
__declspec(align(16)) __m256i gatherB22 = _mm256_set_epi32(7, 6, 5, 4, 7, 6, 5, 4);
__declspec(align(16)) __m256i gatherB33 = _mm256_set_epi32(11, 10, 9, 8, 11, 10, 9, 8);
__declspec(align(16)) __m256i gatherB44 = _mm256_set_epi32(15, 14, 13, 12, 15, 14, 13, 12);

	a12 = _mm256_i32gather_ps(_Left.data, gatherA12, sizeof(float));
	a34 = _mm256_i32gather_ps(_Left.data, gatherA34, sizeof(float));

	b11 = _mm256_i32gather_ps(_Right.data, gatherB11, sizeof(float));
	b22 = _mm256_i32gather_ps(_Right.data, gatherB22, sizeof(float));
	b33 = _mm256_i32gather_ps(_Right.data, gatherB33, sizeof(float));
	b44 = _mm256_i32gather_ps(_Right.data, gatherB44, sizeof(float));

__declspec(align(16)) 意味着要求编译器按16字节对齐，注意！！！SIMD要求字节对齐（当然偏要不对齐也行。。。）

_mm256_set_epi32 语义很简单，就是按照参数来向256位寄存器中放入32位的整数，但是需要注意的是，第一个参数是放到寄存器最末尾的。

4.代码实战

先放出运行结果：
奇妙之旅：SIMD加速矩阵运算
第一个计时是循环1000万次的SIMD，仅用了不到1ms。
第二个计时是普通的循环1000万次。

以下是全部代码：



#include <iostream>
#include <windows.h>
#include <xmmintrin.h>
#include <immintrin.h>


// 计时结构
LARGE_INTEGER t1, t2, tc;

void time_begin()
{
	QueryPerformanceFrequency(&tc);
	QueryPerformanceCounter(&t1);
}

float time_end()
{
	QueryPerformanceCounter(&t2);
	return ((t2.QuadPart - t1.QuadPart)*1.0 / tc.QuadPart) * 1000;
}


class matrix4
{
public:
	matrix4()
		:data{ 0 }
	{

	}

	matrix4(float v)
		:data{ v,0,0,0,0,v,0,0,0,0,v,0,0,0,0,v }
	{

	}

public:

public:
	union
	{
		float data[16];
		float ptr[4][4];
	};
};

// 列主序按行计算
matrix4 mul_1(const matrix4& m1, const matrix4& m2)
{
	matrix4 ret;
	for (int row = 0; row < 4; ++row) 
	{
		for (int col = 0; col < 4; ++col) 
		{
			for (int a = 0; a < 4; ++a) 
			{
				ret.ptr[col][row] += m1.ptr[a][row] * m2.ptr[col][a];
			}
		}
	}

	return ret;
}

__declspec(align(16)) matrix4 A(1.f);
__declspec(align(16)) matrix4 B(2.f);
__declspec(align(16)) float v[16]
= {0, 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};



__declspec(align(16)) __m256i gatherA12 = _mm256_set_epi32(13, 9, 5, 1, 12, 8, 4, 0);
__declspec(align(16)) __m256i gatherA34 = _mm256_set_epi32(15, 11, 7, 3, 14, 10, 6, 2);

__declspec(align(16)) __m256i gatherB11 = _mm256_set_epi32(3, 2, 1, 0, 3, 2, 1, 0);
__declspec(align(16)) __m256i gatherB22 = _mm256_set_epi32(7, 6, 5, 4, 7, 6, 5, 4);
__declspec(align(16)) __m256i gatherB33 = _mm256_set_epi32(11, 10, 9, 8, 11, 10, 9, 8);
__declspec(align(16)) __m256i gatherB44 = _mm256_set_epi32(15, 14, 13, 12, 15, 14, 13, 12);

auto mm_mul_mat(matrix4 const& _Left, matrix4 const& _Right)
{
	matrix4 ret;
	__declspec(align(16)) __m256 temp;
	__declspec(align(16)) __m256 a12, a34;
	__declspec(align(16)) __m256 b11, b22, b33, b44;
	a12 = _mm256_i32gather_ps(_Left.data, gatherA12, sizeof(float));
	a34 = _mm256_i32gather_ps(_Left.data, gatherA34, sizeof(float));

	b11 = _mm256_i32gather_ps(_Right.data, gatherB11, sizeof(float));
	b22 = _mm256_i32gather_ps(_Right.data, gatherB22, sizeof(float));
	b33 = _mm256_i32gather_ps(_Right.data, gatherB33, sizeof(float));
	b44 = _mm256_i32gather_ps(_Right.data, gatherB44, sizeof(float));

	temp = _mm256_dp_ps(a12, b11, 0b11110001);
	ret.data[0] = temp.m256_f32[0];
	ret.data[1] = temp.m256_f32[4];
	temp = _mm256_dp_ps(a34, b11, 0b11110001);
	ret.data[2] = temp.m256_f32[0];
	ret.data[3] = temp.m256_f32[4];

	temp = _mm256_dp_ps(a12, b22, 0b11110001);
	ret.data[4] = temp.m256_f32[0];
	ret.data[5] = temp.m256_f32[4];
	temp = _mm256_dp_ps(a34, b22, 0b11110001);
	ret.data[6] = temp.m256_f32[0];
	ret.data[7] = temp.m256_f32[4];

	temp = _mm256_dp_ps(a12, b33, 0b11110001);
	ret.data[8] = temp.m256_f32[0];
	ret.data[9] = temp.m256_f32[4];
	temp = _mm256_dp_ps(a34, b33, 0b11110001);
	ret.data[10] = temp.m256_f32[0];
	ret.data[11] = temp.m256_f32[4];

	temp = _mm256_dp_ps(a12, b44, 0b11110001);
	ret.data[12] = temp.m256_f32[0];
	ret.data[13] = temp.m256_f32[4];
	temp = _mm256_dp_ps(a34, b44, 0b11110001);
	ret.data[14] = temp.m256_f32[0];
	ret.data[15] = temp.m256_f32[4];

	return ret;
}

int main()
{

	time_begin();
	for (int i = 0; i < 10000000; ++i)
	{
		volatile auto res = mm_mul_mat(A, B);
	}
	::std::cout << "SIMD-AVX \ttime: " << time_end() << ::std::endl;

	time_begin();
	for (int i = 0; i < 10000000; ++i)
	{
		volatile auto m = mul_1(A, B);
	}
	::std::cout << "SISD TRIVIAL \ttime: " << time_end() << ::std::endl;



	return 0;
}

以上。

PS：真是一次奇妙的体验呢～

奇妙之旅：SIMD加速矩阵运算