前言
winograd通过用加法代替乘法来加速运算,其需要预设部分矩阵来帮助计算,但矩阵的选择也是各有千秋,具体要如何选择大家可以网上搜索。算法对于不同的矩阵有着不同的速度和效果。而现在也有许多不同的winograd算法实现,我这里主要基于Fast Algorithms for Convolutional Neural Networks
的实现。
我这里只是粗略的实现了一下效果,代码就写的很简陋( ̄▽ ̄)。
代码示例
这里我设置,,。以的输入,的卷积核为例。(注意输入是要加padding的)
这一步对于同一个卷积核的值来说是固定,因此是可以提前计算好的,是offline的。
float **U_2x2_3x3(float *kernel)
{
float G[4][3] = { { 1, 0, 0},
{0.5, 0.5, 0.5},
{0.5, -0.5, 0.5},
{ 0, 0, 1} };
float GT[3][4] = { { 1, 0.5, 0.5, 0},
{ 0, 0.5, -0.5, 0},
{ 0, 0.5, 0.5, 1} };
float G_g[4][3] = {0};
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 3; ++j) {
float temp = 0;
for (int k = 0; k < 3; ++k) {
temp += G[i][k] * kernel[k * 3 + j];
}
G_g[i][j] = temp;
}
}
float **G_g_GT;
G_g_GT = (float**)malloc(4 * sizeof(float*));
//float G_g_GT[4][4] = {0};
for (int i = 0; i < 4; ++i) {
G_g_GT[i] = (float*)malloc(4 * sizeof(float));
memset(G_g_GT[i], 0, 4 * sizeof(float));
for (int j = 0; j < 4; ++j) {
float temp = 0;
for (int k = 0; k < 3; ++k) {
temp += G_g[i][k] * GT[k][j];
}
G_g_GT[i][j] = temp;
}
}
return G_g_GT;
}
因为我预设的输入是用一维数组来表示二维的矩阵,所以需要一个矩阵起始位置来推断分割的矩阵。
float **V_2x2_3x3(float *input, int start)
{
int BT[4][4] = { {1, 0, -1, 0},
{0, 1, 1, 0},
{0, -1, 1, 0},
{0, 1, 0, -1}};
int B[4][4] = { { 1, 0, 0, 0},
{ 0, 1, -1, 1},
{-1, 1, 1, 0},
{ 0, 0, 0, -1}};
float BT_d[4][4] = {0};
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 4; ++j) {
float temp = 0;
for (int k = 0; k < 4; ++k) {
temp += BT[i][k] * input[k * 6 + j + start];
}
BT_d[i][j] = temp;
}
}
float **BT_d_B;
BT_d_B = (float**)malloc(4 * sizeof(float*));
//float BT_d_B[4][4] = {0};
for (int i = 0; i < 4; ++i) {
BT_d_B[i] = (float*)malloc(4 * sizeof(float));
memset(BT_d_B[i], 0, 4 * sizeof(float));
for (int j = 0; j < 4; ++j) {
float temp = 0;
for (int k = 0; k < 4; ++k) {
temp += BT_d[i][k] * B[k][j];
}
BT_d_B[i][j] = temp;
}
}
return BT_d_B;
}
这里输出的就是局部的最终结果。
float **M_2x2_3x3(float **U, float **V)
{
int AT[2][4] = { {1, 1, 1, 0},
{0, 1, -1, -1} };
int A[4][2] = { {1, 0},
{1, 1},
{1, -1},
{0, -1} };
float M[4][4] = {0};
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 4; ++j) {
M[i][j] = U[i][j] * V[i][j];
}
}
for (int l = 0; l < 4; ++l) {
free(V[l]);
}
free(V);
float AT_M[2][4] = {0};
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 4; ++j) {
float temp = 0;
for (int k = 0; k < 4; ++k) {
temp += AT[i][k] * M[k][j];
}
AT_M[i][j] = temp;
}
}
float **AT_M_A;
AT_M_A = (float**)malloc(2 * sizeof(float*));
for (int i = 0; i < 2; ++i) {
AT_M_A[i] = (float*)malloc(2 * sizeof(float));
memset(AT_M_A[i], 0, 2 * sizeof(float));
for (int j = 0; j < 2; ++j) {
float temp = 0;
for (int k = 0; k < 4; ++k) {
temp += AT_M[i][k] * A[k][j];
}
AT_M_A[i][j] = temp;
}
}
return AT_M_A;
}
Winograd
因为我这里举的输入是,卷积为的例子,所以局部结果可以刚好拼成最终结果,而且padding也只要上下左右各加1就可以了。但如果是的输入,那么padding在下面和右边都要加2才行,而且在局部结果拼最终结果时要舍去1行和1列。
void winograd(float* input, float* kernel, float* output, int input_size)
{
float **U;
U = U_2x2_3x3(kernel);
for (int l = 0; l < 2; ++l) { // H/m向上取整
for (int n = 0; n < 2; ++n) {
float **V, **Y;
V = V_2x2_3x3(input, l * input_size * 2 + n * 2);
Y = M_2x2_3x3(U, V);
int row_col = l * 8 + n * 2;
memcpy(output + row_col, Y[0], 2 * sizeof(float));
memcpy(output + row_col + 4, Y[1], 2 * sizeof(float));
}
}
for (int i = 0; i < 4; ++i) {
free(U[i]);
}
free(U);
}
剩余代码
全部代码都在这里了。
void mm(float* input, float* kernel, float* output, int input_size, int kernel_size)
{
int output_size = input_size - kernel_size + 1;
for (int i = 0; i < output_size; ++i) {
for (int j = 0; j < output_size; ++j) {
float temp = 0;
for (int k = 0; k < kernel_size; ++k) {
for (int l = 0; l < kernel_size; ++l) {
temp += kernel[k*kernel_size+l] * input[i*input_size+j+k*input_size+l];
}
}
output[i*output_size + j] = temp;
}
}
}
int main() {
int input_size, kernel_size;
input_size = 4;
kernel_size = 3;
float *input, *kernel, *output, *winograd_out;
input = (float*)malloc(input_size * input_size * sizeof(float));
for (int i = 0; i < input_size*input_size; ++i) {
input[i] = i;
}
input = add_padding(input,input_size);
input_size += 2;
kernel = (float*)malloc(kernel_size * kernel_size * sizeof(float));
for (int i = 0; i < kernel_size*kernel_size; ++i) {
kernel[i] = 1;
}
int output_size = input_size;
output = (float*)malloc(output_size * output_size * sizeof(float));
memset(output, 0, output_size * output_size * sizeof(float));
winograd_out = (float*)malloc(output_size * output_size * sizeof(float));
memset(winograd_out, 0, output_size * output_size * sizeof(float));
clock_t start, end1, end2;
start = clock();
mm(input, kernel, output, input_size, kernel_size);
end1 = clock();
winograd(input, kernel, winograd_out, input_size);
end2 = clock();
printf("mm cost %f , winograd cost %f", double(end1-start)/CLOCKS_PER_SEC, double(end2-end1)/CLOCKS_PER_SEC);
return 0;
}
最后
虽然结果是一样的,但是我实现的winograd比寻常卷积要慢好多╮( ̄▽ ̄””)╭。具体原因应该还要我接下来摸索,应该可以考虑内存池优化、汇编优化、代码消冗余等。