SHL

得陇复望蜀
在学习DL性能优化的学生
© 2018. All rights reserved.

Winograd for CNN(二)

前言

winograd通过用加法代替乘法来加速运算,其需要预设部分矩阵来帮助计算,但矩阵的选择也是各有千秋,具体要如何选择大家可以网上搜索。算法对于不同的矩阵有着不同的速度和效果。而现在也有许多不同的winograd算法实现,我这里主要基于Fast Algorithms for Convolutional Neural Networks实现。

我这里只是粗略的实现了一下效果,代码就写的很简陋( ̄▽ ̄)。

代码示例

这里我设置,,。以的输入,的卷积核为例。(注意输入是要加padding的)

这一步对于同一个卷积核的值来说是固定,因此是可以提前计算好的,是offline的。

float **U_2x2_3x3(float *kernel)
{
    float G[4][3] = { { 1,    0,    0},
                      {0.5,  0.5,  0.5},
                      {0.5, -0.5,  0.5},
                      { 0,    0,    1} };
    float GT[3][4] = { { 1,  0.5,  0.5,  0},
                       { 0,  0.5, -0.5,  0},
                       { 0,  0.5,  0.5,  1} };
    
    float G_g[4][3] = {0};
    for (int i = 0; i < 4; ++i) {
        for (int j = 0; j < 3; ++j) {
            float temp = 0;
            for (int k = 0; k < 3; ++k) {
                temp += G[i][k] * kernel[k * 3 + j];
            }
            G_g[i][j] = temp;
        }
    }

    float **G_g_GT;
    G_g_GT = (float**)malloc(4 * sizeof(float*));
    //float G_g_GT[4][4] = {0};
    for (int i = 0; i < 4; ++i) {
        G_g_GT[i] = (float*)malloc(4 * sizeof(float));
        memset(G_g_GT[i], 0, 4 * sizeof(float));
        for (int j = 0; j < 4; ++j) {
            float temp = 0;
            for (int k = 0; k < 3; ++k) {
                temp += G_g[i][k] * GT[k][j];
            }
            G_g_GT[i][j] = temp;
        }
    }

    return G_g_GT;
}


因为我预设的输入是用一维数组来表示二维的矩阵,所以需要一个矩阵起始位置来推断分割的矩阵。

float **V_2x2_3x3(float *input, int start)
{
    int BT[4][4] = { {1,  0, -1,  0},
                     {0,  1,  1,  0},
                     {0, -1,  1,  0},
                     {0,  1,  0, -1}};
    int B[4][4] =  { { 1,  0,  0,  0},
                     { 0,  1, -1,  1},
                     {-1,  1,  1,  0},
                     { 0,  0,  0, -1}};

    float BT_d[4][4] = {0};
    for (int i = 0; i < 4; ++i) {
        for (int j = 0; j < 4; ++j) {
            float temp = 0;
            for (int k = 0; k < 4; ++k) {
                temp += BT[i][k] * input[k * 6 + j + start];
            }
            BT_d[i][j] = temp;
        }
    }

    float **BT_d_B;
    BT_d_B = (float**)malloc(4 * sizeof(float*));
    //float BT_d_B[4][4] = {0};
    for (int i = 0; i < 4; ++i) {
        BT_d_B[i] = (float*)malloc(4 * sizeof(float));
        memset(BT_d_B[i], 0, 4 * sizeof(float));
        for (int j = 0; j < 4; ++j) {
            float temp = 0;
            for (int k = 0; k < 4; ++k) {
                temp += BT_d[i][k] * B[k][j];
            }
            BT_d_B[i][j] = temp;
        }
    }

    return BT_d_B;
}


这里输出的就是局部的最终结果。

float **M_2x2_3x3(float **U, float **V)
{
    int AT[2][4] = { {1, 1,  1,  0},
                     {0, 1, -1, -1} };

    int A[4][2] = { {1,  0},
                    {1,  1},
                    {1, -1},
                    {0, -1} };

    float M[4][4] = {0};
    for (int i = 0; i < 4; ++i) {
        for (int j = 0; j < 4; ++j) {
            M[i][j] = U[i][j] * V[i][j];
        }
    }

    for (int l = 0; l < 4; ++l) {
        free(V[l]);
    }
    free(V);

    float AT_M[2][4] = {0};
    for (int i = 0; i < 2; ++i) {
        for (int j = 0; j < 4; ++j) {
            float temp = 0;
            for (int k = 0; k < 4; ++k) {
                temp += AT[i][k] * M[k][j];
            }
            AT_M[i][j] = temp;
        }
    }

    float **AT_M_A;
    AT_M_A = (float**)malloc(2 * sizeof(float*));
    for (int i = 0; i < 2; ++i) {
        AT_M_A[i] = (float*)malloc(2 * sizeof(float));
        memset(AT_M_A[i], 0, 2 * sizeof(float));
        for (int j = 0; j < 2; ++j) {
            float temp = 0;
            for (int k = 0; k < 4; ++k) {
                temp += AT_M[i][k] * A[k][j];
            }
            AT_M_A[i][j] = temp;
        }
    }

    return AT_M_A;
}


Winograd

因为我这里举的输入是,卷积为的例子,所以局部结果可以刚好拼成最终结果,而且padding也只要上下左右各加1就可以了。但如果是的输入,那么padding在下面和右边都要加2才行,而且在局部结果拼最终结果时要舍去1行和1列。

void winograd(float* input, float* kernel, float* output, int input_size)
{
    float **U;
    U = U_2x2_3x3(kernel);
    for (int l = 0; l < 2; ++l) {      //  H/m向上取整
        for (int n = 0; n < 2; ++n) {
            float **V, **Y;
            V = V_2x2_3x3(input, l * input_size * 2 + n * 2);
            Y = M_2x2_3x3(U, V);
            int row_col = l * 8 + n * 2;
            memcpy(output + row_col, Y[0], 2 * sizeof(float));
            memcpy(output + row_col + 4, Y[1], 2 * sizeof(float));

        }
    }
    for (int i = 0; i < 4; ++i) {
        free(U[i]);
    }
    free(U);
}


剩余代码

全部代码都在这里了。

void mm(float* input, float* kernel, float* output, int input_size, int kernel_size)
{
    int output_size = input_size - kernel_size + 1;
    for (int i = 0; i < output_size; ++i) {
        for (int j = 0; j < output_size; ++j) {
            float temp = 0;
            for (int k = 0; k < kernel_size; ++k) {
                for (int l = 0; l < kernel_size; ++l) {
                    temp += kernel[k*kernel_size+l] * input[i*input_size+j+k*input_size+l];
                }
            }
            output[i*output_size + j] = temp;
        }
    }
}

int main() {
    int input_size, kernel_size;
    input_size = 4;
    kernel_size = 3;
    float *input, *kernel, *output, *winograd_out;
    input = (float*)malloc(input_size * input_size * sizeof(float));
    for (int i = 0; i < input_size*input_size; ++i) {
        input[i] = i;
    }

    input = add_padding(input,input_size);
    input_size += 2;

    kernel = (float*)malloc(kernel_size * kernel_size * sizeof(float));
    for (int i = 0; i < kernel_size*kernel_size; ++i) {
        kernel[i] = 1;
    }

    int output_size = input_size;
    output = (float*)malloc(output_size * output_size * sizeof(float));
    memset(output, 0, output_size * output_size * sizeof(float));
    winograd_out = (float*)malloc(output_size * output_size * sizeof(float));
    memset(winograd_out, 0, output_size * output_size * sizeof(float));
    clock_t start, end1, end2;

    start = clock();
    mm(input, kernel, output, input_size, kernel_size);

    end1 = clock();
    winograd(input, kernel, winograd_out, input_size);
    end2 = clock();

    printf("mm cost %f , winograd cost %f", double(end1-start)/CLOCKS_PER_SEC, double(end2-end1)/CLOCKS_PER_SEC);

    return 0;
}

最后

虽然结果是一样的,但是我实现的winograd比寻常卷积要慢好多╮( ̄▽ ̄””)╭。具体原因应该还要我接下来摸索,应该可以考虑内存池优化、汇编优化、代码消冗余等。