Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

请教关于1D向量批量卷积如何加速的问题 #11

Open
YangZ2020 opened this issue Oct 23, 2024 · 1 comment
Open

请教关于1D向量批量卷积如何加速的问题 #11

YangZ2020 opened this issue Oct 23, 2024 · 1 comment

Comments

@YangZ2020
Copy link

我手写了一个计算1D卷积的程序,但是发现执行起来很慢。问题描述和示例代码如下:
WechatIMG367

#include <random>
// #include <utils/chronoMarco.h>
#include <vector>

int main() {

  std::random_device rd;
  std::mt19937 gen(rd());
  std::uniform_real_distribution<float> dis(1.0, 10.0);

  // 生成vec数据;
  size_t size1 = 10000;
  size_t size2 = 126;
  std::vector<float> vec(size1 * size2, 0);
  for (auto &x : vec) {
    x = dis(gen);
  }

  // 生成conv数据;
  size_t sizeb = 64;
  std::vector<float> b(sizeb);
  for (auto &x : b) {
    x = dis(gen);
  };

  // 临时向量,把vec中的每个向量存储到padZeroData中(前后各有一部分0);
  std::vector<float> padZeroData(size2 + sizeb);

  // 分配结果向量;
  std::vector<float> result(size1 * size2, 0);

  // TICK(conv); // 用于计时的Marco。
  // 开始计算。对于每个向量:
  for (int idx = 0; idx < size1; idx++) {

    // 将数据插入到padZeroData中;
    for (int idy = 0; idy < size2; idy++) {
      padZeroData[idy + sizeb / 2] = vec[idx * size2 + idy];
    }

    // padZeroData和b两个向量卷积,结果放入vec中;
    for (int idy = 0; idy < size2; idy++) {
      for (int idz = 0; idz < sizeb; idz++) {
        result[idx * size2 + idy] += padZeroData[idy + idz] * b[idz];
      }
    }
  }
  // TOCK(conv, end, ""); // 用于计时的Marco。
                       // 两段计时之间耗时约35ms。
}

谢谢小彭老师。

@YangZ2020 YangZ2020 changed the title 请教关于1D向量卷积的问题 请教关于1D向量卷积如何加速的问题 Oct 23, 2024
@YangZ2020 YangZ2020 changed the title 请教关于1D向量卷积如何加速的问题 请教关于1D向量批量卷积如何加速的问题 Oct 23, 2024
@archibate
Copy link
Contributor

archibate commented Oct 26, 2024

            for (int idz = 0; idz < sizeb; idz++) {
                result[idx * size2 + idy] += padZeroData[idy + idz] * b[idz];
            }

5079us

            float tmp = 0;
            for (int idz = 0; idz < sizeb; idz++) {
                tmp += padZeroData[idy + idz] * b[idz];
            }
            result[idx * size2 + idy] = tmp;

5058us

                        // padZeroData和b两个向量卷积,结果放入vec中;
            for (int idy = 0; idy < size2; idy++) {
                __m256 tmp = _mm256_setzero_ps();
                for (int idz = 0; idz < sizeb; idz += 8) {
                    tmp = _mm256_fmadd_ps(_mm256_loadu_ps(&padZeroData[idy + idz]), _mm256_loadu_ps(&b[idz]), tmp);
                }
                for (int i = 0; i < 8; ++i) {
                    result[idx * size2 + idy] += tmp[i];
                }
            }

3351us

            for (int idy = 0; idy < size2; idy++) {
                __m256 tmp1 = _mm256_setzero_ps();
                __m256 tmp2 = _mm256_setzero_ps();
                for (int idz = 0; idz < sizeb; idz += 16) {
                    tmp1 = _mm256_fmadd_ps(_mm256_loadu_ps(&padZeroData[idy + idz]), _mm256_loadu_ps(&b[idz]), tmp1);
                    tmp2 = _mm256_fmadd_ps(_mm256_loadu_ps(&padZeroData[idy + idz + 8]), _mm256_loadu_ps(&b[idz + 8]), tmp2);
                }
                tmp1 = _mm256_add_ps(tmp1, tmp2);
                for (int i = 0; i < 8; ++i) {
                    result[idx * size2 + idy] += tmp1[i];
                }
            }

2624us

            for (int idy = 0; idy < size2; idy += 8) {
                __m256 tmp[8]{};
                for (int idz = 0; idz < sizeb; idz += 8) {
                    __m256 btmp = _mm256_loadu_ps(&b[idz]);
                    for (int offy = 0; offy < 8; ++offy) {
                        tmp[offy] = _mm256_fmadd_ps(_mm256_loadu_ps(&padZeroData[idy + offy + idz]), btmp, tmp[offy]);
                    }
                }
                for (int offy = 0; offy < 8; ++offy) {
                    __m256 res = tmp[offy];
                    res = _mm256_hadd_ps(res, res);
                    res = _mm256_hadd_ps(res, res);
                    result[idx * size2 + idy + offy] = _mm_cvtss_f32(
                        _mm_add_ss(_mm256_extractf128_ps(res, 1), _mm256_castps256_ps128(res)));
                }
            }

2293us

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants