-
Notifications
You must be signed in to change notification settings - Fork 10
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
小彭老师,麻烦使用SSE优化一下下面的代码 #5
Comments
原版 418.72ms #include <iostream>
#include <immintrin.h>
#include <atomic>
#include <vector>
#include <random>
#include <fstream>
#include <stdio.h>
#include <chrono>
#define REP4(x) (x, x, x, x)
struct templateFeat
{
__m128i x;
__m128i y;
__m64 dx;
__m64 dy;
__m128 mag;
};
struct matchResult
{
int i;
int j;
int angle;
float score;
};
struct searchFeat
{
short dx;
short dy;
float mag;
};
bool saveDataToTxt(const std::string txt_path,const std::vector<matchResult> match_result) {
std::ofstream file(txt_path);
if (file.is_open()) {
for (int i = 0; i < match_result.size(); ++i) {
file << match_result[i].i << " " << match_result[i].j << " " << match_result[i].angle << " " << match_result[i].score << std::endl;
}
file.close();
std::cout << "Match result datas saved to " << txt_path << "!" << std::endl;
} else {
std::cout << "Unable to open file" << std::endl;
}
return true;
}
std::vector<matchResult> const &calSimilarity(const std::vector<templateFeat> &template_point,size_t template_feat_size,const std::vector<searchFeat> &search_point,int search_feat_size)
{
static std::vector<matchResult> results0Deg;
std::atomic<size_t> results0DegSize{0};
results0Deg.resize(1920 * 1200);
auto $anMinScore = _mm_set1_ps(0.4f - 1);
auto $NormMinScore = _mm_set1_ps(0.4f / template_feat_size);
auto $NormGreediness = _mm_set1_ps(((1 - 0.8f * 0.4f) / (1 - 0.8f)) / template_feat_size);
#pragma omp parallel for collapse(2)
for(int i = 0; i < 1920; i++)
{
for(int j = 0; j < 1200; j++)
{
auto $PartialSum = _mm_setzero_ps();
auto $i = _mm_set1_epi32(i);
auto $j = _mm_set1_epi32(j);
auto $m1 = _mm_setr_ps(1, 2, 3, 4);
auto $0 = _mm_setzero_si128();
auto $4 = _mm_set1_ps(4);
auto $230 = _mm_set1_epi32(230);
auto $350 = _mm_set1_epi32(350);
for(int m = 0; m < template_feat_size / 4 * 4; m += 4)
{
auto $curX = _mm_add_epi32($i, template_point[m / 4].x);
auto $curY = _mm_add_epi32($j, template_point[m / 4].y);
auto $ok = _mm_or_si128(
_mm_or_si128(_mm_cmplt_epi32($curX, $0), _mm_cmplt_epi32($curY, $0)),
_mm_or_si128(_mm_cmpgt_epi32($curX, $230), _mm_cmpgt_epi32($curY, $350)));
auto $iTx = _mm_cvtpi16_ps(template_point[m / 4].dx);
auto $iTy = _mm_cvtpi16_ps(template_point[m / 4].dy);
auto $iTm = template_point[m / 4].mag;
auto $offSet = _mm_and_si128($curY * $350 + $curX, $ok);
auto $iSxy = _mm_i32gather_epi32((int *)search_point.data(), $offSet, sizeof(searchFeat));
// isx isy isx isy isx isy isx isy
// isx 0 isx 0 isx 0 isx 0
// isy 0 isy 0 isy 0 isy 0
auto $iSx = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16($0, $iSxy), 16));
auto $iSy = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16($0, $iSxy), 16));
auto $iSm = _mm_i32gather_ps(1 + (float *)search_point.data(), $offSet, sizeof(searchFeat));
$PartialSum = _mm_add_ps($PartialSum,
_mm_and_ps(_mm_castsi128_ps($ok),
_mm_mul_ps(
_mm_mul_ps($iSm, $iTm),
_mm_add_ps(
_mm_mul_ps($iSx, $iTx),
_mm_mul_ps($iSy, $iTy))
)));
auto $cond = _mm_mul_ps($m1,
_mm_min_ps(
_mm_add_ps($anMinScore, _mm_mul_ps($NormGreediness, $m1)),
_mm_mul_ps($NormMinScore, $m1)));
/* for (int _ = 0; _ < 4; _++) */
/* printf("%f %f %f %f %f %f %f %f\n", $iSx[_], $iSy[_], $iTx[_], $iTy[_], $iSm[_], $PartialSum[_], $m1[_], $cond[_]); */
auto $cmp = _mm_cmplt_ps($PartialSum, $cond);
int cmpmask = _mm_movemask_ps($cmp);
if (cmpmask) [[unlikely]] {
$PartialSum = _mm_setzero_ps();
break;
}
$m1 = _mm_add_ps($m1, $4);
}
$PartialSum = _mm_hadd_ps($PartialSum, $PartialSum);
$PartialSum = _mm_hadd_ps($PartialSum, $PartialSum);
auto PartialScore = _mm_cvtss_f32($PartialSum) / (template_feat_size + 1);
if(PartialScore > 0.4f)
{
results0Deg[results0DegSize.fetch_add(1, std::memory_order_relaxed)] = {i, j, 0, PartialScore};
}
}
}
results0Deg.resize(results0DegSize.load(std::memory_order_relaxed));
return results0Deg;
}
int main()
{
//为了测试函数而生成的随机数:template_point/search_point
std::mt19937 gen;
std::uniform_int_distribution<int> intXDist(-18, 32);
std::uniform_int_distribution<int> intYDist(-48, 49);
std::uniform_int_distribution<short> shortDXDist(-348, 349);
std::uniform_int_distribution<short> shortDYDist(-421, 352);
std::uniform_real_distribution<float> floatDist(0.00237112f, 0.0120056f);
std::vector<templateFeat> template_point(215 / 4);
for (auto &feat : template_point)
{
feat.x = _mm_setr_epi32 REP4(intXDist(gen));
feat.y = _mm_setr_epi32 REP4(intYDist(gen));
feat.dx = _mm_setr_pi16 REP4(shortDXDist(gen));
feat.dy = _mm_setr_pi16 REP4(shortDYDist(gen));
feat.mag = _mm_setr_ps REP4(floatDist(gen));
}
std::uniform_int_distribution<short> shortDxDist(-460, 460);
std::uniform_int_distribution<short> shortDyDist(-476, 460);
std::uniform_real_distribution<float> float_Dist(0.0f, 0.707107f);
std::vector<searchFeat> search_point(2304000);
for (auto &feat : search_point)
{
feat.dx = (shortDxDist(gen));
feat.dy = (shortDyDist(gen));
feat.mag = (float_Dist(gen));
}
//运行算法,计算耗时
auto start = std::chrono::high_resolution_clock::now();
std::vector<matchResult> const &results0Deg = calSimilarity(template_point,215,search_point,2304000);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> duration = end - start;
std::cout << "Function took " << duration.count() << " milliseconds." << std::endl;
//保存匹配结果
saveDataToTxt("/tmp/match_result.txt",results0Deg);
return 0;
} |
for循环内部计算:非SSE : |
修复后的SSE版本 #include <iostream>
#include <immintrin.h>
#include <atomic>
#include <vector>
#include <random>
#include <fstream>
#include <stdio.h>
#include <chrono>
#define REP4(x) (x, x, x, x)
struct templateFeat
{
__m128i x;
__m128i y;
__m64 dx;
__m64 dy;
__m128 mag;
};
struct matchResult
{
int i;
int j;
int angle;
float score;
};
struct searchFeat
{
short dx;
short dy;
float mag;
};
bool saveDataToTxt(const std::string txt_path,const std::vector<matchResult> match_result) {
std::ofstream file(txt_path);
if (file.is_open()) {
for (int i = 0; i < match_result.size(); ++i) {
file << match_result[i].i << " " << match_result[i].j << " " << match_result[i].angle << " " << match_result[i].score << std::endl;
}
file.close();
std::cout << "Match result datas saved to " << txt_path << "!" << std::endl;
} else {
std::cout << "Unable to open file" << std::endl;
}
return true;
}
std::vector<matchResult> const &calSimilarity(const std::vector<templateFeat> &template_point,size_t template_feat_size,const std::vector<searchFeat> &search_point,int search_feat_size)
{
static std::vector<matchResult> results0Deg;
std::atomic<size_t> results0DegSize{0};
results0Deg.resize(1920 * 1200);
auto anMinScore = (0.4 - 1);
auto NormMinScore = (0.4 / template_feat_size);
auto NormGreediness = (((1 - 0.8 * 0.4) / (1 - 0.8)) / template_feat_size);
static std::vector<float> thresholds;
thresholds.resize(((template_feat_size + 3) & ~3));
for (int m = 0; m < ((template_feat_size + 3) & ~3); m++) {
auto SumOfCoords = m + 1;
thresholds[m] = SumOfCoords * std::min(anMinScore + NormGreediness * SumOfCoords, NormMinScore * SumOfCoords);
}
/* #pragma omp parallel for collapse(2) */
for(int i = 0; i < 1920; i++)
{
for(int j = 0; j < 1200; j++)
{
float PartialScore = -999999;
auto $PartialScore = _mm_setzero_ps();
auto $PartialSum = _mm_setzero_ps();
auto $SumOfCoords = _mm_setr_ps(-3, -2, -1, 0);
auto $4 = _mm_set1_ps(4);
auto $template_feat_size = _mm_set1_ps(template_feat_size);
auto $i = _mm_set1_epi32(i);
auto $j = _mm_set1_epi32(j);
auto $0 = _mm_setzero_si128();
auto $230 = _mm_set1_epi32(230);
auto $350 = _mm_set1_epi32(350);
for(int m = 0; m < ((template_feat_size + 3) & ~3); m += 4)
{
$SumOfCoords = _mm_add_ps($SumOfCoords, $4);
auto $curX = _mm_add_epi32($i, template_point[m / 4].x);
auto $curY = _mm_add_epi32($j, template_point[m / 4].y);
auto $notok = _mm_or_si128(_mm_or_si128(_mm_castps_si128(_mm_cmpge_ps($SumOfCoords, $template_feat_size)),
_mm_or_si128(_mm_cmplt_epi32($curX, $0), _mm_cmplt_epi32($curY, $0))),
_mm_or_si128(_mm_cmpgt_epi32($curX, $230), _mm_cmpgt_epi32($curY, $350)));
/* if (0xffff == _mm_movemask_epi8($notok)) { */
/* continue; */
/* } */
auto $iTx = _mm_cvtepi16_epi32(_mm_set1_epi64(template_point[m / 4].dx));
auto $iTy = _mm_cvtepi16_epi32(_mm_set1_epi64(template_point[m / 4].dy));
auto $iTm = template_point[m / 4].mag;
auto $offSet = _mm_andnot_si128($notok, _mm_add_epi32(_mm_mullo_epi32($curY, $350), $curX));
auto $iSxy = _mm_i32gather_epi32((int *)search_point.data(), $offSet, sizeof(searchFeat));
// isx isy isx isy isx isy isx isy
auto $iSx = _mm_srai_epi32(_mm_slli_epi32($iSxy, 16), 16);
auto $iSy = _mm_srai_epi32($iSxy, 16);
auto $iSm = _mm_i32gather_ps(1 + (float *)search_point.data(), $offSet, sizeof(searchFeat));
auto $accum = _mm_andnot_ps(_mm_castsi128_ps($notok),
_mm_mul_ps(
_mm_mul_ps($iSm, $iTm),
_mm_cvtepi32_ps(
_mm_add_epi32(
_mm_mullo_epi32($iSx, $iTx),
_mm_mullo_epi32($iSy, $iTy)))));
/* auto $cond = _mm_mul_ps($m1, */
/* _mm_min_ps( */
/* _mm_add_ps($anMinScore, _mm_mul_ps($NormGreediness, $m1)), */
/* _mm_mul_ps($NormMinScore, $m1))); */
/* for (int _ = 0; _ < 4; _++) */
/* printf("%d %f %f %f %f %f %f %f %f %f %f %f\n", m + _, $accum[_], _mm_cvtepi32_ps($offSet)[_], _mm_cvtepi32_ps($curX)[_], _mm_cvtepi32_ps($curY)[_], _mm_cvtepi32_ps($iSx)[_], _mm_cvtepi32_ps($iSy)[_], _mm_cvtepi32_ps($iTx)[_], _mm_cvtepi32_ps($iTy)[_], $iSm[_], $iTm[_], _mm_cvtepi32_ps($notok)[_]); */
/* exit(1); */
// 0 1 2 3 (HI)
$accum = _mm_add_ps($accum, _mm_castsi128_ps(_mm_bslli_si128(_mm_castps_si128($accum), 4)));
// 0 1+0 2+1 3+2
$accum = _mm_add_ps($accum, _mm_castsi128_ps(_mm_bslli_si128(_mm_castps_si128($accum), 8)));
// 0 1+0 2+1+0 3+2+1+0
$PartialScore = _mm_add_ps($PartialSum, $accum);
/* auto $cond = _mm_mul_ps($SumOfCoords, _mm_min_ps( */
/* _mm_add_ps(_mm_set1_ps(anMinScore), _mm_mul_ps(_mm_set1_ps(NormGreediness), $SumOfCoords)), */
/* _mm_mul_ps(_mm_set1_ps(NormMinScore), $SumOfCoords))); */
auto $cond = _mm_load_ps(thresholds.data() + m);
auto $cmp = _mm_andnot_ps(_mm_castsi128_ps($notok), _mm_cmplt_ps($PartialScore, $cond));
int cmpmask = _mm_movemask_ps($cmp);
if (cmpmask) {
int bit = __builtin_ctz(cmpmask);
PartialScore = $PartialScore[bit] / (m + bit + 1);
goto out;
}
$PartialSum = _mm_add_ps($PartialSum, _mm_shuffle_ps($accum, $accum, 0xff));
}
$PartialScore = _mm_div_ps($PartialScore, $SumOfCoords);
PartialScore = $PartialScore[(template_feat_size - 1) & 3];
out:
if(PartialScore > 0.4f)
{
results0Deg[results0DegSize.fetch_add(1, std::memory_order_relaxed)] = {i, j, 0, PartialScore};
}
}
}
results0Deg.resize(results0DegSize.load(std::memory_order_relaxed));
return results0Deg;
}
int main()
{
//为了测试函数而生成的随机数:template_point/search_point
std::mt19937 gen(0), gen1(1), gen2(2), gen3(3), gen4(4), gen5(5);
std::uniform_int_distribution<int> intXDist(-18, 32);
std::uniform_int_distribution<int> intYDist(-48, 49);
std::uniform_int_distribution<short> shortDXDist(-348, 349);
std::uniform_int_distribution<short> shortDYDist(-421, 352);
std::uniform_real_distribution<float> floatDist(0.00237112f, 0.0120056f);
std::vector<templateFeat> template_point((215 + 3) / 4);
for (auto &feat : template_point)
{
feat.x = _mm_set_epi32 REP4(intXDist(gen1));
feat.y = _mm_set_epi32 REP4(intYDist(gen2));
feat.dx = _mm_set_pi16 REP4(shortDXDist(gen3));
feat.dy = _mm_set_pi16 REP4(shortDYDist(gen4));
feat.mag = _mm_set_ps REP4(floatDist(gen5));
}
std::uniform_int_distribution<short> shortDxDist(-460, 460);
std::uniform_int_distribution<short> shortDyDist(-476, 460);
std::uniform_real_distribution<float> float_Dist(0.0f, 0.707107f);
std::vector<searchFeat> search_point(2304000);
for (auto &feat : search_point)
{
feat.dx = (shortDxDist(gen));
/* static int once = printf("%d\n", feat.dx); */
feat.dy = (shortDyDist(gen));
feat.mag = (float_Dist(gen));
}
//运行算法,计算耗时
auto start = std::chrono::high_resolution_clock::now();
std::vector<matchResult> const &results0Deg = calSimilarity(template_point,215,search_point,2304000);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> duration = end - start;
std::cout << "Function took " << duration.count() << " milliseconds." << std::endl;
//保存匹配结果
saveDataToTxt("/tmp/match_result.txt",results0Deg);
return 0;
} 旧版本 //********************************************************************************************
//***********************************normal_calSimilarity.cpp********************************
//********************************************************************************************
#include <iostream>
#include <vector>
#include <random>
#include <fstream>
#include <stdio.h>
#include <chrono>
struct templateFeat
{
int x;
int y;
short dx;
short dy;
float mag;
};
struct matchResult
{
int i;
int j;
int angle;
float score;
};
struct searchFeat
{
short dx;
short dy;
float mag;
};
bool saveDataToTxt(const std::string txt_path,const std::vector<matchResult> match_result) {
std::ofstream file(txt_path);
if (file.is_open()) {
for (int i = 0; i < match_result.size(); ++i) {
file << match_result[i].i << " " << match_result[i].j << " " << match_result[i].angle << " " << match_result[i].score << std::endl;
}
file.close();
std::cout << "Match result datas saved to match_result.txt!" << std::endl;
} else {
std::cout << "Unable to open file" << std::endl;
}
return true;
}
std::vector<matchResult> calSimilarity(const std::vector<templateFeat> template_point,const std::vector<searchFeat> search_point)
{
int template_feat_size = template_point.size();
int search_feat_size = search_point.size();
std::vector<matchResult> results0Deg;
float anMinScore = 0.4 - 1;
float NormMinScore = 0.4 / template_feat_size;
float NormGreediness = ((1 - 0.8 * 0.4) / (1 - 0.8)) / template_feat_size;
for(int i = 0; i < 1920; i++)
{
for(int j = 0; j < 1200; j++)
{
float PartialScore = 0;
float PartialSum = 0;
int SumOfCoords = 0;
for(int m = 0; m < template_feat_size; m++)
{
int curX = i + template_point[m].x;
int curY = j + template_point[m].y;
if(curX < 0 || curY < 0 || curX > 230 || curY > 350)
{
continue;
}
int iTx = template_point[m].dx;
int iTy = template_point[m].dy;
float iTm = template_point[m].mag;
int offSet = curY * 350 + curX;
int iSx = search_point[offSet].dx;
int iSy = search_point[offSet].dy;
float iSm = search_point[offSet].mag;
if((iSx != 0 || iSy != 0) && (iTx != 0 || iTy != 0))
{
PartialSum += ((iSx * iTx) + (iSy * iTy)) * (iSm * iTm);
}
SumOfCoords = m + 1;
PartialScore = PartialSum / SumOfCoords;
/* printf("%d %f %d %d %d %d %d %d %d %f %f %f\n", m, ((iSx * iTx) + (iSy * iTy)) * (iSm * iTm), offSet, curX, curY, iSx, iSy, iTx, iTy, iSm, iTm, PartialSum); */
/* exit(1); */
/* */
/* printf("%d %f %f\n", m, PartialSum, ((iSx * iTx) + (iSy * iTy)) * (iSm * iTm)); */
if(PartialScore < (std::min(anMinScore + NormGreediness * SumOfCoords, NormMinScore * SumOfCoords)))
{
break;
}
}
/* printf("%f %d\n", PartialScore, SumOfCoords); */
if(PartialScore > 0.4)
{
/* printf("%d %d %f\n", i, j, PartialScore); */
/* exit(1); */
results0Deg.push_back({i, j, 0, PartialScore});
}
}
}
return results0Deg;
}
int main()
{
//为了测试函数而生成的随机数:template_point/search_point
std::mt19937 gen(0), gen1(1), gen2(2), gen3(3), gen4(4), gen5(5);
std::uniform_int_distribution<int> intXDist(-18, 32);
std::uniform_int_distribution<int> intYDist(-48, 49);
std::uniform_int_distribution<short> shortDXDist(-348, 349);
std::uniform_int_distribution<short> shortDYDist(-421, 352);
std::uniform_real_distribution<float> floatDist(0.00237112f, 0.0120056f);
std::vector<templateFeat> template_point(215);
for (auto &feat : template_point)
{
feat.x = intXDist(gen1);
feat.y = intYDist(gen2);
feat.dx = shortDXDist(gen3);
feat.dy = shortDYDist(gen4);
feat.mag = floatDist(gen5);
}
std::uniform_int_distribution<short> shortDxDist(-460, 460);
std::uniform_int_distribution<short> shortDyDist(-476, 460);
std::uniform_real_distribution<float> float_Dist(0.0f, 0.707107f);
std::vector<searchFeat> search_point(2304000);
for (auto &feat : search_point)
{
feat.dx = shortDxDist(gen);
/* static int once = printf("%d\n", feat.dx); */
feat.dy = shortDyDist(gen);
feat.mag = float_Dist(gen);
}
//运行算法,计算耗时
auto start = std::chrono::high_resolution_clock::now();
std::vector<matchResult> results0Deg = calSimilarity(template_point,search_point);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> duration = end - start;
std::cout << "Function took " << duration.count() << " milliseconds." << std::endl;
//保存匹配结果
saveDataToTxt("./match_result.txt",results0Deg);
return 0;
} |
最新版本的测试结果未有加速效果,测试记录:
$ g++ normal_calSimilarity.cpp -O3 -o normal_calSimilarity_gcc -lstdc++ -lm
$ ./normal_calSimilarity_gcc
Function took 311.048 milliseconds.
Match result datas saved to match_result.txt! 耗时:311.048毫秒
$ g++ normal_calSimilaritySSE.cpp -O3 -mavx2 -o normal_calSimilarity_gccSSE
$ ./normal_calSimilarity_gccSSE
Function took 817.517 milliseconds.
Match result datas saved to /tmp/match_result.txt! 耗时:817.517毫秒
//********************************************************************************************
//***********************************normal_calSimilarity.cpp********************************
//********************************************************************************************
#include <iostream>
#include <vector>
#include <random>
#include <fstream>
#include <stdio.h>
#include <chrono>
struct templateFeat
{
int x;
int y;
float dx;
float dy;
};
struct matchResult
{
int i;
int j;
float angle;
float score;
};
bool saveDataToTxt(const std::string txt_path,const std::vector<matchResult> match_result) {
std::ofstream file(txt_path);
if (file.is_open()) {
for (int i = 0; i < match_result.size(); ++i) {
file << match_result[i].i << " " << match_result[i].j << " " << match_result[i].angle << " " << match_result[i].score << std::endl;
}
file.close();
std::cout << "Match result datas saved to match_result.txt!" << std::endl;
} else {
std::cout << "Unable to open file" << std::endl;
}
return true;
}
std::vector<matchResult> calSimilarity(const std::vector<float> search_grad_x,const std::vector<float> search_grad_y,const std::vector<templateFeat> template_grad,int width,int height)
{
std::vector<matchResult> results0Deg;
float anMinScore = 0.5 - 1;
float NormMinScore = 0.5 / template_grad.size();
float NormGreediness = ((1 - 0.9 * 0.5) / (1 - 0.9))/template_grad.size();
for (int i = 12; i < 117; i++)
{
for (int j = 13; j < 123; j++)
{
// 去掉条件[1]
std::vector<templateFeat> template_grad_copy = template_grad;
for(auto it = template_grad_copy.begin(); it != template_grad_copy.end();)
{
int curX = i + it->x;
int curY = j + it->y;
if(curX < 0 || curY < 0 || curX > width - 1 || curY > height - 1)
{
it = template_grad_copy.erase(it);
}
else
{
++it;
}
}
float PartialScore = 0;
float PartialSum = 0;
int SumOfCoords = 0;
for (int m = 0; m < template_grad.size(); m++)
{
int curX = i + template_grad[m].x;
int curY = j + template_grad[m].y;
// 条件[1]
// if (curX < 0 || curY < 0 || curX > width - 1 || curY > height - 1) {
// continue;
// }
int offSet = curY * width + curX;
float iTx = template_grad[m].dx;
float iTy = template_grad[m].dy;
float iSx = search_grad_x[offSet];
float iSy = search_grad_y[offSet];
if ((iSx != 0 || iSy != 0) && (iTx != 0 || iTy != 0))
{
PartialSum += (iSx * iTx) + (iSy * iTy);
}
SumOfCoords = m + 1;
PartialScore = PartialSum / SumOfCoords;
if (PartialScore < (std::min(anMinScore + NormGreediness * SumOfCoords,NormMinScore * SumOfCoords)))
break;
}
if(PartialScore > 0.5)
{
results0Deg.push_back({i, j, 0, PartialScore});
}
}
}
return results0Deg;
}
int main()
{
int width = 1273;
int height = 1231;
int buffer_size = width * height;
std::vector<float> search_grad_x(buffer_size);
std::vector<float> search_grad_y(buffer_size);
for (int i = 0; i < buffer_size; ++i) {
search_grad_x[i] = static_cast<float>(i) * 0.5f;
}
for (int i = 0; i < buffer_size; ++i) {
search_grad_y[i] = static_cast<float>(i) * 1.5f;
}
std::vector<templateFeat> template_grad;
for (int i = 0; i < 125; ++i) {
templateFeat feat;
feat.x = i * 2.0;
feat.y = i * 2.0 + 1.0;
feat.dx = static_cast<float>(i) * 0.25f;
feat.dy = static_cast<float>(i) * 1.5f;
template_grad.push_back(feat);
}
//运行算法,计算耗时:std::milli 就是毫秒,std::micro 就是微秒seconds
auto start = std::chrono::high_resolution_clock::now();
std::vector<matchResult> results0Deg = calSimilarity(search_grad_x,search_grad_y,template_grad,width,height);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> duration = end - start;
std::cout << "Function took " << duration.count() << " ms." << std::endl;
//保存匹配结果
saveDataToTxt("./match_result.txt",results0Deg);
return 1;
}
$ g++ normal_calSimilarity.cpp -O3 -o normal_calSimilarity_gcc -lstdc++ -lm
$ ./normal_calSimilarity_gcc
Function took 10.2154 ms.
Match result datas saved to match_result.txt! 新版本的耗时:10.2154 ms 请在上面新的基准代码上进行SSE修改吧!谢谢~ 对比结果文件 match_result.txt 可以使用Meld工具www.meldmerge.org/ 进行对比测试~~~ |
//********************************************************************************************
//***********************************normal_calSimilarity.cpp********************************
//********************************************************************************************
//#define USE_MULTISCORE//causing wrong result
#define USE_FIRSTUNROLL
#define USE_OPENMP
#define USE_UNROLLM4
#include <iostream>
#include <vector>
#include <random>
#include <fstream>
#include <stdio.h>
#include <chrono>
#include <immintrin.h>
#ifdef USE_OPENMP
#include <atomic>
#endif
struct templateFeat
{
int x;
int y;
short dx;
short dy;
float mag;
};
#ifdef USE_MULTISCORE
struct templateFeatSimd
{
__m256i x;
__m256i y;
__m128i dx;
__m128i dy;
__m256 mag;
};
#endif
struct matchResult
{
int i;
int j;
int angle;
float score;
};
struct searchFeat
{
short dx;
short dy;
float mag;
};
bool saveDataToTxt(const std::string txt_path,const std::vector<matchResult> match_result) {
std::ofstream file(txt_path);
if (file.is_open()) {
for (int i = 0; i < match_result.size(); ++i) {
file << match_result[i].i << " " << match_result[i].j << " " << match_result[i].angle << " " << match_result[i].score << std::endl;
}
file.close();
std::cout << "Match result datas saved to match_result.txt!" << std::endl;
} else {
std::cout << "Unable to open file" << std::endl;
}
return true;
}
#ifdef USE_MULTISCORE
static float multiScore(templateFeatSimd const *template_point_simd,
searchFeat const *search_point,
float const *thresholds,
int template_feat_size,
int search_feat_size,
int i, int j, int mbase,
float PartialSumBase) {
auto $PartialScore = _mm256_setzero_ps();
auto $PartialSum = _mm256_set1_ps(PartialSumBase);
auto $SumOfCoords = _mm256_add_ps(_mm256_setr_ps(1, 2, 3, 4, 5, 6, 7, 8), _mm256_set1_ps(mbase));
auto $8 = _mm256_set1_ps(8);
auto $template_feat_size_plus1 = _mm256_set1_ps(template_feat_size + 1);
auto $i = _mm256_set1_epi32(i);
auto $j = _mm256_set1_epi32(j);
auto $0 = _mm256_setzero_si256();
auto $230 = _mm256_set1_epi32(230);
auto $350 = _mm256_set1_epi32(350);
/* printf("%d\n", mbase); */
for(int m = mbase; m < ((template_feat_size + 7) & ~7); m += 8, $SumOfCoords = _mm256_add_ps($SumOfCoords, $8))
{
auto $curX = _mm256_add_epi32($i, template_point_simd[m / 8].x);
auto $curY = _mm256_add_epi32($j, template_point_simd[m / 8].y);
auto $notok = _mm256_or_si256(_mm256_or_si256(_mm256_castps_si256(_mm256_cmp_ps($SumOfCoords, $template_feat_size_plus1, _CMP_GE_OQ)),
_mm256_or_si256(_mm256_srai_epi32($curX, 31), _mm256_srai_epi32($curY, 31))),
_mm256_or_si256(_mm256_cmpgt_epi32($curX, $230), _mm256_cmpgt_epi32($curY, $350)));
/* if (0xffff == _mm256_movemask_epi8($notok)) { */
/* continue; */
/* } */
auto $iTx = _mm256_cvtepi16_epi32(template_point_simd[m / 8].dx);
auto $iTy = _mm256_cvtepi16_epi32(template_point_simd[m / 8].dy);
auto $iTm = template_point_simd[m / 8].mag;
auto $offSet = _mm256_andnot_si256($notok, _mm256_add_epi32(_mm256_mullo_epi32($curY, $350), $curX));
auto $iSxy = _mm256_i32gather_epi32((int *)search_point, $offSet, sizeof(searchFeat));
// isx isy isx isy isx isy isx isy
auto $iSx = _mm256_srai_epi32(_mm256_slli_epi32($iSxy, 16), 16);
auto $iSy = _mm256_srai_epi32($iSxy, 16);
auto $iSm = _mm256_i32gather_ps(1 + (float *)search_point, $offSet, sizeof(searchFeat));
auto $accum = _mm256_andnot_ps(_mm256_castsi256_ps($notok),
_mm256_mul_ps(
_mm256_mul_ps($iSm, $iTm),
_mm256_cvtepi32_ps(
_mm256_add_epi32(
_mm256_mullo_epi32($iSx, $iTx),
_mm256_mullo_epi32($iSy, $iTy)))));
/* auto $cond = _mm256_mul_ps($m1, */
/* _mm256_min_ps( */
/* _mm256_add_ps($anMinScore, _mm256_mul_ps($NormGreediness, $m1)), */
/* _mm256_mul_ps($NormMinScore, $m1))); */
/* for (int _ = 0; _ < 4; _++) */
/* printf("%d %f %f %f %f %f %f %f %f %f %f %f\n", m + _, $accum[_], _mm256_cvtepi32_ps($offSet)[_], _mm256_cvtepi32_ps($curX)[_], _mm256_cvtepi32_ps($curY)[_], _mm256_cvtepi32_ps($iSx)[_], _mm256_cvtepi32_ps($iSy)[_], _mm256_cvtepi32_ps($iTx)[_], _mm256_cvtepi32_ps($iTy)[_], $iSm[_], $iTm[_], _mm256_cvtepi32_ps($notok)[_]); */
/* exit(1); */
/* for (int _ = 0; _ < 8; _++) */
/* printf("%d %f\n", _, $accum[_]); */
// 0 1 2 3 4 5 6 7
$accum = _mm256_add_ps($accum, _mm256_castsi256_ps(_mm256_slli_si256(_mm256_castps_si256($accum), 4)));
// 0 1+0 2+1 3+2 4 5+4 6+5 7+6
$accum = _mm256_add_ps($accum, _mm256_castsi256_ps(_mm256_slli_si256(_mm256_castps_si256($accum), 8)));
// 0 1+0 2+1+0 3+2+1+0 4 5+4 6+5+4 7+6+5+4
$accum = _mm256_add_ps($accum, _mm256_set_m128(_mm_permute_ps(_mm256_castps256_ps128($accum), 0xff), _mm_setzero_ps()));
// 0 1+0 2+1+0 3+2+1+0 3+2+1+0+4 3+2+1+0+5+4 3+2+1+0+6+5+4 3+2+1+0+7+6+5+4
/* for (int _ = 0; _ < 8; _++) */
/* printf("%d %f\n", _, $accum[_]); */
/* exit(1); */
$PartialScore = _mm256_add_ps($PartialSum, $accum);
/* auto $cond = _mm256_mul_ps($SumOfCoords, _mm256_min_ps( */
/* _mm256_add_ps(_mm256_set1_ps(anMinScore), _mm256_mul_ps(_mm256_set1_ps(NormGreediness), $SumOfCoords)), */
/* _mm256_mul_ps(_mm256_set1_ps(NormMinScore), $SumOfCoords))); */
auto $cond = _mm256_load_ps(thresholds + m);
auto $cmp = _mm256_andnot_ps(_mm256_castsi256_ps($notok), _mm256_cmp_ps($PartialScore, $cond, _CMP_LT_OQ));
int cmpmask = _mm256_movemask_ps($cmp);
if (cmpmask) [[unlikely]] {
int bit = __builtin_ctz(cmpmask);
return $PartialScore[bit] / (m + bit + 1);
}
$PartialSum = _mm256_add_ps($PartialSum, _mm256_broadcastss_ps(_mm256_castps256_ps128(_mm256_castpd_ps(_mm256_permute4x64_pd(_mm256_castps_pd(_mm256_permute_ps($accum, 0xff)), 0xf)))));
}
/* for (int _ = 0; _ < 8; _++) */
/* printf("%d %f %f\n", _, $PartialSum[_], $SumOfCoords[_]); */
/* exit(1); */
$PartialScore = _mm256_div_ps($PartialScore, $SumOfCoords);
return $PartialScore[(template_feat_size - 1) & 7];
}
#endif
std::vector<matchResult> const &calSimilarity(const std::vector<templateFeat> &template_point,const std::vector<searchFeat> &search_point)
{
int template_feat_size = template_point.size();
int search_feat_size = search_point.size();
#ifdef USE_MULTISCORE
static std::vector<templateFeatSimd> template_point_simd;
template_point_simd.resize((template_feat_size + 7) & ~7);
for (int m = 0; m < template_point_simd.size() * 8; m += 8) {
template_point_simd[m / 8].x = _mm256_setr_epi32(template_point[(m) % template_feat_size].x, template_point[(m + 1) % template_feat_size].x, template_point[(m + 2) % template_feat_size].x, template_point[(m + 3) % template_feat_size].x, template_point[(m + 4) % template_feat_size].x, template_point[(m + 4 + 1) % template_feat_size].x, template_point[(m + 4 + 2) % template_feat_size].x, template_point[(m + 4 + 3) % template_feat_size].x);
template_point_simd[m / 8].y = _mm256_setr_epi32(template_point[(m) % template_feat_size].y, template_point[(m + 1) % template_feat_size].y, template_point[(m + 2) % template_feat_size].y, template_point[(m + 3) % template_feat_size].y, template_point[(m + 4) % template_feat_size].y, template_point[(m + 4 + 1) % template_feat_size].y, template_point[(m + 4 + 2) % template_feat_size].y, template_point[(m + 4 + 3) % template_feat_size].y);
template_point_simd[m / 8].dx = _mm_setr_epi16(template_point[(m) % template_feat_size].dx, template_point[(m + 1) % template_feat_size].dx, template_point[(m + 2) % template_feat_size].dx, template_point[(m + 3) % template_feat_size].dx, template_point[(m + 4) % template_feat_size].dx, template_point[(m + 4 + 1) % template_feat_size].dx, template_point[(m + 4 + 2) % template_feat_size].dx, template_point[(m + 4 + 3) % template_feat_size].dx);
template_point_simd[m / 8].dy = _mm_setr_epi16(template_point[(m) % template_feat_size].dy, template_point[(m + 1) % template_feat_size].dy, template_point[(m + 2) % template_feat_size].dy, template_point[(m + 3) % template_feat_size].dy, template_point[(m + 4) % template_feat_size].dy, template_point[(m + 4 + 1) % template_feat_size].dy, template_point[(m + 4 + 2) % template_feat_size].dy, template_point[(m + 4 + 3) % template_feat_size].dy);
template_point_simd[m / 8].mag = _mm256_setr_ps(template_point[(m) % template_feat_size].mag, template_point[(m + 1) % template_feat_size].mag, template_point[(m + 2) % template_feat_size].mag, template_point[(m + 3) % template_feat_size].mag, template_point[(m + 4) % template_feat_size].mag, template_point[(m + 4 + 1) % template_feat_size].mag, template_point[(m + 4 + 2) % template_feat_size].mag, template_point[(m + 4 + 3) % template_feat_size].mag);
}
#endif
static std::vector<matchResult> results0Deg;
results0Deg.resize(1920 * 1200);
#ifdef USE_OPENMP
std::atomic<size_t> results0DegSize = 0;
#else
size_t results0DegSize = 0;
#endif
float anMinScore = 0.4 - 1;
float NormMinScore = 0.4 / template_feat_size;
float NormGreediness = ((1 - 0.8 * 0.4) / (1 - 0.8)) / template_feat_size;
static std::vector<float> thresholds;
thresholds.resize(((template_feat_size + 7) & ~7));
for (int m = 0; m < ((template_feat_size + 7) & ~7); m++) {
auto SumOfCoords = m + 1;
thresholds[m] = SumOfCoords * std::min(anMinScore + NormGreediness * SumOfCoords, NormMinScore * SumOfCoords);
}
#ifdef USE_OPENMP
#pragma omp parallel for schedule(dynamic, 8)
#endif
for(int i = 0; i < 1920; i++)
{
for(int j = 0; j < 1200; j += 8)
{
int m;
auto $PartialSum = _mm256_setzero_ps();
#ifdef USE_FIRSTUNROLL
auto $SumOfCoords = _mm256_set1_epi32(1);
#else
auto $SumOfCoords = _mm256_set1_epi32(1);
#endif
auto $j = _mm256_add_epi32(_mm256_set1_epi32(j), _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7));
auto $350 = _mm256_set1_epi32(350);
auto $loopterm = _mm256_setzero_si256();
#ifdef USE_FIRSTUNROLL
{
int curX = i + template_point[0].x;
if (!(curX < 0 || curX > 230)) [[likely]] {
auto $curY = _mm256_add_epi32($j, _mm256_set1_epi32(template_point[0].y));
auto $notok = _mm256_or_si256(_mm256_srai_epi32($curY, 31), _mm256_cmpgt_epi32($curY, $350));
int iTx = template_point[0].dx;
int iTy = template_point[0].dy;
float iTm = template_point[0].mag;
/* int offSet = curY * 350 + curX; */
auto $offSet = _mm256_andnot_si256($notok, _mm256_add_epi32(_mm256_mullo_epi32($curY, $350), _mm256_set1_epi32(curX)));
auto $iSxy = _mm256_i32gather_epi32((int *)search_point.data(), $offSet, sizeof(searchFeat));
// isx isy isx isy isx isy isx isy
auto $iSx = _mm256_srai_epi32(_mm256_slli_epi32($iSxy, 16), 16);
auto $iSy = _mm256_srai_epi32($iSxy, 16);
auto $iSm = _mm256_i32gather_ps(1 + (float *)search_point.data(), $offSet, sizeof(searchFeat));
auto $accum = _mm256_andnot_ps(_mm256_castsi256_ps($notok),
_mm256_mul_ps(
_mm256_mul_ps($iSm, _mm256_set1_ps(iTm)),
_mm256_cvtepi32_ps(
_mm256_add_epi32(
_mm256_mullo_epi32($iSx, _mm256_set1_epi32(iTx)),
_mm256_mullo_epi32($iSy, _mm256_set1_epi32(iTy))))));
$PartialSum = _mm256_add_ps($PartialSum, $accum);
auto $cmp = _mm256_andnot_ps(_mm256_castsi256_ps($notok), _mm256_cmp_ps($PartialSum, _mm256_set1_ps(thresholds[0]), _CMP_LT_OQ));
$loopterm = _mm256_castps_si256($cmp);
$SumOfCoords = _mm256_add_epi32($SumOfCoords, _mm256_andnot_si256($loopterm, _mm256_set1_epi32(1)));
}
}
if (_mm256_movemask_epi8($loopterm) != -1) {
int curX = i + template_point[1].x;
if (!(curX < 0 || curX > 230)) [[likely]] {
auto $curY = _mm256_add_epi32($j, _mm256_set1_epi32(template_point[1].y));
auto $notok = _mm256_or_si256($loopterm, _mm256_or_si256(_mm256_srai_epi32($curY, 31), _mm256_cmpgt_epi32($curY, $350)));
int iTx = template_point[1].dx;
int iTy = template_point[1].dy;
float iTm = template_point[1].mag;
/* int offSet = curY * 350 + curX; */
auto $offSet = _mm256_andnot_si256($notok, _mm256_add_epi32(_mm256_mullo_epi32($curY, $350), _mm256_set1_epi32(curX)));
auto $iSxy = _mm256_i32gather_epi32((int *)search_point.data(), $offSet, sizeof(searchFeat));
// isx isy isx isy isx isy isx isy
auto $iSx = _mm256_srai_epi32(_mm256_slli_epi32($iSxy, 16), 16);
auto $iSy = _mm256_srai_epi32($iSxy, 16);
auto $iSm = _mm256_i32gather_ps(1 + (float *)search_point.data(), $offSet, sizeof(searchFeat));
auto $accum = _mm256_andnot_ps(_mm256_castsi256_ps($notok),
_mm256_mul_ps(
_mm256_mul_ps($iSm, _mm256_set1_ps(iTm)),
_mm256_cvtepi32_ps(
_mm256_add_epi32(
_mm256_mullo_epi32($iSx, _mm256_set1_epi32(iTx)),
_mm256_mullo_epi32($iSy, _mm256_set1_epi32(iTy))))));
$PartialSum = _mm256_add_ps($PartialSum, $accum);
auto $cmp = _mm256_andnot_ps(_mm256_castsi256_ps($notok), _mm256_cmp_ps($PartialSum, _mm256_set1_ps(thresholds[1]), _CMP_LT_OQ));
$loopterm = _mm256_or_si256($loopterm, _mm256_castps_si256($cmp));
$SumOfCoords = _mm256_add_epi32($SumOfCoords, _mm256_andnot_si256($loopterm, _mm256_set1_epi32(1)));
}
if (_mm256_movemask_epi8($loopterm) != -1) [[unlikely]] {
m = 2;
#else
m = 0;
#endif
#ifdef USE_UNROLLM4
#pragma GCC unroll 4
#endif
for(;
m < template_feat_size;
m++, $SumOfCoords = _mm256_add_epi32($SumOfCoords, _mm256_andnot_si256($loopterm, _mm256_set1_epi32(1)))
) {
int curX = i + template_point[m].x;
if (curX < 0 || curX > 230) [[unlikely]] { continue; }
auto $curY = _mm256_add_epi32($j, _mm256_set1_epi32(template_point[m].y));
auto $notok = _mm256_or_si256($loopterm, _mm256_or_si256(_mm256_srai_epi32($curY, 31), _mm256_cmpgt_epi32($curY, $350)));
int iTx = template_point[m].dx;
int iTy = template_point[m].dy;
float iTm = template_point[m].mag;
/* int offSet = curY * 350 + curX; */
auto $offSet = _mm256_andnot_si256($notok, _mm256_add_epi32(_mm256_mullo_epi32($curY, $350), _mm256_set1_epi32(curX)));
auto $iSxy = _mm256_i32gather_epi32((int *)search_point.data(), $offSet, sizeof(searchFeat));
// isx isy isx isy isx isy isx isy
auto $iSx = _mm256_srai_epi32(_mm256_slli_epi32($iSxy, 16), 16);
auto $iSy = _mm256_srai_epi32($iSxy, 16);
auto $iSm = _mm256_i32gather_ps(1 + (float *)search_point.data(), $offSet, sizeof(searchFeat));
auto $accum = _mm256_andnot_ps(_mm256_castsi256_ps($notok),
_mm256_mul_ps(
_mm256_mul_ps($iSm, _mm256_set1_ps(iTm)),
_mm256_cvtepi32_ps(
_mm256_add_epi32(
_mm256_mullo_epi32($iSx, _mm256_set1_epi32(iTx)),
_mm256_mullo_epi32($iSy, _mm256_set1_epi32(iTy))))));
$PartialSum = _mm256_add_ps($PartialSum, $accum);
auto $cmp = _mm256_andnot_ps(_mm256_castsi256_ps($notok), _mm256_cmp_ps($PartialSum, _mm256_set1_ps(thresholds[m]), _CMP_LT_OQ));
$loopterm = _mm256_or_si256($loopterm, _mm256_castps_si256($cmp));
#ifndef USE_MULTISCORE
if (_mm256_movemask_epi8($loopterm) == -1) {
break;
}
#else
auto popcorn = __builtin_popcount(_mm256_movemask_ps(_mm256_castsi256_ps($loopterm)));
if ((popcorn == 8 || popcorn >= 7) && !(m & 7)) [[unlikely]] {
break;
}
#endif
}
#ifdef USE_FIRSTUNROLL
}
}
#endif
auto $PartialScore = _mm256_div_ps($PartialSum, _mm256_cvtepi32_ps($SumOfCoords));
#ifdef USE_MULTISCORE
auto mask = _mm256_movemask_ps(_mm256_castsi256_ps($loopterm));
if (mask != 0xff && m < template_feat_size) {
auto k = __builtin_ctz(~mask);
$PartialScore[k] = multiScore(
template_point_simd.data(), search_point.data(), thresholds.data(),
template_feat_size, search_feat_size, i, j, m, $PartialSum[k]);
}
#endif
auto $PartialScoreAbove = _mm256_cmp_ps($PartialScore, _mm256_set1_ps(0.4f), _CMP_GT_OQ);
for (int k = 0; k < 8; k++) {
if($PartialScoreAbove[k]) {
/* printf("%d %d %f\n", i, j, PartialScore); */
/* exit(1); */
#ifdef USE_OPENMP
results0Deg[results0DegSize.fetch_add(1, std::memory_order_relaxed)] = {i, j + k, 0, $PartialScore[k]};
#else
results0Deg[results0DegSize++] = {i, j + k, 0, $PartialScore[k]};
#endif
}
}
}
}
#ifdef USE_OPENMP
results0Deg.resize(results0DegSize.load(std::memory_order_relaxed));
#else
results0Deg.resize(results0DegSize);
#endif
return results0Deg;
}
int main()
{
//为了测试函数而生成的随机数:template_point/search_point
std::mt19937 gen(0), gen1(1), gen2(2), gen3(3), gen4(4), gen5(5);
std::uniform_int_distribution<int> intXDist(-18, 32);
std::uniform_int_distribution<int> intYDist(-48, 49);
std::uniform_int_distribution<short> shortDXDist(-348, 349);
std::uniform_int_distribution<short> shortDYDist(-421, 352);
std::uniform_real_distribution<float> floatDist(0.00237112f, 0.0120056f);
std::vector<templateFeat> template_point(215);
for (auto &feat : template_point)
{
feat.x = intXDist(gen1);
feat.y = intYDist(gen2);
feat.dx = shortDXDist(gen3);
feat.dy = shortDYDist(gen4);
feat.mag = floatDist(gen5);
}
std::uniform_int_distribution<short> shortDxDist(-460, 460);
std::uniform_int_distribution<short> shortDyDist(-476, 460);
std::uniform_real_distribution<float> float_Dist(0.0f, 0.707107f);
std::vector<searchFeat> search_point(2304000);
for (auto &feat : search_point)
{
feat.dx = shortDxDist(gen);
/* static int once = printf("%d\n", feat.dx); */
feat.dy = shortDyDist(gen);
feat.mag = float_Dist(gen);
}
//运行算法,计算耗时
auto start = std::chrono::high_resolution_clock::now();
/* for (int i = 0; i < 100; i++) */
/* calSimilarity(template_point,search_point); */
std::vector<matchResult> const &results0Deg = calSimilarity(template_point,search_point);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> duration = end - start;
std::cout << "Function took " << duration.count() << " milliseconds." << std::endl;
//保存匹配结果
saveDataToTxt("/tmp/match_result.txt",results0Deg);
return 0;
} 修复了性能提升不大的问题,现在是418ms -> 21ms了。 |
以下的calSimilarity函数在算法过程中会执行(-180°~180°步长为0.1)很多次,下面时从工程中摘出来的代码,运行单次时,在编译器开O0优化,耗时为2046.41ms;在编译器开O3优化,耗时为310.06ms
gcc normal_calSimilarity.cpp -O0 -o normal_calSimilarity_gcc -lstdc++ -lm
The text was updated successfully, but these errors were encountered: