Skip to content

Commit

Permalink
Better hmin/hmax algorithms for SSE/AVX2 (#510)
Browse files Browse the repository at this point in the history
Use a formulation that automatically produces the same result
in all lanes, avoiding a separate broadcast step.

The same approach would work with floats in principle, but it's
not guaranteed to give the same result in all lanes when NaNs
are involved (due to the way MINPS/MAXPS are defined), so leave
the float versions alone for now.

About 1% encode time reduction encoding a 8192x8192 test texture
at 6x6 -thorough on a Ryzen 7950X3D.

Co-authored-by: Fabian Giesen <[email protected]>
Co-authored-by: Pete Harris <[email protected]>
  • Loading branch information
3 people authored Nov 4, 2024
1 parent c968f72 commit 521179c
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 18 deletions.
22 changes: 10 additions & 12 deletions Source/astcenc_vecmathlib_avx2_8.h
Original file line number Diff line number Diff line change
Expand Up @@ -458,13 +458,12 @@ ASTCENC_SIMD_INLINE vint8 max(vint8 a, vint8 b)
*/
ASTCENC_SIMD_INLINE vint8 hmin(vint8 a)
{
__m128i m = _mm_min_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1));
m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2)));
m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
// Build min within groups of 2, then 4, then 8
__m256i m = _mm256_min_epi32(a.m, _mm256_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1)));
m = _mm256_min_epi32(m, _mm256_shuffle_epi32(m, _MM_SHUFFLE(1, 0, 3, 2)));
m = _mm256_min_epi32(m, _mm256_permute2x128_si256(m, m, 0x01));

__m256i r = astcenc_mm256_set_m128i(m, m);
vint8 vmin(r);
vint8 vmin(m);
return vmin;
}

Expand All @@ -481,13 +480,12 @@ ASTCENC_SIMD_INLINE int hmin_s(vint8 a)
*/
ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
{
__m128i m = _mm_max_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1));
m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2)));
m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
// Build max within groups of 2, then 4, then 8
__m256i m = _mm256_max_epi32(a.m, _mm256_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1)));
m = _mm256_max_epi32(m, _mm256_shuffle_epi32(m, _MM_SHUFFLE(1, 0, 3, 2)));
m = _mm256_max_epi32(m, _mm256_permute2x128_si256(m, m, 0x01));

__m256i r = astcenc_mm256_set_m128i(m, m);
vint8 vmax(r);
vint8 vmax(m);
return vmax;
}

Expand Down
12 changes: 6 additions & 6 deletions Source/astcenc_vecmathlib_sse_4.h
Original file line number Diff line number Diff line change
Expand Up @@ -606,19 +606,19 @@ ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b)
*/
ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
{
a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 3, 2))));
a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 1))));
return vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 0)));
a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1))));
a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(1, 0, 3, 2))));
return a;
}

/*
* @brief Return the horizontal maximum of a vector.
*/
ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
{
a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 3, 2))));
a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 1))));
return vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 0)));
a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1))));
a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(1, 0, 3, 2))));
return a;
}

/**
Expand Down

0 comments on commit 521179c

Please sign in to comment.