Skip to content

Commit

Permalink
Better hmin/hmax algorithms for SSE/AVX2
Browse files Browse the repository at this point in the history
Use a formulation that automatically produces the same result
in all lanes, avoiding a separate broadcast step.

The same approach would work with floats in principle, but it's
not guaranteed to give the same result in all lanes when NaNs
are involved (due to the way MINPS/MAXPS are defined), so leave
the float versions alone for now.

About 1% encode time reduction encoding a 8192x8192 test texture
at 6x6 -thorough on a Ryzen 7950X3D.
  • Loading branch information
Fabian Giesen committed Nov 1, 2024
1 parent 2ff200e commit b79248a
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 18 deletions.
22 changes: 10 additions & 12 deletions Source/astcenc_vecmathlib_avx2_8.h
Original file line number Diff line number Diff line change
Expand Up @@ -458,13 +458,12 @@ ASTCENC_SIMD_INLINE vint8 max(vint8 a, vint8 b)
*/
ASTCENC_SIMD_INLINE vint8 hmin(vint8 a)
{
__m128i m = _mm_min_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1));
m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2)));
m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
// Build min within groups of 2, then 4, then 8
__m256i m = _mm256_min_epi32(a.m, _mm256_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1)));
m = _mm256_min_epi32(m, _mm256_shuffle_epi32(m, _MM_SHUFFLE(1, 0, 3, 2)));
m = _mm256_min_epi32(m, _mm256_permute2x128_si256(m, m, 0x01));

__m256i r = astcenc_mm256_set_m128i(m, m);
vint8 vmin(r);
vint8 vmin(m);
return vmin;
}

Expand All @@ -481,13 +480,12 @@ ASTCENC_SIMD_INLINE int hmin_s(vint8 a)
*/
ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
{
__m128i m = _mm_max_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1));
m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2)));
m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
// Build max within groups of 2, then 4, then 8
__m256i m = _mm256_max_epi32(a.m, _mm256_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1)));
m = _mm256_max_epi32(m, _mm256_shuffle_epi32(m, _MM_SHUFFLE(1, 0, 3, 2)));
m = _mm256_max_epi32(m, _mm256_permute2x128_si256(m, m, 0x01));

__m256i r = astcenc_mm256_set_m128i(m, m);
vint8 vmax(r);
vint8 vmax(m);
return vmax;
}

Expand Down
12 changes: 6 additions & 6 deletions Source/astcenc_vecmathlib_sse_4.h
Original file line number Diff line number Diff line change
Expand Up @@ -606,19 +606,19 @@ ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b)
*/
ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
{
a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 3, 2))));
a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 1))));
return vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 0)));
a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1))));
a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(1, 0, 3, 2))));
return a;
}

/*
* @brief Return the horizontal maximum of a vector.
*/
ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
{
a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 3, 2))));
a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 1))));
return vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 0)));
a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(2, 3, 0, 1))));
a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(1, 0, 3, 2))));
return a;
}

/**
Expand Down

0 comments on commit b79248a

Please sign in to comment.