Skip to content

Commit

Permalink
Improve performance by reordering simd intrinsics
Browse files Browse the repository at this point in the history
  • Loading branch information
stevenewald committed Jan 11, 2025
1 parent 33b729a commit 4761f25
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 17 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ include(cmake/project-is-top-level.cmake)
include(cmake/variables.cmake)

set(CMAKE_INTERPROCEDURAL_OPTIMIZATION ON)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rocketlake")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
# add_compile_options(-fno-inline -fno-omit-frame-pointer)


Expand Down
26 changes: 10 additions & 16 deletions source/mandelbrot/equations_simd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,15 @@ std::array<iteration_count, 8> compute_iterations(
__mmask8 active_mask = 0xFF;

for (iteration_count iterations = 0; iterations < max_iters; iterations++) {
// load current values
__m512d x = input_vec_real;
__m512d y = input_vec_imag;

// compute squares and product
__m512d x_squared = _mm512_mul_pd(x, x);
__m512d y_squared = _mm512_mul_pd(y, y);
__m512d xy = _mm512_mul_pd(x, y);
__m512d x_squared = _mm512_mul_pd(input_vec_real, input_vec_real);
__m512d y_squared = _mm512_mul_pd(input_vec_imag, input_vec_imag);
__m512d xy = _mm512_mul_pd(input_vec_real, input_vec_imag);

// update real part: input_vec_real = x_squared - y_squared + constant_reals
__m512d temp_real = _mm512_sub_pd(x_squared, y_squared);
input_vec_real = _mm512_add_pd(temp_real, input_vec_constant_reals);
input_vec_real = _mm512_add_pd(
_mm512_sub_pd(x_squared, y_squared), input_vec_constant_reals
);

// update imaginary part: input_vec_imag = 2 * xy + constant_imags
input_vec_imag =
Expand All @@ -49,24 +46,21 @@ std::array<iteration_count, 8> compute_iterations(
__m512d squared_norms_vec = _mm512_add_pd(x_squared, y_squared);

// determine which elements have diverged
__mmask8 solved_mask =
_mm512_cmp_pd_mask(squared_norms_vec, squared_divergence_vec, _CMP_GT_OS);
active_mask =
_mm512_cmp_pd_mask(squared_norms_vec, squared_divergence_vec, _CMP_LE_OS);

// update iteration counts for elements that have just diverged
solved_its_vec = _mm_mask_blend_epi16(
solved_mask, solved_its_vec,
active_mask, solved_its_vec,
_mm_set1_epi16(static_cast<int16_t>(iterations))
);

// update active mask to skip computations for diverged elements
active_mask = _kandn_mask8(solved_mask, active_mask);

// break if all elements have diverged
if (active_mask == 0) [[unlikely]]
break;
}

__mmask8 mask = _mm_cmpeq_epi16_mask(solved_its_vec, _mm_set1_epi16(0));
__mmask8 mask = _mm_cmpeq_epi16_mask(solved_its_vec, _mm_set1_epi16(max_iters - 1));
solved_its_vec = _mm_mask_mov_epi16(
solved_its_vec, mask, _mm_set1_epi16(static_cast<int16_t>(max_iters))
);
Expand Down

0 comments on commit 4761f25

Please sign in to comment.