From 510bc4d7253530235301c094d3d4b28a4e7b50f4 Mon Sep 17 00:00:00 2001 From: Nicholas K Sauter Date: Tue, 9 Apr 2024 15:59:59 -0700 Subject: [PATCH] Debug case for non-working test. The script tests/tst_memory_policy.py fails with a cuda illegal access. The intention is to get help from NESAP to get a functional test. --- simtbx/kokkos/detector.cpp | 135 +++++++++++++ simtbx/kokkos/detector.h | 82 +------- simtbx/kokkos/simulation.cpp | 173 +++++++++++++++++ simtbx/kokkos/simulation.h | 89 +-------- simtbx/kokkos/simulation_kernels.h | 300 +++++++++++++++++++++++++++++ simtbx/tests/tst_memory_policy.py | 56 +++++- 6 files changed, 664 insertions(+), 171 deletions(-) diff --git a/simtbx/kokkos/detector.cpp b/simtbx/kokkos/detector.cpp index 233b97c7ff6..c4ef795d725 100644 --- a/simtbx/kokkos/detector.cpp +++ b/simtbx/kokkos/detector.cpp @@ -92,5 +92,140 @@ namespace simtbx { namespace Kokkos { } } + template<> + void kokkos_detector::hello(){ + SCITBX_EXAMINE("small small small"); + } + template<> + void kokkos_detector::hello(){ + SCITBX_EXAMINE("large large large"); + } + + template<> void + kokkos_detector::each_image_allocate() { + resize(m_rangemap, m_total_pixel_count); + resize(m_omega_reduction, m_total_pixel_count); + resize(m_max_I_x_reduction, m_total_pixel_count); + resize(m_max_I_y_reduction, m_total_pixel_count); + resize(m_floatimage, m_total_pixel_count); + + resize(m_maskimage, m_total_pixel_count); + kokkostbx::transfer_shared2kokkos(m_sdet_vector, metrology.sdet); + kokkostbx::transfer_shared2kokkos(m_fdet_vector, metrology.fdet); + kokkostbx::transfer_shared2kokkos(m_odet_vector, metrology.odet); + kokkostbx::transfer_shared2kokkos(m_pix0_vector, metrology.pix0); + kokkostbx::transfer_shared2kokkos(m_distance, metrology.dists); + kokkostbx::transfer_shared2kokkos(m_Xbeam, metrology.Xbeam); + kokkostbx::transfer_shared2kokkos(m_Ybeam, metrology.Ybeam); + fence(); + + // metrology.show(); + + // printf(" rangemap size:%d\n", m_rangemap.span()); + // printf(" omega_reduction size:%d\n", m_omega_reduction.span()); + // printf(" max_I_x_reduction size:%d\n", m_max_I_x_reduction.span()); + // printf(" max_I_y_reduction size:%d\n", m_max_I_y_reduction.span()); + // printf(" maskimage size:%d\n", m_maskimage.span()); + // printf(" floatimage size:%d\n", m_floatimage.span()); + // printf(" sdet_vector size:%d\n", m_sdet_vector.span()); + // printf(" fdet_vector size:%d\n", m_fdet_vector.span()); + // printf(" odet_vector size:%d\n", m_odet_vector.span()); + // printf(" pix0_vector size:%d\n", m_pix0_vector.span()); + // printf(" distance size:%d\n", m_distance.span()); + // printf(" Xbeam size:%d\n", m_Xbeam.span()); + // printf(" Ybeam size:%d\n", m_Ybeam.span()); + + // print_view(m_fdet_vector); + // print_view(m_odet_vector, 1, 3); + + // printf("DONE.\n"); + } + template<> void + kokkos_detector::each_image_allocate() { + resize(m_maskimage, m_total_pixel_count); + kokkostbx::transfer_shared2kokkos(m_sdet_vector, metrology.sdet); + kokkostbx::transfer_shared2kokkos(m_fdet_vector, metrology.fdet); + kokkostbx::transfer_shared2kokkos(m_odet_vector, metrology.odet); + kokkostbx::transfer_shared2kokkos(m_pix0_vector, metrology.pix0); + kokkostbx::transfer_shared2kokkos(m_distance, metrology.dists); + kokkostbx::transfer_shared2kokkos(m_Xbeam, metrology.Xbeam); + kokkostbx::transfer_shared2kokkos(m_Ybeam, metrology.Ybeam); + fence(); + } + + template<> + void + kokkos_detector::set_active_pixels_on_GPU(af::shared active_pixel_list_value) { + m_active_pixel_size = active_pixel_list_value.size(); + kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value); + active_pixel_list = active_pixel_list_value; + } + + template<> + void + kokkos_detector::set_active_pixels_on_GPU(af::shared active_pixel_list_value) { + m_active_pixel_size = active_pixel_list_value.size(); + kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value); + active_pixel_list = active_pixel_list_value; + resize(m_rangemap, m_active_pixel_size); + resize(m_omega_reduction, m_active_pixel_size); + resize(m_max_I_x_reduction, m_active_pixel_size); + resize(m_max_I_y_reduction, m_active_pixel_size); + resize(m_floatimage, m_active_pixel_size); + resize(m_accumulate_floatimage, m_active_pixel_size); + fence(); + } + + template<> af::shared + kokkos_detector::get_whitelist_raw_pixels(af::shared selection) { + hello(); + //return the data array for the multipanel detector case, but only for whitelist pixels + vector_size_t active_pixel_selection = vector_size_t("active_pixel_selection", selection.size()); + kokkostbx::transfer_shared2kokkos(active_pixel_selection, selection); + + size_t output_pixel_size = selection.size(); + vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size); + + auto temp = m_accumulate_floatimage; + + parallel_for("get_active_pixel_selection", + range_policy(0, output_pixel_size), + KOKKOS_LAMBDA (const int i) { + size_t index = active_pixel_selection( i ); + active_pixel_results( i ) = temp( index ); + }); + + af::shared output_array(output_pixel_size, af::init_functor_null()); + kokkostbx::transfer_kokkos2shared(output_array, active_pixel_results); + + SCITBX_ASSERT(output_array.size() == output_pixel_size); + return output_array; + } + template<> af::shared + kokkos_detector::get_whitelist_raw_pixels(af::shared selection) { + SCITBX_CHECK_POINT; + hello(); + //return the data array for the multipanel detector case, but only for whitelist pixels + + std::size_t output_pixel_size = selection.size(); + //vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size); + + //auto temp = m_accumulate_floatimage; + + //parallel_for("get_active_pixel_selection2", + // range_policy(0, output_pixel_size), + // KOKKOS_LAMBDA (const int i) { + // active_pixel_results( i ) = temp( i ); + //}); + + af::shared output_array(output_pixel_size, af::init_functor_null()); + SCITBX_CHECK_POINT; + kokkostbx::transfer_kokkos2shared(output_array, m_accumulate_floatimage);//active_pixel_results); + SCITBX_CHECK_POINT; + + SCITBX_ASSERT(output_array.size() == output_pixel_size); + return output_array; + } + } // Kokkos } // simtbx diff --git a/simtbx/kokkos/detector.h b/simtbx/kokkos/detector.h index c7149eaeb40..74d286ffbed 100644 --- a/simtbx/kokkos/detector.h +++ b/simtbx/kokkos/detector.h @@ -41,10 +41,8 @@ struct packed_metrology{ af::sharedYbeam; }; -struct large_array_policy { -}; -struct small_whitelist_policy { -}; +struct large_array_policy {}; +struct small_whitelist_policy {}; template struct kokkos_detector @@ -63,7 +61,7 @@ struct kokkos_detector //void write_raw_pixels(simtbx::nanoBragg::nanoBragg&); //af::flex_double get_raw_pixels(); //void set_active_pixels_on_GPU(af::shared); - //af::shared get_whitelist_raw_pixels(af::shared); + af::shared get_whitelist_raw_pixels(af::shared); inline void each_image_free(){} //no op in Kokkos int h_deviceID; @@ -157,46 +155,7 @@ struct kokkos_detector return view_floatimage; }; - inline void - each_image_allocate() { - resize(m_rangemap, m_total_pixel_count); - resize(m_omega_reduction, m_total_pixel_count); - resize(m_max_I_x_reduction, m_total_pixel_count); - resize(m_max_I_y_reduction, m_total_pixel_count); - - resize(m_maskimage, m_total_pixel_count); - resize(m_floatimage, m_total_pixel_count); - - kokkostbx::transfer_shared2kokkos(m_sdet_vector, metrology.sdet); - kokkostbx::transfer_shared2kokkos(m_fdet_vector, metrology.fdet); - kokkostbx::transfer_shared2kokkos(m_odet_vector, metrology.odet); - kokkostbx::transfer_shared2kokkos(m_pix0_vector, metrology.pix0); - kokkostbx::transfer_shared2kokkos(m_distance, metrology.dists); - kokkostbx::transfer_shared2kokkos(m_Xbeam, metrology.Xbeam); - kokkostbx::transfer_shared2kokkos(m_Ybeam, metrology.Ybeam); - fence(); - - // metrology.show(); - - // printf(" rangemap size:%d\n", m_rangemap.span()); - // printf(" omega_reduction size:%d\n", m_omega_reduction.span()); - // printf(" max_I_x_reduction size:%d\n", m_max_I_x_reduction.span()); - // printf(" max_I_y_reduction size:%d\n", m_max_I_y_reduction.span()); - // printf(" maskimage size:%d\n", m_maskimage.span()); - // printf(" floatimage size:%d\n", m_floatimage.span()); - // printf(" sdet_vector size:%d\n", m_sdet_vector.span()); - // printf(" fdet_vector size:%d\n", m_fdet_vector.span()); - // printf(" odet_vector size:%d\n", m_odet_vector.span()); - // printf(" pix0_vector size:%d\n", m_pix0_vector.span()); - // printf(" distance size:%d\n", m_distance.span()); - // printf(" Xbeam size:%d\n", m_Xbeam.span()); - // printf(" Ybeam size:%d\n", m_Ybeam.span()); - - // print_view(m_fdet_vector); - // print_view(m_odet_vector, 1, 3); - - // printf("DONE.\n"); - } + void each_image_allocate(); inline void scale_in_place(const double& factor){ @@ -206,6 +165,8 @@ struct kokkos_detector }); } + void set_active_pixels_on_GPU(af::shared active_pixel_list_value); + inline void write_raw_pixels(simtbx::nanoBragg::nanoBragg& nB) { //only implement the monolithic detector case, one panel @@ -242,37 +203,8 @@ struct kokkos_detector return output_array; } - inline void - set_active_pixels_on_GPU(af::shared active_pixel_list_value) { - m_active_pixel_size = active_pixel_list_value.size(); - kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value); - active_pixel_list = active_pixel_list_value; - } - - inline af::shared - get_whitelist_raw_pixels(af::shared selection) { - //return the data array for the multipanel detector case, but only for whitelist pixels - vector_size_t active_pixel_selection = vector_size_t("active_pixel_selection", selection.size()); - kokkostbx::transfer_shared2kokkos(active_pixel_selection, selection); + void hello(); - size_t output_pixel_size = selection.size(); - vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size); - - auto temp = m_accumulate_floatimage; - - parallel_for("get_active_pixel_selection", - range_policy(0, output_pixel_size), - KOKKOS_LAMBDA (const int i) { - size_t index = active_pixel_selection( i ); - active_pixel_results( i ) = temp( index ); - }); - - af::shared output_array(output_pixel_size, af::init_functor_null()); - kokkostbx::transfer_kokkos2shared(output_array, active_pixel_results); - - SCITBX_ASSERT(output_array.size() == output_pixel_size); - return output_array; - } }; diff --git a/simtbx/kokkos/simulation.cpp b/simtbx/kokkos/simulation.cpp index b4fd96861b1..cf2d79dd4e1 100644 --- a/simtbx/kokkos/simulation.cpp +++ b/simtbx/kokkos/simulation.cpp @@ -4,6 +4,7 @@ #include "simtbx/kokkos/simulation_kernels.h" #include "kokkostbx/kokkos_utils.h" #include "scitbx/array_family/flex_types.h" +#include "simtbx/kokkos/detector.h" #define THREADS_PER_BLOCK_X 128 #define THREADS_PER_BLOCK_Y 1 @@ -42,5 +43,177 @@ namespace Kokkos { } */ + template<> void + exascale_api::add_energy_multichannel_mask_allpanel( + af::shared const ichannels, + simtbx::Kokkos::kokkos_energy_channels & kec, + simtbx::Kokkos::kokkos_detector & kdt, + af::shared const active_pixel_list, + af::shared const weight + ){ + //SCITBX_CHECK_POINT; // tested by tst_unified.py + kdt.set_active_pixels_on_GPU(active_pixel_list); + + // transfer source_I, source_lambda + // the int arguments are for sizes of the arrays + + SCITBX_ASSERT(SIM.sources == ichannels.size()); /* For each nanoBragg source, this value instructs + the simulation where to look for structure factors. If -1, skip this source wavelength. */ + SCITBX_ASSERT(SIM.sources == weight.size()); + + for (int ictr = 0; ictr < SIM.sources; ++ictr){ + if (ichannels[ictr] < 0) continue; // the ichannel array + //printf("HA HA %4d channel %4d, %5.1f %5.1f %5.1f %10.5f %10.5g\n",ictr,ichannels[ictr], + // SIM.source_X[ictr], SIM.source_Y[ictr], SIM.source_Z[ictr], + // SIM.source_I[ictr], SIM.source_lambda[ictr]); + + ::Kokkos::resize(m_crystal_orientation, SIM.phisteps, SIM.mosaic_domains, 3); + calc_CrystalOrientations( + SIM.phi0, SIM.phistep, SIM.phisteps, m_spindle_vector, m_a0, m_b0, m_c0, SIM.mosaic_spread, + SIM.mosaic_domains, m_mosaic_umats, m_crystal_orientation); + + // magic happens here: take pointer from singleton, temporarily use it for add Bragg iteration: + vector_cudareal_t current_channel_Fhkl = kec.d_channel_Fhkl[ichannels[ictr]]; + + vector_cudareal_t c_source_X = vector_cudareal_t("c_source_X", 0); + vector_cudareal_t c_source_Y = vector_cudareal_t("c_source_Y", 0); + vector_cudareal_t c_source_Z = vector_cudareal_t("c_source_Z", 0); + vector_cudareal_t c_source_I = vector_cudareal_t("c_source_I", 0); + vector_cudareal_t c_source_lambda = vector_cudareal_t("c_source_lambda", 0); + double weighted_I = SIM.source_I[ictr] * weight[ictr]; + kokkostbx::transfer_double2kokkos(c_source_X, &(SIM.source_X[ictr]), 1); + kokkostbx::transfer_double2kokkos(c_source_Y, &(SIM.source_Y[ictr]), 1); + kokkostbx::transfer_double2kokkos(c_source_Z, &(SIM.source_Z[ictr]), 1); + kokkostbx::transfer_double2kokkos(c_source_I, &(weighted_I), 1); + kokkostbx::transfer_double2kokkos(c_source_lambda, &(SIM.source_lambda[ictr]), 1); + + // set up cell basis vectors for the diffuse parameters (convert vec4 to vec3) + diffuse.a0 << SIM.a0[1],SIM.a0[2],SIM.a0[3]; diffuse.b0 << SIM.b0[1],SIM.b0[2],SIM.b0[3]; diffuse.c0 << SIM.c0[1],SIM.c0[2],SIM.c0[3]; + + debranch_maskall_Kernel( + kdt.m_panel_count, kdt.m_slow_dim_size, kdt.m_fast_dim_size, active_pixel_list.size(), + SIM.oversample, SIM.point_pixel, + SIM.pixel_size, m_subpixel_size, m_steps, + SIM.detector_thickstep, SIM.detector_thicksteps, SIM.detector_thick, SIM.detector_attnlen, + kdt.m_sdet_vector, + kdt.m_fdet_vector, + kdt.m_odet_vector, + kdt.m_pix0_vector, + kdt.m_distance, + m_beam_vector, + SIM.dmin, SIM.phisteps, 1, + c_source_X, c_source_Y, + c_source_Z, + c_source_I, c_source_lambda, + SIM.mosaic_domains, m_crystal_orientation, + SIM.Na, SIM.Nb, SIM.Nc, SIM.V_cell, + m_water_size, m_water_F, m_water_MW, simtbx::nanoBragg::r_e_sqr, + SIM.fluence, simtbx::nanoBragg::Avogadro, SIM.spot_scale, SIM.integral_form, + SIM.default_F, + current_channel_Fhkl, + kec.m_FhklParams, + SIM.nopolar, + m_polar_vector, SIM.polarization, SIM.fudge, + kdt.m_active_pixel_list, + diffuse, + // return arrays: + kdt.m_floatimage, + kdt.m_omega_reduction, + kdt.m_max_I_x_reduction, + kdt.m_max_I_y_reduction, + kdt.m_rangemap); + //don't want to free the kec data when the nanoBragg goes out of scope, so switch the pointer + // cu_current_channel_Fhkl = NULL; + + add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage); + }// loop over channels + } + + template<> void + exascale_api::add_energy_multichannel_mask_allpanel( + af::shared const ichannels, + simtbx::Kokkos::kokkos_energy_channels & kec, + simtbx::Kokkos::kokkos_detector & kdt, + af::shared const active_pixel_list, + af::shared const weight + ){ + //SCITBX_CHECK_POINT; // tested by tst_memory_policy.py + kdt.set_active_pixels_on_GPU(active_pixel_list); + + // transfer source_I, source_lambda + // the int arguments are for sizes of the arrays + + SCITBX_ASSERT(SIM.sources == ichannels.size()); /* For each nanoBragg source, this value instructs + the simulation where to look for structure factors. If -1, skip this source wavelength. */ + SCITBX_ASSERT(SIM.sources == weight.size()); + + for (int ictr = 0; ictr < SIM.sources; ++ictr){ + if (ichannels[ictr] < 0) continue; // the ichannel array + //printf("HA HA %4d channel %4d, %5.1f %5.1f %5.1f %10.5f %10.5g\n",ictr,ichannels[ictr], + // SIM.source_X[ictr], SIM.source_Y[ictr], SIM.source_Z[ictr], + // SIM.source_I[ictr], SIM.source_lambda[ictr]); + + ::Kokkos::resize(m_crystal_orientation, SIM.phisteps, SIM.mosaic_domains, 3); + calc_CrystalOrientations( + SIM.phi0, SIM.phistep, SIM.phisteps, m_spindle_vector, m_a0, m_b0, m_c0, SIM.mosaic_spread, + SIM.mosaic_domains, m_mosaic_umats, m_crystal_orientation); + + // magic happens here: take pointer from singleton, temporarily use it for add Bragg iteration: + vector_cudareal_t current_channel_Fhkl = kec.d_channel_Fhkl[ichannels[ictr]]; + + vector_cudareal_t c_source_X = vector_cudareal_t("c_source_X", 0); + vector_cudareal_t c_source_Y = vector_cudareal_t("c_source_Y", 0); + vector_cudareal_t c_source_Z = vector_cudareal_t("c_source_Z", 0); + vector_cudareal_t c_source_I = vector_cudareal_t("c_source_I", 0); + vector_cudareal_t c_source_lambda = vector_cudareal_t("c_source_lambda", 0); + double weighted_I = SIM.source_I[ictr] * weight[ictr]; + kokkostbx::transfer_double2kokkos(c_source_X, &(SIM.source_X[ictr]), 1); + kokkostbx::transfer_double2kokkos(c_source_Y, &(SIM.source_Y[ictr]), 1); + kokkostbx::transfer_double2kokkos(c_source_Z, &(SIM.source_Z[ictr]), 1); + kokkostbx::transfer_double2kokkos(c_source_I, &(weighted_I), 1); + kokkostbx::transfer_double2kokkos(c_source_lambda, &(SIM.source_lambda[ictr]), 1); + + // set up cell basis vectors for the diffuse parameters (convert vec4 to vec3) + diffuse.a0 << SIM.a0[1],SIM.a0[2],SIM.a0[3]; diffuse.b0 << SIM.b0[1],SIM.b0[2],SIM.b0[3]; diffuse.c0 << SIM.c0[1],SIM.c0[2],SIM.c0[3]; + + debranch_maskall_low_memory_Kernel( + kdt.m_panel_count, kdt.m_slow_dim_size, kdt.m_fast_dim_size, active_pixel_list.size(), + SIM.oversample, SIM.point_pixel, + SIM.pixel_size, m_subpixel_size, m_steps, + SIM.detector_thickstep, SIM.detector_thicksteps, SIM.detector_thick, SIM.detector_attnlen, + kdt.m_sdet_vector, + kdt.m_fdet_vector, + kdt.m_odet_vector, + kdt.m_pix0_vector, + kdt.m_distance, + m_beam_vector, + SIM.dmin, SIM.phisteps, 1, + c_source_X, c_source_Y, + c_source_Z, + c_source_I, c_source_lambda, + SIM.mosaic_domains, m_crystal_orientation, + SIM.Na, SIM.Nb, SIM.Nc, SIM.V_cell, + m_water_size, m_water_F, m_water_MW, simtbx::nanoBragg::r_e_sqr, + SIM.fluence, simtbx::nanoBragg::Avogadro, SIM.spot_scale, SIM.integral_form, + SIM.default_F, + current_channel_Fhkl, + kec.m_FhklParams, + SIM.nopolar, + m_polar_vector, SIM.polarization, SIM.fudge, + kdt.m_active_pixel_list, + diffuse, + // return arrays: + kdt.m_floatimage, + kdt.m_omega_reduction, + kdt.m_max_I_x_reduction, + kdt.m_max_I_y_reduction, + kdt.m_rangemap); + //don't want to free the kec data when the nanoBragg goes out of scope, so switch the pointer + // cu_current_channel_Fhkl = NULL; + + add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage); + }// loop over channels + } + } // Kokkos } // simtbx diff --git a/simtbx/kokkos/simulation.h b/simtbx/kokkos/simulation.h index 9b9d6a65522..9b69b4ce954 100644 --- a/simtbx/kokkos/simulation.h +++ b/simtbx/kokkos/simulation.h @@ -86,12 +86,12 @@ struct exascale_api { simtbx::Kokkos::kokkos_detector &, af::shared const ); */ - /*void add_energy_multichannel_mask_allpanel(af::shared const, + void add_energy_multichannel_mask_allpanel(af::shared const, simtbx::Kokkos::kokkos_energy_channels &, simtbx::Kokkos::kokkos_detector &, af::shared const, af::shared const - ); */ + ); //void add_background(simtbx::Kokkos::kokkos_detector &, int const&); //af::flex_double add_noise(simtbx::Kokkos::kokkos_detector &); @@ -330,91 +330,6 @@ struct exascale_api { add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage); } - inline void - add_energy_multichannel_mask_allpanel( - af::shared const ichannels, - simtbx::Kokkos::kokkos_energy_channels & kec, - simtbx::Kokkos::kokkos_detector & kdt, - af::shared const active_pixel_list, - af::shared const weight - ){ - //SCITBX_CHECK_POINT; // tested by tst_unified.py - kdt.set_active_pixels_on_GPU(active_pixel_list); - - // transfer source_I, source_lambda - // the int arguments are for sizes of the arrays - - SCITBX_ASSERT(SIM.sources == ichannels.size()); /* For each nanoBragg source, this value instructs - the simulation where to look for structure factors. If -1, skip this source wavelength. */ - SCITBX_ASSERT(SIM.sources == weight.size()); - - for (int ictr = 0; ictr < SIM.sources; ++ictr){ - if (ichannels[ictr] < 0) continue; // the ichannel array - //printf("HA HA %4d channel %4d, %5.1f %5.1f %5.1f %10.5f %10.5g\n",ictr,ichannels[ictr], - // SIM.source_X[ictr], SIM.source_Y[ictr], SIM.source_Z[ictr], - // SIM.source_I[ictr], SIM.source_lambda[ictr]); - - ::Kokkos::resize(m_crystal_orientation, SIM.phisteps, SIM.mosaic_domains, 3); - calc_CrystalOrientations( - SIM.phi0, SIM.phistep, SIM.phisteps, m_spindle_vector, m_a0, m_b0, m_c0, SIM.mosaic_spread, - SIM.mosaic_domains, m_mosaic_umats, m_crystal_orientation); - - // magic happens here: take pointer from singleton, temporarily use it for add Bragg iteration: - vector_cudareal_t current_channel_Fhkl = kec.d_channel_Fhkl[ichannels[ictr]]; - - vector_cudareal_t c_source_X = vector_cudareal_t("c_source_X", 0); - vector_cudareal_t c_source_Y = vector_cudareal_t("c_source_Y", 0); - vector_cudareal_t c_source_Z = vector_cudareal_t("c_source_Z", 0); - vector_cudareal_t c_source_I = vector_cudareal_t("c_source_I", 0); - vector_cudareal_t c_source_lambda = vector_cudareal_t("c_source_lambda", 0); - double weighted_I = SIM.source_I[ictr] * weight[ictr]; - kokkostbx::transfer_double2kokkos(c_source_X, &(SIM.source_X[ictr]), 1); - kokkostbx::transfer_double2kokkos(c_source_Y, &(SIM.source_Y[ictr]), 1); - kokkostbx::transfer_double2kokkos(c_source_Z, &(SIM.source_Z[ictr]), 1); - kokkostbx::transfer_double2kokkos(c_source_I, &(weighted_I), 1); - kokkostbx::transfer_double2kokkos(c_source_lambda, &(SIM.source_lambda[ictr]), 1); - - // set up cell basis vectors for the diffuse parameters (convert vec4 to vec3) - diffuse.a0 << SIM.a0[1],SIM.a0[2],SIM.a0[3]; diffuse.b0 << SIM.b0[1],SIM.b0[2],SIM.b0[3]; diffuse.c0 << SIM.c0[1],SIM.c0[2],SIM.c0[3]; - - debranch_maskall_Kernel( - kdt.m_panel_count, kdt.m_slow_dim_size, kdt.m_fast_dim_size, active_pixel_list.size(), - SIM.oversample, SIM.point_pixel, - SIM.pixel_size, m_subpixel_size, m_steps, - SIM.detector_thickstep, SIM.detector_thicksteps, SIM.detector_thick, SIM.detector_attnlen, - kdt.m_sdet_vector, - kdt.m_fdet_vector, - kdt.m_odet_vector, - kdt.m_pix0_vector, - kdt.m_distance, - m_beam_vector, - SIM.dmin, SIM.phisteps, 1, - c_source_X, c_source_Y, - c_source_Z, - c_source_I, c_source_lambda, - SIM.mosaic_domains, m_crystal_orientation, - SIM.Na, SIM.Nb, SIM.Nc, SIM.V_cell, - m_water_size, m_water_F, m_water_MW, simtbx::nanoBragg::r_e_sqr, - SIM.fluence, simtbx::nanoBragg::Avogadro, SIM.spot_scale, SIM.integral_form, - SIM.default_F, - current_channel_Fhkl, - kec.m_FhklParams, - SIM.nopolar, - m_polar_vector, SIM.polarization, SIM.fudge, - kdt.m_active_pixel_list, - diffuse, - // return arrays: - kdt.m_floatimage, - kdt.m_omega_reduction, - kdt.m_max_I_x_reduction, - kdt.m_max_I_y_reduction, - kdt.m_rangemap); - //don't want to free the kec data when the nanoBragg goes out of scope, so switch the pointer - // cu_current_channel_Fhkl = NULL; - add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage); - }// loop over channels - } - inline void add_background(simtbx::Kokkos::kokkos_detector & kdt, int const& override_source) { // cudaSafeCall(cudaSetDevice(SIM.device_Id)); diff --git a/simtbx/kokkos/simulation_kernels.h b/simtbx/kokkos/simulation_kernels.h index 7c1fa9eb21b..dd78e002421 100644 --- a/simtbx/kokkos/simulation_kernels.h +++ b/simtbx/kokkos/simulation_kernels.h @@ -643,6 +643,306 @@ void debranch_maskall_Kernel(int npanels, int spixels, int fpixels, int total_pi }); } +void debranch_maskall_low_memory_Kernel(int npanels, int spixels, int fpixels, int total_pixels, + int oversample, int point_pixel, + CUDAREAL pixel_size, CUDAREAL subpixel_size, int steps, + CUDAREAL detector_thickstep, int detector_thicksteps, CUDAREAL detector_thick, CUDAREAL detector_mu, + const view_1d_t sdet_vector, const view_1d_t fdet_vector, + const view_1d_t odet_vector, const view_1d_t pix0_vector, + const vector_cudareal_t close_distance, + const vector_cudareal_t beam_vector, + CUDAREAL dmin, int phisteps, int sources, + const vector_cudareal_t source_X, const vector_cudareal_t source_Y, + const vector_cudareal_t source_Z, + const vector_cudareal_t source_I, const vector_cudareal_t source_lambda, + int mosaic_domains, crystal_orientation_t crystal_orientation, + CUDAREAL Na, CUDAREAL Nb, CUDAREAL Nc, CUDAREAL V_cell, + CUDAREAL water_size, CUDAREAL water_F, CUDAREAL water_MW, CUDAREAL r_e_sqr, + CUDAREAL fluence, CUDAREAL Avogadro, CUDAREAL spot_scale, int integral_form, + CUDAREAL default_F, + const vector_cudareal_t Fhkl, + const hklParams FhklParams, + int nopolar, + const vector_cudareal_t polar_vector, CUDAREAL polarization, CUDAREAL fudge, + const vector_size_t pixel_lookup, + simtbx::Kokkos::diffuse_api diffuse, + vector_float_t floatimage /*out*/, + vector_float_t omega_reduction /*out*/, vector_float_t max_I_x_reduction /*out*/, + vector_float_t max_I_y_reduction /*out*/, vector_bool_t rangemap) { + + + const int s_h_min = FhklParams.h_min; + const int s_k_min = FhklParams.k_min; + const int s_l_min = FhklParams.l_min; + const int s_h_range = FhklParams.h_range; + const int s_k_range = FhklParams.k_range; + const int s_l_range = FhklParams.l_range; + const int s_h_max = s_h_min + s_h_range - 1; + const int s_k_max = s_k_min + s_k_range - 1; + const int s_l_max = s_l_min + s_l_range - 1; + + // set up diffuse scattering if needed + vector_mat3_t laue_mats = vector_mat3_t("laue_mats",24); + vector_cudareal_t dG_trace = vector_cudareal_t("dG_trace",3); + vector_vec3_t dG_dgam = vector_vec3_t("dG_dgam",3); + int num_laue_mats = 0; + int dhh = 0, dkk = 0, dll = 0; + KOKKOS_MAT3 rotate_principal_axes = diffuse.rotate_principal_axes; // (1,0,0,0,1,0,0,0,1); + int laue_group_num = diffuse.laue_group_num; + int stencil_size = diffuse.stencil_size; + KOKKOS_MAT3 anisoG = diffuse.anisoG; // (300.,0,0,0,100.,0,0,0,300.); + KOKKOS_MAT3 anisoU = diffuse.anisoU; // (0.48,0,0,0,0.16,0,0,0,0.16); + KOKKOS_MAT3 Bmat_realspace(1.,0,0,0,1.,0,0,0,1.); // Placeholder + KOKKOS_MAT3 anisoG_local; + CUDAREAL anisoG_determ = 0; + KOKKOS_MAT3 anisoU_local; + bool use_diffuse=diffuse.enable; + // ***NEEDS UPDATE: use legacy API for passing diffuse scale as KOKKOS_MAT3 + vector_mat3_t diffuse_scale_mat3 = vector_mat3_t("diffuse_scale_mat3",1); + + if (use_diffuse){ + anisoG_local = anisoG; + anisoU_local = anisoU; + + if (laue_group_num < 1 || laue_group_num >14 ){ + throw std::string("Laue group number not in range 1-14"); + } + + Kokkos::parallel_reduce("prepare diffuse mats", 1, KOKKOS_LAMBDA (const int& i, int& num_laue_mats_temp){ + num_laue_mats_temp = gen_laue_mats(laue_group_num, laue_mats, rotate_principal_axes); + // KOKKOS_MAT3 rotate_principal_axes; + // rotate_principal_axes << 0.70710678, -0.70710678, 0., 0.70710678, 0.70710678, 0., 0., 0., 1.; + + KOKKOS_MAT3 Amatrix(diffuse.a0[0],diffuse.a0[1],diffuse.a0[2],diffuse.b0[0],diffuse.b0[1],diffuse.b0[2], + diffuse.c0[0],diffuse.c0[1],diffuse.c0[2]); + KOKKOS_MAT3 Ainv = Amatrix.inverse()*1.e-10; + CUDAREAL reciprocal_space_volume = 8*M_PI*M_PI*M_PI*Ainv.determinant(); + CUDAREAL _tmpfac = M_PI * 0.63 / fudge; + CUDAREAL diffuse_scale = reciprocal_space_volume * sqrt(_tmpfac*_tmpfac*_tmpfac); + + // ***NEEDS UPDATE: use legacy API for passing diffuse scale as KOKKOS_MAT3 + diffuse_scale_mat3(0)(0,0) = diffuse_scale; + + for ( int iL = 0; iL < num_laue_mats_temp; iL++ ){ + laue_mats(iL) = Ainv * laue_mats(iL); + } + const KOKKOS_MAT3 Ginv = anisoG_local.inverse(); + const KOKKOS_MAT3 dG = Bmat_realspace * Ginv; + for (int i_gam=0; i_gam<3; i_gam++){ + dG_dgam(i_gam)[i_gam] = 1; + KOKKOS_MAT3 temp_dgam; + temp_dgam(i_gam, 0) = dG_dgam(i_gam)[0]; + temp_dgam(i_gam, 1) = dG_dgam(i_gam)[1]; + temp_dgam(i_gam, 2) = dG_dgam(i_gam)[2]; + dG_trace(i_gam) = (Ginv*temp_dgam).trace(); + } + }, num_laue_mats); + anisoG_determ = anisoG_local.determinant(); + dhh = dkk = dll = stencil_size; // Limits of stencil for diffuse calc + } + KOKKOS_VEC3 dHH(dhh,dkk,dll); + // end of diffuse setup + +// Implementation notes. This kernel is aggressively debranched, therefore the assumptions are: +// 1) mosaicity non-zero positive +// 2) xtal shape is "Gauss" i.e. 3D spheroid. +// 3) No bounds check for access to the structure factor array. +// 4) No check for Flatt=0. + const CUDAREAL dmin_r = (dmin > 0.0) ? 1/dmin : 0.0; + + // add background from something amorphous, precalculate scaling + const CUDAREAL F_bg = water_F; + const CUDAREAL I_bg = F_bg * F_bg * r_e_sqr * fluence * water_size * water_size * water_size * 1e6 * Avogadro / water_MW; + const CUDAREAL I_factor = r_e_sqr * spot_scale * fluence / steps; + + Kokkos::parallel_for("debranch_maskall", total_pixels, KOKKOS_LAMBDA(const int& pixIdx) { + + vec3 polar_vector_tmp {polar_vector(1), polar_vector(2), polar_vector(3)}; + + // position in pixel array + const int j = pixel_lookup(pixIdx);//pixIdx: index into pixel subset; j: index into the data. + const int i_panel = j / (fpixels*spixels); // the panel number + const int j_panel = j % (fpixels*spixels); // the pixel number within the panel + const int fpixel = j_panel % fpixels; + const int spixel = j_panel / fpixels; + const int outIdx = pixIdx; + + // reset photon count for this pixel + CUDAREAL I = I_bg; + CUDAREAL omega_sub_reduction = 0.0; + CUDAREAL max_I_x_sub_reduction = 0.0; + CUDAREAL max_I_y_sub_reduction = 0.0; + CUDAREAL polar = 0.0; + if (nopolar) { + polar = 1.0; + } + + // add this now to avoid problems with skipping later + // move this to the bottom to avoid accessing global device memory. floatimage[j] = I_bg; + // loop over sub-pixels + int subS, subF; + for (subS = 0; subS < oversample; ++subS) { // Y voxel + for (subF = 0; subF < oversample; ++subF) { // X voxel + // absolute mm position on detector (relative to its origin) + CUDAREAL Fdet = subpixel_size * (fpixel * oversample + subF) + subpixel_size / 2.0; // X voxel + CUDAREAL Sdet = subpixel_size * (spixel * oversample + subS) + subpixel_size / 2.0; // Y voxel + // Fdet = pixel_size*fpixel; + // Sdet = pixel_size*spixel; + + max_I_x_sub_reduction = Fdet; + max_I_y_sub_reduction = Sdet; + + int thick_tic; + for (thick_tic = 0; thick_tic < detector_thicksteps; ++thick_tic) { + // assume "distance" is to the front of the detector sensor layer + CUDAREAL Odet = thick_tic * detector_thickstep; // Z Orthagonal voxel. + + // construct detector subpixel position in 3D space + // pixel_X = distance; + // pixel_Y = Sdet-Ybeam; + // pixel_Z = Fdet-Xbeam; + vec3 pixel_pos; + pixel_pos += Fdet * fdet_vector(i_panel); + pixel_pos += Sdet * sdet_vector(i_panel); + pixel_pos += Odet * odet_vector(i_panel); + pixel_pos += pix0_vector(i_panel); + + // construct the diffracted-beam unit vector to this sub-pixel + CUDAREAL airpath_r = 1 / pixel_pos.length(); + vec3 diffracted = pixel_pos.get_unit_vector(); + + // solid angle subtended by a pixel: (pix/airpath)^2*cos(2theta) + CUDAREAL omega_pixel = pixel_size * pixel_size * airpath_r * airpath_r * close_distance(i_panel) * airpath_r; + // option to turn off obliquity effect, inverse-square-law only + if (point_pixel) { + omega_pixel = airpath_r * airpath_r; + } + + // now calculate detector thickness effects + CUDAREAL capture_fraction = 1.0; + if (detector_thick > 0.0 && detector_mu> 0.0) { + // inverse of effective thickness increase + CUDAREAL parallax = odet_vector(i_panel).dot(diffracted); + capture_fraction = exp(-thick_tic * detector_thickstep / detector_mu / parallax) + - exp(-(thick_tic + 1) * detector_thickstep / detector_mu / parallax); + } + + // loop over sources now + int source; + for (source = 0; source < sources; ++source) { + + // retrieve stuff from cache + vec3 incident = {-source_X(source), -source_Y(source), -source_Z(source)}; + CUDAREAL lambda = source_lambda(source); + CUDAREAL source_fraction = source_I(source); + + // construct the incident beam unit vector while recovering source distance + // TODO[Giles]: Optimization! We can unitize the source vectors before passing them in. + incident.normalize(); + + // construct the scattering vector for this pixel + vec3 scattering = (diffracted - incident) / lambda; + CUDAREAL stol = 0.5 * scattering.length(); + + // rough cut to speed things up when we aren't using whole detector + if (dmin > 0.0 && stol > 0.0) { + // use reciprocal of (dmin > 0.5 / stol) + if (dmin_r <= 2 * stol) { + continue; + } + } + + // polarization factor + if (!nopolar) { + // need to compute polarization factor + polar = polarization_factor(polarization, incident, diffracted, polar_vector_tmp); + } else { + polar = 1.0; + } + + // sweep over phi angles + for (int phi_tic = 0; phi_tic < phisteps; ++phi_tic) { + // enumerate mosaic domains + for (int mos_tic = 0; mos_tic < mosaic_domains; ++mos_tic) { + // apply mosaic rotation after phi rotation + auto a = crystal_orientation(phi_tic, mos_tic, 0); + auto b = crystal_orientation(phi_tic, mos_tic, 1); + auto c = crystal_orientation(phi_tic, mos_tic, 2); + + // construct fractional Miller indicies + CUDAREAL h = a.dot(scattering); + CUDAREAL k = b.dot(scattering); + CUDAREAL l = c.dot(scattering); + + // round off to nearest whole index + int h0 = ceil(h - 0.5); + int k0 = ceil(k - 0.5); + int l0 = ceil(l - 0.5); + + // structure factor of the lattice (paralelpiped crystal) + // F_latt = sin(M_PI*s_Na*h)*sin(M_PI*s_Nb*k)*sin(M_PI*s_Nc*l)/sin(M_PI*h)/sin(M_PI*k)/sin(M_PI*l); + + CUDAREAL I_latt = 1.0; // Shape transform for the crystal. + CUDAREAL hrad_sqr = 0.0; + + // handy radius in reciprocal space, squared + hrad_sqr = (h - h0) * (h - h0) * Na * Na + (k - k0) * (k - k0) * Nb * Nb + (l - l0) * (l - l0) * Nc * Nc; + // fudge the radius so that volume and FWHM are similar to square_xtal spots + double my_arg = hrad_sqr / 0.63 * fudge; + { + CUDAREAL F_latt = Na * Nb * Nc * exp(-(my_arg)); + I_latt = F_latt * F_latt; + } + + // new code for diffuse. + if (use_diffuse) { + KOKKOS_VEC3 H_vec(h,k,l); + KOKKOS_VEC3 H0(h0,k0,l0); + CUDAREAL step_diffuse_param[6]; + // ***NEEDS UPDATE: use legacy API for passing diffuse scale as KOKKOS_MAT3 + calc_diffuse_at_hkl(H_vec,H0,dHH,s_h_min,s_k_min,s_l_min,s_h_max,s_k_max, + s_l_max,s_h_range,s_k_range,s_l_range,diffuse_scale_mat3(0),Fhkl,num_laue_mats,laue_mats, + anisoG_local,dG_trace,anisoG_determ,anisoU_local,dG_dgam,false,&I_latt,step_diffuse_param); + } + // end s_use_diffuse outer + + // structure factor of the unit cell + CUDAREAL F_cell = default_F; + //F_cell = quickFcell_ldg(s_hkls, s_h_max, s_h_min, s_k_max, s_k_min, s_l_max, s_l_min, h0, k0, l0, s_h_range, s_k_range, s_l_range, default_F, Fhkl); + if ( + h0 < s_h_min || + k0 < s_k_min || + l0 < s_l_min || + h0 > s_h_max || + k0 > s_k_max || + l0 > s_l_max + ) { + F_cell = 0.; + } else { + const int hkl_index = (h0-s_h_min)*s_k_range*s_l_range + (k0-s_k_min)*s_l_range + (l0-s_l_min); + F_cell = Fhkl[hkl_index]; + } + + // now we have the structure factor for this pixel + + // convert amplitudes into intensity (photons per steradian) + I += F_cell * F_cell * I_latt * source_fraction * capture_fraction * omega_pixel; + omega_sub_reduction += omega_pixel; + } // end of mosaic loop + } // end of phi loop + } // end of source loop + } // end of detector thickness loop + } // end of sub-pixel y loop + } // end of sub-pixel x loop + const double photons = I_bg + I_factor * polar * I; + floatimage( outIdx ) = photons; + //omega_reduction( outIdx ) = omega_sub_reduction; // shared contention + //max_I_x_reduction( outIdx ) = max_I_x_sub_reduction; + //max_I_y_reduction( outIdx ) = max_I_y_sub_reduction; + //rangemap( outIdx ) = true; + }); +} + // __global__ void nanoBraggSpotsInitCUDAKernel(int spixels, int fpixesl, float * floatimage, float * omega_reduction, // float * max_I_x_reduction, // float * max_I_y_reduction, bool * rangemap); diff --git a/simtbx/tests/tst_memory_policy.py b/simtbx/tests/tst_memory_policy.py index dbb94cee5ed..177d6e5a97e 100644 --- a/simtbx/tests/tst_memory_policy.py +++ b/simtbx/tests/tst_memory_policy.py @@ -233,10 +233,12 @@ def specialized_api_for_whitelist_low_memory(self, params, whitelist_pixels, arg per_image_scale_factor = self.domains_per_crystal # 1.0 self.gpu_detector.scale_in_place(per_image_scale_factor) # apply scale directly on GPU self.reset_pythony_beams(self.SIM) + print("AAA") self.whitelist_values = self.gpu_detector.get_whitelist_raw_pixels(whitelist_pixels) + print("BBB") -def get_whitelist_from_refls(prefix,SIM): - image_size = len(SIM.raw_pixels) +def get_whitelist_from_refls(prefix,SIM=None): + #image_size = len(SIM.raw_pixels) from dials.array_family import flex from dxtbx.model.experiment_list import ExperimentListFactory experiments = ExperimentListFactory.from_json_file("%s_imported.expt"%prefix, check_format = False) @@ -353,8 +355,9 @@ def run_all(params): free_gpu_before = get_gpu_memory()[0] del SWCs free_gpu_after = get_gpu_memory()[0] - print((free_gpu_after - free_gpu_before)/NTRIALS,"free") - assert (free_gpu_after - free_gpu_before)/NTRIALS >= 50 # old policy uses at least 50 MB per sim., actual value 57.6 MB + old_memory_use = (free_gpu_after - free_gpu_before)/NTRIALS + print(old_memory_use,"free") + assert old_memory_use >= 50 # old policy uses at least 50 MB per sim., actual value 57.6 MB #figure out the minimum change needed to reduce memory consumption by factor of image_size/whitelist_size #accomplish the same with compile-time polymorphism @@ -378,16 +381,51 @@ def run_all(params): free_gpu_before = get_gpu_memory()[0] del SWCs free_gpu_after = get_gpu_memory()[0] - print((free_gpu_after - free_gpu_before)/NTRIALS,"free") - assert (free_gpu_after - free_gpu_before)/NTRIALS >= 50 # old policy uses at least 50 MB per sim., actual value 57.6 MB + new_memory_use = (free_gpu_after - free_gpu_before)/NTRIALS + print(new_memory_use,"free") + assert old_memory_use > 4.*new_memory_use # new policy cuts down at least 4-fold on memory, actual value ** - return +def run_subset_for_NESAP_debug(params): + # make the dxtbx objects + BEAM = basic_beam() + DETECTOR = basic_detector() + CRYSTAL = basic_crystal() + SF_model = amplitudes(CRYSTAL) + # Famp = SF_model.Famp # simple uniform amplitudes + SF_model.random_structure(CRYSTAL) + SF_model.ersatz_correct_to_P1() + whitelist_pixels = flex.size_t((11929, + 351293,351294,351295,352828,352829,352830,352831,354364,354365, + 354366,354367,355900,355901,355902,355903,357436,357437,357438, + 357439,352383,352384,352385,353919,353920,353921,355455,355456, + 355457,356991,356992,356993,354120,354121,354122,355656,355657)) + + NTRIALS=5 + #figure out the minimum change needed to reduce memory consumption by factor of image_size/whitelist_size + #accomplish the same with compile-time polymorphism + #have side-by-side test in same python script + # Reproduce whitelist sims with small-memory mechanism + SWCs=[] + for x in range(NTRIALS): + print("Whitelist-only iteration with small memory",x) + SWCs.append(several_wavelength_case_policy(BEAM,DETECTOR,CRYSTAL,SF_model,weights=flex.double([1.]))) + SWCs[-1].specialized_api_for_whitelist_low_memory(whitelist_pixels=whitelist_pixels,params=params,argchk=False,sources=True) + #produce an output image file for intermediate debugging + working_raw_pixels = flex.double(image_size) # blank array + working_raw_pixels.set_selected(whitelist_pixels, SWCs[-1].whitelist_values) + working_raw_pixels.reshape(flex.grid(SWCs[-1].SIM.raw_pixels.focus())) + + free_gpu_before = get_gpu_memory()[0] + del SWCs + free_gpu_after = get_gpu_memory()[0] + new_memory_use = (free_gpu_after - free_gpu_before)/NTRIALS + print(new_memory_use,"free") if __name__=="__main__": params,options = parse_input() # Initialize based on GPU context gpu_instance_type = get_exascale("gpu_instance", params.context) gpu_instance = gpu_instance_type(deviceId = 0) - - run_all(params) + #run_all(params) + run_subset_for_NESAP_debug(params) print("OK")