diff --git a/simtbx/diffBragg/device.py b/simtbx/diffBragg/device.py
index 1dade37c57..fcd4bc952a 100644
--- a/simtbx/diffBragg/device.py
+++ b/simtbx/diffBragg/device.py
@@ -21,6 +21,7 @@ def __init__(self, gpu_id, force=False):
         """
         self.gpu_id = gpu_id
         self.force_kokkos_init = force
+        self.is_using_kokkos_gpu = USE_KOKKOS_GPU
 
     def __enter__(self):
         if USE_KOKKOS_GPU or self.force_kokkos_init:
diff --git a/simtbx/diffBragg/phil.py b/simtbx/diffBragg/phil.py
index b6558f77fb..2414d298d4 100644
--- a/simtbx/diffBragg/phil.py
+++ b/simtbx/diffBragg/phil.py
@@ -1319,6 +1319,9 @@
     .help = diffbragg offers CUDA support via the DIFFBRAGG_USE_CUDA=1 environment variable specification
     .help = or openmp support using the OMP_NUM_THREADS flag
     .help = The exascale only uses CUDA (will raise error if CUDA is not confugured)
+  mosaic_samples_override = None
+    .type = int
+    .help = Specify the number of mosaic spread samples
 }
 """
 
diff --git a/simtbx/diffBragg/src/diffBragg.cpp b/simtbx/diffBragg/src/diffBragg.cpp
index a42cc5b960..03fa68afee 100644
--- a/simtbx/diffBragg/src/diffBragg.cpp
+++ b/simtbx/diffBragg/src/diffBragg.cpp
@@ -1967,7 +1967,7 @@ void diffBragg::add_diffBragg_spots(const af::shared<size_t>& panels_fasts_slows
     db_cryst.Nd = Nd;
     db_cryst.Ne = Ne;
     db_cryst.Nf = Nf;
-
+    db_cryst.xtal_shape = xtal_shape;
 
     db_beam.number_of_sources = sources;
     db_beam.source_X = source_X;
diff --git a/simtbx/diffBragg/src/diffBraggCUDA.cu b/simtbx/diffBragg/src/diffBraggCUDA.cu
index 9c58687f40..1e5731e8c3 100644
--- a/simtbx/diffBragg/src/diffBraggCUDA.cu
+++ b/simtbx/diffBragg/src/diffBraggCUDA.cu
@@ -39,13 +39,6 @@ void diffBragg_sum_over_steps_cuda(
         diffBragg_cudaPointers& cp,
         timer_variables& TIMERS){
 
-
-    if (db_cryst.phisteps > 1){
-        printf("PHI (goniometer rotations) not supported in GPU code: phi0=%f phisteps=%d, phistep=%f\n", db_cryst.phi0, db_cryst.phisteps, db_cryst.phistep);
-        printf("SPINDLE VEC: %f %f %f\n",db_cryst.spindle_vec[0], db_cryst.spindle_vec[1], db_cryst.spindle_vec[2]);
-        exit(-1);
-    }
-
     int numblocks;
     int blocksize;
     char* diffBragg_blocks = getenv("DIFFBRAGG_NUM_BLOCKS");
@@ -513,7 +506,9 @@ void diffBragg_sum_over_steps_cuda(
         cp.data_freq, cp.data_trusted,
         cp.FhklLinear_ASUid,
         cp.Fhkl_channels,
-        cp.Fhkl_scale, cp.Fhkl_scale_deriv
+        cp.Fhkl_scale, cp.Fhkl_scale_deriv,
+        db_cryst.xtal_shape==GAUSS_STAR,
+        db_cryst.xtal_shape==SQUARE
         );
 
     error_msg(cudaGetLastError(), "after kernel call");
diff --git a/simtbx/diffBragg/src/diffBraggKOKKOS.cpp b/simtbx/diffBragg/src/diffBraggKOKKOS.cpp
index 273088f765..bca80ba988 100644
--- a/simtbx/diffBragg/src/diffBraggKOKKOS.cpp
+++ b/simtbx/diffBragg/src/diffBraggKOKKOS.cpp
@@ -57,12 +57,6 @@ void diffBraggKOKKOS::diffBragg_sum_over_steps_kokkos(
     cuda_flags& db_cu_flags,
     // diffBragg_kokkosPointers& kp,
     timer_variables& TIMERS) {
-    if (db_cryst.phi0 != 0 || db_cryst.phisteps > 1) {
-        printf(
-            "PHI (goniometer position) not supported in GPU code: phi0=%f phisteps=%d, phistep=%f\n",
-            db_cryst.phi0, db_cryst.phisteps, db_cryst.phistep);
-        exit(-1);
-    }
 
     Kokkos::Tools::pushRegion("diffBragg_sum_over_steps_kokkos");
     Kokkos::Tools::pushRegion("local detector, beam and crystal");
@@ -529,7 +523,10 @@ void diffBraggKOKKOS::diffBragg_sum_over_steps_kokkos(
             m_data_freq, m_data_trusted,
             m_FhklLinear_ASUid,
             m_Fhkl_channels,
-            m_Fhkl_scale, m_Fhkl_scale_deriv);
+            m_Fhkl_scale, m_Fhkl_scale_deriv,
+            db_cryst.xtal_shape==GAUSS_STAR,
+            db_cryst.xtal_shape==SQUARE
+            );
     } else {
         kokkos_sum_over_steps(
             Npix_to_model, m_panels_fasts_slows, m_floatimage, m_wavelenimage, m_d_Umat_images,
@@ -568,7 +565,9 @@ void diffBraggKOKKOS::diffBragg_sum_over_steps_kokkos(
             m_data_freq, m_data_trusted,
             m_FhklLinear_ASUid,
             m_Fhkl_channels,
-            m_Fhkl_scale, m_Fhkl_scale_deriv
+            m_Fhkl_scale, m_Fhkl_scale_deriv,
+            db_cryst.xtal_shape==GAUSS_STAR,
+            db_cryst.xtal_shape==SQUARE
             );
     }
 
diff --git a/simtbx/diffBragg/src/diffBraggKOKKOS.h b/simtbx/diffBragg/src/diffBraggKOKKOS.h
index 3aa07ae27b..7df7346b46 100644
--- a/simtbx/diffBragg/src/diffBraggKOKKOS.h
+++ b/simtbx/diffBragg/src/diffBraggKOKKOS.h
@@ -10,6 +10,7 @@
 #include "simtbx/diffBragg/src/util.h"
 #include "simtbx/diffBragg/src/util_kokkos.h"
 #include "simtbx/diffBragg/src/diffBragg_refine_flag.h"
+#include "simtbx/nanoBragg/nanotypes.h"
 
 using vector_vec3_t = view_1d_t<KOKKOS_VEC3>;
 using vector_mat3_t = view_1d_t<KOKKOS_MAT3>;
diff --git a/simtbx/diffBragg/src/diffBragg_cpu_kernel.cpp b/simtbx/diffBragg/src/diffBragg_cpu_kernel.cpp
index 94cd9fa501..d3458b7ac2 100644
--- a/simtbx/diffBragg/src/diffBragg_cpu_kernel.cpp
+++ b/simtbx/diffBragg/src/diffBragg_cpu_kernel.cpp
@@ -401,6 +401,13 @@ void diffBragg_sum_over_steps(
         double fp_fdp_manager_dI[2] = {0,0};
         double dI_latt_diffuse[6] = {0,0,0,0,0,0};
 
+        // compute unit cell volume
+        Eigen::Vector3d ap_vec(1e10*db_cryst.eig_B(0,0), 1e10*db_cryst.eig_B(1,0), 1e10*db_cryst.eig_B(2,0));
+        Eigen::Vector3d bp_vec(1e10*db_cryst.eig_B(0,1), 1e10*db_cryst.eig_B(1,1), 1e10*db_cryst.eig_B(2,1));
+        Eigen::Vector3d cp_vec(1e10*db_cryst.eig_B(0,2), 1e10*db_cryst.eig_B(1,2), 1e10*db_cryst.eig_B(2,2));
+        double cell_vol = ap_vec.dot(bp_vec.cross(cp_vec)); // cubic Angstrom
+        double xtal_size = pow(db_cryst.Na*db_cryst.Nb*db_cryst.Nc*cell_vol, (double)1/double(3)); // Angstrom
+
         for (int i_step=0; i_step < db_steps.Nsteps; i_step++){
 
             int subS = db_steps.subS_pos[i_step];
@@ -474,25 +481,21 @@ void diffBragg_sum_over_steps(
                 }
             }
 
-            double phi = db_cryst.phi0 + db_cryst.phistep*phi_tic;
-            Eigen::Matrix3d Bmat_realspace = db_cryst.eig_B;
-            if( phi != 0.0 )
-            {
-                double cosphi = cos(phi);
-                double sinphi = sin(phi);
-                Eigen::Vector3d ap_vec(db_cryst.eig_B(0,0), db_cryst.eig_B(1,0), db_cryst.eig_B(2,0));
-                Eigen::Vector3d bp_vec(db_cryst.eig_B(0,1), db_cryst.eig_B(1,1), db_cryst.eig_B(2,1));
-                Eigen::Vector3d cp_vec(db_cryst.eig_B(0,2), db_cryst.eig_B(1,2), db_cryst.eig_B(2,2));
-
-                ap_vec = ap_vec*cosphi + db_cryst.spindle_vec.cross(ap_vec)*sinphi + db_cryst.spindle_vec*(db_cryst.spindle_vec.dot(ap_vec))*(1-cosphi);
-                bp_vec = bp_vec*cosphi + db_cryst.spindle_vec.cross(bp_vec)*sinphi + db_cryst.spindle_vec*(db_cryst.spindle_vec.dot(bp_vec))*(1-cosphi);
-                cp_vec = cp_vec*cosphi + db_cryst.spindle_vec.cross(cp_vec)*sinphi + db_cryst.spindle_vec*(db_cryst.spindle_vec.dot(cp_vec))*(1-cosphi);
-
-                Bmat_realspace << ap_vec[0], bp_vec[0], cp_vec[0],
-                                    ap_vec[1], bp_vec[1], cp_vec[1],
-                                    ap_vec[2], bp_vec[2], cp_vec[2];
-            }
-            Bmat_realspace *= 1e10;
+            Eigen::Matrix3d Bmat_realspace = 1e10*db_cryst.eig_B;
+            //if( phi != 0.0 )
+            //{
+            //    double cosphi = cos(phi);
+            //    double sinphi = sin(phi);
+
+            //    ap_vec = ap_vec*cosphi + db_cryst.spindle_vec.cross(ap_vec)*sinphi + db_cryst.spindle_vec*(db_cryst.spindle_vec.dot(ap_vec))*(1-cosphi);
+            //    bp_vec = bp_vec*cosphi + db_cryst.spindle_vec.cross(bp_vec)*sinphi + db_cryst.spindle_vec*(db_cryst.spindle_vec.dot(bp_vec))*(1-cosphi);
+            //    cp_vec = cp_vec*cosphi + db_cryst.spindle_vec.cross(cp_vec)*sinphi + db_cryst.spindle_vec*(db_cryst.spindle_vec.dot(cp_vec))*(1-cosphi);
+
+            //    Bmat_realspace << ap_vec[0], bp_vec[0], cp_vec[0],
+            //                        ap_vec[1], bp_vec[1], cp_vec[1],
+            //                        ap_vec[2], bp_vec[2], cp_vec[2];
+            //}
+            //Bmat_realspace *= 1e10;
             if (db_flags.use_diffuse && db_flags.gamma_miller_units){
               anisoG_local = anisoG_local * Bmat_realspace;
               for (int i_gam=0; i_gam<3; i_gam++){
@@ -501,9 +504,28 @@ void diffBragg_sum_over_steps(
             }
 
             Eigen::Matrix3d U = db_cryst.eig_U;
-            Eigen::Matrix3d UBO = (db_cryst.UMATS_RXYZ[mos_tic] * U*Bmat_realspace*(db_cryst.eig_O.transpose())).transpose();
 
-                Eigen::Matrix3d Ainv = U*(Bmat_realspace.transpose().inverse())* (db_cryst.eig_O.inverse());
+            Eigen::Matrix3d UBOt=U*Bmat_realspace*(db_cryst.eig_O.transpose());
+            double phi = db_cryst.phi0 + db_cryst.phistep*phi_tic;
+            if (phi != 0){
+                if (i_pix==0){
+                    printf("phistep=%f, phi0=%f, phi=%f\n", db_cryst.phistep, db_cryst.phi0, phi);
+                }
+
+                double c = cos(phi);
+                double omc = 1-c;
+                double s = sin(phi);
+                Eigen::Matrix3d Rphi;
+                double gx = db_cryst.spindle_vec[0];
+                double gy = db_cryst.spindle_vec[1];
+                double gz = db_cryst.spindle_vec[2];
+                Rphi << c + gx*gx*omc,    gx*gy*omc-gz*s,   gx*gz*omc+gy*s,
+                      gy*gx*omc + gz*s,   c + gy*gy*omc,   gy*gz*omc - gx*s,
+                      gz*gx*omc - gy*s,  gz*gy*omc + gx*s, c + gz*gz*omc;
+                UBOt = Rphi*UBOt;
+            }
+            Eigen::Matrix3d UBO = (db_cryst.UMATS_RXYZ[mos_tic]*UBOt).transpose();
+            Eigen::Matrix3d Ainv = UBO.inverse();
             Eigen::Vector3d q_vec(scattering[0], scattering[1], scattering[2]);
             q_vec *= 1e-10;
             Eigen::Vector3d H_vec = UBO*q_vec;
@@ -528,12 +550,30 @@ void diffBragg_sum_over_steps(
             Eigen::Vector3d V = NABC*delta_H;
             double hrad_sqr = V.dot(V);
             double NABC_determ = NABC.determinant();
-            double F_latt;
-            if (db_flags.no_Nabc_scale)
-                F_latt = exp(-( hrad_sqr / 0.63 * db_cryst.fudge ));
-            else
-                //F_latt = db_cryst.Na*db_cryst.Nb*db_cryst.Nc*exp(-( hrad_sqr / 0.63 * db_cryst.fudge ));
-                F_latt = NABC_determ*exp(-( hrad_sqr / 0.63 * db_cryst.fudge ));
+            double F_latt = 0;
+            if (db_cryst.xtal_shape==SQUARE){
+                F_latt = 1;
+                if (db_cryst.Na > 1)
+                        F_latt *= sincg(M_PI * h, db_cryst.Na);
+                if (db_cryst.Nb > 1)
+                        F_latt *= sincg(M_PI * k, db_cryst.Nb);
+                if (db_cryst.Nc > 1)
+                        F_latt *= sincg(M_PI * l, db_cryst.Nc);
+            }
+            else { // gaussian relp model
+                double exparg;
+                if (db_cryst.xtal_shape==GAUSS_STAR){
+                    Eigen::Vector3d delta_Q = UBO.inverse()*delta_H;
+                    double rad_star_sqr = delta_Q.dot(delta_Q)*xtal_size*xtal_size;
+                    exparg = rad_star_sqr *1.9* db_cryst.fudge ;
+                }
+                else
+                    exparg = hrad_sqr / 0.63 * db_cryst.fudge;
+
+                F_latt = exp(-exparg);
+                if (! db_flags.no_Nabc_scale)
+                    F_latt *= NABC_determ;
+            }
 
             /* convert amplitudes into intensity (photons per steradian) */
             if (!db_flags.oversample_omega && !db_flags.Fhkl_gradient_mode)
@@ -749,7 +789,6 @@ void diffBragg_sum_over_steps(
             //    printf("hkl= %f %f %f  hkl1= %d %d %d  Fcell=%f\n", h,k,l,h0,k0,l0, F_cell);
 
             double two_C = 2*C;
-            Eigen::Matrix3d UBOt = U*Bmat_realspace*(db_cryst.eig_O.transpose());
             if (db_flags.refine_Umat[0]){
                 Eigen::Matrix3d RyRzUBOt = db_cryst.RotMats[1]*db_cryst.RotMats[2]*UBOt;
                 Eigen::Vector3d delta_H_prime = (db_cryst.UMATS[mos_tic]*db_cryst.dRotMats[0]*RyRzUBOt).transpose()*q_vec;
diff --git a/simtbx/diffBragg/src/diffBragg_gpu_kernel.cu b/simtbx/diffBragg/src/diffBragg_gpu_kernel.cu
index f74319f810..babbe0b344 100644
--- a/simtbx/diffBragg/src/diffBragg_gpu_kernel.cu
+++ b/simtbx/diffBragg/src/diffBragg_gpu_kernel.cu
@@ -2,6 +2,13 @@
 #include <simtbx/diffBragg/src/diffuse_util.h>
 #include <stdio.h>
 
+/* Fourier transform of a grating */
+__device__ CUDAREAL sincg(CUDAREAL x, CUDAREAL N) {
+	if (x != 0.0)
+		return sin(x * N) / sin(x);
+	return N;
+}
+
 __global__
 void gpu_sum_over_steps(
         int Npix_to_model, unsigned int* panels_fasts_slows,
@@ -66,11 +73,16 @@ void gpu_sum_over_steps(
         const int* __restrict__ data_freq, const bool* __restrict__ data_trusted,
         const int* __restrict__ FhklLinear_ASUid,
         const CUDAREAL* __restrict__ Fhkl_channels,
-        const CUDAREAL* __restrict__ Fhkl_scale, CUDAREAL* Fhkl_scale_deriv)
+        const CUDAREAL* __restrict__ Fhkl_scale, CUDAREAL* Fhkl_scale_deriv,
+        bool gaussian_star_shape, bool square_shape)
 { // BEGIN GPU kernel
 
     int tid = blockIdx.x * blockDim.x + threadIdx.x;
     int thread_stride = blockDim.x * gridDim.x;
+    __shared__ CUDAREAL s_phi0, s_phistep, gx,gy,gz;
+    __shared__ int s_phisteps;
+    __shared__ bool s_gaussian_star_shape;
+    __shared__ bool s_square_shape;
     __shared__ bool s_Fhkl_channels_empty;
     __shared__ bool s_Fhkl_have_scale_factors;
     __shared__ bool s_Fhkl_gradient_mode;
@@ -95,9 +107,9 @@ void gpu_sum_over_steps(
     __shared__ CUDAREAL two_C;
     __shared__ MAT3 Bmat_realspace;
     __shared__ MAT3 Amat_init;
-    //__shared__ CUDAREAL s_Na;
-    //__shared__ CUDAREAL s_Nb;
-    //__shared__ CUDAREAL s_Nc;
+    __shared__ CUDAREAL s_Na;
+    __shared__ CUDAREAL s_Nb;
+    __shared__ CUDAREAL s_Nc;
     __shared__ CUDAREAL s_NaNbNc_squared;
     __shared__ int s_h_max, s_k_max, s_l_max, s_h_min, s_k_min, s_l_min, s_h_range, s_k_range, s_l_range;
     __shared__ int s_oversample, s_detector_thicksteps, s_sources, s_mosaic_domains,  s_printout_fpixel,
@@ -126,8 +138,17 @@ void gpu_sum_over_steps(
     __shared__ VEC3 Hmin, Hmax, dHH, Hrange;
     //extern __shared__ CUDAREAL det_vecs[];
     //__shared__ int det_stride;
+    __shared__ CUDAREAL s_xtal_size_sq;
 
     if (threadIdx.x==0){ // TODO can we get speed gains by dividing up the following definitions over more threads ?
+        s_phisteps = phisteps;
+        s_phi0 = phi0;
+        s_phistep = phistep;
+        gx = spindle_vec[0];
+        gy = spindle_vec[1];
+        gz = spindle_vec[2];
+        s_gaussian_star_shape = gaussian_star_shape;
+        s_square_shape = square_shape;
         for (int i=0; i<3; i++){
             s_refine_Ncells[i] = refine_Ncells[i];
             s_refine_Umat[i] = refine_Umat[i];
@@ -170,9 +191,9 @@ void gpu_sum_over_steps(
         s_NABC_det_sq = s_NABC_det*s_NABC_det;
         C = 2 / 0.63 * fudge;
         two_C = 2*C;
-        //s_Na = Na;
-        //s_Nb = Nb;
-        //s_Nc = Nc;
+        s_Na = Na;
+        s_Nb = Nb;
+        s_Nc = Nc;
         s_NaNbNc_squared = (Na*Nb*Nc);
         s_NaNbNc_squared *= s_NaNbNc_squared;
         s_h_max = h_max;
@@ -238,6 +259,12 @@ void gpu_sum_over_steps(
         //    det_vecs[i] = fdet_vectors[i];
         //    det_vecs[i+det_stride] = sdet_vectors[i];
         //}
+        MAT3 Amat0 = Amatrices[0];
+        VEC3 A(Amat0(0,0),Amat0(0,1), Amat0(0,2));
+        VEC3 B(Amat0(1,0),Amat0(1,1), Amat0(1,2));
+        VEC3 C(Amat0(2,0),Amat0(2,1), Amat0(2,2));
+        CUDAREAL cell_vol = A.dot(B.cross(C));
+        s_xtal_size_sq = pow(s_NABC_det*cell_vol, (CUDAREAL)2/(CUDAREAL)3);
     }
 
     //extern __shared__ CUDAREAL source_data[];
@@ -432,10 +459,26 @@ void gpu_sum_over_steps(
             texture_scale *= cap_frac_times_omega;
             texture_scale *= sI;
 
-        for(int _mos_tic=0;_mos_tic<s_mosaic_domains;++_mos_tic){
 
+        for (int _phi_tic=0; _phi_tic<s_phisteps; ++_phi_tic){
+            MAT3 Rphi;
+            CUDAREAL phi = s_phi0 + s_phistep*_phi_tic;
+            if (phi != 0){
+                CUDAREAL c = cos(phi);
+                CUDAREAL omc = 1-c;
+                CUDAREAL s = sin(phi);
+                Rphi << c + gx*gx*omc,    gx*gy*omc-gz*s,   gx*gz*omc+gy*s,
+                      gy*gx*omc + gz*s,   c + gy*gy*omc,   gy*gz*omc - gx*s,
+                      gz*gx*omc - gy*s,  gz*gy*omc + gx*s, c + gz*gz*omc;
+            }
+
+        for(int _mos_tic=0;_mos_tic<s_mosaic_domains;++_mos_tic){
             int amat_idx = _mos_tic;
             MAT3 UBO = Amatrices[amat_idx];
+            if (phi != 0){
+                MAT3 Um = UMATS_RXYZ[_mos_tic]; // note, this will be slow - check if we can simply allow Um and Rphi to commute ...
+                UBO = UBO*Um*Rphi.transpose()*Um.transpose();
+            }
 
             VEC3 H_vec = UBO*q_vec;
             CUDAREAL _h = H_vec[0];
@@ -451,14 +494,33 @@ void gpu_sum_over_steps(
             VEC3 delta_H = H_vec - H0;
             VEC3 V = _NABC*delta_H;
             CUDAREAL _hrad_sqr = V.dot(V);
-            CUDAREAL exparg = _hrad_sqr*C/2;
-            CUDAREAL I0 =0;
-
-            if (exparg< 35)
-                if (s_no_Nabc_scale)
-                    I0 = exp(-2*exparg);
+            CUDAREAL I0=0;
+            if (s_square_shape){
+                I0 = 1;
+                if(s_Na>1)
+                    I0 *= sincg(M_PI*_h,s_Na);
+                if(s_Nb>1)
+                    I0 *= sincg(M_PI*_k,s_Nb);
+                if(s_Nc>1)
+                    I0 *= sincg(M_PI*_l,s_Nc);
+                I0 *= I0;
+            }
+            else{
+                CUDAREAL exparg;
+                if (s_gaussian_star_shape){
+                    MAT3 Ainv = UBO.inverse();
+                    VEC3 delta_Q = Ainv*delta_H;
+                    CUDAREAL rad_star_sqr = delta_Q.dot(delta_Q)*s_xtal_size_sq;
+                    exparg = rad_star_sqr*1.9*fudge;
+                }
                 else
-                    I0 = (s_NABC_det_sq)*exp(-2*exparg);
+                    exparg = _hrad_sqr*C/2;
+                if (exparg< 35)
+                    if (s_no_Nabc_scale)
+                        I0 = exp(-2*exparg);
+                    else
+                        I0 = (s_NABC_det_sq)*exp(-2*exparg);
+            }
 
             // are we doing diffuse scattering
             CUDAREAL step_diffuse_param[6]  = {0,0,0,0,0,0};
@@ -606,6 +668,8 @@ void gpu_sum_over_steps(
             MAT3 UBOt;
             if (s_refine_Umat[0] || s_refine_Umat[1] ||s_refine_Umat[2] || s_refine_eta){
                 UBOt = Amat_init;
+                if (phi != 0)
+                    UBOt = Rphi*UBOt;
             }
             if (s_refine_Umat[0]){
                 MAT3 RyRzUBOt = RotMats[1]*RotMats[2]*UBOt;
@@ -910,6 +974,7 @@ void gpu_sum_over_steps(
             }
 
             } // end of mos_tic loop
+            } // end of phi_tic loop
            } // end of source loop
           } // end of thick step loop
          } // end of fpos loop
diff --git a/simtbx/diffBragg/src/diffBragg_gpu_kernel.h b/simtbx/diffBragg/src/diffBragg_gpu_kernel.h
index fb0f2b218c..686e933ed7 100644
--- a/simtbx/diffBragg/src/diffBragg_gpu_kernel.h
+++ b/simtbx/diffBragg/src/diffBragg_gpu_kernel.h
@@ -75,6 +75,6 @@ __global__ void gpu_sum_over_steps(
         const int* __restrict__ data_freq, const bool* __restrict__ data_trusted,
         const int* __restrict__ FhklLinear_ASUid,
         const CUDAREAL* __restrict__ Fhkl_channels,
-        const CUDAREAL* __restrict__ Fhkl_scale, CUDAREAL* Fhkl_scale_deriv
-
+        const CUDAREAL* __restrict__ Fhkl_scale, CUDAREAL* Fhkl_scale_deriv,
+        bool gaussian_star_shape, bool square_shape
         );
diff --git a/simtbx/diffBragg/src/diffBragg_kokkos_kernel.cpp b/simtbx/diffBragg/src/diffBragg_kokkos_kernel.cpp
index 5705914642..cd49361ba0 100644
--- a/simtbx/diffBragg/src/diffBragg_kokkos_kernel.cpp
+++ b/simtbx/diffBragg/src/diffBragg_kokkos_kernel.cpp
@@ -1,5 +1,6 @@
 #include <cstdio>
 #include "diffBraggKOKKOS.h"
+#include <simtbx/kokkos/kernel_math.h>
 #include <simtbx/diffBragg/src/diffuse_util_kokkos.h>
 
 void kokkos_sum_over_steps(
@@ -150,7 +151,9 @@ void kokkos_sum_over_steps(
     const vector_int_t FhklLinear_ASUid,
     const vector_int_t Fhkl_channels,
     const vector_cudareal_t Fhkl_scale,
-    vector_cudareal_t Fhkl_scale_deriv) {  // BEGIN GPU kernel
+    vector_cudareal_t Fhkl_scale_deriv,
+    bool gaussian_star_shape, bool square_shape
+    ) {  // BEGIN GPU kernel
 
     const KOKKOS_MAT3 Bmat_realspace = eig_B * 1e10;
     const KOKKOS_MAT3 eig_Otranspose = eig_O.transpose();
@@ -174,6 +177,9 @@ void kokkos_sum_over_steps(
     vector_cudareal_t dG_trace = vector_cudareal_t("dG_trace", 3);
     int num_laue_mats = 0;
     int dhh = 0, dkk = 0, dll = 0;
+    CUDAREAL gx = spindle_vec[0];
+    CUDAREAL gy = spindle_vec[1];
+    CUDAREAL gz = spindle_vec[2];
 
     Kokkos::View<KOKKOS_MAT3*[3]> UMATS_prime("UMATS_prime", mosaic_domains);
     Kokkos::View<KOKKOS_MAT3*[3]> UMATS_dbl_prime("UMATS_dbl_prime", mosaic_domains);
@@ -418,9 +424,24 @@ void kokkos_sum_over_steps(
 
                         // TODO rename
                         CUDAREAL texture_scale = _capture_fraction * _omega_pixel * sI;
+                        for (int _phi_tic=0; _phi_tic<phisteps; ++_phi_tic){
+                            KOKKOS_MAT3 Rphi;
+                            CUDAREAL phi = phi0 + phistep*_phi_tic;
+                            if (phi != 0){
+                                CUDAREAL c = cos(phi);
+                                CUDAREAL omc = 1-c;
+                                CUDAREAL s = sin(phi);
+                                Rphi = KOKKOS_MAT3{c + gx*gx*omc,    gx*gy*omc-gz*s,   gx*gz*omc+gy*s,
+                                                 gy*gx*omc + gz*s,   c + gy*gy*omc,   gy*gz*omc - gx*s,
+                                                 gz*gx*omc - gy*s,  gz*gy*omc + gx*s, c + gz*gz*omc};
+                            }
 
                         for (int _mos_tic = 0; _mos_tic < mosaic_domains; ++_mos_tic) {
-                            const KOKKOS_MAT3 UBO = Amatrices(_mos_tic);
+                            KOKKOS_MAT3 UBO = Amatrices(_mos_tic);
+                            if (phi != 0){
+                                KOKKOS_MAT3 Um = UMATS_RXYZ(_mos_tic);
+                                UBO = UBO*Um*Rphi.transpose()*Um.transpose();
+                            }
 
                             KOKKOS_VEC3 H_vec = UBO * q_vec;
                             CUDAREAL _h = H_vec[0];
@@ -436,15 +457,41 @@ void kokkos_sum_over_steps(
                             KOKKOS_VEC3 delta_H = H_vec - H0;
                             KOKKOS_VEC3 V = _NABC * delta_H;
                             CUDAREAL _hrad_sqr = V.length_sqr();
-                            CUDAREAL exparg = _hrad_sqr * C / 2;
-                            CUDAREAL I0 = 0;
-
-                            if (exparg < 35)
-                                if (no_Nabc_scale)
-                                    I0 = ::Kokkos::exp(-2 * exparg);
+                            CUDAREAL I0;
+                            if(square_shape){
+                                CUDAREAL F_latt = 1.0;
+                               if(Na>1)
+                                   F_latt *= sincg(M_PI*_h,Na);
+                               if(Nb>1)
+                                   F_latt *= sincg(M_PI*_k,Nb);
+                               if(Nc>1)
+                                   F_latt *= sincg(M_PI*_l,Nc);
+                               I0 = F_latt*F_latt;
+                            }
+                            else {
+                                CUDAREAL exparg;
+                                if (gaussian_star_shape){
+                                    // TODO can precompute xtal_size_sq to save time
+                                    KOKKOS_VEC3 A {UBO[0], UBO[1], UBO[2]};
+                                    KOKKOS_VEC3 B {UBO[3], UBO[4], UBO[5]};
+                                    KOKKOS_VEC3 C {UBO[6], UBO[7], UBO[8]};
+                                    CUDAREAL cell_vol = A.dot(B.cross(C));
+                                    CUDAREAL xtal_size_sq = pow(NABC_det*cell_vol, CUDAREAL(2)/CUDAREAL(3));
+                                    KOKKOS_MAT3 Ainv = UBO.inverse();
+                                    KOKKOS_VEC3 delta_Q = Ainv*delta_H;
+                                    CUDAREAL rad_star_sqr = delta_Q.dot(delta_Q)*xtal_size_sq;
+                                    exparg = rad_star_sqr*1.9*fudge ;
+                                }
                                 else
-                                    I0 = (NABC_det_sq) *
-                                            ::Kokkos::exp(-2 * exparg);
+                                    exparg = _hrad_sqr * C / 2;
+                                I0 = 0;
+                                if (exparg < 35)
+                                    if (no_Nabc_scale)
+                                        I0 = ::Kokkos::exp(-2 * exparg);
+                                    else
+                                        I0 = (NABC_det_sq) *
+                                                ::Kokkos::exp(-2 * exparg);
+                            }
 
                             // are we doing diffuse scattering
                             CUDAREAL step_diffuse_param[6] = {0, 0, 0, 0, 0, 0};
@@ -613,6 +660,8 @@ void kokkos_sum_over_steps(
                             KOKKOS_MAT3 UBOt;
                              if (refine_flag & (REFINE_UMAT | REFINE_ETA)) {
                                 UBOt = Amat_init;
+                                if (phi != 0)
+                                    UBOt = Rphi*UBOt;
                             }
                             if (refine_flag & REFINE_UMAT1) {
                                 const KOKKOS_VEC3 dV = UMATS_prime(_mos_tic, 0) * q_vec;
@@ -935,6 +984,7 @@ void kokkos_sum_over_steps(
                             } // end of printout if
 
                         } // end of mos_tic loop
+                      } // end of phi_tic loop
                     } // end of source loop
                 } // end of thick step loop
             } // end of fpos loop
@@ -1334,7 +1384,8 @@ void kokkos_sum_over_steps(
     const vector_int_t FhklLinear_ASUid,
     const vector_int_t Fhkl_channels,
     const vector_cudareal_t Fhkl_scale,
-    vector_cudareal_t Fhkl_scale_deriv) {  // BEGIN GPU kernel
+    vector_cudareal_t Fhkl_scale_deriv,
+    bool gaussian_star_shape, bool square_shape) {  // BEGIN GPU kernel
 
     const KOKKOS_MAT3 Bmat_realspace = eig_B * 1e10;
     const KOKKOS_MAT3 eig_Otranspose = eig_O.transpose();
@@ -1358,6 +1409,9 @@ void kokkos_sum_over_steps(
     vector_cudareal_t dG_trace = vector_cudareal_t("dG_trace", 3);
     int num_laue_mats = 0;
     int dhh = 0, dkk = 0, dll = 0;
+    CUDAREAL gx = spindle_vec[0];
+    CUDAREAL gy = spindle_vec[1];
+    CUDAREAL gz = spindle_vec[2];
 
     Kokkos::View<KOKKOS_MAT3*[3]> UMATS_prime("UMATS_prime", mosaic_domains);
     Kokkos::View<KOKKOS_MAT3*[3]> UMATS_dbl_prime("UMATS_dbl_prime", mosaic_domains);
@@ -1599,9 +1653,24 @@ void kokkos_sum_over_steps(
 
                         // TODO rename
                         CUDAREAL texture_scale = _capture_fraction * _omega_pixel * sI;
+                        for (int _phi_tic=0; _phi_tic<phisteps; ++_phi_tic){
+                            KOKKOS_MAT3 Rphi;
+                            CUDAREAL phi = phi0 + phistep*_phi_tic;
+                            if (phi != 0){
+                                CUDAREAL c = cos(phi);
+                                CUDAREAL omc = 1-c;
+                                CUDAREAL s = sin(phi);
+                                Rphi = KOKKOS_MAT3{c + gx*gx*omc,    gx*gy*omc-gz*s,   gx*gz*omc+gy*s,
+                                                 gy*gx*omc + gz*s,   c + gy*gy*omc,   gy*gz*omc - gx*s,
+                                                 gz*gx*omc - gy*s,  gz*gy*omc + gx*s, c + gz*gz*omc};
+                            }
 
                         for (int _mos_tic = 0; _mos_tic < mosaic_domains; ++_mos_tic) {
-                            const KOKKOS_MAT3 UBO = Amatrices(_mos_tic);
+                            KOKKOS_MAT3 UBO = Amatrices(_mos_tic);
+                            if (phi != 0){
+                                KOKKOS_MAT3 Um = UMATS_RXYZ(_mos_tic);
+                                UBO = UBO*Um*Rphi.transpose()*Um.transpose();
+                            }
 
                             KOKKOS_VEC3 H_vec = UBO * q_vec;
                             CUDAREAL _h = H_vec[0];
@@ -1617,15 +1686,42 @@ void kokkos_sum_over_steps(
                             KOKKOS_VEC3 delta_H = H_vec - H0;
                             KOKKOS_VEC3 V = _NABC * delta_H;
                             CUDAREAL _hrad_sqr = V.length_sqr();
-                            CUDAREAL exparg = _hrad_sqr * C / 2;
-                            CUDAREAL I0 = 0;
-
-                            if (exparg < 35)
-                                if (no_Nabc_scale)
-                                    I0 = ::Kokkos::exp(-2 * exparg);
+                            CUDAREAL I0=0;
+                            if(square_shape){
+                               CUDAREAL F_latt = 1.0;
+                               /* xtal is a paralelpiped */
+                               if(Na>1)
+                                   F_latt *= sincg(M_PI*_h,Na);
+                               if(Nb>1)
+                                   F_latt *= sincg(M_PI*_k,Nb);
+                               if(Nc>1)
+                                   F_latt *= sincg(M_PI*_l,Nc);
+                               I0 = F_latt*F_latt;
+                            }
+                            else{ // using a gaussian model
+                                CUDAREAL exparg;
+                                if (gaussian_star_shape){
+                                    // TODO: can precompute xtal_vol to save time ... but prob not necessary as gaussian star model only used for forward calc
+                                    KOKKOS_VEC3 A {UBO[0], UBO[1], UBO[2]};
+                                    KOKKOS_VEC3 B {UBO[3], UBO[4], UBO[5]};
+                                    KOKKOS_VEC3 C {UBO[6], UBO[7], UBO[8]};
+                                    CUDAREAL cell_vol = A.dot(B.cross(C));
+                                    CUDAREAL xtal_size_sq = pow(NABC_det*cell_vol, CUDAREAL(2)/CUDAREAL(3));
+                                    KOKKOS_MAT3 Ainv = UBO.inverse();
+                                    KOKKOS_VEC3 delta_Q = Ainv*delta_H;
+                                    CUDAREAL rad_star_sqr = delta_Q.dot(delta_Q)*xtal_size_sq;
+                                    exparg = rad_star_sqr*1.9*fudge ;
+                                }
                                 else
-                                    I0 = (NABC_det_sq) *
-                                            ::Kokkos::exp(-2 * exparg);
+                                    exparg = _hrad_sqr * C / 2;
+
+                                if (exparg < 35)
+                                    if (no_Nabc_scale)
+                                        I0 = ::Kokkos::exp(-2 * exparg);
+                                    else
+                                        I0 = (NABC_det_sq) *
+                                                ::Kokkos::exp(-2 * exparg);
+                            }
 
                             // are we doing diffuse scattering
                             CUDAREAL step_diffuse_param[6] = {0, 0, 0, 0, 0, 0};
@@ -1790,6 +1886,8 @@ void kokkos_sum_over_steps(
                             KOKKOS_MAT3 UBOt;
                              if (refine_flag & (REFINE_UMAT | REFINE_ETA)) {
                                 UBOt = Amat_init;
+                                if (phi != 0)
+                                    UBOt = Rphi*UBOt;
                             }
                             if (refine_flag & REFINE_UMAT1) {
                                 const KOKKOS_VEC3 dV = UMATS_prime(_mos_tic, 0) * q_vec;
@@ -2112,6 +2210,7 @@ void kokkos_sum_over_steps(
                             } // end of printout if
 
                         } // end of mos_tic loop
+                      } // end of phi_tic loop
                     } // end of source loop
                 } // end of thick step loop
             } // end of fpos loop
@@ -2506,4 +2605,5 @@ kokkos_sum_over_steps<
     const vector_int_t FhklLinear_ASUid,
     const vector_int_t Fhkl_channels,
     const vector_cudareal_t Fhkl_scale,
-    vector_cudareal_t Fhkl_scale_deriv);
+    vector_cudareal_t Fhkl_scale_deriv,
+    bool gaussian_star_shape, bool square_shape);
diff --git a/simtbx/diffBragg/src/diffBragg_kokkos_kernel.h b/simtbx/diffBragg/src/diffBragg_kokkos_kernel.h
index 3586183087..36d8c7939f 100644
--- a/simtbx/diffBragg/src/diffBragg_kokkos_kernel.h
+++ b/simtbx/diffBragg/src/diffBragg_kokkos_kernel.h
@@ -151,7 +151,8 @@ void kokkos_sum_over_steps(
     const vector_int_t FhklLinear_ASUid,
     const vector_int_t Fhkl_channels,
     const vector_cudareal_t Fhkl_scale,
-    vector_cudareal_t Fhkl_scale_deriv
+    vector_cudareal_t Fhkl_scale_deriv,
+    bool gaussian_star_shape, bool square_shape
 );
 
 template <
@@ -313,6 +314,7 @@ void kokkos_sum_over_steps(
     const vector_int_t FhklLinear_ASUid,
     const vector_int_t Fhkl_channels,
     const vector_cudareal_t Fhkl_scale,
-    vector_cudareal_t Fhkl_scale_deriv
+    vector_cudareal_t Fhkl_scale_deriv,
+    bool gaussian_star_shape, bool square_shape
 );
 #endif
diff --git a/simtbx/diffBragg/src/util.h b/simtbx/diffBragg/src/util.h
index ec197c1dc9..ebe1244ea7 100644
--- a/simtbx/diffBragg/src/util.h
+++ b/simtbx/diffBragg/src/util.h
@@ -14,6 +14,12 @@
     #define CUDAREAL double
 #endif
 
+#include "simtbx/nanoBragg/nanotypes.h"
+using simtbx::nanoBragg::shapetype;
+using simtbx::nanoBragg::SQUARE;
+using simtbx::nanoBragg::GAUSS;
+using simtbx::nanoBragg::GAUSS_STAR;
+
 using image_type = std::vector<CUDAREAL>;
 typedef Eigen::Matrix<double,3,1> VEC3;
 typedef Eigen::Matrix<double,3,3> MAT3;
@@ -152,6 +158,7 @@ struct flags{
 };
 
 struct crystal{
+    shapetype xtal_shape = GAUSS;
     double Friedel_beta = 1e10; // restraint factor for Friedel pairs
     double Finit_beta = 1e10; // restraint factor for Friedel pairs
     std::vector<int> pos_inds; // indices of the positive Friedel mate
diff --git a/simtbx/diffBragg/tests/tst_diffBragg_nanoBragg_congruency.py b/simtbx/diffBragg/tests/tst_diffBragg_nanoBragg_congruency.py
deleted file mode 100644
index 194607a9c6..0000000000
--- a/simtbx/diffBragg/tests/tst_diffBragg_nanoBragg_congruency.py
+++ /dev/null
@@ -1,39 +0,0 @@
-from __future__ import division
-##from simtbx.kokkos import gpu_instance
-#kokkos_run = gpu_instance(deviceId = 0)
-from simtbx.diffBragg.utils import find_diffBragg_instances
-import numpy as np
-
-
-def main():
-    from simtbx.nanoBragg.sim_data import SimData
-
-    S = SimData(use_default_crystal=True)
-    S.instantiate_diffBragg()
-    S.D.nopolar = True
-    S.D.oversample = 3
-
-    S.D.add_diffBragg_spots()
-    S._add_background()
-
-    diff_img = S.D.raw_pixels.as_numpy_array()
-
-    S.D.raw_pixels *= 0
-    S.D.add_nanoBragg_spots()
-    S._add_background()
-
-    nano_img = S.D.raw_pixels.as_numpy_array()
-
-    assert np.allclose(diff_img, nano_img, atol=1e-9)
-    for name in find_diffBragg_instances(globals()): del globals()[name]
-
-
-if __name__ == "__main__":
-    import sys
-    if "--kokkos" in sys.argv:
-        import os
-        os.environ["DIFFBRAGG_USE_KOKKOS"]="1"
-    from simtbx.diffBragg.device import DeviceWrapper
-    with DeviceWrapper(0) as _:
-        main()
-    print ("OK!")
diff --git a/simtbx/diffBragg/tests/tst_diffBragg_rotation.py b/simtbx/diffBragg/tests/tst_diffBragg_rotation.py
index 339e3ffcfc..bf444e7ea3 100644
--- a/simtbx/diffBragg/tests/tst_diffBragg_rotation.py
+++ b/simtbx/diffBragg/tests/tst_diffBragg_rotation.py
@@ -1,7 +1,6 @@
 from __future__ import absolute_import, division, print_function
-from simtbx.kokkos import gpu_instance
-kokkos_run = gpu_instance(deviceId = 0)
 from simtbx.diffBragg import utils, hopper_utils
+from simtbx.nanoBragg import nanoBragg
 from simtbx.nanoBragg.tst_nanoBragg_multipanel import beam as dxtbx_beam
 from simtbx.nanoBragg.tst_nanoBragg_multipanel import whole_det as dxtbx_det
 from simtbx.nanoBragg import nanoBragg_crystal, nanoBragg_beam, sim_data
@@ -9,8 +8,7 @@
 from dxtbx_model_ext import Crystal
 from scitbx import matrix
 import numpy as np
-# from simtbx.nanoBragg import shapetype
-# from simtbx.nanoBragg import nanoBragg
+import os
 import libtbx.load_env # possibly implicit
 
 pdb_lines = """HEADER TEST
@@ -32,12 +30,20 @@ def write_test_pdb(fileout="test.pdb"):
   pdb_inp.write_pdb_file(fileout)
   return fileout
 
-def run_sim(testpdb, version="nanoBragg", spindle_axis=(1,0,0), phi_start=0, phistep_deg=-1, phisteps=-1, osc_deg=-1):
+def run_sim(testpdb, version="nanoBragg", spindle_axis=(1,0,0), phi_start=0, osc_deg=-1, phisteps=-1, verbose=False,
+            manual_mode=False, manual_angle=0.):
   # crystal
   symmetry=extract_from(testpdb)
   sg = str(symmetry.space_group_info())
   fmat = matrix.sqr(symmetry.unit_cell().fractionalization_matrix())
   dxtbx_cryst = Crystal(fmat, sg)
+  if manual_mode:
+      sa = matrix.col(spindle_axis)
+      U = matrix.sqr(dxtbx_cryst.get_U())
+      Rphi = sa.axis_and_angle_as_r3_rotation_matrix(manual_angle, deg=True)
+      U = Rphi*U
+      dxtbx_cryst.set_U(U.elems)
+
   crystal = nanoBragg_crystal.NBcrystal(init_defaults=True)
   crystal.isotropic_ncells = False
   crystal.dxtbx_crystal = dxtbx_cryst
@@ -63,69 +69,112 @@ def run_sim(testpdb, version="nanoBragg", spindle_axis=(1,0,0), phi_start=0, phi
   SIM.detector = utils.strip_thickness_from_detector(dxtbx_det)
   SIM.detector = dxtbx_det
   SIM.crystal = crystal
+
   SIM.beam = beam
   SIM.panel_id = 0
   def setup_rotation(SIM):
-    SIM.D.spindle_axis=spindle_axis
-    SIM.D.phi_deg = phi_start
-    SIM.D.phistep_deg = phistep_deg
-    SIM.D.phisteps = phisteps
+    if not manual_mode:
+      SIM.D.spindle_axis = spindle_axis
+      SIM.D.phi_deg = phi_start
+      SIM.D.osc_deg = osc_deg
+      SIM.D.phisteps = phisteps
   if version == "nanoBragg":
-    SIM.instantiate_nanoBragg(oversample=1, device_Id=0, default_F=0)
-    SIM.D.printout_pixel_fastslow = FAST,SLOW
+    SIM.instantiate_nanoBragg(oversample=1, device_Id=0, default_F=0, interpolate=0)
     setup_rotation(SIM)
-    SIM.D.show_params()
+    if verbose:
+      SIM.D.printout_pixel_fastslow = FAST,SLOW
+      SIM.D.show_params()
 
     SIM.D.add_nanoBragg_spots()
     pix = SIM.D.raw_pixels.as_numpy_array()
     SIM.D.free_all()
     return pix
   else:
-    SIM.instantiate_diffBragg(oversample=1, device_Id=0, default_F=0)
+    SIM.instantiate_diffBragg(oversample=1, device_Id=0, default_F=0, interpolate=0)
     SIM.D.xray_beams = SIM.beam.xray_beams
     ucell_man = utils.manager_from_params(ucell_p)
     Bmatrix = ucell_man.B_recipspace
     SIM.D.Bmatrix = Bmatrix
 
     npix = int(len(pfs)/3)
-    SIM.D.printout_pixel_fastslow = FAST,SLOW
     setup_rotation(SIM)
-    SIM.D.show_params()
+    if verbose:
+      SIM.D.printout_pixel_fastslow = FAST,SLOW
+      SIM.D.show_params()
 
     SIM.D.add_diffBragg_spots_full()
     pix = SIM.D.raw_pixels_roi.as_numpy_array()
-    SIM.D.free_all()
+    hopper_utils.free_SIM_mem(SIM)
     return pix
 
 def assert_identical_pixels(pix1, pix2):
-  assert np.allclose(pix1,pix2)
+  assert np.allclose(pix1,pix2, atol=0.1)
+
+def test_manual_rotation():
+    testpdb = write_test_pdb(fileout="test.pdb")
+    SIM = nanoBragg()
+    SIM.phi_deg = 0
+    SIM.osc_deg = 0.5
+    SIM.phisteps = 50
+    phistep_deg = SIM.osc_deg / SIM.phisteps
+    assert phistep_deg == SIM.phistep_deg # Note, this was broken by a bug recently
+
+    pix_sums = []
+    for version in ("nanoBragg", "diffBragg"):
+        pix = None
+        for phi_tic in range(SIM.phisteps):
+            manual_angle = SIM.phi_deg + phi_tic*phistep_deg
+            print("phiTic=%d, deltaPhi=%f deg." % (phi_tic, manual_angle))
+            temp = run_sim(testpdb, version=version, spindle_axis=(1, 0, 0), phi_start=0, osc_deg=-1, phisteps=-1,
+                    verbose=False, manual_mode=True, manual_angle=manual_angle)
+            if pix is None:
+                pix = temp
+            else:
+                pix += temp
+        pix /= SIM.phisteps
+        pix_sums.append(pix)
+    nb_pix, db_pix = pix_sums
+    assert np.allclose(nb_pix.ravel(), db_pix)
+    print("tst manual diffBragg vs manual nanoBragg: OK!")
+
+    temp_db = run_sim(testpdb, version="diffBragg", spindle_axis=(1, 0, 0), phi_start=SIM.phi_deg, osc_deg=SIM.osc_deg,
+                   phisteps=SIM.phisteps, verbose=False, manual_mode=False)
+    assert np.allclose(db_pix, temp_db)
+    print("tst manual vs internal diffBragg: OK!")
+
+    temp_nb = run_sim(testpdb, version="nanoBragg", spindle_axis=(1, 0, 0), phi_start=SIM.phi_deg, osc_deg=SIM.osc_deg,
+                      phisteps=SIM.phisteps, verbose=False, manual_mode=False)
+    # for some reason, nanoBragg phi rotation seems to propagate rounding error maybe, so we must use a larger atol...
+    assert np.allclose(nb_pix, temp_nb, atol=0.1)
+    print("tst manual vs internal nanoBragg: OK!")
+    SIM.free_all()
 
 def test_range_of_rotation_steps():
-  # automatic param setting for phistep_deg, phisteps and osc_deg when passed values are <0
+  # automatic param setting for phisteps and osc_deg when passed values are <0
   # conflicting params are tolerated and should be handled with the same decision tree
   testpdb = write_test_pdb(fileout="test.pdb")
   for phi_start in (0, 30):
-    for phistep_deg in (-1, 0.01, 0.1):
-      for phisteps in (-1, 1, 10):
-        for osc_deg in (-1, 0.1, 1):
-          pix1 = run_sim(
-                  testpdb,
-                  version="nanoBragg",
-                  spindle_axis=(1,0,0),
-                  phi_start=phi_start,
-                  phistep_deg=phistep_deg,
-                  phisteps=phisteps,
-                  osc_deg=osc_deg)#, fileout="rotimg_nano_%03d.h5")
-          pix2 = run_sim(
-                  testpdb,
-                  version="diffBragg",
-                  spindle_axis=(1,0,0),
-                  phi_start=phi_start,
-                  phistep_deg=phistep_deg,
-                  phisteps=phisteps,
-                  osc_deg=osc_deg)#, fileout="rotimg_diff_%03d.h5")
-          pix2 = pix2.reshape(pix1.shape)
-          assert_identical_pixels(pix1, pix2)
+    for osc_deg in (0., 0.01, 0.1):
+      for phisteps in (1, 10, 15):
+        pix1 = run_sim(
+                testpdb,
+                version="nanoBragg",
+                spindle_axis=(1,0,0),
+                phi_start=phi_start,
+                osc_deg=osc_deg,
+                phisteps=phisteps
+                )#, fileout="rotimg_nano_%03d.h5")
+        pix2 = run_sim(
+                testpdb,
+                version="diffBragg",
+                spindle_axis=(1,0,0),
+                phi_start=phi_start,
+                osc_deg=osc_deg,
+                phisteps=phisteps
+                )#, fileout="rotimg_diff_%03d.h5")
+        pix2 = pix2.reshape(pix1.shape)
+        assert_identical_pixels(pix1, pix2)
+        print("tst range: phisteps=%d, osc_deg=%f deg., phistart=%f deg. ... OK!" %(phisteps, osc_deg, phi_start))
 
 def test_sweep():
   testpdb = write_test_pdb(fileout="test.pdb")
@@ -135,29 +184,34 @@ def test_sweep():
             version="nanoBragg",
             spindle_axis=(1,0,0),
             phi_start=phi_start,
-            phistep_deg=0.05,
-            phisteps=20,
-            osc_deg=1)#, fileout="rotimg_nano_%03d.h5")
+            osc_deg=0.05,
+            phisteps=20
+            )#, fileout="rotimg_nano_%03d.h5")
     pix2 = run_sim(
             testpdb,
             version="diffBragg",
             spindle_axis=(1,0,0),
             phi_start=phi_start,
-            phistep_deg=0.05,
-            phisteps=20,
-            osc_deg=1)#, fileout="rotimg_diff_%03d.h5")
+            osc_deg=0.05,
+            phisteps=20
+            )#, fileout="rotimg_diff_%03d.h5")
     pix2 = pix2.reshape(pix1.shape)
     assert_identical_pixels(pix1, pix2)
+    print("tst sweep: phisteps=20, osc_deg=0.05 deg., phistart=%f deg. ... OK!" % (phi_start, ))
 
 def tst_all():
-  import os
-  # currently only tests CPU implementation
-  if "DIFFBRAGG_USE_CUDA" in os.environ:
-    os.environ.unsetenv("DIFFBRAGG_USE_CUDA")
-  #os.environ["DIFFBRAGG_USE_CUDA"]=1
+  test_manual_rotation()
   test_range_of_rotation_steps()
   test_sweep()
 
 if __name__=="__main__":
-  tst_all()
-  print("OK")
+    import sys
+    if "--kokkos" in sys.argv:
+        os.environ["DIFFBRAGG_USE_KOKKOS"] = "1"
+    from simtbx.diffBragg.utils import find_diffBragg_instances
+    from simtbx.diffBragg.device import DeviceWrapper
+
+    with DeviceWrapper(0) as _:
+        tst_all()
+        for name in find_diffBragg_instances(globals()): del globals()[name]
+    print("OK")
diff --git a/simtbx/diffBragg/utils.py b/simtbx/diffBragg/utils.py
index d8c439e26d..340bf9360e 100644
--- a/simtbx/diffBragg/utils.py
+++ b/simtbx/diffBragg/utils.py
@@ -837,6 +837,7 @@ def simulator_from_expt_and_params(expt, params=None):
 
     # create nanoBragg crystal
     crystal = NBcrystal(init_defaults=False)
+    crystal.xtal_shape = "gauss"
     crystal.isotropic_ncells = has_isotropic_ncells
     if params.simulator.crystal.rotXYZ_ucell is not None:
         rotXYZ = params.simulator.crystal.rotXYZ_ucell[:3]
@@ -902,7 +903,7 @@ def simulator_from_expt_and_params(expt, params=None):
     return SIM
 
 
-def update_SIM_with_gonio(SIM, params=None, delta_phi=None, num_phi_steps=5):
+def update_SIM_with_gonio(SIM, params=None, delta_phi=None, num_phi_steps=5, spindle_axis=(1,0,0)):
     """
 
     :param SIM: sim_data instance
@@ -918,9 +919,10 @@ def update_SIM_with_gonio(SIM, params=None, delta_phi=None, num_phi_steps=5):
         num_phi_steps = params.simulator.gonio.phi_steps
 
     if delta_phi is not None:
+        SIM.D.spindle_axis = spindle_axis
         SIM.D.phi_deg = 0
         SIM.D.osc_deg = delta_phi
-        SIM.D.phisteps = num_phi_steps
+        SIM.D.phisteps = int(num_phi_steps)
 
 
 def get_complex_fcalc_from_pdb(
diff --git a/simtbx/kokkos/simulation_kernels.h b/simtbx/kokkos/simulation_kernels.h
index 0a91f33e59..31e5c00c75 100644
--- a/simtbx/kokkos/simulation_kernels.h
+++ b/simtbx/kokkos/simulation_kernels.h
@@ -15,6 +15,7 @@ using simtbx::nanoBragg::SQUARE;
 using simtbx::nanoBragg::ROUND;
 using simtbx::nanoBragg::GAUSS;
 using simtbx::nanoBragg::GAUSS_ARGCHK;
+using simtbx::nanoBragg::GAUSS_STAR;
 using simtbx::nanoBragg::TOPHAT;
 
 void calc_CrystalOrientations(CUDAREAL phi0,
@@ -297,6 +298,18 @@ void kokkosSpotsKernel(int spixels, int fpixels, int roi_xmin, int roi_xmax,
                                                                         if (my_arg<35.){ F_latt = Na * Nb * Nc * exp(-(my_arg));
                                                                         } else { F_latt = 0.; } // warps coalesce when blocks of 32 pixels have no Bragg signal
                                                                 }
+                                                                if (xtal_shape == GAUSS_STAR){
+                                                                        mat3 At (a[0],a[1],a[2],
+                                                                                 b[0],b[1],b[2],
+                                                                                 c[0],c[1],c[2]);
+                                                                        vec3 delta_H {h-h0, k-k0, l-l0};
+                                                                        vec3 delta_Q = (1e10*At).inverse()*delta_H;
+                                                                        CUDAREAL Nvol=Na*Nb*Nc;
+                                                                        CUDAREAL xtal_size_sq = pow(Nvol*V_cell, double(2)/double(3));
+                                                                        CUDAREAL rad_star_sqr = delta_Q.dot(delta_Q)*xtal_size_sq;
+                                                                        F_latt = Nvol*exp(-( rad_star_sqr *1.9 * fudge ));
+                                                                }
+
                                                                 if (xtal_shape == TOPHAT) {
                                                                         // make a flat-top spot of same height and volume as square_xtal spots
                                                                         F_latt = Na * Nb * Nc * (hrad_sqr * fudge < 0.3969);
diff --git a/simtbx/modeling/forward_models.py b/simtbx/modeling/forward_models.py
index 090290e8fc..b1bc63e751 100644
--- a/simtbx/modeling/forward_models.py
+++ b/simtbx/modeling/forward_models.py
@@ -70,7 +70,7 @@ def model_spots_from_pandas(pandas_frame,  rois_per_panel=None,
                           symbol_override=None, quiet=False, reset_Bmatrix=False, nopolar=False,
                           force_no_detector_thickness=False, printout_pix=None, norm_by_nsource=False,
                           use_exascale_api=False, use_db=False, show_timings=False, perpixel_wavelen=False,
-                          det_thicksteps=None, from_pdb=None):
+                          det_thicksteps=None, from_pdb=None, mosaic_samples_override=None):
     if perpixel_wavelen and not use_db:
         raise NotImplementedError("to get perpixel wavelengths set use_db=True to use the diffBragg backend")
     if use_exascale_api:
@@ -196,6 +196,8 @@ def model_spots_from_pandas(pandas_frame,  rois_per_panel=None,
         mos_dom = 1
         if "num_mosaicity_samples" in list(df):
             mos_dom = int(df.num_mosaicity_samples.values[0])
+        if mosaic_samples_override is not None:
+            mos_dom = mosaic_samples_override
         eta_abc = df.eta_abc.values[0]
         LOGGER.debug("Num mos samples=%d, eta_abc=%f %f %f" % ((mos_dom,)+ eta_abc )  )
         LOGGER.debug("Num energy channels=%d" % len(energies))
@@ -244,7 +246,8 @@ def diffBragg_forward(CRYSTAL, DETECTOR, BEAM, Famp, energies, fluxes,
                       nopolar=False, diffuse_params=None, cuda=False,
                       show_timings=False,perpixel_wavelen=False,
                       det_thicksteps=None, eta_abc=None, Ncells_def=None,
-                      num_phi_steps=1, delta_phi=None, div_mrad=0, divsteps=0):
+                      num_phi_steps=1, delta_phi=None, div_mrad=0, divsteps=0,
+                      spindle_axis=(1,0,0)):
 
     if cuda:
         os.environ["DIFFBRAGG_USE_CUDA"] = "1"
@@ -308,7 +311,7 @@ def diffBragg_forward(CRYSTAL, DETECTOR, BEAM, Famp, energies, fluxes,
         S.D.diffuse_sigma = diffuse_params["sigma"]
 
     if delta_phi is not None:
-        utils.update_SIM_with_gonio(S, delta_phi=delta_phi, num_phi_steps=num_phi_steps )
+        utils.update_SIM_with_gonio(S, delta_phi=delta_phi, num_phi_steps=num_phi_steps, spindle_axis=spindle_axis)
     S.D.add_diffBragg_spots_full()
     if show_timings or LOGGER.level <= 10:
         S.D.show_timings()
diff --git a/simtbx/nanoBragg/nanoBragg.cpp b/simtbx/nanoBragg/nanoBragg.cpp
index 14ba8b26b8..2ff5ba28ce 100644
--- a/simtbx/nanoBragg/nanoBragg.cpp
+++ b/simtbx/nanoBragg/nanoBragg.cpp
@@ -2380,6 +2380,7 @@ nanoBragg::show_params()
     if(xtal_shape == SQUARE) printf("parallelpiped");
     if(xtal_shape == GAUSS ) printf("gaussian");
     if(xtal_shape == GAUSS_ARGCHK ) printf("gaussian_argchk");
+    if(xtal_shape == GAUSS_STAR ) printf("gaussian_star");
     if(xtal_shape == TOPHAT) printf("tophat-spot");
     printf(" xtal: %.1fx%.1fx%.1f cells\n",Na,Nb,Nc);
     printf("Unit Cell: %g %g %g %g %g %g\n", a_A[0],b_A[0],c_A[0],alpha*RTD,beta*RTD,gamma*RTD);
@@ -2673,6 +2674,15 @@ nanoBragg::add_nanoBragg_spots()
                                         if (my_arg<35.){ F_latt = Na * Nb * Nc * exp(-(my_arg));}
                                         else { F_latt = 0.; } // not expected to give performance gain on optimized C++, only on GPU
                                     }
+                                    if (xtal_shape == GAUSS_STAR){
+                                        double xtal_size_sq = pow(Na*Nb*Nc*V_cell, double(2)/double(3));
+                                        vec3 delta_H {h-h0, k-k0, l-l0};
+                                        mat3 At(a[1], a[2], a[3], b[1], b[2], b[3], c[1], c[2], c[3]);
+                                        mat3 Ainverse =  (1e10*At).inverse();
+                                        vec3 delta_Q = Ainverse*delta_H;
+                                        double rad_star_sqr = (delta_Q*delta_Q)*xtal_size_sq; // dot product
+                                        F_latt = Na*Nb*Nc*exp(-( rad_star_sqr*1.9*fudge ));
+                                    }
 
                                     if(xtal_shape == TOPHAT)
                                     {
diff --git a/simtbx/nanoBragg/nanoBraggCUDA.cu b/simtbx/nanoBragg/nanoBraggCUDA.cu
index fe153bbf20..025af9180c 100644
--- a/simtbx/nanoBragg/nanoBraggCUDA.cu
+++ b/simtbx/nanoBragg/nanoBraggCUDA.cu
@@ -21,6 +21,7 @@ using simtbx::nanoBragg::SQUARE;
 using simtbx::nanoBragg::ROUND;
 using simtbx::nanoBragg::GAUSS;
 using simtbx::nanoBragg::GAUSS_ARGCHK;
+using simtbx::nanoBragg::GAUSS_STAR;
 using simtbx::nanoBragg::TOPHAT;
 
 static void CheckCudaErrorAux(const char *, unsigned, const char *, cudaError_t);
@@ -740,12 +741,39 @@ CUDAREAL pixel_size, CUDAREAL subpixel_size, int steps, CUDAREAL detector_thicks
 									/* fudge the radius so that volume and FWHM are similar to square_xtal spots */
 									F_latt = Na * Nb * Nc * exp(-(hrad_sqr / 0.63 * fudge));
 								}
-                                                                if (s_xtal_shape == GAUSS_ARGCHK) {
-                                                                        /* fudge the radius so that volume and FWHM are similar to square_xtal spots */
-                                                                        double my_arg = hrad_sqr / 0.63 * fudge;
-                                                                        if (my_arg<35.){ F_latt = Na * Nb * Nc * exp(-(my_arg));
-                                                                        } else { F_latt = 0.; } // warps coalesce when blocks of 32 pixels have no Bragg signal
-                                                                }
+                                if (s_xtal_shape == GAUSS_ARGCHK) {
+                                   /* fudge the radius so that volume and FWHM are similar to square_xtal spots */
+                                   double my_arg = hrad_sqr / 0.63 * fudge;
+                                   if (my_arg<35.){ F_latt = Na * Nb * Nc * exp(-(my_arg));
+                                   } else { F_latt = 0.; } // warps coalesce when blocks of 32 pixels have no Bragg signal
+                                }
+                                if (s_xtal_shape == GAUSS_STAR){
+                                   CUDAREAL a_cross_b[] = { 0.0, 0.0, 0.0, 0.0 };
+                                   CUDAREAL b_cross_c[] = { 0.0, 0.0, 0.0, 0.0 };
+                                   CUDAREAL c_cross_a[] = { 0.0, 0.0, 0.0, 0.0 };
+                                   cross_product(a,b,a_cross_b);
+                                   cross_product(b,c,b_cross_c);
+                                   cross_product(c,a,c_cross_a);
+                                   /* new reciprocal-space cell vectors */
+                                   CUDAREAL a_star_tic[] = {0,0,0,0};
+                                   CUDAREAL b_star_tic[] = {0,0,0,0};
+                                   CUDAREAL c_star_tic[] = {0,0,0,0};
+                                   vector_scale(b_cross_c,a_star_tic,1e20/V_cell);
+                                   vector_scale(c_cross_a,b_star_tic,1e20/V_cell);
+                                   vector_scale(a_cross_b,c_star_tic,1e20/V_cell);
+                                   //}
+                                   CUDAREAL dh=h-h0;
+                                   CUDAREAL dk=k-k0;
+                                   CUDAREAL dl=l-l0;
+                                   CUDAREAL dx_star = dh*a_star_tic[1] + dk*b_star_tic[1] + dl*c_star_tic[1];
+                                   CUDAREAL dy_star = dh*a_star_tic[2] + dk*b_star_tic[2] + dl*c_star_tic[2];
+                                   CUDAREAL dz_star = dh*a_star_tic[3] + dk*b_star_tic[3] + dl*c_star_tic[3];
+                                   CUDAREAL Nvol=Na*Nb*Nc;
+                                   CUDAREAL xtal_size_sq = pow(Nvol*V_cell, CUDAREAL(2)/CUDAREAL(3));
+                                   CUDAREAL rad_star_sqr = ( dx_star*dx_star + dy_star*dy_star + dz_star*dz_star )
+                                                  *xtal_size_sq;
+                                   F_latt = Nvol*exp(-( rad_star_sqr *1.9 * fudge ));
+                                }
 								if (s_xtal_shape == TOPHAT) {
 									/* make a flat-top spot of same height and volume as square_xtal spots */
 									F_latt = Na * Nb * Nc * (hrad_sqr * fudge < 0.3969);
diff --git a/simtbx/nanoBragg/nanoBragg_crystal.py b/simtbx/nanoBragg/nanoBragg_crystal.py
index 397c0fcb2e..3c6408a7b9 100644
--- a/simtbx/nanoBragg/nanoBragg_crystal.py
+++ b/simtbx/nanoBragg/nanoBragg_crystal.py
@@ -162,6 +162,8 @@ def n_mos_domains(self, val):
   def xtal_shape(self):
     if self._xtal_shape == "gauss":
       return shapetype.Gauss
+    if self._xtal_shape == "gauss_star":
+      return shapetype.Gauss_star
     elif self._xtal_shape == "gauss_argchk":
       return shapetype.Gauss_argchk
     elif self._xtal_shape == "round":
diff --git a/simtbx/nanoBragg/nanoBragg_ext.cpp b/simtbx/nanoBragg/nanoBragg_ext.cpp
index 269c11634f..ea6b0eaa87 100644
--- a/simtbx/nanoBragg/nanoBragg_ext.cpp
+++ b/simtbx/nanoBragg/nanoBragg_ext.cpp
@@ -1154,7 +1154,7 @@ printf("DEBUG: pythony_stolFbg[1]=(%g,%g)\n",nanoBragg.pythony_stolFbg[1][0],nan
 
   /* spindle angle phi step, in deg */
   static double get_phistep_deg(nanoBragg const& nanoBragg) {
-      return nanoBragg.phi0*RTD;;
+      return nanoBragg.phistep*RTD;;
   }
   static void   set_phistep_deg(nanoBragg& nanoBragg, double const& value) {
       nanoBragg.phistep = value/RTD;
@@ -1337,6 +1337,7 @@ printf("DEBUG: pythony_stolFbg[1]=(%g,%g)\n",nanoBragg.pythony_stolFbg[1][0],nan
       .value("Round",ROUND)
       .value("Gauss",GAUSS)
       .value("Gauss_argchk",GAUSS_ARGCHK)
+      .value("Gauss_star",GAUSS_STAR)
       .value("Tophat",TOPHAT)
       .value("Fiber",FIBER)
       .value("Unknown",UNKNOWN)
@@ -1862,6 +1863,10 @@ printf("DEBUG: pythony_stolFbg[1]=(%g,%g)\n",nanoBragg.pythony_stolFbg[1][0],nan
                      "random number seed for detector calibration error, keep the same for a given detector")
 
       /* noise generation parameters */
+      .add_property("fudge",
+                     make_getter(&nanoBragg::fudge,rbv()),
+                     make_setter(&nanoBragg::fudge,dcp()),
+                     "fudge factor on F_latt falloff (default=1)")
       .add_property("spot_scale",
                      make_getter(&nanoBragg::spot_scale,rbv()),
                      make_setter(&nanoBragg::spot_scale,dcp()),
diff --git a/simtbx/nanoBragg/nanotypes.h b/simtbx/nanoBragg/nanotypes.h
index 524bd2371c..fbc73bb32f 100644
--- a/simtbx/nanoBragg/nanotypes.h
+++ b/simtbx/nanoBragg/nanotypes.h
@@ -25,7 +25,7 @@ struct hklParams {
 };
 
 typedef enum { SAMPLE, BEAM } pivot;
-typedef enum { UNKNOWN, SQUARE, ROUND, GAUSS, GAUSS_ARGCHK, TOPHAT, FIBER } shapetype;
+typedef enum { UNKNOWN, SQUARE, ROUND, GAUSS, GAUSS_ARGCHK, TOPHAT, FIBER, GAUSS_STAR } shapetype;
 // GAUSS_ARGCHK provides a lightweight backdoor for efficient implementation of
 // the GAUSS shapetype in CUDA. Precaluates the argument of the exponential.
 typedef enum { CUSTOM, ADXV, MOSFLM, XDS, DIALS, DENZO } convention;
diff --git a/simtbx/nanoBragg/sim_data.py b/simtbx/nanoBragg/sim_data.py
index f40c2581da..604c442adb 100644
--- a/simtbx/nanoBragg/sim_data.py
+++ b/simtbx/nanoBragg/sim_data.py
@@ -371,9 +371,11 @@ def update_Fhkl_tuple(self):
         Fimag = flex.double(Fimag)
         self.D.Fhkl_tuple = ma_on_detector.indices(), Freal, Fimag
       elif self.using_diffBragg_spots:
-        self.D.Fhkl_tuple = ma_on_detector.indices(), ma_on_detector.data(), None
+        inds = ma_on_detector.indices()
+        dat = ma_on_detector.data()
+        self.D.Fhkl_tuple = inds, dat, None
       else:
-        self.D.Fhkl_tuple = self.crystal.miller_array.indices(), self.crystal.miller_array.data(), None
+        self.D.Fhkl_tuple = ma_on_detector.indices(), ma_on_detector.data(), None
 
   def set_dspace_binning(self, nbins, verbose=False):
     dsp = self.D.Fhkl.d_spacings().data().as_numpy_array()
@@ -406,10 +408,13 @@ def _crystal_properties(self):
       self.D.Omatrix = self.crystal.Omatrix
       self.D.Bmatrix = self.crystal.dxtbx_crystal.get_B() #
       self.D.Umatrix = self.crystal.dxtbx_crystal.get_U()
-    self.update_Fhkl_tuple()
-    self.D.unit_cell_tuple = self.crystal.dxtbx_crystal.get_unit_cell().parameters()
 
     if self.using_diffBragg_spots:
+      shape = str(self.crystal.xtal_shape)
+      if shape not in ["Gauss", "Gauss_star", "Square"]:
+        raise ValueError("nanoBragg_crystal xtal_shape should be square, gauss or gauss_star")
+      self.update_Fhkl_tuple()
+      self.D.unit_cell_tuple = self.crystal.dxtbx_crystal.get_unit_cell().parameters()
       # init mosaic domain size:
       if self.crystal.isotropic_ncells:
         self.D.Ncells_abc = self.crystal.Ncells_abc[0]
diff --git a/simtbx/nanoBragg/tst_nanoBragg_Flatt_sym.py b/simtbx/nanoBragg/tst_nanoBragg_Flatt_sym.py
new file mode 100644
index 0000000000..f8d9554952
--- /dev/null
+++ b/simtbx/nanoBragg/tst_nanoBragg_Flatt_sym.py
@@ -0,0 +1,242 @@
+from __future__ import division
+"""
+The script attempts to simulate diffraction patterns
+for symmetrically equivalent crystal orientations, and tests if the intensities change.
+If changes in intensity are detected, the stdout will indicate Failure.
+"""
+from argparse import ArgumentParser
+parser = ArgumentParser()
+parser.add_argument("sym", type=str, choices=["C","I","F","P1","P2","P3","P4","P6"], help="crystal system")
+parser.add_argument("--cuda", action="store_true", help="Uses Giles nanoBragg CUDA kernel")
+parser.add_argument("--fix", action="store_true", help="fix using symmetric mosaic blocks (exaFEL fix)")
+parser.add_argument("--mos", action="store_true", help="add mosaic spread")
+parser.add_argument("--plot", action="store_true", help="display plots at the end")
+parser.add_argument("--gauss", action="store_true", help="Use the original Gaussian RELP shape")
+parser.add_argument("--square", action="store_true", help="Use the Square RELP shape")
+args = parser.parse_args()
+from scipy.spatial.transform import Rotation
+import numpy as np
+from simtbx.diffBragg import utils as db_utils
+
+from simtbx.nanoBragg import nanoBragg, shapetype
+from scitbx.matrix import sqr
+from dxtbx.model import Crystal
+from dxtbx.model import DetectorFactory, BeamFactory
+from cctbx import sgtbx, miller, crystal, uctbx
+from cctbx.array_family import flex
+from cctbx.sgtbx.literal_description import literal_description
+
+
+# ===================
+# prepare the miller array for nanoBragg
+def prep_miller_array(mill_arr):
+    """prep for nanoBragg"""
+    # TODO: is this correct order of things?
+    cb_op = mill_arr.space_group_info().change_of_basis_op_to_primitive_setting()
+    mill_arr = mill_arr.expand_to_p1()
+    mill_arr = mill_arr.generate_bijvoet_mates()
+    dtrm = cb_op.c().r().determinant()
+    if not dtrm == 1:
+        mill_arr = mill_arr.change_basis(cb_op)
+    return mill_arr
+
+
+# simple method to check whether the structure factors in nanoBragg obey the symmetry
+def sanity_check_Fs(SIM, sg, ucell_p1):
+    """
+    SIM: nanoBragg instance
+    sg: space group operator
+    ucell_p1: p1 unit cell params
+    """
+    inds, amps = SIM.Fhkl_tuple
+    print(ucell_p1)
+
+    sym = crystal.symmetry(ucell_p1, "P1")
+    mset = miller.set(sym, flex.miller_index(inds), True)
+    ma = miller.array(mset, flex.double(amps)).set_observation_type_xray_amplitude().resolution_filter(-1,
+                                                                                                       2)
+    print(sg.info())
+    ops = [o for o in sg.build_derived_laue_group().all_ops() if o.r().determinant() == 1]
+    for o in ops:
+        print("Op=", o.as_xyz())
+        ma2 = ma.change_basis(sgtbx.change_of_basis_op(o))
+        r = ma.r1_factor(ma2, assume_index_matching=True)
+        print("R=", r, "\n")
+        assert r == 0
+# ===================
+
+add_spots_func="add_nanoBragg_spots"
+if args.cuda:
+    add_spots_func = "add_nanoBragg_spots_cuda"
+
+syms = {
+ 'C':  # 1hk5
+     [(190.14, 39.04, 96.23, 90, 105.07, 90),
+    "C 1 2 1"],
+ 'I':  # 3dll
+     [(169.7, 410, 695, 90, 90, 90),
+     "I 2 2 2"],
+ 'F':  # 1r03
+     [(181.41, 181.41, 181.41, 90, 90, 90),
+     "F 4 3 2"],
+ 'P6':  # 4pgu
+     [(62.207, 62.207, 293.474, 90, 90, 120),
+     "P 65 2 2"],
+ 'P3':  # 3dxj
+     [(235.09, 235.09, 250.88, 90, 90, 120),
+     "P 32"],
+ 'P4':  # 1z35
+     [(136.8, 136.8, 136.8, 90, 90, 90),
+     "P 41 3 2"],
+ 'P2':  # 3nxs
+     [(78.63, 91.46, 57.47, 90, 90, 90),
+     "P 21 21 2"],
+ 'P1':  # 3l89
+     [(94.53, 107.71, 154.1, 90.01, 90.1, 104.7),
+     "P 1"]}
+
+ucell_p, symbol = syms[args.sym]
+ucell = uctbx.unit_cell(ucell_p)
+F = db_utils.make_miller_array(unit_cell=ucell_p, symbol=symbol)
+
+sg = F.space_group()
+sgi = sg.info()
+print("unit cell, space group:\n", F, "\n")
+
+O = ucell.orthogonalization_matrix()
+# real space vectors
+a = O[0], O[3], O[6]
+b = O[1], O[4], O[7]
+c = O[2], O[5], O[8]
+C_sg = Crystal(a,b,c,sg)
+# this should have the identity as a Umat
+assert np.allclose(C_sg.get_U(), (1,0,0,0,1,0,0,0,1))
+
+# convert crystal to P1
+to_p1_op = sgi.change_of_basis_op_to_primitive_setting()
+C_p1 = C_sg.change_basis(to_p1_op)
+
+# random orientation
+Misset = sqr(Rotation.random(random_state=1).as_matrix().flatten())
+# reorient the P1 crystal
+U_p1 = sqr(C_p1.get_U())
+C_p1.set_U(Misset*U_p1)
+
+# make a detector and a beam
+beam = BeamFactory.simple(wavelength=1)
+detector = DetectorFactory.simple(
+    sensor='PAD',
+    distance=200,
+    beam_centre=(51.25,51.25),
+    fast_direction='+x',
+    slow_direction='-y',
+    pixel_size=(.1,.1),
+    image_size=(1024,1024))
+
+SIM = nanoBragg(detector=detector, beam=beam)
+#SIM.printout_pixel_fastslow = 504, 422
+SIM.xtal_shape = shapetype.Gauss_star
+if args.gauss and args.square:
+    raise RuntimeError("Can only have gauss, square or neither (default, which is Gauss_star)")
+if args.gauss:
+    SIM.xtal_shape = shapetype.Gauss
+if args.square:
+    SIM.xtal_shape = shapetype.Square
+
+SIM.oversample = 1
+SIM.interpolate = 0
+# convert to P1 (and change basis depending on space group)
+F_p1 = prep_miller_array(F)
+SIM.Fhkl_tuple = F_p1.indices(), F_p1.data()
+SIM.Amatrix = sqr(C_p1.get_A()).transpose()
+if args.mos:
+    SIM.mosaic_spread_deg = 1
+    SIM.mosaic_domains = 10
+assert np.allclose(SIM.unit_cell_tuple, C_p1.get_unit_cell().parameters())
+SIM.Ncells_abc = 12, 12, 12
+if args.fix:
+    SIM.set_mosaic_blocks_sym(C_p1, reference_symbol=sgi.type().lookup_symbol(), orig_mos_domains=1)
+getattr(SIM, add_spots_func)()
+reference = SIM.raw_pixels.as_numpy_array()
+print(SIM.fudge)
+#exit()
+
+#sanity_check_Fs(SIM, sg, C_p1.get_unit_cell().parameters())
+print("USINGSHAPE:", SIM.xtal_shape)
+
+ops = sg.build_derived_laue_group().all_ops()
+ops = [o for o in ops if o.r().determinant() == 1]
+
+results = []
+for o in ops:
+    sg_op = sgtbx.change_of_basis_op(o)
+    C_o = C_sg.change_basis(sg_op).change_basis(to_p1_op)
+    U = sqr(C_o.get_U())
+    C_o.set_U(Misset*U)
+    Amat = sqr(C_o.get_A())
+    if args.fix:
+        SIM.set_mosaic_blocks_sym(C_o, sgi.type().lookup_symbol(), orig_mos_domains=1)
+    SIM.raw_pixels *= 0
+    SIM.Amatrix = Amat.transpose()
+
+    print("operator:", o.as_xyz())
+    forms = literal_description(o)
+    print(forms.long_form())
+    getattr(SIM, add_spots_func)()
+    img = SIM.raw_pixels.as_numpy_array()
+    if not np.allclose(img, reference, atol=1e-7):
+        passed=False
+        print("FAILED\n")
+        SIM.raw_pixels *= 0
+        getattr(SIM, add_spots_func)()
+    else:
+        passed=True
+        print("PASSED\n")
+    results.append((passed, img, o))
+
+SIM.free_all()
+nfail = sum([not passed for passed,_,_ in results ])
+nops = len(results)
+if nfail > 0:
+    print("Test failed for %d / %d ops." %(nfail, len(results)))
+else:
+    print("All tests pass (%d ops) ! " %(len(results)))
+    print("Ok!")
+
+if args.plot:
+    try:
+        from pylab import *
+        from itertools import cycle
+        imgs = cycle(results)
+        m = reference[reference >0].mean()
+        s = reference[reference>0].std()
+        vmax=m+0.5*s
+        fig, (ax1,ax2) = subplots(nrows=1, ncols=2, layout='constrained')
+        fig.set_size_inches((8,4))
+        ax1.imshow(reference, vmin=0, vmax=vmax)
+        ax2.imshow(results[0][1], vmin=0, vmax=vmax)
+        ax1.set_title("+x,+y,+z", fontsize=16)
+        suptitle("%s" % str(sgi), fontsize=16)
+        for ax in (ax1, ax2):
+            ax.set_xlim(300, 700)
+            ax.set_ylim(700, 300)
+        while 1:
+            if not fignum_exists(fig.number):
+                break
+            passed, img, op = next(imgs)
+            ax2.images[0].set_data(img)
+            pass_s = "passed" if passed else "failed"
+            def check_sign(x):
+                if not x.startswith("-"):
+                    x = "+"+x
+                return x
+            x,y,z = map(check_sign, op.as_xyz().split(','))
+            xyz = ",".join([x,y,z])
+
+            ax2.set_title("%s (%s)" % (xyz, pass_s), fontsize=16)
+            plt.draw()
+            plt.pause(2)
+
+    except KeyboardInterrupt:
+        close()
+        exit()
diff --git a/simtbx/run_tests.py b/simtbx/run_tests.py
index 2a677dc246..6b8dbfa977 100644
--- a/simtbx/run_tests.py
+++ b/simtbx/run_tests.py
@@ -24,7 +24,6 @@
     "$D/diffBragg/tests/tst_diffBragg_update_dxtbx_geoms.py",
     "$D/diffBragg/tests/tst_diffBragg_deriv_rois.py",
     "$D/diffBragg/tests/tst_diffBragg_detdist_derivatives.py",
-    "$D/diffBragg/tests/tst_diffBragg_nanoBragg_congruency.py",
     "$D/diffBragg/tests/tst_diffBragg_ncells_property.py",
     "$D/diffBragg/tests/tst_diffBragg_ncells_offdiag_property.py",
     ["$D/diffBragg/tests/tst_diffBragg_ncells_offdiag_property.py", "--idx 1"],
@@ -75,6 +74,7 @@
 tst_list = nb_tst_list
 if OPT.enable_cxx11 and sys.platform != 'win32':
   tst_list += db_tst_list+db_tst_list_nonCuda
+  tst_list += ["$D/tests/tst_api_congruency.py"]
 
 if OPT.enable_cuda:
   tst_list_parallel += [
@@ -109,6 +109,7 @@
       tst_list_parallel.append(par_tst)
 
     tst_list_parallel += db_tst_list_onlyCuda
+    tst_list_parallel += ["$D/tests/tst_api_congruency.py", "--kokkos"]
 
 def run():
   build_dir = libtbx.env.under_build("simtbx")
diff --git a/simtbx/tests/tst_api_congruency.py b/simtbx/tests/tst_api_congruency.py
new file mode 100644
index 0000000000..fb0f61a02f
--- /dev/null
+++ b/simtbx/tests/tst_api_congruency.py
@@ -0,0 +1,115 @@
+from __future__ import division
+from simtbx.diffBragg.utils import find_diffBragg_instances
+import numpy as np
+
+""""
+CHECK NANOBRAGG, DIFFBRAGG, and EXAFEL+KOKKOS PRODUCE EQUIVALENT RESULTS
+NOTE: Cannot --kokkos flag will skip the exafel API
+"""
+
+import sys
+import libtbx.load_env
+HAS_KOKKOS_EXAFEL = False
+if libtbx.env.build_options.enable_kokkos and sys.platform.startswith('linux'):
+    # the Exafel KOKKOS API:
+    from simtbx.kokkos import gpu_energy_channels
+    from simtbx.kokkos import exascale_api
+    from simtbx.kokkos import gpu_detector as kokkosd
+    HAS_KOKKOS_EXAFEL = True
+
+
+def _test_exafel(S, reference, add_background=False):
+    """
+    # ONLY RUN THIS IF KOKKOS INSTANCE EXISTS
+    :param S:  SimData instance (nanoBragg/sim_data.py)
+    :param reference: reference 2D image from different API to test against
+    :param add_background: whether or not to include background
+    """
+    kokkos_channels_singleton = gpu_energy_channels()
+    inds, amps = S.D.Fhkl_tuple
+    kokkos_channels_singleton.structure_factors_to_GPU_direct(0, inds, amps)
+    kokkos_simulation = exascale_api(nanoBragg=S.D)
+    kokkos_simulation.allocate()
+
+    kokkos_detector = kokkosd(detector=S.detector, beam=S.D.xray_beams[0])
+    kokkos_detector.each_image_allocate()
+
+    # loop over energies
+    kokkos_simulation.add_energy_channel_from_gpu_amplitudes(
+        0, kokkos_channels_singleton, kokkos_detector)
+    if add_background:
+        kokkos_simulation.add_background(kokkos_detector)
+    kokkos_detector.write_raw_pixels(S.D)
+    kokkos_img = S.D.raw_pixels.as_numpy_array()
+    if add_background:
+        # note, why is there a difference in the images when adding a background?
+        assert np.allclose(kokkos_img, reference, atol=1e-3, rtol=0)
+    else:
+        assert np.allclose(kokkos_img, reference)
+
+
+def main(shape="gauss_star", background=True, test_exafel=False, nb_cuda=False):
+    # TODO: test non-P space groups
+    # TODO: test divergence model
+    # TODO: test dispersion model
+
+    from simtbx.nanoBragg.sim_data import SimData
+    S = SimData(use_default_crystal=True)
+    # Important, only add water OR air, but not both, because the exafel API will only look for one background model
+    S.add_water = True
+    S.add_air = False
+    S.crystal.xtal_shape = shape
+    S.crystal.mos_spread_deg = 1
+    S.crystal.n_mos_domains = 20
+    S.instantiate_diffBragg()
+    S.D.nopolar = True
+    S.D.oversample = 3
+    S.D.spot_scale = 10
+
+    S.D.add_diffBragg_spots()
+    if background:
+        S._add_background()
+
+    diff_img = S.D.raw_pixels.as_numpy_array()
+
+    S.D.raw_pixels *= 0
+    if nb_cuda:
+        S.D.add_nanoBragg_spots_cuda()
+    else:
+        S.D.add_nanoBragg_spots()
+    if background:
+        S._add_background()
+
+    nano_img = S.D.raw_pixels.as_numpy_array()
+
+    assert np.allclose(diff_img, nano_img, atol=1e-9)
+    a = nano_img.mean()
+    b = nano_img.max()
+    c = nano_img.min()
+    d = nano_img.std()
+    e = np.median(nano_img)
+    vals = a,b,c,d,e
+    print("Reference image has pixel values Mean,Max,Min,Median,Std=%f,%f,%f,%f,%f" %vals)
+    if HAS_KOKKOS_EXAFEL and test_exafel:
+        S.D.raw_pixels *= 0
+        _test_exafel(S, nano_img, background)
+    else:
+        print("No Kokkos GPU instance, skipping _test_exafel ... ")
+    for name in find_diffBragg_instances(globals()): del globals()[name]
+
+
+if __name__ == "__main__":
+    import sys
+    import os
+    if "--kokkos" in sys.argv:
+        # If we use this flag, then the deviceWrapper will instantiate KOKKOS instance
+        os.environ["DIFFBRAGG_USE_KOKKOS"]="1"
+    nb_cuda = "--cuda" in sys.argv
+
+    from simtbx.diffBragg.device import DeviceWrapper
+    with DeviceWrapper(0) as wrapper:
+        for background in [False,True]:
+            test_exafel = wrapper.is_using_kokkos_gpu
+            main("gauss", background=background, test_exafel=test_exafel, nb_cuda=nb_cuda)
+            main("gauss_star", background=background, test_exafel=test_exafel, nb_cuda=nb_cuda)
+    print ("OK!")