Skip to content

Commit

Permalink
replaced old rotate by call to new rotate
Browse files Browse the repository at this point in the history
  • Loading branch information
gitpeterwind committed Dec 12, 2023
1 parent 83df62a commit 6297908
Showing 1 changed file with 10 additions and 372 deletions.
382 changes: 10 additions & 372 deletions src/utils/ComplexFunction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -654,6 +654,14 @@ void cplxfunc::multiply_imag(ComplexFunction &out, ComplexFunction inp_a, Comple
namespace mpifuncvec {


/** @brief Make a linear combination of functions
*
* Uses "local" representation: treats one node at a time.
* For each node, all functions are transformed simultaneously
* by a dense matrix multiplication.
* Phi input functions, Psi output functions
*
*/
void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, MPI_FuncVector &Psi, double prec) {

Check warning on line 665 in src/utils/ComplexFunction.cpp

View check run for this annotation

Codecov / codecov/patch

src/utils/ComplexFunction.cpp#L665

Added line #L665 was not covered by tests

// The principle of this routine is that nodes are rotated one by one using matrix multiplication.
Expand Down Expand Up @@ -1036,378 +1044,8 @@ void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, MPI_FuncVector &Psi, do


void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec) {

// The principle of this routine is that nodes are rotated one by one using matrix multiplication.
// The routine does avoid when possible to move data, but uses pointers and indices manipulation.
// MPI version does not use OMP yet, Serial version uses OMP
int N = Phi.size();

// 1) make union tree without coefficients
FunctionTree<3> refTree(*Phi.vecMRA);
mpi::allreduce_Tree_noCoeff(refTree, Phi, mpi::comm_wrk);

int sizecoeff = (1 << refTree.getDim()) * refTree.getKp1_d();
int sizecoeffW = ((1 << refTree.getDim()) - 1) * refTree.getKp1_d();
std::vector<double> scalefac_ref;
std::vector<double *> coeffVec_ref; // not used!
std::vector<int> indexVec_ref; // serialIx of the nodes
std::vector<int> parindexVec_ref; // serialIx of the parent nodes
int max_ix;
// get a list of all nodes in union tree, identified by their serialIx indices
refTree.makeCoeffVector(coeffVec_ref, indexVec_ref, parindexVec_ref, scalefac_ref, max_ix, refTree);
int max_n = indexVec_ref.size();

// 2) We work with real numbers only. Make real blocks for U matrix
bool UhasReal = false;
bool UhasImag = false;
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
if (std::abs(U(i, j).real()) > MachineZero) UhasReal = true;
if (std::abs(U(i, j).imag()) > MachineZero) UhasImag = true;
}
}

IntVector PsihasReIm = IntVector::Zero(2);
for (int i = 0; i < N; i++) {
if (!mpi::my_orb(i)) continue;
PsihasReIm[0] = (Phi[i].hasReal()) ? 1 : 0;
PsihasReIm[1] = (Phi[i].hasImag()) ? 1 : 0;
}
mpi::allreduce_vector(PsihasReIm, mpi::comm_wrk);
if (not PsihasReIm[0] and not PsihasReIm[1]) {
return; // do nothing
}

bool makeReal = (UhasReal and PsihasReIm[0]) or (UhasImag and PsihasReIm[1]);
bool makeImag = (UhasReal and PsihasReIm[1]) or (UhasImag and PsihasReIm[0]);

for (int i = 0; i < N; i++) {
if (!mpi::my_orb(i)) continue;
if (not makeReal and Phi[i].hasReal()) Phi[i].free(NUMBER::Real);
if (not makeImag and Phi[i].hasImag()) Phi[i].free(NUMBER::Imag);
}

if (not makeReal and not makeImag) { return; }

int Neff = N; // effective number of orbitals
if (makeImag) Neff = 2 * N; // Imag and Real treated independently. We always use real part of U

IntVector conjMat = IntVector::Zero(Neff);
for (int i = 0; i < Neff; i++) {
if (!mpi::my_orb(i % N)) continue;
conjMat[i] = (Phi[i % N].conjugate()) ? -1 : 1;
}
mpi::allreduce_vector(conjMat, mpi::comm_wrk);

// we make a real matrix = U, but organized as one or four real blocks
// out_r = U_rr*in_r - U_ir*in_i*conjMat
// out_i = U_ri*in_r - U_ii*in_i*conjMat
// the first index of U is the one used on input Phi
DoubleMatrix Ureal(Neff, Neff); // four blocks, for rr ri ir ii
for (int i = 0; i < Neff; i++) {
for (int j = 0; j < Neff; j++) {
double sign = 1.0;
if (i < N and j < N) {
// real U applied on real Phi
Ureal(i, j) = U.real()(i % N, j % N);
} else if (i >= N and j >= N) {
// real U applied on imag Phi
Ureal(i, j) = conjMat[i] * U.real()(i % N, j % N);
} else if (i < N and j >= N) {
// imag U applied on real Phi
Ureal(i, j) = U.imag()(i % N, j % N);
} else {
// imag U applied on imag Phi
Ureal(i, j) = -1.0 * conjMat[i] * U.imag()(i % N, j % N);
}
}
}

// 3) In the serial case we store the coeff pointers in coeffVec. In the mpi case the coeff are stored in the bank

bool serial = mpi::wrk_size == 1; // flag for serial/MPI switch
BankAccount nodesPhi; // to put the original nodes
BankAccount nodesRotated; // to put the rotated nodes

// used for serial only:
std::vector<std::vector<double *>> coeffVec(Neff);
std::vector<std::vector<int>> indexVec(Neff); // serialIx of the nodes
std::map<int, std::vector<int>> node2orbVec; // for each node index, gives a vector with the indices of the orbitals using this node
std::vector<std::map<int, int>> orb2node(Neff); // for a given orbital and a given node, gives the node index in the
// orbital given the node index in the reference tree
if (serial) {

// make list of all coefficients (coeffVec), and their reference indices (indexVec)
std::vector<int> parindexVec; // serialIx of the parent nodes
std::vector<double> scalefac;
for (int j = 0; j < N; j++) {
// make vector with all coef pointers and their indices in the union grid
if (Phi[j].hasReal()) {
Phi[j].real().makeCoeffVector(coeffVec[j], indexVec[j], parindexVec, scalefac, max_ix, refTree);
// make a map that gives j from indexVec
int orb_node_ix = 0;
for (int ix : indexVec[j]) {
orb2node[j][ix] = orb_node_ix++;
if (ix < 0) continue;
node2orbVec[ix].push_back(j);
}
}
if (Phi[j].hasImag()) {
Phi[j].imag().makeCoeffVector(coeffVec[j + N], indexVec[j + N], parindexVec, scalefac, max_ix, refTree);
// make a map that gives j from indexVec
int orb_node_ix = 0;
for (int ix : indexVec[j + N]) {
orb2node[j + N][ix] = orb_node_ix++;
if (ix < 0) continue;
node2orbVec[ix].push_back(j + N);
}
}
}
} else { // MPI case

// send own nodes to bank, identifying them through the serialIx of refTree
mpifuncvec::save_nodes(Phi, refTree, nodesPhi);
mpi::barrier(mpi::comm_wrk); // required for now, as the blockdata functionality has no queue yet.
}

// 4) rotate all the nodes
IntMatrix split_serial; // in the serial case all split are store in one array
std::vector<double> split(Neff, -1.0); // which orbitals need splitting (at a given node). For now double for compatibilty with bank
std::vector<double> needsplit(Neff, 1.0); // which orbitals need splitting
std::vector<std::vector<double *>> coeffpVec(Neff); // to put pointers to the rotated coefficient for each orbital in serial case
std::vector<std::map<int, int>> ix2coef(Neff); // to find the index in for example rotCoeffVec[] corresponding to a serialIx
int csize; // size of the current coefficients (different for roots and branches)
std::vector<DoubleMatrix> rotatedCoeffVec; // just to ensure that the data from rotatedCoeff is not deleted, since we point to it.
// j indices are for unrotated orbitals, i indices are for rotated orbitals
if (serial) {
std::map<int, int> ix2coef_ref; // to find the index n corresponding to a serialIx
split_serial.resize(Neff, max_n); // not use in the MPI case
for (int n = 0; n < max_n; n++) {
int node_ix = indexVec_ref[n]; // SerialIx for this node in the reference tree
ix2coef_ref[node_ix] = n;
for (int i = 0; i < Neff; i++) split_serial(i, n) = 1;
}

std::vector<int> nodeReady(max_n, 0); // To indicate to OMP threads that the parent is ready (for splits)

// assumes the nodes are ordered such that parent are treated before children. BFS or DFS ok.
// NB: the n must be traversed approximately in right order: Thread n may have to wait until som other preceding
// n is finished.
#pragma omp parallel for schedule(dynamic)
for (int n = 0; n < max_n; n++) {
int csize;
int node_ix = indexVec_ref[n]; // SerialIx for this node in the reference tree
// 4a) make a dense contiguous matrix with the coefficient from all the orbitals using node n
std::vector<int> orbjVec; // to remember which orbital correspond to each orbVec.size();
if (node2orbVec[node_ix].size() <= 0) continue;
csize = sizecoeffW;
if (parindexVec_ref[n] < 0) csize = sizecoeff; // for root nodes we include scaling coeff

int shift = sizecoeff - sizecoeffW; // to copy only wavelet part
if (parindexVec_ref[n] < 0) shift = 0;
DoubleMatrix coeffBlock(csize, node2orbVec[node_ix].size());
for (int j : node2orbVec[node_ix]) { // loop over indices of the orbitals using this node
int orb_node_ix = orb2node[j][node_ix];
for (int k = 0; k < csize; k++) coeffBlock(k, orbjVec.size()) = coeffVec[j][orb_node_ix][k + shift];
orbjVec.push_back(j);
}

// 4b) make a list of rotated orbitals needed for this node
// OMP must wait until parent is ready
while (parindexVec_ref[n] >= 0 and nodeReady[ix2coef_ref[parindexVec_ref[n]]] == 0) {
#pragma omp flush
};

std::vector<int> orbiVec;
for (int i = 0; i < Neff; i++) { // loop over all rotated orbitals
if (not makeReal and i < N) continue;
if (not makeImag and i >= N) continue;
if (parindexVec_ref[n] >= 0 and split_serial(i, ix2coef_ref[parindexVec_ref[n]]) == 0) continue; // parent node has too small wavelets
orbiVec.push_back(i);
}

// 4c) rotate this node
DoubleMatrix Un(orbjVec.size(), orbiVec.size()); // chunk of U, with reorganized indices
for (int i = 0; i < orbiVec.size(); i++) { // loop over rotated orbitals
for (int j = 0; j < orbjVec.size(); j++) { Un(j, i) = Ureal(orbjVec[j], orbiVec[i]); }
}
DoubleMatrix rotatedCoeff(csize, orbiVec.size());
// HERE IT HAPPENS!
rotatedCoeff.noalias() = coeffBlock * Un; // Matrix mutiplication

// 4d) store and make rotated node pointers
// for now we allocate in buffer, in future could be directly allocated in the final trees
double thres = prec * prec * scalefac_ref[n] * scalefac_ref[n];
// make all norms:
for (int i = 0; i < orbiVec.size(); i++) {
// check if parent must be split
if (parindexVec_ref[n] == -1 or split_serial(orbiVec[i], ix2coef_ref[parindexVec_ref[n]])) {
// mark this node for this orbital for later split
#pragma omp critical
{
ix2coef[orbiVec[i]][node_ix] = coeffpVec[orbiVec[i]].size();
coeffpVec[orbiVec[i]].push_back(&(rotatedCoeff(0, i))); // list of coefficient pointers
}
// check norms for split
double wnorm = 0.0; // rotatedCoeff(k, i) is already in cache here
int kstart = 0;
if (parindexVec_ref[n] < 0) kstart = sizecoeff - sizecoeffW; // do not include scaling, even for roots
for (int k = kstart; k < csize; k++) wnorm += rotatedCoeff(k, i) * rotatedCoeff(k, i);
if (thres < wnorm or prec < 0)
split_serial(orbiVec[i], n) = 1;
else
split_serial(orbiVec[i], n) = 0;
} else {
ix2coef[orbiVec[i]][node_ix] = max_n + 1; // should not be used
split_serial(orbiVec[i], n) = 0; // do not split if parent does not need to be split
}
}
nodeReady[n] = 1;
#pragma omp critical
{
// this ensures that rotatedCoeff is not deleted, when getting out of scope
rotatedCoeffVec.push_back(std::move(rotatedCoeff));
}
}

} else { // MPI case

// TODO? rotate in bank, so that we do not get and put. Requires clever handling of splits.
std::vector<double> split(Neff, -1.0); // which orbitals need splitting (at a given node). For now double for compatibilty with bank
std::vector<double> needsplit(Neff, 1.0); // which orbitals need splitting
BankAccount nodeSplits;
mpi::barrier(mpi::comm_wrk); // required for now, as the blockdata functionality has no queue yet.

DoubleMatrix coeffBlock(sizecoeff, Neff);
max_ix++; // largest node index + 1. to store rotated orbitals with different id
TaskManager tasks(max_n);
for (int nn = 0; nn < max_n; nn++) {
int n = tasks.next_task();
if (n < 0) break;
double thres = prec * prec * scalefac_ref[n] * scalefac_ref[n];
// 4a) make list of orbitals that should split the parent node, i.e. include this node
int parentid = parindexVec_ref[n];
if (parentid == -1) {
// root node, split if output needed
for (int i = 0; i < N; i++) {
if (makeReal)
split[i] = 1.0;
else
split[i] = -1.0;
}
for (int i = N; i < Neff; i++) {
if (makeImag)
split[i] = 1.0;
else
split[i] = -1.0;
}
csize = sizecoeff;
} else {
// note that it will wait until data is available
nodeSplits.get_data(parentid, Neff, split.data());
csize = sizecoeffW;
}
std::vector<int> orbiVec;
std::vector<int> orbjVec;
for (int i = 0; i < Neff; i++) { // loop over rotated orbitals
if (split[i] < 0.0) continue; // parent node has too small wavelets
orbiVec.push_back(i);
}

// 4b) rotate this node
DoubleMatrix coeffBlock(csize, Neff); // largest possible used size
nodesPhi.get_nodeblock(indexVec_ref[n], coeffBlock.data(), orbjVec);
coeffBlock.conservativeResize(Eigen::NoChange, orbjVec.size()); // keep only used part

// chunk of U, with reorganized indices and separate blocks for real and imag:
DoubleMatrix Un(orbjVec.size(), orbiVec.size());
DoubleMatrix rotatedCoeff(csize, orbiVec.size());

for (int i = 0; i < orbiVec.size(); i++) { // loop over included rotated real and imag part of orbitals
for (int j = 0; j < orbjVec.size(); j++) { // loop over input orbital, possibly imaginary parts
Un(j, i) = Ureal(orbjVec[j], orbiVec[i]);
}
}

// HERE IT HAPPENS
rotatedCoeff.noalias() = coeffBlock * Un; // Matrix mutiplication

// 3c) find which orbitals need to further refine this node, and store rotated node (after each other while
// in cache).
for (int i = 0; i < orbiVec.size(); i++) { // loop over rotated orbitals
needsplit[orbiVec[i]] = -1.0; // default, do not split
// check if this node/orbital needs further refinement
double wnorm = 0.0;
int kwstart = csize - sizecoeffW; // do not include scaling
for (int k = kwstart; k < csize; k++) wnorm += rotatedCoeff.col(i)[k] * rotatedCoeff.col(i)[k];
if (thres < wnorm or prec < 0) needsplit[orbiVec[i]] = 1.0;
nodesRotated.put_nodedata(orbiVec[i], indexVec_ref[n] + max_ix, csize, rotatedCoeff.col(i).data());
}
nodeSplits.put_data(indexVec_ref[n], Neff, needsplit.data());
}
mpi::barrier(mpi::comm_wrk); // wait until all rotated nodes are ready
}

// 5) reconstruct trees using rotated nodes.

// only serial case can use OMP, because MPI cannot be used by threads
if (serial) {
// OMP parallelized, but does not scale well, because the total memory bandwidth is a bottleneck. (the main
// operation is writing the coefficient into the tree)

#pragma omp parallel for schedule(static)
for (int j = 0; j < Neff; j++) {
if (j < N) {
if (!Phi[j].hasReal()) Phi[j].alloc(NUMBER::Real);
Phi[j].real().clear();
Phi[j].real().makeTreefromCoeff(refTree, coeffpVec[j], ix2coef[j], prec);
} else {
if (!Phi[j % N].hasImag()) Phi[j % N].alloc(NUMBER::Imag);
Phi[j % N].imag().clear();
Phi[j % N].imag().makeTreefromCoeff(refTree, coeffpVec[j], ix2coef[j], prec);
}
}

} else { // MPI case

for (int j = 0; j < Neff; j++) {
if (not mpi::my_orb(j % N)) continue;
// traverse possible nodes, and stop descending when norm is zero (leaf in out[j])
std::vector<double *> coeffpVec; //
std::map<int, int> ix2coef; // to find the index in coeffVec[] corresponding to a serialIx
int ix = 0;
std::vector<double *> pointerstodelete; // list of temporary arrays to clean up
for (int ibank = 0; ibank < mpi::bank_size; ibank++) {
std::vector<int> nodeidVec;
double *dataVec; // will be allocated by bank
nodesRotated.get_orbblock(j, dataVec, nodeidVec, ibank);
if (nodeidVec.size() > 0) pointerstodelete.push_back(dataVec);
int shift = 0;
for (int n = 0; n < nodeidVec.size(); n++) {
assert(nodeidVec[n] - max_ix >= 0); // unrotated nodes have been deleted
assert(ix2coef.count(nodeidVec[n] - max_ix) == 0); // each nodeid treated once
ix2coef[nodeidVec[n] - max_ix] = ix++;
csize = sizecoeffW;
if (parindexVec_ref[nodeidVec[n] - max_ix] < 0) csize = sizecoeff;
coeffpVec.push_back(&dataVec[shift]); // list of coeff pointers
shift += csize;
}
}
if (j < N) {
// Real part
if (!Phi[j].hasReal()) Phi[j].alloc(NUMBER::Real);
Phi[j].real().clear();
Phi[j].real().makeTreefromCoeff(refTree, coeffpVec, ix2coef, prec);
} else {
// Imag part
if (!Phi[j % N].hasImag()) Phi[j % N].alloc(NUMBER::Imag);
Phi[j % N].imag().clear();
Phi[j % N].imag().makeTreefromCoeff(refTree, coeffpVec, ix2coef, prec);
}
for (double *p : pointerstodelete) delete[] p;
pointerstodelete.clear();
}
}
rotate(Phi, U, Phi, prec);
return;

Check warning on line 1048 in src/utils/ComplexFunction.cpp

View check run for this annotation

Codecov / codecov/patch

src/utils/ComplexFunction.cpp#L1046-L1048

Added lines #L1046 - L1048 were not covered by tests
}

/** @brief Save all nodes in bank; identify them using serialIx from refTree
Expand Down

0 comments on commit 6297908

Please sign in to comment.