From 6297908d63c81b5d2f507135ad45901376c25199 Mon Sep 17 00:00:00 2001 From: gitpeterwind Date: Tue, 12 Dec 2023 09:48:35 +0100 Subject: [PATCH] replaced old rotate by call to new rotate --- src/utils/ComplexFunction.cpp | 382 +--------------------------------- 1 file changed, 10 insertions(+), 372 deletions(-) diff --git a/src/utils/ComplexFunction.cpp b/src/utils/ComplexFunction.cpp index cf25a67af..d49048532 100644 --- a/src/utils/ComplexFunction.cpp +++ b/src/utils/ComplexFunction.cpp @@ -654,6 +654,14 @@ void cplxfunc::multiply_imag(ComplexFunction &out, ComplexFunction inp_a, Comple namespace mpifuncvec { +/** @brief Make a linear combination of functions + * + * Uses "local" representation: treats one node at a time. + * For each node, all functions are transformed simultaneously + * by a dense matrix multiplication. + * Phi input functions, Psi output functions + * + */ void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, MPI_FuncVector &Psi, double prec) { // The principle of this routine is that nodes are rotated one by one using matrix multiplication. @@ -1036,378 +1044,8 @@ void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, MPI_FuncVector &Psi, do void rotate(MPI_FuncVector &Phi, const ComplexMatrix &U, double prec) { - - // The principle of this routine is that nodes are rotated one by one using matrix multiplication. - // The routine does avoid when possible to move data, but uses pointers and indices manipulation. - // MPI version does not use OMP yet, Serial version uses OMP - int N = Phi.size(); - - // 1) make union tree without coefficients - FunctionTree<3> refTree(*Phi.vecMRA); - mpi::allreduce_Tree_noCoeff(refTree, Phi, mpi::comm_wrk); - - int sizecoeff = (1 << refTree.getDim()) * refTree.getKp1_d(); - int sizecoeffW = ((1 << refTree.getDim()) - 1) * refTree.getKp1_d(); - std::vector scalefac_ref; - std::vector coeffVec_ref; // not used! - std::vector indexVec_ref; // serialIx of the nodes - std::vector parindexVec_ref; // serialIx of the parent nodes - int max_ix; - // get a list of all nodes in union tree, identified by their serialIx indices - refTree.makeCoeffVector(coeffVec_ref, indexVec_ref, parindexVec_ref, scalefac_ref, max_ix, refTree); - int max_n = indexVec_ref.size(); - - // 2) We work with real numbers only. Make real blocks for U matrix - bool UhasReal = false; - bool UhasImag = false; - for (int i = 0; i < N; i++) { - for (int j = 0; j < N; j++) { - if (std::abs(U(i, j).real()) > MachineZero) UhasReal = true; - if (std::abs(U(i, j).imag()) > MachineZero) UhasImag = true; - } - } - - IntVector PsihasReIm = IntVector::Zero(2); - for (int i = 0; i < N; i++) { - if (!mpi::my_orb(i)) continue; - PsihasReIm[0] = (Phi[i].hasReal()) ? 1 : 0; - PsihasReIm[1] = (Phi[i].hasImag()) ? 1 : 0; - } - mpi::allreduce_vector(PsihasReIm, mpi::comm_wrk); - if (not PsihasReIm[0] and not PsihasReIm[1]) { - return; // do nothing - } - - bool makeReal = (UhasReal and PsihasReIm[0]) or (UhasImag and PsihasReIm[1]); - bool makeImag = (UhasReal and PsihasReIm[1]) or (UhasImag and PsihasReIm[0]); - - for (int i = 0; i < N; i++) { - if (!mpi::my_orb(i)) continue; - if (not makeReal and Phi[i].hasReal()) Phi[i].free(NUMBER::Real); - if (not makeImag and Phi[i].hasImag()) Phi[i].free(NUMBER::Imag); - } - - if (not makeReal and not makeImag) { return; } - - int Neff = N; // effective number of orbitals - if (makeImag) Neff = 2 * N; // Imag and Real treated independently. We always use real part of U - - IntVector conjMat = IntVector::Zero(Neff); - for (int i = 0; i < Neff; i++) { - if (!mpi::my_orb(i % N)) continue; - conjMat[i] = (Phi[i % N].conjugate()) ? -1 : 1; - } - mpi::allreduce_vector(conjMat, mpi::comm_wrk); - - // we make a real matrix = U, but organized as one or four real blocks - // out_r = U_rr*in_r - U_ir*in_i*conjMat - // out_i = U_ri*in_r - U_ii*in_i*conjMat - // the first index of U is the one used on input Phi - DoubleMatrix Ureal(Neff, Neff); // four blocks, for rr ri ir ii - for (int i = 0; i < Neff; i++) { - for (int j = 0; j < Neff; j++) { - double sign = 1.0; - if (i < N and j < N) { - // real U applied on real Phi - Ureal(i, j) = U.real()(i % N, j % N); - } else if (i >= N and j >= N) { - // real U applied on imag Phi - Ureal(i, j) = conjMat[i] * U.real()(i % N, j % N); - } else if (i < N and j >= N) { - // imag U applied on real Phi - Ureal(i, j) = U.imag()(i % N, j % N); - } else { - // imag U applied on imag Phi - Ureal(i, j) = -1.0 * conjMat[i] * U.imag()(i % N, j % N); - } - } - } - - // 3) In the serial case we store the coeff pointers in coeffVec. In the mpi case the coeff are stored in the bank - - bool serial = mpi::wrk_size == 1; // flag for serial/MPI switch - BankAccount nodesPhi; // to put the original nodes - BankAccount nodesRotated; // to put the rotated nodes - - // used for serial only: - std::vector> coeffVec(Neff); - std::vector> indexVec(Neff); // serialIx of the nodes - std::map> node2orbVec; // for each node index, gives a vector with the indices of the orbitals using this node - std::vector> orb2node(Neff); // for a given orbital and a given node, gives the node index in the - // orbital given the node index in the reference tree - if (serial) { - - // make list of all coefficients (coeffVec), and their reference indices (indexVec) - std::vector parindexVec; // serialIx of the parent nodes - std::vector scalefac; - for (int j = 0; j < N; j++) { - // make vector with all coef pointers and their indices in the union grid - if (Phi[j].hasReal()) { - Phi[j].real().makeCoeffVector(coeffVec[j], indexVec[j], parindexVec, scalefac, max_ix, refTree); - // make a map that gives j from indexVec - int orb_node_ix = 0; - for (int ix : indexVec[j]) { - orb2node[j][ix] = orb_node_ix++; - if (ix < 0) continue; - node2orbVec[ix].push_back(j); - } - } - if (Phi[j].hasImag()) { - Phi[j].imag().makeCoeffVector(coeffVec[j + N], indexVec[j + N], parindexVec, scalefac, max_ix, refTree); - // make a map that gives j from indexVec - int orb_node_ix = 0; - for (int ix : indexVec[j + N]) { - orb2node[j + N][ix] = orb_node_ix++; - if (ix < 0) continue; - node2orbVec[ix].push_back(j + N); - } - } - } - } else { // MPI case - - // send own nodes to bank, identifying them through the serialIx of refTree - mpifuncvec::save_nodes(Phi, refTree, nodesPhi); - mpi::barrier(mpi::comm_wrk); // required for now, as the blockdata functionality has no queue yet. - } - - // 4) rotate all the nodes - IntMatrix split_serial; // in the serial case all split are store in one array - std::vector split(Neff, -1.0); // which orbitals need splitting (at a given node). For now double for compatibilty with bank - std::vector needsplit(Neff, 1.0); // which orbitals need splitting - std::vector> coeffpVec(Neff); // to put pointers to the rotated coefficient for each orbital in serial case - std::vector> ix2coef(Neff); // to find the index in for example rotCoeffVec[] corresponding to a serialIx - int csize; // size of the current coefficients (different for roots and branches) - std::vector rotatedCoeffVec; // just to ensure that the data from rotatedCoeff is not deleted, since we point to it. - // j indices are for unrotated orbitals, i indices are for rotated orbitals - if (serial) { - std::map ix2coef_ref; // to find the index n corresponding to a serialIx - split_serial.resize(Neff, max_n); // not use in the MPI case - for (int n = 0; n < max_n; n++) { - int node_ix = indexVec_ref[n]; // SerialIx for this node in the reference tree - ix2coef_ref[node_ix] = n; - for (int i = 0; i < Neff; i++) split_serial(i, n) = 1; - } - - std::vector nodeReady(max_n, 0); // To indicate to OMP threads that the parent is ready (for splits) - - // assumes the nodes are ordered such that parent are treated before children. BFS or DFS ok. - // NB: the n must be traversed approximately in right order: Thread n may have to wait until som other preceding - // n is finished. -#pragma omp parallel for schedule(dynamic) - for (int n = 0; n < max_n; n++) { - int csize; - int node_ix = indexVec_ref[n]; // SerialIx for this node in the reference tree - // 4a) make a dense contiguous matrix with the coefficient from all the orbitals using node n - std::vector orbjVec; // to remember which orbital correspond to each orbVec.size(); - if (node2orbVec[node_ix].size() <= 0) continue; - csize = sizecoeffW; - if (parindexVec_ref[n] < 0) csize = sizecoeff; // for root nodes we include scaling coeff - - int shift = sizecoeff - sizecoeffW; // to copy only wavelet part - if (parindexVec_ref[n] < 0) shift = 0; - DoubleMatrix coeffBlock(csize, node2orbVec[node_ix].size()); - for (int j : node2orbVec[node_ix]) { // loop over indices of the orbitals using this node - int orb_node_ix = orb2node[j][node_ix]; - for (int k = 0; k < csize; k++) coeffBlock(k, orbjVec.size()) = coeffVec[j][orb_node_ix][k + shift]; - orbjVec.push_back(j); - } - - // 4b) make a list of rotated orbitals needed for this node - // OMP must wait until parent is ready - while (parindexVec_ref[n] >= 0 and nodeReady[ix2coef_ref[parindexVec_ref[n]]] == 0) { -#pragma omp flush - }; - - std::vector orbiVec; - for (int i = 0; i < Neff; i++) { // loop over all rotated orbitals - if (not makeReal and i < N) continue; - if (not makeImag and i >= N) continue; - if (parindexVec_ref[n] >= 0 and split_serial(i, ix2coef_ref[parindexVec_ref[n]]) == 0) continue; // parent node has too small wavelets - orbiVec.push_back(i); - } - - // 4c) rotate this node - DoubleMatrix Un(orbjVec.size(), orbiVec.size()); // chunk of U, with reorganized indices - for (int i = 0; i < orbiVec.size(); i++) { // loop over rotated orbitals - for (int j = 0; j < orbjVec.size(); j++) { Un(j, i) = Ureal(orbjVec[j], orbiVec[i]); } - } - DoubleMatrix rotatedCoeff(csize, orbiVec.size()); - // HERE IT HAPPENS! - rotatedCoeff.noalias() = coeffBlock * Un; // Matrix mutiplication - - // 4d) store and make rotated node pointers - // for now we allocate in buffer, in future could be directly allocated in the final trees - double thres = prec * prec * scalefac_ref[n] * scalefac_ref[n]; - // make all norms: - for (int i = 0; i < orbiVec.size(); i++) { - // check if parent must be split - if (parindexVec_ref[n] == -1 or split_serial(orbiVec[i], ix2coef_ref[parindexVec_ref[n]])) { - // mark this node for this orbital for later split -#pragma omp critical - { - ix2coef[orbiVec[i]][node_ix] = coeffpVec[orbiVec[i]].size(); - coeffpVec[orbiVec[i]].push_back(&(rotatedCoeff(0, i))); // list of coefficient pointers - } - // check norms for split - double wnorm = 0.0; // rotatedCoeff(k, i) is already in cache here - int kstart = 0; - if (parindexVec_ref[n] < 0) kstart = sizecoeff - sizecoeffW; // do not include scaling, even for roots - for (int k = kstart; k < csize; k++) wnorm += rotatedCoeff(k, i) * rotatedCoeff(k, i); - if (thres < wnorm or prec < 0) - split_serial(orbiVec[i], n) = 1; - else - split_serial(orbiVec[i], n) = 0; - } else { - ix2coef[orbiVec[i]][node_ix] = max_n + 1; // should not be used - split_serial(orbiVec[i], n) = 0; // do not split if parent does not need to be split - } - } - nodeReady[n] = 1; -#pragma omp critical - { - // this ensures that rotatedCoeff is not deleted, when getting out of scope - rotatedCoeffVec.push_back(std::move(rotatedCoeff)); - } - } - - } else { // MPI case - - // TODO? rotate in bank, so that we do not get and put. Requires clever handling of splits. - std::vector split(Neff, -1.0); // which orbitals need splitting (at a given node). For now double for compatibilty with bank - std::vector needsplit(Neff, 1.0); // which orbitals need splitting - BankAccount nodeSplits; - mpi::barrier(mpi::comm_wrk); // required for now, as the blockdata functionality has no queue yet. - - DoubleMatrix coeffBlock(sizecoeff, Neff); - max_ix++; // largest node index + 1. to store rotated orbitals with different id - TaskManager tasks(max_n); - for (int nn = 0; nn < max_n; nn++) { - int n = tasks.next_task(); - if (n < 0) break; - double thres = prec * prec * scalefac_ref[n] * scalefac_ref[n]; - // 4a) make list of orbitals that should split the parent node, i.e. include this node - int parentid = parindexVec_ref[n]; - if (parentid == -1) { - // root node, split if output needed - for (int i = 0; i < N; i++) { - if (makeReal) - split[i] = 1.0; - else - split[i] = -1.0; - } - for (int i = N; i < Neff; i++) { - if (makeImag) - split[i] = 1.0; - else - split[i] = -1.0; - } - csize = sizecoeff; - } else { - // note that it will wait until data is available - nodeSplits.get_data(parentid, Neff, split.data()); - csize = sizecoeffW; - } - std::vector orbiVec; - std::vector orbjVec; - for (int i = 0; i < Neff; i++) { // loop over rotated orbitals - if (split[i] < 0.0) continue; // parent node has too small wavelets - orbiVec.push_back(i); - } - - // 4b) rotate this node - DoubleMatrix coeffBlock(csize, Neff); // largest possible used size - nodesPhi.get_nodeblock(indexVec_ref[n], coeffBlock.data(), orbjVec); - coeffBlock.conservativeResize(Eigen::NoChange, orbjVec.size()); // keep only used part - - // chunk of U, with reorganized indices and separate blocks for real and imag: - DoubleMatrix Un(orbjVec.size(), orbiVec.size()); - DoubleMatrix rotatedCoeff(csize, orbiVec.size()); - - for (int i = 0; i < orbiVec.size(); i++) { // loop over included rotated real and imag part of orbitals - for (int j = 0; j < orbjVec.size(); j++) { // loop over input orbital, possibly imaginary parts - Un(j, i) = Ureal(orbjVec[j], orbiVec[i]); - } - } - - // HERE IT HAPPENS - rotatedCoeff.noalias() = coeffBlock * Un; // Matrix mutiplication - - // 3c) find which orbitals need to further refine this node, and store rotated node (after each other while - // in cache). - for (int i = 0; i < orbiVec.size(); i++) { // loop over rotated orbitals - needsplit[orbiVec[i]] = -1.0; // default, do not split - // check if this node/orbital needs further refinement - double wnorm = 0.0; - int kwstart = csize - sizecoeffW; // do not include scaling - for (int k = kwstart; k < csize; k++) wnorm += rotatedCoeff.col(i)[k] * rotatedCoeff.col(i)[k]; - if (thres < wnorm or prec < 0) needsplit[orbiVec[i]] = 1.0; - nodesRotated.put_nodedata(orbiVec[i], indexVec_ref[n] + max_ix, csize, rotatedCoeff.col(i).data()); - } - nodeSplits.put_data(indexVec_ref[n], Neff, needsplit.data()); - } - mpi::barrier(mpi::comm_wrk); // wait until all rotated nodes are ready - } - - // 5) reconstruct trees using rotated nodes. - - // only serial case can use OMP, because MPI cannot be used by threads - if (serial) { - // OMP parallelized, but does not scale well, because the total memory bandwidth is a bottleneck. (the main - // operation is writing the coefficient into the tree) - -#pragma omp parallel for schedule(static) - for (int j = 0; j < Neff; j++) { - if (j < N) { - if (!Phi[j].hasReal()) Phi[j].alloc(NUMBER::Real); - Phi[j].real().clear(); - Phi[j].real().makeTreefromCoeff(refTree, coeffpVec[j], ix2coef[j], prec); - } else { - if (!Phi[j % N].hasImag()) Phi[j % N].alloc(NUMBER::Imag); - Phi[j % N].imag().clear(); - Phi[j % N].imag().makeTreefromCoeff(refTree, coeffpVec[j], ix2coef[j], prec); - } - } - - } else { // MPI case - - for (int j = 0; j < Neff; j++) { - if (not mpi::my_orb(j % N)) continue; - // traverse possible nodes, and stop descending when norm is zero (leaf in out[j]) - std::vector coeffpVec; // - std::map ix2coef; // to find the index in coeffVec[] corresponding to a serialIx - int ix = 0; - std::vector pointerstodelete; // list of temporary arrays to clean up - for (int ibank = 0; ibank < mpi::bank_size; ibank++) { - std::vector nodeidVec; - double *dataVec; // will be allocated by bank - nodesRotated.get_orbblock(j, dataVec, nodeidVec, ibank); - if (nodeidVec.size() > 0) pointerstodelete.push_back(dataVec); - int shift = 0; - for (int n = 0; n < nodeidVec.size(); n++) { - assert(nodeidVec[n] - max_ix >= 0); // unrotated nodes have been deleted - assert(ix2coef.count(nodeidVec[n] - max_ix) == 0); // each nodeid treated once - ix2coef[nodeidVec[n] - max_ix] = ix++; - csize = sizecoeffW; - if (parindexVec_ref[nodeidVec[n] - max_ix] < 0) csize = sizecoeff; - coeffpVec.push_back(&dataVec[shift]); // list of coeff pointers - shift += csize; - } - } - if (j < N) { - // Real part - if (!Phi[j].hasReal()) Phi[j].alloc(NUMBER::Real); - Phi[j].real().clear(); - Phi[j].real().makeTreefromCoeff(refTree, coeffpVec, ix2coef, prec); - } else { - // Imag part - if (!Phi[j % N].hasImag()) Phi[j % N].alloc(NUMBER::Imag); - Phi[j % N].imag().clear(); - Phi[j % N].imag().makeTreefromCoeff(refTree, coeffpVec, ix2coef, prec); - } - for (double *p : pointerstodelete) delete[] p; - pointerstodelete.clear(); - } - } + rotate(Phi, U, Phi, prec); + return; } /** @brief Save all nodes in bank; identify them using serialIx from refTree