From 79dbd107e0bf085d03cbee1778f1c4f1d082f855 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 22 Nov 2021 09:37:41 -0500 Subject: [PATCH 01/26] Add more guards. Add brackets to maintain code blocks to avoid redeclaring the threadID var --- src/helpme_standalone.h | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/src/helpme_standalone.h b/src/helpme_standalone.h index 8b0316a34d..184437bffa 100644 --- a/src/helpme_standalone.h +++ b/src/helpme_standalone.h @@ -2367,7 +2367,9 @@ namespace helpme { template void permuteABCtoCBA(Real const *__restrict__ abcPtr, int const aDimension, int const bDimension, int const cDimension, Real *__restrict__ cbaPtr, size_t nThreads = 1) { +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads) +#endif for (int C = 0; C <= -1 + cDimension; ++C) for (int B = 0; B <= -1 + bDimension; ++B) for (int A = 0; A <= -1 + aDimension; ++A) @@ -2387,7 +2389,9 @@ void permuteABCtoCBA(Real const *__restrict__ abcPtr, int const aDimension, int template void permuteABCtoACB(Real const *__restrict__ abcPtr, int const aDimension, int const bDimension, int const cDimension, Real *__restrict__ acbPtr, size_t nThreads = 1) { +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads) +#endif for (int A = 0; A <= -1 + aDimension; ++A) for (int C = 0; C <= -1 + cDimension; ++C) for (int B = 0; B <= -1 + bDimension; ++B) @@ -3059,7 +3063,9 @@ class PMEInstance { // Exclude m=0 cell. int start = (nodeZero ? 1 : 0); // Writing the three nested loops in one allows for better load balancing in parallel. +#ifdef _OPENMP #pragma omp parallel for reduction(+ : energy, Vxx, Vxy, Vyy, Vxz, Vyz, Vzz) num_threads(nThreads) +#endif for (size_t yxz = start; yxz < nyxz; ++yxz) { size_t xz = yxz % nxz; short ky = yxz / nxz; @@ -3166,7 +3172,9 @@ class PMEInstance { // Exclude m=0 cell. int start = (nodeZero ? 1 : 0); // Writing the three nested loops in one allows for better load balancing in parallel. +#ifdef _OPENMP #pragma omp parallel for reduction(+ : energy, Vxx, Vxy, Vyy, Vxz, Vyz, Vzz) num_threads(nThreads) +#endif for (size_t yxz = start; yxz < nyxz; ++yxz) { size_t xz = yxz % nxz; short ky = yxz / nxz; @@ -3325,7 +3333,9 @@ class PMEInstance { // Exclude m=0 cell. int start = (nodeZero ? 1 : 0); // Writing the three nested loops in one allows for better load balancing in parallel. +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads) +#endif for (size_t yxz = start; yxz < nyxz; ++yxz) { size_t xz = yxz % nxz; short ky = yxz / nxz; @@ -3774,9 +3784,9 @@ class PMEInstance { int nComponents = nCartesian(parameterAngMom); size_t numBA = (size_t)myGridDimensionB_ * myGridDimensionA_; +#ifdef _OPENMP #pragma omp parallel num_threads(nThreads_) { -#ifdef _OPENMP int threadID = omp_get_thread_num(); #else int threadID = 0; @@ -3792,7 +3802,9 @@ class PMEInstance { const auto &splineC = cacheEntry.cSpline; spreadParametersImpl(atom, realGrid, nComponents, splineA, splineB, splineC, parameters, threadID); } +#ifdef _OPENMP } +#endif return realGrid; } @@ -3812,11 +3824,12 @@ class PMEInstance { gridAtomList_.resize(gridDimensionC_); // Classify atoms to their worker threads first, then construct splines for each thread +#ifdef _OPENMP #pragma omp parallel num_threads(nThreads_) { -#ifdef _OPENMP int threadID = omp_get_thread_num(); #else + { int threadID = 0; #endif for (size_t row = threadID; row < gridDimensionC_; row += nThreads_) { @@ -3856,7 +3869,6 @@ class PMEInstance { } numAtomsPerThread_[threadID] = myNumAtoms; } - // We could intervene here and do some load balancing by inspecting the list. Currently // the lazy approach of just assuming that the atoms are evenly distributed along c is used. @@ -3875,11 +3887,12 @@ class PMEInstance { threadOffset[thread] = threadOffset[thread - 1] + numAtomsPerThread_[thread - 1]; } +#ifdef _OPENMP #pragma omp parallel num_threads(nThreads_) { -#ifdef _OPENMP int threadID = omp_get_thread_num(); #else + { int threadID = 0; #endif size_t entry = threadOffset[threadID]; @@ -3911,13 +3924,13 @@ class PMEInstance { } } } - // Finally, find all of the splines that this thread will need to handle +#ifdef _OPENMP #pragma omp parallel num_threads(nThreads_) { -#ifdef _OPENMP int threadID = omp_get_thread_num(); #else + { int threadID = 0; #endif auto &mySplineList = splinesPerThread_[threadID]; @@ -4145,15 +4158,17 @@ class PMEInstance { helpme::vector buffer(nThreads_ * scratchRowDim); // A transform, with instant sort to CAB ordering for each local block +#ifdef _OPENMP #pragma omp parallel num_threads(nThreads_) { -#ifdef _OPENMP int threadID = omp_get_thread_num(); #else int threadID = 0; #endif auto scratch = &buffer[threadID * scratchRowDim]; +#ifdef _OPENMP #pragma omp for +#endif for (int c = 0; c < subsetOfCAlongA_; ++c) { for (int b = 0; b < myGridDimensionB_; ++b) { Real *gridPtr = realGrid + c * myGridDimensionB_ * gridDimensionA_ + b * gridDimensionA_; @@ -4166,7 +4181,9 @@ class PMEInstance { } } } +#ifdef _OPENMP } +#endif #if HAVE_MPI == 1 // Communicate A back to blocks From 576acc1325a3eb4d5e277c8269663a866f1e944f Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 22 Nov 2021 09:45:27 -0500 Subject: [PATCH 02/26] Three more guards --- src/helpme_standalone.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/helpme_standalone.h b/src/helpme_standalone.h index 184437bffa..430392e9ad 100644 --- a/src/helpme_standalone.h +++ b/src/helpme_standalone.h @@ -4215,7 +4215,9 @@ class PMEInstance { // B transform size_t numCA = (size_t)subsetOfCAlongB_ * myComplexGridDimensionA_; +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) +#endif for (size_t ca = 0; ca < numCA; ++ca) { fftHelperB_.transform(buffer1 + ca * gridDimensionB_, FFTW_FORWARD); } @@ -4263,7 +4265,9 @@ class PMEInstance { #endif // C transform size_t numBA = (size_t)subsetOfBAlongC_ * myComplexGridDimensionA_; +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) +#endif for (size_t ba = 0; ba < numBA; ++ba) { fftHelperC_.transform(buffer2 + ba * gridDimensionC_, FFTW_FORWARD); } @@ -4290,7 +4294,9 @@ class PMEInstance { // C transform size_t numYX = (size_t)subsetOfBAlongC_ * myComplexGridDimensionA_; +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) +#endif for (size_t yx = 0; yx < numYX; ++yx) { fftHelperC_.transform(convolvedGrid + yx * gridDimensionC_, FFTW_BACKWARD); } From afb12d9d437f622286e2f57189635d6c711c52ec Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 22 Nov 2021 15:49:07 -0500 Subject: [PATCH 03/26] More pragma guards --- src/helpme_standalone.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/helpme_standalone.h b/src/helpme_standalone.h index 430392e9ad..780a013950 100644 --- a/src/helpme_standalone.h +++ b/src/helpme_standalone.h @@ -4348,11 +4348,15 @@ class PMEInstance { // B transform with instant sort of local blocks from CAB -> CBA order size_t numCA = (size_t)subsetOfCAlongB_ * myComplexGridDimensionA_; +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) +#endif for (size_t ca = 0; ca < numCA; ++ca) { fftHelperB_.transform(buffer1 + ca * gridDimensionB_, FFTW_BACKWARD); } +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) +#endif for (int c = 0; c < subsetOfCAlongB_; ++c) { for (int a = 0; a < myComplexGridDimensionA_; ++a) { int cx = c * myComplexGridDimensionA_ * gridDimensionB_ + a * gridDimensionB_; @@ -4399,7 +4403,9 @@ class PMEInstance { // A transform Real *realGrid = reinterpret_cast(buffer2); +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) +#endif for (int cb = 0; cb < subsetOfCAlongA_ * myGridDimensionB_; ++cb) { fftHelperA_.transform(buffer1 + cb * complexGridDimensionA_, realGrid + cb * gridDimensionA_); } From a6fe6cca5c6bc7a8459769099e8d544ff5143b0c Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 22 Nov 2021 15:55:05 -0500 Subject: [PATCH 04/26] 2 more guards --- src/helpme_standalone.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/helpme_standalone.h b/src/helpme_standalone.h index 780a013950..8e18ccee4d 100644 --- a/src/helpme_standalone.h +++ b/src/helpme_standalone.h @@ -4518,7 +4518,9 @@ class PMEInstance { } if (iAmNodeZero) transformedGrid[0] = 0; // Writing the three nested loops in one allows for better load balancing in parallel. +#ifdef _OPENMP #pragma omp parallel for reduction(+ : energy) num_threads(nThreads_) +#endif for (size_t yxz = 0; yxz < nyxz; ++yxz) { energy += transformedGrid[yxz] * transformedGrid[yxz] * influenceFunction[yxz]; transformedGrid[yxz] *= influenceFunction[yxz]; @@ -4549,7 +4551,9 @@ class PMEInstance { } if (iAmNodeZero) transformedGrid[0] = Complex(0, 0); const size_t numCTerms(myNumKSumTermsC_); +#ifdef _OPENMP #pragma omp parallel for reduction(+ : energy) num_threads(nThreads_) +#endif for (size_t yxz = 0; yxz < nyxz; ++yxz) { size_t xz = yxz % nxz; int kx = firstKSumTermA_ + xz / numCTerms; From 4e9588b6aa91e6cc686135e9641eb5746f357950 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 22 Nov 2021 16:07:43 -0500 Subject: [PATCH 05/26] Revert accidentally pushed change --- src/helpme_standalone.h | 47 ++++++----------------------------------- 1 file changed, 7 insertions(+), 40 deletions(-) diff --git a/src/helpme_standalone.h b/src/helpme_standalone.h index 8e18ccee4d..8b0316a34d 100644 --- a/src/helpme_standalone.h +++ b/src/helpme_standalone.h @@ -2367,9 +2367,7 @@ namespace helpme { template void permuteABCtoCBA(Real const *__restrict__ abcPtr, int const aDimension, int const bDimension, int const cDimension, Real *__restrict__ cbaPtr, size_t nThreads = 1) { -#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads) -#endif for (int C = 0; C <= -1 + cDimension; ++C) for (int B = 0; B <= -1 + bDimension; ++B) for (int A = 0; A <= -1 + aDimension; ++A) @@ -2389,9 +2387,7 @@ void permuteABCtoCBA(Real const *__restrict__ abcPtr, int const aDimension, int template void permuteABCtoACB(Real const *__restrict__ abcPtr, int const aDimension, int const bDimension, int const cDimension, Real *__restrict__ acbPtr, size_t nThreads = 1) { -#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads) -#endif for (int A = 0; A <= -1 + aDimension; ++A) for (int C = 0; C <= -1 + cDimension; ++C) for (int B = 0; B <= -1 + bDimension; ++B) @@ -3063,9 +3059,7 @@ class PMEInstance { // Exclude m=0 cell. int start = (nodeZero ? 1 : 0); // Writing the three nested loops in one allows for better load balancing in parallel. -#ifdef _OPENMP #pragma omp parallel for reduction(+ : energy, Vxx, Vxy, Vyy, Vxz, Vyz, Vzz) num_threads(nThreads) -#endif for (size_t yxz = start; yxz < nyxz; ++yxz) { size_t xz = yxz % nxz; short ky = yxz / nxz; @@ -3172,9 +3166,7 @@ class PMEInstance { // Exclude m=0 cell. int start = (nodeZero ? 1 : 0); // Writing the three nested loops in one allows for better load balancing in parallel. -#ifdef _OPENMP #pragma omp parallel for reduction(+ : energy, Vxx, Vxy, Vyy, Vxz, Vyz, Vzz) num_threads(nThreads) -#endif for (size_t yxz = start; yxz < nyxz; ++yxz) { size_t xz = yxz % nxz; short ky = yxz / nxz; @@ -3333,9 +3325,7 @@ class PMEInstance { // Exclude m=0 cell. int start = (nodeZero ? 1 : 0); // Writing the three nested loops in one allows for better load balancing in parallel. -#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads) -#endif for (size_t yxz = start; yxz < nyxz; ++yxz) { size_t xz = yxz % nxz; short ky = yxz / nxz; @@ -3784,9 +3774,9 @@ class PMEInstance { int nComponents = nCartesian(parameterAngMom); size_t numBA = (size_t)myGridDimensionB_ * myGridDimensionA_; -#ifdef _OPENMP #pragma omp parallel num_threads(nThreads_) { +#ifdef _OPENMP int threadID = omp_get_thread_num(); #else int threadID = 0; @@ -3802,9 +3792,7 @@ class PMEInstance { const auto &splineC = cacheEntry.cSpline; spreadParametersImpl(atom, realGrid, nComponents, splineA, splineB, splineC, parameters, threadID); } -#ifdef _OPENMP } -#endif return realGrid; } @@ -3824,12 +3812,11 @@ class PMEInstance { gridAtomList_.resize(gridDimensionC_); // Classify atoms to their worker threads first, then construct splines for each thread -#ifdef _OPENMP #pragma omp parallel num_threads(nThreads_) { +#ifdef _OPENMP int threadID = omp_get_thread_num(); #else - { int threadID = 0; #endif for (size_t row = threadID; row < gridDimensionC_; row += nThreads_) { @@ -3869,6 +3856,7 @@ class PMEInstance { } numAtomsPerThread_[threadID] = myNumAtoms; } + // We could intervene here and do some load balancing by inspecting the list. Currently // the lazy approach of just assuming that the atoms are evenly distributed along c is used. @@ -3887,12 +3875,11 @@ class PMEInstance { threadOffset[thread] = threadOffset[thread - 1] + numAtomsPerThread_[thread - 1]; } -#ifdef _OPENMP #pragma omp parallel num_threads(nThreads_) { +#ifdef _OPENMP int threadID = omp_get_thread_num(); #else - { int threadID = 0; #endif size_t entry = threadOffset[threadID]; @@ -3924,13 +3911,13 @@ class PMEInstance { } } } + // Finally, find all of the splines that this thread will need to handle -#ifdef _OPENMP #pragma omp parallel num_threads(nThreads_) { +#ifdef _OPENMP int threadID = omp_get_thread_num(); #else - { int threadID = 0; #endif auto &mySplineList = splinesPerThread_[threadID]; @@ -4158,17 +4145,15 @@ class PMEInstance { helpme::vector buffer(nThreads_ * scratchRowDim); // A transform, with instant sort to CAB ordering for each local block -#ifdef _OPENMP #pragma omp parallel num_threads(nThreads_) { +#ifdef _OPENMP int threadID = omp_get_thread_num(); #else int threadID = 0; #endif auto scratch = &buffer[threadID * scratchRowDim]; -#ifdef _OPENMP #pragma omp for -#endif for (int c = 0; c < subsetOfCAlongA_; ++c) { for (int b = 0; b < myGridDimensionB_; ++b) { Real *gridPtr = realGrid + c * myGridDimensionB_ * gridDimensionA_ + b * gridDimensionA_; @@ -4181,9 +4166,7 @@ class PMEInstance { } } } -#ifdef _OPENMP } -#endif #if HAVE_MPI == 1 // Communicate A back to blocks @@ -4215,9 +4198,7 @@ class PMEInstance { // B transform size_t numCA = (size_t)subsetOfCAlongB_ * myComplexGridDimensionA_; -#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) -#endif for (size_t ca = 0; ca < numCA; ++ca) { fftHelperB_.transform(buffer1 + ca * gridDimensionB_, FFTW_FORWARD); } @@ -4265,9 +4246,7 @@ class PMEInstance { #endif // C transform size_t numBA = (size_t)subsetOfBAlongC_ * myComplexGridDimensionA_; -#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) -#endif for (size_t ba = 0; ba < numBA; ++ba) { fftHelperC_.transform(buffer2 + ba * gridDimensionC_, FFTW_FORWARD); } @@ -4294,9 +4273,7 @@ class PMEInstance { // C transform size_t numYX = (size_t)subsetOfBAlongC_ * myComplexGridDimensionA_; -#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) -#endif for (size_t yx = 0; yx < numYX; ++yx) { fftHelperC_.transform(convolvedGrid + yx * gridDimensionC_, FFTW_BACKWARD); } @@ -4348,15 +4325,11 @@ class PMEInstance { // B transform with instant sort of local blocks from CAB -> CBA order size_t numCA = (size_t)subsetOfCAlongB_ * myComplexGridDimensionA_; -#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) -#endif for (size_t ca = 0; ca < numCA; ++ca) { fftHelperB_.transform(buffer1 + ca * gridDimensionB_, FFTW_BACKWARD); } -#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) -#endif for (int c = 0; c < subsetOfCAlongB_; ++c) { for (int a = 0; a < myComplexGridDimensionA_; ++a) { int cx = c * myComplexGridDimensionA_ * gridDimensionB_ + a * gridDimensionB_; @@ -4403,9 +4376,7 @@ class PMEInstance { // A transform Real *realGrid = reinterpret_cast(buffer2); -#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) -#endif for (int cb = 0; cb < subsetOfCAlongA_ * myGridDimensionB_; ++cb) { fftHelperA_.transform(buffer1 + cb * complexGridDimensionA_, realGrid + cb * gridDimensionA_); } @@ -4518,9 +4489,7 @@ class PMEInstance { } if (iAmNodeZero) transformedGrid[0] = 0; // Writing the three nested loops in one allows for better load balancing in parallel. -#ifdef _OPENMP #pragma omp parallel for reduction(+ : energy) num_threads(nThreads_) -#endif for (size_t yxz = 0; yxz < nyxz; ++yxz) { energy += transformedGrid[yxz] * transformedGrid[yxz] * influenceFunction[yxz]; transformedGrid[yxz] *= influenceFunction[yxz]; @@ -4551,9 +4520,7 @@ class PMEInstance { } if (iAmNodeZero) transformedGrid[0] = Complex(0, 0); const size_t numCTerms(myNumKSumTermsC_); -#ifdef _OPENMP #pragma omp parallel for reduction(+ : energy) num_threads(nThreads_) -#endif for (size_t yxz = 0; yxz < nyxz; ++yxz) { size_t xz = yxz % nxz; int kx = firstKSumTermA_ + xz / numCTerms; From 44686a3f4c4865f53766303ec02e94fa77e090f0 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 22 Nov 2021 16:10:50 -0500 Subject: [PATCH 06/26] Add lots of openmp guards --- src/helpme_standalone.h | 47 +++++++++++++++++++++++++++++++++++------ 1 file changed, 40 insertions(+), 7 deletions(-) diff --git a/src/helpme_standalone.h b/src/helpme_standalone.h index 8b0316a34d..8e18ccee4d 100644 --- a/src/helpme_standalone.h +++ b/src/helpme_standalone.h @@ -2367,7 +2367,9 @@ namespace helpme { template void permuteABCtoCBA(Real const *__restrict__ abcPtr, int const aDimension, int const bDimension, int const cDimension, Real *__restrict__ cbaPtr, size_t nThreads = 1) { +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads) +#endif for (int C = 0; C <= -1 + cDimension; ++C) for (int B = 0; B <= -1 + bDimension; ++B) for (int A = 0; A <= -1 + aDimension; ++A) @@ -2387,7 +2389,9 @@ void permuteABCtoCBA(Real const *__restrict__ abcPtr, int const aDimension, int template void permuteABCtoACB(Real const *__restrict__ abcPtr, int const aDimension, int const bDimension, int const cDimension, Real *__restrict__ acbPtr, size_t nThreads = 1) { +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads) +#endif for (int A = 0; A <= -1 + aDimension; ++A) for (int C = 0; C <= -1 + cDimension; ++C) for (int B = 0; B <= -1 + bDimension; ++B) @@ -3059,7 +3063,9 @@ class PMEInstance { // Exclude m=0 cell. int start = (nodeZero ? 1 : 0); // Writing the three nested loops in one allows for better load balancing in parallel. +#ifdef _OPENMP #pragma omp parallel for reduction(+ : energy, Vxx, Vxy, Vyy, Vxz, Vyz, Vzz) num_threads(nThreads) +#endif for (size_t yxz = start; yxz < nyxz; ++yxz) { size_t xz = yxz % nxz; short ky = yxz / nxz; @@ -3166,7 +3172,9 @@ class PMEInstance { // Exclude m=0 cell. int start = (nodeZero ? 1 : 0); // Writing the three nested loops in one allows for better load balancing in parallel. +#ifdef _OPENMP #pragma omp parallel for reduction(+ : energy, Vxx, Vxy, Vyy, Vxz, Vyz, Vzz) num_threads(nThreads) +#endif for (size_t yxz = start; yxz < nyxz; ++yxz) { size_t xz = yxz % nxz; short ky = yxz / nxz; @@ -3325,7 +3333,9 @@ class PMEInstance { // Exclude m=0 cell. int start = (nodeZero ? 1 : 0); // Writing the three nested loops in one allows for better load balancing in parallel. +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads) +#endif for (size_t yxz = start; yxz < nyxz; ++yxz) { size_t xz = yxz % nxz; short ky = yxz / nxz; @@ -3774,9 +3784,9 @@ class PMEInstance { int nComponents = nCartesian(parameterAngMom); size_t numBA = (size_t)myGridDimensionB_ * myGridDimensionA_; +#ifdef _OPENMP #pragma omp parallel num_threads(nThreads_) { -#ifdef _OPENMP int threadID = omp_get_thread_num(); #else int threadID = 0; @@ -3792,7 +3802,9 @@ class PMEInstance { const auto &splineC = cacheEntry.cSpline; spreadParametersImpl(atom, realGrid, nComponents, splineA, splineB, splineC, parameters, threadID); } +#ifdef _OPENMP } +#endif return realGrid; } @@ -3812,11 +3824,12 @@ class PMEInstance { gridAtomList_.resize(gridDimensionC_); // Classify atoms to their worker threads first, then construct splines for each thread +#ifdef _OPENMP #pragma omp parallel num_threads(nThreads_) { -#ifdef _OPENMP int threadID = omp_get_thread_num(); #else + { int threadID = 0; #endif for (size_t row = threadID; row < gridDimensionC_; row += nThreads_) { @@ -3856,7 +3869,6 @@ class PMEInstance { } numAtomsPerThread_[threadID] = myNumAtoms; } - // We could intervene here and do some load balancing by inspecting the list. Currently // the lazy approach of just assuming that the atoms are evenly distributed along c is used. @@ -3875,11 +3887,12 @@ class PMEInstance { threadOffset[thread] = threadOffset[thread - 1] + numAtomsPerThread_[thread - 1]; } +#ifdef _OPENMP #pragma omp parallel num_threads(nThreads_) { -#ifdef _OPENMP int threadID = omp_get_thread_num(); #else + { int threadID = 0; #endif size_t entry = threadOffset[threadID]; @@ -3911,13 +3924,13 @@ class PMEInstance { } } } - // Finally, find all of the splines that this thread will need to handle +#ifdef _OPENMP #pragma omp parallel num_threads(nThreads_) { -#ifdef _OPENMP int threadID = omp_get_thread_num(); #else + { int threadID = 0; #endif auto &mySplineList = splinesPerThread_[threadID]; @@ -4145,15 +4158,17 @@ class PMEInstance { helpme::vector buffer(nThreads_ * scratchRowDim); // A transform, with instant sort to CAB ordering for each local block +#ifdef _OPENMP #pragma omp parallel num_threads(nThreads_) { -#ifdef _OPENMP int threadID = omp_get_thread_num(); #else int threadID = 0; #endif auto scratch = &buffer[threadID * scratchRowDim]; +#ifdef _OPENMP #pragma omp for +#endif for (int c = 0; c < subsetOfCAlongA_; ++c) { for (int b = 0; b < myGridDimensionB_; ++b) { Real *gridPtr = realGrid + c * myGridDimensionB_ * gridDimensionA_ + b * gridDimensionA_; @@ -4166,7 +4181,9 @@ class PMEInstance { } } } +#ifdef _OPENMP } +#endif #if HAVE_MPI == 1 // Communicate A back to blocks @@ -4198,7 +4215,9 @@ class PMEInstance { // B transform size_t numCA = (size_t)subsetOfCAlongB_ * myComplexGridDimensionA_; +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) +#endif for (size_t ca = 0; ca < numCA; ++ca) { fftHelperB_.transform(buffer1 + ca * gridDimensionB_, FFTW_FORWARD); } @@ -4246,7 +4265,9 @@ class PMEInstance { #endif // C transform size_t numBA = (size_t)subsetOfBAlongC_ * myComplexGridDimensionA_; +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) +#endif for (size_t ba = 0; ba < numBA; ++ba) { fftHelperC_.transform(buffer2 + ba * gridDimensionC_, FFTW_FORWARD); } @@ -4273,7 +4294,9 @@ class PMEInstance { // C transform size_t numYX = (size_t)subsetOfBAlongC_ * myComplexGridDimensionA_; +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) +#endif for (size_t yx = 0; yx < numYX; ++yx) { fftHelperC_.transform(convolvedGrid + yx * gridDimensionC_, FFTW_BACKWARD); } @@ -4325,11 +4348,15 @@ class PMEInstance { // B transform with instant sort of local blocks from CAB -> CBA order size_t numCA = (size_t)subsetOfCAlongB_ * myComplexGridDimensionA_; +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) +#endif for (size_t ca = 0; ca < numCA; ++ca) { fftHelperB_.transform(buffer1 + ca * gridDimensionB_, FFTW_BACKWARD); } +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) +#endif for (int c = 0; c < subsetOfCAlongB_; ++c) { for (int a = 0; a < myComplexGridDimensionA_; ++a) { int cx = c * myComplexGridDimensionA_ * gridDimensionB_ + a * gridDimensionB_; @@ -4376,7 +4403,9 @@ class PMEInstance { // A transform Real *realGrid = reinterpret_cast(buffer2); +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) +#endif for (int cb = 0; cb < subsetOfCAlongA_ * myGridDimensionB_; ++cb) { fftHelperA_.transform(buffer1 + cb * complexGridDimensionA_, realGrid + cb * gridDimensionA_); } @@ -4489,7 +4518,9 @@ class PMEInstance { } if (iAmNodeZero) transformedGrid[0] = 0; // Writing the three nested loops in one allows for better load balancing in parallel. +#ifdef _OPENMP #pragma omp parallel for reduction(+ : energy) num_threads(nThreads_) +#endif for (size_t yxz = 0; yxz < nyxz; ++yxz) { energy += transformedGrid[yxz] * transformedGrid[yxz] * influenceFunction[yxz]; transformedGrid[yxz] *= influenceFunction[yxz]; @@ -4520,7 +4551,9 @@ class PMEInstance { } if (iAmNodeZero) transformedGrid[0] = Complex(0, 0); const size_t numCTerms(myNumKSumTermsC_); +#ifdef _OPENMP #pragma omp parallel for reduction(+ : energy) num_threads(nThreads_) +#endif for (size_t yxz = 0; yxz < nyxz; ++yxz) { size_t xz = yxz % nxz; int kx = firstKSumTermA_ + xz / numCTerms; From ec7c76d9ea09c28b22fa8f876d8f370ad4c114a1 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 22 Nov 2021 17:10:29 -0500 Subject: [PATCH 07/26] Add a guard and a bracket --- src/helpme_standalone.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/helpme_standalone.h b/src/helpme_standalone.h index 8e18ccee4d..b2417868f0 100644 --- a/src/helpme_standalone.h +++ b/src/helpme_standalone.h @@ -4637,14 +4637,15 @@ class PMEInstance { fractionalPhis_ = RealMat(nThreads_, rowSize); } size_t nAtoms = std::accumulate(numAtomsPerThread_.begin(), numAtomsPerThread_.end(), 0); +#ifdef _OPENMP #pragma omp parallel num_threads(nThreads_) { -#ifdef _OPENMP int threadID = omp_get_thread_num(); +#pragma omp for #else + { int threadID = 0; #endif -#pragma omp for for (size_t atom = 0; atom < nAtoms; ++atom) { const auto &cacheEntry = splineCache_[atom]; const auto &absAtom = cacheEntry.absoluteAtomNumber; From e36fdba5edc88a3baef70391575ff6cf694892a4 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Mon, 22 Nov 2021 17:21:01 -0500 Subject: [PATCH 08/26] Ad brackets in non-openmp to be consistent. Guard 2 more. --- src/helpme_standalone.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/helpme_standalone.h b/src/helpme_standalone.h index b2417868f0..63503eb388 100644 --- a/src/helpme_standalone.h +++ b/src/helpme_standalone.h @@ -3789,6 +3789,7 @@ class PMEInstance { { int threadID = omp_get_thread_num(); #else + { int threadID = 0; #endif for (size_t row = threadID; row < myGridDimensionC_; row += nThreads_) { @@ -3802,9 +3803,7 @@ class PMEInstance { const auto &splineC = cacheEntry.cSpline; spreadParametersImpl(atom, realGrid, nComponents, splineA, splineB, splineC, parameters, threadID); } -#ifdef _OPENMP } -#endif return realGrid; } @@ -4163,6 +4162,7 @@ class PMEInstance { { int threadID = omp_get_thread_num(); #else + { int threadID = 0; #endif auto scratch = &buffer[threadID * scratchRowDim]; @@ -4181,9 +4181,7 @@ class PMEInstance { } } } -#ifdef _OPENMP } -#endif #if HAVE_MPI == 1 // Communicate A back to blocks @@ -5014,7 +5012,9 @@ class PMEInstance { // Direct space, using simple O(N^2) algorithm. This can be improved using a nonbonded list if needed. Real cutoffSquared = sphericalCutoff * sphericalCutoff; Real kappaSquared = kappa_ * kappa_; +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) +#endif for (size_t i = 0; i < nAtoms; ++i) { const auto &coordsI = coordinates.row(i); Real *phiPtr = potential[i]; @@ -5046,7 +5046,9 @@ class PMEInstance { } else { std::logic_error("Unknown algorithm in helpme::computePAtAtomicSites"); } +#ifdef _OPENMP #pragma omp parallel for num_threads(nThreads_) +#endif for (size_t atom = 0; atom < nAtoms; ++atom) { const auto &cacheEntry = splineCache_[atom]; const auto &absAtom = cacheEntry.absoluteAtomNumber; From d63a800d7f5fa94e186106c395e6812164ac747f Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 06:30:30 -0500 Subject: [PATCH 09/26] Fix signed/unsigned comparison --- src/helpme_standalone.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/helpme_standalone.h b/src/helpme_standalone.h index 63503eb388..c43dd1273a 100644 --- a/src/helpme_standalone.h +++ b/src/helpme_standalone.h @@ -1024,10 +1024,10 @@ class Matrix { unsortedEigenVectors.transposeInPlace(); std::vector> eigenPairs; - for (int val = 0; val < nRows_; ++val) eigenPairs.push_back({eigenValues[val][0], unsortedEigenVectors[val]}); + for (size_t val = 0; val < nRows_; ++val) eigenPairs.push_back({eigenValues[val][0], unsortedEigenVectors[val]}); std::sort(eigenPairs.begin(), eigenPairs.end()); if (order == SortOrder::Descending) std::reverse(eigenPairs.begin(), eigenPairs.end()); - for (int val = 0; val < nRows_; ++val) { + for (size_t val = 0; val < nRows_; ++val) { const auto& e = eigenPairs[val]; eigenValues.data_[val] = std::get<0>(e); std::copy(std::get<1>(e), std::get<1>(e) + nCols_, sortedEigenVectors[val]); From ff2db56700ead08431d3aa037018b99ebaa7e7c5 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 06:55:40 -0500 Subject: [PATCH 10/26] Fix signed/unsigned comparison --- src/helpme_standalone.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/helpme_standalone.h b/src/helpme_standalone.h index c43dd1273a..9258d7ae0f 100644 --- a/src/helpme_standalone.h +++ b/src/helpme_standalone.h @@ -3283,7 +3283,9 @@ class PMEInstance { if (coordinates.nRows() != parameters.nRows()) throw std::runtime_error( "Inconsistent number of coordinates and parameters; there should be nAtoms of each."); - if (parameters.nCols() != (nCartesian(parameterAngMom) - cartesianOffset)) + int n_param_cols = nCartesian(parameterAngMom) - cartesianOffset; + if (n_param_cols < 0 || + parameters.nCols() != (size_t)n_param_cols) throw std::runtime_error( "Mismatch in the number of parameters provided and the parameter angular momentum"); } From 904e57c396840ffc03fd0dec61619623d4a9a7fa Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 07:02:47 -0500 Subject: [PATCH 11/26] Fix signed/unsigned comparison --- src/helpme_standalone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helpme_standalone.h b/src/helpme_standalone.h index 9258d7ae0f..20e8fdf9ad 100644 --- a/src/helpme_standalone.h +++ b/src/helpme_standalone.h @@ -3833,7 +3833,7 @@ class PMEInstance { { int threadID = 0; #endif - for (size_t row = threadID; row < gridDimensionC_; row += nThreads_) { + for (int row = threadID; row < gridDimensionC_; row += nThreads_) { gridAtomList_[row].clear(); } auto &mySplineList = splinesPerThread_[threadID]; From 4066e0f81195d747704b883094f61f7da8085fae Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 07:23:47 -0500 Subject: [PATCH 12/26] Use int and cast to int to be consistent and avoid losing bits. --- src/helpme_standalone.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/helpme_standalone.h b/src/helpme_standalone.h index 20e8fdf9ad..60558b010a 100644 --- a/src/helpme_standalone.h +++ b/src/helpme_standalone.h @@ -3840,13 +3840,13 @@ class PMEInstance { const auto &gridIteratorC = threadedGridIteratorC_[threadID]; mySplineList.clear(); size_t myNumAtoms = 0; - for (int atom = 0; atom < nAtoms; ++atom) { + for (size_t atom = 0; atom < nAtoms; ++atom) { const Real *atomCoords = coords[atom]; Real cCoord = atomCoords[0] * recVecs_(0, 2) + atomCoords[1] * recVecs_(1, 2) + atomCoords[2] * recVecs_(2, 2) - EPS; cCoord -= floor(cCoord); short cStartingGridPoint = gridDimensionC_ * cCoord; - size_t thisAtomsThread = cStartingGridPoint % nThreads_; + int thisAtomsThread = (int)cStartingGridPoint % nThreads_; const auto &cGridIterator = gridIteratorC_[cStartingGridPoint]; if (cGridIterator.size() && thisAtomsThread == threadID) { Real aCoord = atomCoords[0] * recVecs_(0, 0) + atomCoords[1] * recVecs_(1, 0) + From 24aa72e1089381f77ba7ddbdc4e772ff3a928239 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 07:29:12 -0500 Subject: [PATCH 13/26] Comment out currently unused variable. Need to check if this should be gridIteratorC_ or not. --- src/helpme_standalone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helpme_standalone.h b/src/helpme_standalone.h index 60558b010a..47db6b9015 100644 --- a/src/helpme_standalone.h +++ b/src/helpme_standalone.h @@ -3837,7 +3837,7 @@ class PMEInstance { gridAtomList_[row].clear(); } auto &mySplineList = splinesPerThread_[threadID]; - const auto &gridIteratorC = threadedGridIteratorC_[threadID]; + //const auto &gridIteratorC = threadedGridIteratorC_[threadID]; FIXME currently unused mySplineList.clear(); size_t myNumAtoms = 0; for (size_t atom = 0; atom < nAtoms; ++atom) { From 6463a88c662c64639dc78da449a48c98faaf420a Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 07:32:05 -0500 Subject: [PATCH 14/26] Fix signed/unsigned comparison --- src/helpme_standalone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helpme_standalone.h b/src/helpme_standalone.h index 47db6b9015..5be77169af 100644 --- a/src/helpme_standalone.h +++ b/src/helpme_standalone.h @@ -3880,7 +3880,7 @@ class PMEInstance { // certain scale factor to try and minimize allocations in a not-too-wasteful manner. if (splineCache_.size() < numCacheEntries) { size_t newSize = static_cast(1.2 * numCacheEntries); - for (int atom = splineCache_.size(); atom < newSize; ++atom) + for (size_t atom = splineCache_.size(); atom < newSize; ++atom) splineCache_.emplace_back(splineOrder_, splineDerivativeLevel); } std::vector threadOffset(nThreads_, 0); From 658355b1ebeb8bb9e85b1a80bd6211804c434a71 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 07:35:27 -0500 Subject: [PATCH 15/26] Make loop var int since all other vars are int --- src/helpme_standalone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helpme_standalone.h b/src/helpme_standalone.h index 5be77169af..786fecb6a3 100644 --- a/src/helpme_standalone.h +++ b/src/helpme_standalone.h @@ -3897,7 +3897,7 @@ class PMEInstance { int threadID = 0; #endif size_t entry = threadOffset[threadID]; - for (size_t cRow = threadID; cRow < gridDimensionC_; cRow += nThreads_) { + for (int cRow = threadID; cRow < gridDimensionC_; cRow += nThreads_) { for (const auto &gridPointAndAtom : gridAtomList_[cRow]) { size_t atom = gridPointAndAtom.second; const Real *atomCoords = coords[atom]; From 132621928d186ee57371c3bbeec4ae10c718503b Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 07:37:15 -0500 Subject: [PATCH 16/26] Make loop var int --- src/helpme_standalone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helpme_standalone.h b/src/helpme_standalone.h index 786fecb6a3..e4685dfef0 100644 --- a/src/helpme_standalone.h +++ b/src/helpme_standalone.h @@ -3794,7 +3794,7 @@ class PMEInstance { { int threadID = 0; #endif - for (size_t row = threadID; row < myGridDimensionC_; row += nThreads_) { + for (int row = threadID; row < myGridDimensionC_; row += nThreads_) { std::fill(&realGrid[row * numBA], &realGrid[(row + 1) * numBA], Real(0)); } for (const auto &spline : splinesPerThread_[threadID]) { From f1c9b82b3ff2ba5decf9504c5f8b1379b1aa869f Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 07:49:42 -0500 Subject: [PATCH 17/26] Protect MPI-only variable --- src/helpme_standalone.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/helpme_standalone.h b/src/helpme_standalone.h index e4685dfef0..fa18aabf99 100644 --- a/src/helpme_standalone.h +++ b/src/helpme_standalone.h @@ -4119,14 +4119,20 @@ class PMEInstance { * \return Pointer to the transformed grid, which is stored in one of the buffers in BAC order. */ Complex *forwardTransform(Real *realGrid) { +#if HAVE_MPI == 1 Real *__restrict__ realCBA; +#endif Complex *__restrict__ buffer1, *__restrict__ buffer2; if (realGrid == reinterpret_cast(workSpace1_.data())) { +#if HAVE_MPI == 1 realCBA = reinterpret_cast(workSpace2_.data()); +#endif buffer1 = workSpace2_.data(); buffer2 = workSpace1_.data(); } else { +#if HAVE_MPI == 1 realCBA = reinterpret_cast(workSpace1_.data()); +#endif buffer1 = workSpace1_.data(); buffer2 = workSpace2_.data(); } From 55cc1293ebc499563d29885bad9a7676d374c066 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 08:33:01 -0500 Subject: [PATCH 18/26] Change int to size_t to avoid signed/unsigned comparison --- src/helpme_standalone.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/helpme_standalone.h b/src/helpme_standalone.h index fa18aabf99..dec739aa96 100644 --- a/src/helpme_standalone.h +++ b/src/helpme_standalone.h @@ -793,8 +793,8 @@ class Matrix { */ void assertSymmetric(const Real& threshold = 1e-10f) const { assertSquare(); - for (int row = 0; row < nRows_; ++row) { - for (int col = 0; col < row; ++col) { + for (size_t row = 0; row < nRows_; ++row) { + for (size_t col = 0; col < row; ++col) { if (std::abs(data_[row * nCols_ + col] - data_[col * nCols_ + row]) > threshold) throw std::runtime_error("Unexpected non-symmetric matrix found."); } From 4e8a16468b5b0dec77ca2cb4b9ce3a344b75b5f3 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 08:39:50 -0500 Subject: [PATCH 19/26] Comment out unused var --- src/helpme_standalone.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/helpme_standalone.h b/src/helpme_standalone.h index dec739aa96..dcd83fbeb1 100644 --- a/src/helpme_standalone.h +++ b/src/helpme_standalone.h @@ -210,7 +210,8 @@ void JacobiCyclicDiagonalization(Real *eigenvalues, Real *eigenvectors, const Re Real threshold_norm; Real threshold; Real tan_phi, sin_phi, cos_phi, tan2_phi, sin2_phi, cos2_phi; - Real sin_2phi, cos_2phi, cot_2phi; + Real sin_2phi, cot_2phi; + //Real cos_2phi; Real dum1; Real dum2; Real dum3; @@ -263,7 +264,7 @@ void JacobiCyclicDiagonalization(Real *eigenvalues, Real *eigenvectors, const Re if (tan_phi < 0) sin_phi = -sin_phi; cos_phi = sqrt(cos2_phi); sin_2phi = 2 * sin_phi * cos_phi; - cos_2phi = cos2_phi - sin2_phi; + //cos_2phi = cos2_phi - sin2_phi; // Rotate columns k and m for both the matrix A // and the matrix of eigenvectors. From f520a61991df790f0110ee2776851e9c0d6bb994 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 08:42:38 -0500 Subject: [PATCH 20/26] Change to size_t to avoid signed/unsigned comparison --- src/helpme_standalone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helpme_standalone.h b/src/helpme_standalone.h index dcd83fbeb1..2bad9b3fbb 100644 --- a/src/helpme_standalone.h +++ b/src/helpme_standalone.h @@ -822,7 +822,7 @@ class Matrix { Matrix evecs = std::get<1>(eigenPairs); evalsReal.applyOperationToEachElement(function); Matrix evecsT = evecs.transpose(); - for (int row = 0; row < nRows_; ++row) { + for (size_t row = 0; row < nRows_; ++row) { Real transformedEigenvalue = evalsReal[row][0]; std::for_each(evecsT.data_ + row * nCols_, evecsT.data_ + (row + 1) * nCols_, [&](Real& val) { val *= transformedEigenvalue; }); From d5205bc4d86a98136855192b205b3bcb74284f32 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 10:25:23 -0500 Subject: [PATCH 21/26] Protect against signed int comparison to unsigned overflow --- src/helpme_standalone.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/helpme_standalone.h b/src/helpme_standalone.h index 2bad9b3fbb..4c48ef36bf 100644 --- a/src/helpme_standalone.h +++ b/src/helpme_standalone.h @@ -2253,7 +2253,10 @@ class BSpline { derivativeLevel_ = derivativeLevel; // The +1 is to account for the fact that we need to store entries up to and including the max. - if (splines_.nRows() < derivativeLevel + 1 || splines_.nCols() != order) + if (derivativeLevel < 0 || order < 0) { + throw std::runtime_error("BSpline::update: derivativeLevel and/or order < 0."); + } + if (splines_.nRows() < (size_t)derivativeLevel + 1 || splines_.nCols() != (size_t)order) splines_ = Matrix(derivativeLevel + 1, order); splines_.setZero(); From 2721b91947e8c2dc944172de84e51ed32b1fc026 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 10:29:40 -0500 Subject: [PATCH 22/26] Fix 3 signed/unsigned comparison --- src/helpme_standalone.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/helpme_standalone.h b/src/helpme_standalone.h index 4c48ef36bf..6cb85fe6d0 100644 --- a/src/helpme_standalone.h +++ b/src/helpme_standalone.h @@ -858,10 +858,10 @@ class Matrix { throw std::runtime_error("Attempting to multiply matrices with incompatible dimensions."); Matrix product(nRows_, other.nCols_); Real* output = product.data_; - for (int row = 0; row < nRows_; ++row) { + for (size_t row = 0; row < nRows_; ++row) { const Real* rowPtr = data_ + row * nCols_; - for (int col = 0; col < other.nCols_; ++col) { - for (int link = 0; link < nCols_; ++link) { + for (size_t col = 0; col < other.nCols_; ++col) { + for (size_t link = 0; link < nCols_; ++link) { *output += rowPtr[link] * other.data_[link * other.nCols_ + col]; } ++output; From 6c9ab4b9495b5a579528cb531b94d6a9123f5365 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Tue, 23 Nov 2021 10:35:46 -0500 Subject: [PATCH 23/26] Init uninitialized var --- src/helpme_standalone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helpme_standalone.h b/src/helpme_standalone.h index 6cb85fe6d0..037d6ee059 100644 --- a/src/helpme_standalone.h +++ b/src/helpme_standalone.h @@ -5242,7 +5242,7 @@ class PMEInstance { sanityChecks(parameterAngMom, parameters, coordinates); filterAtomsAndBuildSplineCache(parameterAngMom, coordinates); auto realGrid = spreadParameters(parameterAngMom, parameters); - Real energy; + Real energy = 0; if (algorithmType_ == AlgorithmType::PME) { auto gridAddress = forwardTransform(realGrid); energy = convolveE(gridAddress); From d669753585b1b1c54a807b2d05037345b2c6a99c Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Wed, 24 Nov 2021 09:36:51 -0500 Subject: [PATCH 24/26] Update to latest helpme, init uninitialized var in Matrix constructor --- src/helpme_standalone.h | 529 +++++++++++++++++++++++++++++----------- 1 file changed, 385 insertions(+), 144 deletions(-) diff --git a/src/helpme_standalone.h b/src/helpme_standalone.h index 037d6ee059..c95d3ae4b6 100644 --- a/src/helpme_standalone.h +++ b/src/helpme_standalone.h @@ -5,7 +5,7 @@ // -// original file: ../src/helpme.h +// original file: src/helpme.h // BEGINLICENSE // @@ -40,7 +40,7 @@ #include #include -// original file: ../src/cartesiantransform.h +// original file: src/cartesiantransform.h // BEGINLICENSE // @@ -53,7 +53,7 @@ #ifndef _HELPME_STANDALONE_CARTESIANTRANSFORM_H_ #define _HELPME_STANDALONE_CARTESIANTRANSFORM_H_ -// original file: ../src/matrix.h +// original file: src/matrix.h // BEGINLICENSE // @@ -78,7 +78,7 @@ #include #include -// original file: ../src/lapack_wrapper.h +// original file: src/lapack_wrapper.h // BEGINLICENSE // @@ -317,7 +317,7 @@ void JacobiCyclicDiagonalization(Real *eigenvalues, Real *eigenvectors, const Re } // Namespace helpme #endif // Header guard -// original file: ../src/string_utils.h +// original file: src/string_utils.h // BEGINLICENSE // @@ -392,7 +392,7 @@ std::string stringify(T *data, size_t size, size_t rowDim, int width = 14, int p } // Namespace helpme #endif // Header guard -// original file: ../src/memory.h +// original file: src/memory.h // BEGINLICENSE // @@ -646,7 +646,7 @@ class Matrix { /*! * \brief Matrix Constructs an empty matrix. */ - Matrix() : nRows_(0), nCols_(0) {} + Matrix() : nRows_(0), nCols_(0), data_(0) {} /*! * \brief Matrix Constructs a new matrix, allocating memory. @@ -877,6 +877,17 @@ class Matrix { */ Matrix operator*(const Matrix& other) const { return this->multiply(other); } + /*! + * \brief operator * scale a copy of this matrix by a constant, leaving the orignal untouched. + * \param scaleFac the scale factor to apply. + * \return the scaled version of this matrix. + */ + Matrix operator*(Real scaleFac) const { + auto scaled = this->clone(); + scaled.applyOperationToEachElement([&](Real& element) { element *= scaleFac; }); + return scaled; + } + /*! * \brief increment this matrix with another, returning a new matrix containing the sum. * \param other the right hand side of the matrix sum. @@ -1165,7 +1176,7 @@ Matrix cartesianTransform(int maxAngularMomentum, bool transformOnlyThisSh } // Namespace helpme #endif // Header guard -// original file: ../src/fftw_wrapper.h +// original file: src/fftw_wrapper.h // BEGINLICENSE // @@ -1411,7 +1422,7 @@ class FFTWWrapper { } // Namespace helpme #endif // Header guard -// original file: ../src/gamma.h +// original file: src/gamma.h // BEGINLICENSE // @@ -1808,7 +1819,7 @@ Real nonTemplateGammaComputer(int twoS) { } // Namespace helpme #endif // Header guard -// original file: ../src/gridsize.h +// original file: src/gridsize.h // BEGINLICENSE // @@ -1886,7 +1897,7 @@ int findGridSize(T inputSize, const std::initializer_list &requiredDivisors) #endif // #include "memory.h" #if HAVE_MPI == 1 -// original file: ../src/mpi_wrapper.h +// original file: src/mpi_wrapper.h // BEGINLICENSE // @@ -2086,7 +2097,7 @@ std::ostream& operator<<(std::ostream& os, const std::unique_ptr= expectedNTerms) return; angMomIterator_.resize(expectedNTerms); - for (short l = 0, count = 0; l <= L; ++l) { - for (short lz = 0; lz <= l; ++lz) { - for (short ly = 0; ly <= l - lz; ++ly) { - short lx = l - ly - lz; + for (int l = 0, count = 0; l <= L; ++l) { + for (int lz = 0; lz <= l; ++lz) { + for (int ly = 0; ly <= l - lz; ++ly) { + int lx = l - ly - lz; angMomIterator_[count] = {{static_cast(lx), static_cast(ly), static_cast(lz)}}; ++count; } @@ -2764,6 +2780,162 @@ class PMEInstance { } } + /*! + * \brief Runs a PME reciprocal space calculation, computing the potential and, optionally, its derivatives as + * well as the volume dependent part of the virial that comes from the structure factor. + * \param parameterAngMom the angular momentum of the parameters (0 for charges, C6 coefficients, 2 for + * quadrupoles, etc.). A negative value indicates that only the shell with |parameterAngMom| is to be considered, + * e.g. a value of -2 specifies that only quadrupoles (and not dipoles or charges) will be provided; the input + * matrix should have dimensions corresponding only to the number of terms in this shell. + * \param parameters the list of parameters associated with each atom (charges, C6 + * coefficients, multipoles, etc...). For a parameter with angular momentum L, a matrix of dimension nAtoms x nL + * is expected, where nL = (L+1)*(L+2)*(L+3)/6 and the fast running index nL has the ordering + * + * 0 X Y Z XX XY YY XZ YZ ZZ XXX XXY XYY YYY XXZ XYZ YYZ XZZ YZZ ZZZ ... + * + * i.e. generated by the python loops + * \code{.py} + * for L in range(maxAM+1): + * for Lz in range(0,L+1): + * for Ly in range(0, L - Lz + 1): + * Lx = L - Ly - Lz + * \endcode + * \param coordinates the cartesian coordinates, ordered in memory as {x1,y1,z1,x2,y2,z2,....xN,yN,zN}. + * \param energy pointer to the variable holding the energy; this is incremented, not assigned. + * \param gridPoints the list of grid points at which the potential is needed; can be the same as the + * coordinates. + * \param derivativeLevel the order of the potential derivatives required; 0 is the potential, 1 is + * (minus) the field, etc. A negative value indicates that only the derivative with order |parameterAngMom| + * is to be generated, e.g. -2 specifies that only the second derivative (not the potential or its gradient) + * will be returned as output. The output matrix should have space for only these terms, accordingly. + * \param potential the array holding the potential. This is a matrix of dimensions + * nAtoms x nD, where nD is the derivative level requested. See the details fo the parameters argument for + * information about ordering of derivative components. N.B. this array is incremented with the potential, not + * assigned, so take care to zero it first if only the current results are desired. + * \param virial a vector of length 6 containing the unique virial elements, in the order XX XY YY XZ YZ ZZ. + * This vector is incremented, not assigned. + */ + void computePRecHelper(int parameterAngMom, const RealMat ¶meters, const RealMat &coordinates, + const RealMat &gridPoints, int derivativeLevel, RealMat &potential, RealMat &virial) { + bool onlyOneShellForInput = parameterAngMom < 0; + bool onlyOneShellForOutput = derivativeLevel < 0; + parameterAngMom = std::abs(parameterAngMom); + derivativeLevel = std::abs(derivativeLevel); + int cartesianOffset = onlyOneShellForInput ? nCartesian(parameterAngMom - 1) : 0; + sanityChecks(parameterAngMom, parameters, coordinates, cartesianOffset); + updateAngMomIterator(std::max(parameterAngMom, derivativeLevel)); + // Note: we're calling the version of spread parameters that computes its own splines here. + // This is quite inefficient, but allow the potential to be computed at arbitrary locations by + // simply regenerating splines on demand in the probing stage. If this becomes too slow, it's + // easy to write some logic to check whether gridPoints and coordinates are the same, and + // handle that special case using spline cacheing machinery for efficiency. + Real *realGrid = reinterpret_cast(workSpace1_.data()); + std::fill(workSpace1_.begin(), workSpace1_.end(), 0); + updateAngMomIterator(parameterAngMom); + auto fractionalParameters = + cartesianTransform(parameterAngMom, onlyOneShellForInput, scaledRecVecs_.transpose(), parameters); + int nComponents = nCartesian(parameterAngMom) - cartesianOffset; + size_t nAtoms = coordinates.nRows(); + for (size_t atom = 0; atom < nAtoms; ++atom) { + // Blindly reconstruct splines for this atom, assuming nothing about the validity of the cache. + // Note that this incurs a somewhat steep cost due to repeated memory allocations. + auto bSplines = makeBSplines(coordinates[atom], parameterAngMom); + const auto &splineA = std::get<0>(bSplines); + const auto &splineB = std::get<1>(bSplines); + const auto &splineC = std::get<2>(bSplines); + const auto &aGridIterator = gridIteratorA_[splineA.startingGridPoint()]; + const auto &bGridIterator = gridIteratorB_[splineB.startingGridPoint()]; + const auto &cGridIterator = gridIteratorC_[splineC.startingGridPoint()]; + int numPointsA = static_cast(aGridIterator.size()); + int numPointsB = static_cast(bGridIterator.size()); + int numPointsC = static_cast(cGridIterator.size()); + const auto *iteratorDataA = aGridIterator.data(); + const auto *iteratorDataB = bGridIterator.data(); + const auto *iteratorDataC = cGridIterator.data(); + for (int component = 0; component < nComponents; ++component) { + const auto &quanta = angMomIterator_[component + cartesianOffset]; + Real param = fractionalParameters(atom, component); + const Real *splineValsA = splineA[quanta[0]]; + const Real *splineValsB = splineB[quanta[1]]; + const Real *splineValsC = splineC[quanta[2]]; + for (int pointC = 0; pointC < numPointsC; ++pointC) { + const auto &cPoint = iteratorDataC[pointC]; + Real cValP = param * splineValsC[cPoint.second]; + for (int pointB = 0; pointB < numPointsB; ++pointB) { + const auto &bPoint = iteratorDataB[pointB]; + Real cbValP = cValP * splineValsB[bPoint.second]; + Real *cbRow = &realGrid[cPoint.first * myGridDimensionB_ * myGridDimensionA_ + + bPoint.first * myGridDimensionA_]; + for (int pointA = 0; pointA < numPointsA; ++pointA) { + const auto &aPoint = iteratorDataA[pointA]; + cbRow[aPoint.first] += cbValP * splineValsA[aPoint.second]; + } + } + } + } + } + + Real *potentialGrid; + if (algorithmType_ == AlgorithmType::PME) { + auto gridAddress = forwardTransform(realGrid); + if (virial.nRows() == 0 && virial.nCols() == 0) { + convolveE(gridAddress); + } else { + convolveEV(gridAddress, virial); + } + potentialGrid = inverseTransform(gridAddress); + } else if (algorithmType_ == AlgorithmType::CompressedPME) { + auto gridAddress = compressedForwardTransform(realGrid); + if (virial.nRows() == 0 && virial.nCols() == 0) { + convolveE(gridAddress); + potentialGrid = compressedInverseTransform(gridAddress); + } else { + Real *convolvedGrid; + convolveEV(gridAddress, convolvedGrid, virial); + potentialGrid = compressedInverseTransform(convolvedGrid); + } + } else { + std::logic_error("Unknown algorithm in helpme::computePRec"); + } + + auto fracPotential = potential.clone(); + fracPotential.setZero(); + cartesianOffset = onlyOneShellForOutput ? nCartesian(derivativeLevel - 1) : 0; + int nPotentialComponents = nCartesian(derivativeLevel) - cartesianOffset; + size_t nPoints = gridPoints.nRows(); + for (size_t point = 0; point < nPoints; ++point) { + Real *phiPtr = fracPotential[point]; + auto bSplines = makeBSplines(gridPoints[point], derivativeLevel); + auto splineA = std::get<0>(bSplines); + auto splineB = std::get<1>(bSplines); + auto splineC = std::get<2>(bSplines); + const auto &aGridIterator = gridIteratorA_[splineA.startingGridPoint()]; + const auto &bGridIterator = gridIteratorB_[splineB.startingGridPoint()]; + const auto &cGridIterator = gridIteratorC_[splineC.startingGridPoint()]; + const Real *splineStartA = splineA[0]; + const Real *splineStartB = splineB[0]; + const Real *splineStartC = splineC[0]; + for (const auto &cPoint : cGridIterator) { + for (const auto &bPoint : bGridIterator) { + const Real *cbRow = potentialGrid + cPoint.first * myGridDimensionA_ * myGridDimensionB_ + + bPoint.first * myGridDimensionA_; + for (const auto &aPoint : aGridIterator) { + Real gridVal = cbRow[aPoint.first]; + for (int component = 0; component < nPotentialComponents; ++component) { + const auto &quanta = angMomIterator_[component + cartesianOffset]; + const Real *splineValsA = splineStartA + quanta[0] * splineOrder_; + const Real *splineValsB = splineStartB + quanta[1] * splineOrder_; + const Real *splineValsC = splineStartC + quanta[2] * splineOrder_; + phiPtr[component] += gridVal * splineValsA[aPoint.second] * splineValsB[bPoint.second] * + splineValsC[cPoint.second]; + } + } + } + } + } + potential += cartesianTransform(derivativeLevel, onlyOneShellForOutput, scaledRecVecs_, fracPotential); + } + /*! * \brief Spreads parameters onto the grid for a single atom * \param atom the absolute atom number. @@ -2929,9 +3101,11 @@ class PMEInstance { * \param splineB the BSpline object for the B direction. * \param splineC the BSpline object for the C direction. * \param phiPtr a scratch array of length nForceComponents, to store the fractional potential. - * \param parameters the list of parameters associated with each atom (charges, C6 coefficients, multipoles, - * etc...). For a parameter with angular momentum L, a matrix of dimension nAtoms x nL is expected, where nL = - * (L+1)*(L+2)*(L+3)/6 and the fast running index nL has the ordering + * \param fracParameters the list of parameters associated with the current atom, in + * the scaled fraction coordinate basis (charges, C6 coefficients, + * multipoles, etc...). For a parameter with angular momentum L, a matrix + * of dimension nAtoms x nL is expected, where + * nL = (L+1)*(L+2)*(L+3)/6 and the fast running index nL has the ordering * * 0 X Y Z XX XY YY XZ YZ ZZ XXX XXY XYY YYY XXZ XYZ YYZ XZZ YZZ ZZZ ... * @@ -2946,13 +3120,13 @@ class PMEInstance { */ void probeGridImpl(const int &atom, const Real *potentialGrid, const int &nComponents, const int &nForceComponents, const Spline &splineA, const Spline &splineB, const Spline &splineC, Real *phiPtr, - const RealMat ¶meters, Real *forces) { + const Real *fracParameters, Real *forces) { std::fill(phiPtr, phiPtr + nForceComponents, 0); probeGridImpl(potentialGrid, nForceComponents, splineA, splineB, splineC, phiPtr); Real fracForce[3] = {0, 0, 0}; for (int component = 0; component < nComponents; ++component) { - Real param = parameters(atom, component); + Real param = fracParameters[component]; const auto &quanta = angMomIterator_[component]; short lx = quanta[0]; short ly = quanta[1]; @@ -3788,6 +3962,14 @@ class PMEInstance { Real *realGrid = reinterpret_cast(workSpace1_.data()); updateAngMomIterator(parameterAngMom); + // We need to figure out whether the incoming parameters need to be transformed to scaled fractional + // coordinates or not, which is only needed for angular momentum higher than zero. + RealMat tempParams; + if (parameterAngMom) { + tempParams = cartesianTransform(parameterAngMom, false, scaledRecVecs_.transpose(), parameters); + } + const auto &fractionalParameters = parameterAngMom ? tempParams : parameters; + int nComponents = nCartesian(parameterAngMom); size_t numBA = (size_t)myGridDimensionB_ * myGridDimensionA_; #ifdef _OPENMP @@ -3807,7 +3989,8 @@ class PMEInstance { const auto &splineA = cacheEntry.aSpline; const auto &splineB = cacheEntry.bSpline; const auto &splineC = cacheEntry.cSpline; - spreadParametersImpl(atom, realGrid, nComponents, splineA, splineB, splineC, parameters, threadID); + spreadParametersImpl(atom, realGrid, nComponents, splineA, splineB, splineC, fractionalParameters, + threadID); } } return realGrid; @@ -4634,19 +4817,33 @@ class PMEInstance { * Lx = L - Ly - Lz * \endcode * \param forces a Nx3 matrix of the forces, ordered in memory as {Fx1,Fy1,Fz1,Fx2,Fy2,Fz2,....FxN,FyN,FzN}. + * \param virial pointer to the virial vector if needed */ - void probeGrid(const Real *potentialGrid, int parameterAngMom, const RealMat ¶meters, RealMat &forces) { + void probeGrid(const Real *potentialGrid, int parameterAngMom, const RealMat ¶meters, RealMat &forces, + Real *virial = nullptr) { updateAngMomIterator(parameterAngMom + 1); int nComponents = nCartesian(parameterAngMom); int nForceComponents = nCartesian(parameterAngMom + 1); const Real *paramPtr = parameters[0]; // Find how many multiples of the cache line size are needed // to ensure that each thread hits a unique page. + size_t nAtoms = std::accumulate(numAtomsPerThread_.begin(), numAtomsPerThread_.end(), 0); size_t rowSize = std::ceil(nForceComponents / cacheLineSizeInReals_) * cacheLineSizeInReals_; - if (fractionalPhis_.nRows() != nThreads_ || fractionalPhis_.nCols() != rowSize) { - fractionalPhis_ = RealMat(nThreads_, rowSize); + if (fractionalPhis_.nRows() < nAtoms || fractionalPhis_.nCols() < rowSize) { + fractionalPhis_ = RealMat(nAtoms, rowSize); + } + + RealMat fractionalParams; + Real cartPhi[3]; + if (parameterAngMom) { + fractionalParams = cartesianTransform(parameterAngMom, false, scaledRecVecs_.transpose(), parameters); + if (virial) { + if (parameterAngMom > 1) { + // The structure factor derivatives below are only implemented up to dipoles for now + throw std::runtime_error("Only multipoles up to L=1 are supported if the virial is requested"); + } + } } - size_t nAtoms = std::accumulate(numAtomsPerThread_.begin(), numAtomsPerThread_.end(), 0); #ifdef _OPENMP #pragma omp parallel num_threads(nThreads_) { @@ -4665,7 +4862,20 @@ class PMEInstance { if (parameterAngMom) { Real *myScratch = fractionalPhis_[threadID % nThreads_]; probeGridImpl(absAtom, potentialGrid, nComponents, nForceComponents, splineA, splineB, splineC, - myScratch, parameters, forces[absAtom]); + myScratch, fractionalParams[absAtom], forces[absAtom]); + // Add extra virial terms coming from the derivative of the structure factor. + // See eq. 2.16 of https://doi.org/10.1063/1.1630791 for details + if (virial) { + // Get the potential in the Cartesian basis + matrixVectorProduct(scaledRecVecs_, &myScratch[1], &cartPhi[0]); + const Real *parm = parameters[absAtom]; + virial[0] += cartPhi[0] * parm[1]; + virial[1] += 0.5f * (cartPhi[0] * parm[2] + cartPhi[1] * parm[1]); + virial[2] += cartPhi[1] * parm[2]; + virial[3] += 0.5f * (cartPhi[0] * parm[3] + cartPhi[2] * parm[1]); + virial[4] += 0.5f * (cartPhi[1] * parm[3] + cartPhi[2] * parm[2]); + virial[5] += cartPhi[2] * parm[3]; + } } else { probeGridImpl(potentialGrid, splineA, splineB, splineC, paramPtr[absAtom], forces[absAtom]); } @@ -4725,7 +4935,7 @@ class PMEInstance { */ Real computeEDir(const Matrix &pairList, int parameterAngMom, const RealMat ¶meters, const RealMat &coordinates) { - if (parameterAngMom) throw std::runtime_error("Multipole self terms have not been coded yet."); + if (parameterAngMom) throw std::runtime_error("Multipole direct terms have not been coded yet."); sanityChecks(parameterAngMom, parameters, coordinates); Real energy = 0; @@ -5007,13 +5217,13 @@ class PMEInstance { /*! * \brief Computes the full electrostatic potential at atomic sites due to point charges located at those same - * sites. The site located at each probe location is neglected, to avoid the resulting singularity \param charges - * the list of point charges (in e) associated with each particle. \param coordinates the cartesian coordinates, - * ordered in memory as {x1,y1,z1,x2,y2,z2,....xN,yN,zN}. \param potential the array holding the potential. This is - * a matrix of dimensions nAtoms x 1 \param sphericalCutoff the cutoff (in A) applied to the real space summations, - * which must be no more than half of the box dimensions + * sites. The site located at each probe location is neglected, to avoid the resulting singularity. + * \param charges * the list of point charges (in e) associated with each particle. + * \param coordinates the cartesian coordinates, ordered in memory as {x1,y1,z1,x2,y2,z2,....xN,yN,zN}. + * \param potential the array holding the potential. This is * a matrix of dimensions nAtoms x 1. + * \param sphericalCutoff the cutoff (in A) applied to the real space summations, + * which must be no more than half of the box dimensions. */ - void computePAtAtomicSites(const RealMat &charges, const RealMat &coordinates, RealMat &potential, Real sphericalCutoff) { sanityChecks(0, charges, coordinates); @@ -5075,6 +5285,47 @@ class PMEInstance { } } + /* + * \brief Runs a PME reciprocal space calculation, computing the potential and, optionally, its derivatives as + * well as the volume dependent part of the virial that comes from the structure factor. + * \param parameterAngMom the angular momentum of the parameters (0 for charges, C6 coefficients, 2 for + * quadrupoles, etc.). A negative value indicates that only the shell with |parameterAngMom| is to be considered, + * e.g. a value of -2 specifies that only quadrupoles (and not dipoles or charges) will be provided; the input + * matrix should have dimensions corresponding only to the number of terms in this shell. + * \param parameters the list of parameters associated with each atom (charges, C6 + * coefficients, multipoles, etc...). For a parameter with angular momentum L, a matrix of dimension nAtoms x nL + * is expected, where nL = (L+1)*(L+2)*(L+3)/6 and the fast running index nL has the ordering + * + * 0 X Y Z XX XY YY XZ YZ ZZ XXX XXY XYY YYY XXZ XYZ YYZ XZZ YZZ ZZZ ... + * + * i.e. generated by the python loops + * \code{.py} + * for L in range(maxAM+1): + * for Lz in range(0,L+1): + * for Ly in range(0, L - Lz + 1): + * Lx = L - Ly - Lz + * \endcode + * \param coordinates the cartesian coordinates, ordered in memory as {x1,y1,z1,x2,y2,z2,....xN,yN,zN}. + * \param energy pointer to the variable holding the energy; this is incremented, not assigned. + * \param gridPoints the list of grid points at which the potential is needed; can be the same as the + * coordinates. + * \param derivativeLevel the order of the potential derivatives required; 0 is the potential, 1 is + * (minus) the field, etc. A negative value indicates that only the derivative with order |parameterAngMom| + * is to be generated, e.g. -2 specifies that only the second derivative (not the potential or its gradient) + * will be returned as output. The output matrix should have space for only these terms, accordingly. + * \param potential the array holding the potential. This is a matrix of dimensions + * nAtoms x nD, where nD is the derivative level requested. See the details fo the parameters argument for + * information about ordering of derivative components. N.B. this array is incremented with the potential, not + * assigned, so take care to zero it first if only the current results are desired. + * \param virial a vector of length 6 containing the unique virial elements, in the order XX XY YY XZ YZ ZZ. + * This vector is incremented, not assigned. + */ + + void computePVRec(int parameterAngMom, const RealMat ¶meters, const RealMat &coordinates, + const RealMat &gridPoints, int derivativeLevel, RealMat &potential, RealMat &virial) { + computePRecHelper(parameterAngMom, parameters, coordinates, gridPoints, derivativeLevel, potential, virial); + } + /*! * \brief Runs a PME reciprocal space calculation, computing the potential and, optionally, its derivatives. * \param parameterAngMom the angular momentum of the parameters (0 for charges, C6 coefficients, 2 for @@ -5107,115 +5358,11 @@ class PMEInstance { * information about ordering of derivative components. N.B. this array is incremented with the potential, not * assigned, so take care to zero it first if only the current results are desired. */ - void computePRec(int parameterAngMom, const RealMat ¶meters, const RealMat &coordinates, const RealMat &gridPoints, int derivativeLevel, RealMat &potential) { - bool onlyOneShellForInput = parameterAngMom < 0; - bool onlyOneShellForOutput = derivativeLevel < 0; - parameterAngMom = std::abs(parameterAngMom); - derivativeLevel = std::abs(derivativeLevel); - int cartesianOffset = onlyOneShellForInput ? nCartesian(parameterAngMom - 1) : 0; - sanityChecks(parameterAngMom, parameters, coordinates, cartesianOffset); - updateAngMomIterator(std::max(parameterAngMom, derivativeLevel)); - // Note: we're calling the version of spread parameters that computes its own splines here. - // This is quite inefficient, but allow the potential to be computed at arbitrary locations by - // simply regenerating splines on demand in the probing stage. If this becomes too slow, it's - // easy to write some logic to check whether gridPoints and coordinates are the same, and - // handle that special case using spline cacheing machinery for efficiency. - Real *realGrid = reinterpret_cast(workSpace1_.data()); - std::fill(workSpace1_.begin(), workSpace1_.end(), 0); - updateAngMomIterator(parameterAngMom); - auto fractionalParameters = - cartesianTransform(parameterAngMom, onlyOneShellForInput, scaledRecVecs_, parameters); - int nComponents = nCartesian(parameterAngMom) - cartesianOffset; - size_t nAtoms = coordinates.nRows(); - for (size_t atom = 0; atom < nAtoms; ++atom) { - // Blindly reconstruct splines for this atom, assuming nothing about the validity of the cache. - // Note that this incurs a somewhat steep cost due to repeated memory allocations. - auto bSplines = makeBSplines(coordinates[atom], parameterAngMom); - const auto &splineA = std::get<0>(bSplines); - const auto &splineB = std::get<1>(bSplines); - const auto &splineC = std::get<2>(bSplines); - const auto &aGridIterator = gridIteratorA_[splineA.startingGridPoint()]; - const auto &bGridIterator = gridIteratorB_[splineB.startingGridPoint()]; - const auto &cGridIterator = gridIteratorC_[splineC.startingGridPoint()]; - int numPointsA = static_cast(aGridIterator.size()); - int numPointsB = static_cast(bGridIterator.size()); - int numPointsC = static_cast(cGridIterator.size()); - const auto *iteratorDataA = aGridIterator.data(); - const auto *iteratorDataB = bGridIterator.data(); - const auto *iteratorDataC = cGridIterator.data(); - for (int component = 0; component < nComponents; ++component) { - const auto &quanta = angMomIterator_[component + cartesianOffset]; - Real param = fractionalParameters(atom, component); - const Real *splineValsA = splineA[quanta[0]]; - const Real *splineValsB = splineB[quanta[1]]; - const Real *splineValsC = splineC[quanta[2]]; - for (int pointC = 0; pointC < numPointsC; ++pointC) { - const auto &cPoint = iteratorDataC[pointC]; - Real cValP = param * splineValsC[cPoint.second]; - for (int pointB = 0; pointB < numPointsB; ++pointB) { - const auto &bPoint = iteratorDataB[pointB]; - Real cbValP = cValP * splineValsB[bPoint.second]; - Real *cbRow = &realGrid[cPoint.first * myGridDimensionB_ * myGridDimensionA_ + - bPoint.first * myGridDimensionA_]; - for (int pointA = 0; pointA < numPointsA; ++pointA) { - const auto &aPoint = iteratorDataA[pointA]; - cbRow[aPoint.first] += cbValP * splineValsA[aPoint.second]; - } - } - } - } - } - - Real *potentialGrid; - if (algorithmType_ == AlgorithmType::PME) { - auto gridAddress = forwardTransform(realGrid); - convolveE(gridAddress); - potentialGrid = inverseTransform(gridAddress); - } else if (algorithmType_ == AlgorithmType::CompressedPME) { - auto gridAddress = compressedForwardTransform(realGrid); - convolveE(gridAddress); - potentialGrid = compressedInverseTransform(gridAddress); - } else { - std::logic_error("Unknown algorithm in helpme::computePRec"); - } - - auto fracPotential = potential.clone(); - cartesianOffset = onlyOneShellForOutput ? nCartesian(derivativeLevel - 1) : 0; - int nPotentialComponents = nCartesian(derivativeLevel) - cartesianOffset; - size_t nPoints = gridPoints.nRows(); - for (size_t point = 0; point < nPoints; ++point) { - Real *phiPtr = fracPotential[point]; - auto bSplines = makeBSplines(gridPoints[point], derivativeLevel); - auto splineA = std::get<0>(bSplines); - auto splineB = std::get<1>(bSplines); - auto splineC = std::get<2>(bSplines); - const auto &aGridIterator = gridIteratorA_[splineA.startingGridPoint()]; - const auto &bGridIterator = gridIteratorB_[splineB.startingGridPoint()]; - const auto &cGridIterator = gridIteratorC_[splineC.startingGridPoint()]; - const Real *splineStartA = splineA[0]; - const Real *splineStartB = splineB[0]; - const Real *splineStartC = splineC[0]; - for (const auto &cPoint : cGridIterator) { - for (const auto &bPoint : bGridIterator) { - const Real *cbRow = potentialGrid + cPoint.first * myGridDimensionA_ * myGridDimensionB_ + - bPoint.first * myGridDimensionA_; - for (const auto &aPoint : aGridIterator) { - Real gridVal = cbRow[aPoint.first]; - for (int component = 0; component < nPotentialComponents; ++component) { - const auto &quanta = angMomIterator_[component + cartesianOffset]; - const Real *splineValsA = splineStartA + quanta[0] * splineOrder_; - const Real *splineValsB = splineStartB + quanta[1] * splineOrder_; - const Real *splineValsC = splineStartC + quanta[2] * splineOrder_; - phiPtr[component] += gridVal * splineValsA[aPoint.second] * splineValsB[bPoint.second] * - splineValsC[cPoint.second]; - } - } - } - } - } - potential += cartesianTransform(derivativeLevel, onlyOneShellForOutput, scaledRecVecs_, fracPotential); + RealMat emptyMatrix(0, 0); + computePRecHelper(parameterAngMom, parameters, coordinates, gridPoints, derivativeLevel, potential, + emptyMatrix); } /*! @@ -5240,6 +5387,7 @@ class PMEInstance { */ Real computeERec(int parameterAngMom, const RealMat ¶meters, const RealMat &coordinates) { sanityChecks(parameterAngMom, parameters, coordinates); + filterAtomsAndBuildSplineCache(parameterAngMom, coordinates); auto realGrid = spreadParameters(parameterAngMom, parameters); Real energy = 0; @@ -5340,7 +5488,7 @@ class PMEInstance { auto gridAddress = forwardTransform(realGrid); energy = convolveEV(gridAddress, virial); auto potentialGrid = inverseTransform(gridAddress); - probeGrid(potentialGrid, parameterAngMom, parameters, forces); + probeGrid(potentialGrid, parameterAngMom, parameters, forces, virial[0]); } else if (algorithmType_ == AlgorithmType::CompressedPME) { auto gridAddress = compressedForwardTransform(realGrid); Real *convolvedGrid; @@ -5354,6 +5502,98 @@ class PMEInstance { return energy; } + /*! + * \brief Runs a PME reciprocal space calculation, computing energies, forces and the virial. + * \param parameterAngMom the angular momentum of the parameters (0 for charges, C6 coefficients, 2 for + * quadrupoles, etc.). + * \param parameters the list of parameters associated with each atom (charges, C6 + * coefficients, multipoles, etc...). For a parameter with angular momentum L, a matrix of dimension nAtoms x nL + * is expected, where nL = (L+1)*(L+2)*(L+3)/6 and the fast running index nL has the ordering + * + * 0 X Y Z XX XY YY XZ YZ ZZ XXX XXY XYY YYY XXZ XYZ YYZ XZZ YZZ ZZZ ... + * + * i.e. generated by the python loops + * \code{.py} + * for L in range(maxAM+1): + * for Lz in range(0,L+1): + * for Ly in range(0, L - Lz + 1): + * Lx = L - Ly - Lz + * \endcode + * \param inducedDipoles the induced dipoles in the order {x1,y1,z1,x2,y2,z2,....xN,yN,zN}. + * \param polarizationType the method used to converged the induced dipoles. + * \param coordinates the cartesian coordinates, ordered in memory as {x1,y1,z1,x2,y2,z2,....xN,yN,zN}. + * \param energy pointer to the variable holding the energy; this is incremented, not assigned. + * \param forces a Nx3 matrix of the forces, ordered in memory as {Fx1,Fy1,Fz1,Fx2,Fy2,Fz2,....FxN,FyN,FzN}. + * This matrix is incremented, not assigned. + * \param virial a vector of length 6 containing the unique virial elements, in the order XX XY YY XZ YZ ZZ. + * This vector is incremented, not assigned. + * \return the reciprocal space energy. + */ + Real computeEFVRecIsotropicInducedDipoles(int parameterAngMom, const RealMat ¶meters, + const RealMat &inducedDipoles, PolarizationType polarizationType, + const RealMat &coordinates, RealMat &forces, RealMat &virial) { + sanityChecks(parameterAngMom, parameters, coordinates); + if (parameterAngMom) + throw std::runtime_error("Only point charges are allowed in computeEFVRecIsoPolarized() at the moment."); + if (polarizationType != PolarizationType::Mutual) + throw std::runtime_error("Only mutual (variation) optimized dipoles are supported at the moment."); + + size_t numAtoms = parameters.nRows(); + // Get the potential and field from the permanent moments + RealMat potential(numAtoms, 10); + RealMat combinedMultipoles(numAtoms, 4); + for (int atom = 0; atom < parameters.nRows(); ++atom) { + combinedMultipoles[atom][0] = parameters[atom][0]; + combinedMultipoles[atom][1] = inducedDipoles[atom][0]; + combinedMultipoles[atom][2] = inducedDipoles[atom][1]; + combinedMultipoles[atom][3] = inducedDipoles[atom][2]; + } + + computePVRec(1, combinedMultipoles, coordinates, coordinates, 2, potential, virial); + + double energy = 0; + Real *virialPtr = virial.begin(); + Real &Vxx = virialPtr[0]; + Real &Vxy = virialPtr[1]; + Real &Vyy = virialPtr[2]; + Real &Vxz = virialPtr[3]; + Real &Vyz = virialPtr[4]; + Real &Vzz = virialPtr[5]; + for (int atom = 0; atom < numAtoms; ++atom) { + const Real *dPhi = potential[atom]; + double charge = parameters[atom][0]; + Real *force = forces[atom]; + double phi = dPhi[0]; + double phiX = dPhi[1]; + double phiY = dPhi[2]; + double phiZ = dPhi[3]; + double phiXX = dPhi[4]; + double phiXY = dPhi[5]; + double phiYY = dPhi[6]; + double phiXZ = dPhi[7]; + double phiYZ = dPhi[8]; + double phiZZ = dPhi[9]; + const Real *mu = inducedDipoles[atom]; + energy += 0.5 * charge * phi; + + force[0] += phiXX * mu[0] + phiXY * mu[1] + phiXZ * mu[2]; + force[1] += phiXY * mu[0] + phiYY * mu[1] + phiYZ * mu[2]; + force[2] += phiXZ * mu[0] + phiYZ * mu[1] + phiZZ * mu[2]; + + force[0] += charge * phiX; + force[1] += charge * phiY; + force[2] += charge * phiZ; + + Vxx += phiX * mu[0]; + Vxy += 0.5 * (phiX * mu[1] + phiY * mu[0]); + Vyy += phiY * mu[1]; + Vxz += 0.5 * (phiX * mu[2] + phiZ * mu[0]); + Vyz += 0.5 * (phiY * mu[2] + phiZ * mu[1]); + Vzz += phiZ * mu[2]; + } + return energy; + } + /*! * \brief Runs a full (direct and reciprocal space) PME calculation, computing the energy. The direct space * implementation here is not totally optimal, so this routine should primarily be used for testing and @@ -5585,6 +5825,7 @@ using PMEInstanceF = helpme::PMEInstance; typedef enum { Undefined = 0, XAligned = 1, ShapeMatrix = 2 } LatticeType; typedef enum { /* Undefined comes from the above scope */ ZYX = 1 } NodeOrder; +typedef enum { Mutual = 0 } PolarizationType; typedef struct PMEInstance PMEInstance; extern struct PMEInstance *helpme_createD(); From 72d5c952ef427743858233a19196a82c94d65ec8 Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Wed, 24 Nov 2021 09:40:21 -0500 Subject: [PATCH 25/26] Init uninitialized var --- src/helpme_standalone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/helpme_standalone.h b/src/helpme_standalone.h index c95d3ae4b6..56edcd5594 100644 --- a/src/helpme_standalone.h +++ b/src/helpme_standalone.h @@ -2875,7 +2875,7 @@ class PMEInstance { } } - Real *potentialGrid; + Real *potentialGrid = 0; if (algorithmType_ == AlgorithmType::PME) { auto gridAddress = forwardTransform(realGrid); if (virial.nRows() == 0 && virial.nCols() == 0) { From 08acc1a69df7da3e8a87d16fd0468e5df22f553c Mon Sep 17 00:00:00 2001 From: "Daniel R. Roe" Date: Wed, 24 Nov 2021 10:47:37 -0500 Subject: [PATCH 26/26] Add linting error fixes --- src/helpme_standalone.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/helpme_standalone.h b/src/helpme_standalone.h index 56edcd5594..dd02231efc 100644 --- a/src/helpme_standalone.h +++ b/src/helpme_standalone.h @@ -211,7 +211,7 @@ void JacobiCyclicDiagonalization(Real *eigenvalues, Real *eigenvectors, const Re Real threshold; Real tan_phi, sin_phi, cos_phi, tan2_phi, sin2_phi, cos2_phi; Real sin_2phi, cot_2phi; - //Real cos_2phi; + // Real cos_2phi; Real dum1; Real dum2; Real dum3; @@ -264,7 +264,7 @@ void JacobiCyclicDiagonalization(Real *eigenvalues, Real *eigenvectors, const Re if (tan_phi < 0) sin_phi = -sin_phi; cos_phi = sqrt(cos2_phi); sin_2phi = 2 * sin_phi * cos_phi; - //cos_2phi = cos2_phi - sin2_phi; + // cos_2phi = cos2_phi - sin2_phi; // Rotate columns k and m for both the matrix A // and the matrix of eigenvectors. @@ -1036,7 +1036,8 @@ class Matrix { unsortedEigenVectors.transposeInPlace(); std::vector> eigenPairs; - for (size_t val = 0; val < nRows_; ++val) eigenPairs.push_back({eigenValues[val][0], unsortedEigenVectors[val]}); + for (size_t val = 0; val < nRows_; ++val) + eigenPairs.push_back({eigenValues[val][0], unsortedEigenVectors[val]}); std::sort(eigenPairs.begin(), eigenPairs.end()); if (order == SortOrder::Descending) std::reverse(eigenPairs.begin(), eigenPairs.end()); for (size_t val = 0; val < nRows_; ++val) { @@ -2265,7 +2266,7 @@ class BSpline { // The +1 is to account for the fact that we need to store entries up to and including the max. if (derivativeLevel < 0 || order < 0) { - throw std::runtime_error("BSpline::update: derivativeLevel and/or order < 0."); + throw std::runtime_error("BSpline::update: derivativeLevel and/or order < 0."); } if (splines_.nRows() < (size_t)derivativeLevel + 1 || splines_.nCols() != (size_t)order) splines_ = Matrix(derivativeLevel + 1, order); @@ -3462,8 +3463,7 @@ class PMEInstance { throw std::runtime_error( "Inconsistent number of coordinates and parameters; there should be nAtoms of each."); int n_param_cols = nCartesian(parameterAngMom) - cartesianOffset; - if (n_param_cols < 0 || - parameters.nCols() != (size_t)n_param_cols) + if (n_param_cols < 0 || parameters.nCols() != (size_t)n_param_cols) throw std::runtime_error( "Mismatch in the number of parameters provided and the parameter angular momentum"); } @@ -4024,7 +4024,7 @@ class PMEInstance { gridAtomList_[row].clear(); } auto &mySplineList = splinesPerThread_[threadID]; - //const auto &gridIteratorC = threadedGridIteratorC_[threadID]; FIXME currently unused + // const auto &gridIteratorC = threadedGridIteratorC_[threadID]; FIXME currently unused mySplineList.clear(); size_t myNumAtoms = 0; for (size_t atom = 0; atom < nAtoms; ++atom) {