Iteration 0021 — 2599e1f9f4c8 (accepted)¶
GitHub commit: 2599e1f9f4c8 Published branch: fermilink-optimize/lammps-tip4p
Change summary¶
Use contiguous alias-friendly x/f/part2grid views in pair_lj_cut_tip4p_long and TIP4P PPPM particle_map/make_rho/fieldforce_ik without changing force, tally, or cache semantics
Acceptance rationale¶
Correctness passed and candidate 2599e1f9f4c8 reduces weighted_median_pair_plus_kspace_seconds from 78.12 to 76.248, a 2.40% improvement over incumbent ab55ea3e8508 that clears the 2% acceptance bar.
Guardrails & metrics¶
field |
value |
|---|---|
decision |
ACCEPTED |
correctness |
ok |
correctness mode |
field_tolerances |
hard reject |
no |
guardrail errors |
0 |
incumbent commit |
|
candidate commit |
|
incumbent metric |
78.12 |
candidate metric |
76.248 |
baseline metric |
94.717 |
Δ vs incumbent |
+2.396% (lower-is-better sign) |
changed files |
src/KSPACE/pair_lj_cut_tip4p_long.cpp, src/KSPACE/pppm_tip4p.cpp |
Diffstat¶
src/KSPACE/pair_lj_cut_tip4p_long.cpp | 143 ++++++++++++++++++----------------
src/KSPACE/pppm_tip4p.cpp | 128 +++++++++++++++++++-----------
2 files changed, 159 insertions(+), 112 deletions(-)
Diff¶
diff --git a/src/KSPACE/pair_lj_cut_tip4p_long.cpp b/src/KSPACE/pair_lj_cut_tip4p_long.cpp
index a070ca005d..7c4a239833 100644
--- a/src/KSPACE/pair_lj_cut_tip4p_long.cpp
+++ b/src/KSPACE/pair_lj_cut_tip4p_long.cpp
@@ -37,6 +37,10 @@
using namespace LAMMPS_NS;
using namespace EwaldConst;
+using dbl3_t = struct {
+ double x, y, z;
+};
+
/* ---------------------------------------------------------------------- */
PairLJCutTIP4PLong::PairLJCutTIP4PLong(LAMMPS *lmp) :
@@ -119,7 +123,8 @@ void PairLJCutTIP4PLong::compute(int eflag, int vflag)
template <int CTABLE, int EVFLAG, int EFLAG, int VFLAG>
void PairLJCutTIP4PLong::eval()
{
- double **x = atom->x;
+ const auto * _noalias const x = (dbl3_t *) atom->x[0];
+ double **x_d = atom->x;
double *q = atom->q;
tagint *tag = atom->tag;
int *type = atom->type;
@@ -133,16 +138,16 @@ void PairLJCutTIP4PLong::eval()
for (int ii = 0; ii < inum; ii++) {
const int i = ilist[ii];
const double qtmp = q[i];
- const double xtmp = x[i][0];
- const double ytmp = x[i][1];
- const double ztmp = x[i][2];
+ const double xtmp = x[i].x;
+ const double ytmp = x[i].y;
+ const double ztmp = x[i].z;
const int itype = type[i];
int *jlist = firstneigh[i];
const int jnum = numneigh[i];
if (itype == typeO) {
int iH1, iH2;
- double *x1 = tip4p_site(i,iH1,iH2,x,tag,type);
+ double *x1 = tip4p_site(i,iH1,iH2,x_d,tag,type);
const double site_delx = x1[0] - xtmp;
const double site_dely = x1[1] - ytmp;
const double site_delz = x1[2] - ztmp;
@@ -152,10 +157,10 @@ void PairLJCutTIP4PLong::eval()
eval_i<CTABLE,EVFLAG,EFLAG,VFLAG,1,0>(i,itype,qtmp,xtmp,ytmp,ztmp,iH1,iH2,x1,jlist,jnum,
nlocal,cut_coulsq_i_h,cut_coulsqplus);
} else if (itype == typeH) {
- eval_i<CTABLE,EVFLAG,EFLAG,VFLAG,0,1>(i,itype,qtmp,xtmp,ytmp,ztmp,-1,-1,x[i],jlist,jnum,
+ eval_i<CTABLE,EVFLAG,EFLAG,VFLAG,0,1>(i,itype,qtmp,xtmp,ytmp,ztmp,-1,-1,x_d[i],jlist,jnum,
nlocal,cut_coulsq,cut_coulsqplus);
} else {
- eval_i<CTABLE,EVFLAG,EFLAG,VFLAG,0,0>(i,itype,qtmp,xtmp,ytmp,ztmp,-1,-1,x[i],jlist,jnum,
+ eval_i<CTABLE,EVFLAG,EFLAG,VFLAG,0,0>(i,itype,qtmp,xtmp,ytmp,ztmp,-1,-1,x_d[i],jlist,jnum,
nlocal,cut_coulsq,cut_coulsqplus);
}
}
@@ -176,10 +181,12 @@ void PairLJCutTIP4PLong::eval_i(int i, int itype, double qtmp, double xtmp, doub
double factor_coul;
double grij, expm2, prefactor, t, erfc;
double fO[3], fH[3], fd[3], v[6];
- double *x2, *xH1, *xH2;
+ double *x2;
+ const dbl3_t *xH1, *xH2;
- double **f = atom->f;
- double **x = atom->x;
+ auto * _noalias const f = (dbl3_t *) atom->f[0];
+ const auto * _noalias const x = (dbl3_t *) atom->x[0];
+ double **x_d = atom->x;
double *q = atom->q;
tagint *tag = atom->tag;
int *type = atom->type;
@@ -210,9 +217,9 @@ void PairLJCutTIP4PLong::eval_i(int i, int itype, double qtmp, double xtmp, doub
factor_coul = special_coul[sb];
j &= NEIGHMASK;
- delx = xtmp - x[j][0];
- dely = ytmp - x[j][1];
- delz = ztmp - x[j][2];
+ delx = xtmp - x[j].x;
+ dely = ytmp - x[j].y;
+ delz = ztmp - x[j].z;
rsq = delx*delx + dely*dely + delz*delz;
jtype = type[j];
const bool j_is_oxygen = (jtype == typeO);
@@ -226,12 +233,12 @@ void PairLJCutTIP4PLong::eval_i(int i, int itype, double qtmp, double xtmp, doub
forcelj = r6inv * (lj1_i[jtype]*r6inv - lj2_i[jtype]);
forcelj *= factor_lj * r2inv;
- f[i][0] += delx*forcelj;
- f[i][1] += dely*forcelj;
- f[i][2] += delz*forcelj;
- f[j][0] -= delx*forcelj;
- f[j][1] -= dely*forcelj;
- f[j][2] -= delz*forcelj;
+ f[i].x += delx*forcelj;
+ f[i].y += dely*forcelj;
+ f[i].z += delz*forcelj;
+ f[j].x -= delx*forcelj;
+ f[j].y -= dely*forcelj;
+ f[j].z -= delz*forcelj;
if constexpr (EFLAG) {
evdwl = r6inv*(lj3_i[jtype]*r6inv-lj4_i[jtype]) - offset_i[jtype];
@@ -248,15 +255,15 @@ void PairLJCutTIP4PLong::eval_i(int i, int itype, double qtmp, double xtmp, doub
if (rsq < cut_coulsq_precheck) {
if constexpr (I_WATER) {
- if (j_is_oxygen) x2 = tip4p_site(j,jH1,jH2,x,tag,type);
- else x2 = x[j];
+ if (j_is_oxygen) x2 = tip4p_site(j,jH1,jH2,x_d,tag,type);
+ else x2 = x_d[j];
delx = x1[0] - x2[0];
dely = x1[1] - x2[1];
delz = x1[2] - x2[2];
rsq = delx*delx + dely*dely + delz*delz;
} else if (j_is_oxygen) {
- x2 = tip4p_site(j,jH1,jH2,x,tag,type);
+ x2 = tip4p_site(j,jH1,jH2,x_d,tag,type);
delx = x1[0] - x2[0];
dely = x1[1] - x2[1];
delz = x1[2] - x2[2];
@@ -299,9 +306,9 @@ void PairLJCutTIP4PLong::eval_i(int i, int itype, double qtmp, double xtmp, doub
}
if constexpr (!I_WATER) {
- f[i][0] += delx * cforce;
- f[i][1] += dely * cforce;
- f[i][2] += delz * cforce;
+ f[i].x += delx * cforce;
+ f[i].y += dely * cforce;
+ f[i].z += delz * cforce;
if constexpr (VFLAG) {
v[0] = xtmp * delx * cforce;
@@ -326,27 +333,27 @@ void PairLJCutTIP4PLong::eval_i(int i, int itype, double qtmp, double xtmp, doub
fH[1] = half_alpha * fd[1];
fH[2] = half_alpha * fd[2];
- f[i][0] += fO[0];
- f[i][1] += fO[1];
- f[i][2] += fO[2];
+ f[i].x += fO[0];
+ f[i].y += fO[1];
+ f[i].z += fO[2];
- f[iH1][0] += fH[0];
- f[iH1][1] += fH[1];
- f[iH1][2] += fH[2];
+ f[iH1].x += fH[0];
+ f[iH1].y += fH[1];
+ f[iH1].z += fH[2];
- f[iH2][0] += fH[0];
- f[iH2][1] += fH[1];
- f[iH2][2] += fH[2];
+ f[iH2].x += fH[0];
+ f[iH2].y += fH[1];
+ f[iH2].z += fH[2];
if constexpr (VFLAG) {
- xH1 = x[iH1];
- xH2 = x[iH2];
- v[0] = xtmp*fO[0] + xH1[0]*fH[0] + xH2[0]*fH[0];
- v[1] = ytmp*fO[1] + xH1[1]*fH[1] + xH2[1]*fH[1];
- v[2] = ztmp*fO[2] + xH1[2]*fH[2] + xH2[2]*fH[2];
- v[3] = xtmp*fO[1] + xH1[0]*fH[1] + xH2[0]*fH[1];
- v[4] = xtmp*fO[2] + xH1[0]*fH[2] + xH2[0]*fH[2];
- v[5] = ytmp*fO[2] + xH1[1]*fH[2] + xH2[1]*fH[2];
+ xH1 = &x[iH1];
+ xH2 = &x[iH2];
+ v[0] = xtmp*fO[0] + xH1->x*fH[0] + xH2->x*fH[0];
+ v[1] = ytmp*fO[1] + xH1->y*fH[1] + xH2->y*fH[1];
+ v[2] = ztmp*fO[2] + xH1->z*fH[2] + xH2->z*fH[2];
+ v[3] = xtmp*fO[1] + xH1->x*fH[1] + xH2->x*fH[1];
+ v[4] = xtmp*fO[2] + xH1->x*fH[2] + xH2->x*fH[2];
+ v[5] = ytmp*fO[2] + xH1->y*fH[2] + xH2->y*fH[2];
}
if constexpr (EVFLAG) {
vlist[n++] = i;
@@ -356,17 +363,17 @@ void PairLJCutTIP4PLong::eval_i(int i, int itype, double qtmp, double xtmp, doub
}
if (!j_is_oxygen) {
- f[j][0] -= delx * cforce;
- f[j][1] -= dely * cforce;
- f[j][2] -= delz * cforce;
+ f[j].x -= delx * cforce;
+ f[j].y -= dely * cforce;
+ f[j].z -= delz * cforce;
if constexpr (VFLAG) {
- v[0] -= x[j][0] * delx * cforce;
- v[1] -= x[j][1] * dely * cforce;
- v[2] -= x[j][2] * delz * cforce;
- v[3] -= x[j][0] * dely * cforce;
- v[4] -= x[j][0] * delz * cforce;
- v[5] -= x[j][1] * delz * cforce;
+ v[0] -= x[j].x * delx * cforce;
+ v[1] -= x[j].y * dely * cforce;
+ v[2] -= x[j].z * delz * cforce;
+ v[3] -= x[j].x * dely * cforce;
+ v[4] -= x[j].x * delz * cforce;
+ v[5] -= x[j].y * delz * cforce;
}
if constexpr (EVFLAG) vlist[n++] = j;
@@ -385,27 +392,27 @@ void PairLJCutTIP4PLong::eval_i(int i, int itype, double qtmp, double xtmp, doub
fH[1] = half_alpha * fd[1];
fH[2] = half_alpha * fd[2];
- f[j][0] += fO[0];
- f[j][1] += fO[1];
- f[j][2] += fO[2];
+ f[j].x += fO[0];
+ f[j].y += fO[1];
+ f[j].z += fO[2];
- f[jH1][0] += fH[0];
- f[jH1][1] += fH[1];
- f[jH1][2] += fH[2];
+ f[jH1].x += fH[0];
+ f[jH1].y += fH[1];
+ f[jH1].z += fH[2];
- f[jH2][0] += fH[0];
- f[jH2][1] += fH[1];
- f[jH2][2] += fH[2];
+ f[jH2].x += fH[0];
+ f[jH2].y += fH[1];
+ f[jH2].z += fH[2];
if constexpr (VFLAG) {
- xH1 = x[jH1];
- xH2 = x[jH2];
- v[0] += x[j][0]*fO[0] + xH1[0]*fH[0] + xH2[0]*fH[0];
- v[1] += x[j][1]*fO[1] + xH1[1]*fH[1] + xH2[1]*fH[1];
- v[2] += x[j][2]*fO[2] + xH1[2]*fH[2] + xH2[2]*fH[2];
- v[3] += x[j][0]*fO[1] + xH1[0]*fH[1] + xH2[0]*fH[1];
- v[4] += x[j][0]*fO[2] + xH1[0]*fH[2] + xH2[0]*fH[2];
- v[5] += x[j][1]*fO[2] + xH1[1]*fH[2] + xH2[1]*fH[2];
+ xH1 = &x[jH1];
+ xH2 = &x[jH2];
+ v[0] += x[j].x*fO[0] + xH1->x*fH[0] + xH2->x*fH[0];
+ v[1] += x[j].y*fO[1] + xH1->y*fH[1] + xH2->y*fH[1];
+ v[2] += x[j].z*fO[2] + xH1->z*fH[2] + xH2->z*fH[2];
+ v[3] += x[j].x*fO[1] + xH1->x*fH[1] + xH2->x*fH[1];
+ v[4] += x[j].x*fO[2] + xH1->x*fH[2] + xH2->x*fH[2];
+ v[5] += x[j].y*fO[2] + xH1->y*fH[2] + xH2->y*fH[2];
}
if constexpr (EVFLAG) {
vlist[n++] = j;
diff --git a/src/KSPACE/pppm_tip4p.cpp b/src/KSPACE/pppm_tip4p.cpp
index dae2a023f0..ad04d3a89c 100644
--- a/src/KSPACE/pppm_tip4p.cpp
+++ b/src/KSPACE/pppm_tip4p.cpp
@@ -34,6 +34,14 @@ using namespace MathConst;
static constexpr FFT_SCALAR ZEROF = 0.0;
static constexpr int OFFSET = 16384;
+using dbl3_t = struct {
+ double x, y, z;
+};
+
+using int3_t = struct {
+ int a, b, t;
+};
+
/* ---------------------------------------------------------------------- */
PPPMTIP4P::PPPMTIP4P(LAMMPS *lmp) : PPPM(lmp)
@@ -113,11 +121,15 @@ void PPPMTIP4P::find_M_cached(int i, int &iH1, int &iH2, double *&xM)
void PPPMTIP4P::particle_map()
{
int nx,ny,nz,iH1,iH2;
- double *xi,*xM;
+ double *xM;
int *type = atom->type;
- double **x = atom->x;
+ const auto * _noalias const x = (dbl3_t *) atom->x[0];
+ auto * _noalias const p2g = (int3_t *) part2grid[0];
int nlocal = atom->nlocal;
+ const double boxlox = boxlo[0];
+ const double boxloy = boxlo[1];
+ const double boxloz = boxlo[2];
if (nlocal > 0) {
grow_tip4p_cache();
@@ -129,22 +141,29 @@ void PPPMTIP4P::particle_map()
int flag = 0;
for (int i = 0; i < nlocal; i++) {
+ double xix, xiy, xiz;
if (type[i] == typeO) {
find_M_cached(i,iH1,iH2,xM);
- xi = xM;
- } else xi = x[i];
+ xix = xM[0];
+ xiy = xM[1];
+ xiz = xM[2];
+ } else {
+ xix = x[i].x;
+ xiy = x[i].y;
+ xiz = x[i].z;
+ }
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
// current particle coord can be outside global and local box
// add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
- nx = static_cast<int> ((xi[0]-boxlo[0])*delxinv+shift) - OFFSET;
- ny = static_cast<int> ((xi[1]-boxlo[1])*delyinv+shift) - OFFSET;
- nz = static_cast<int> ((xi[2]-boxlo[2])*delzinv+shift) - OFFSET;
+ nx = static_cast<int> ((xix-boxlox)*delxinv+shift) - OFFSET;
+ ny = static_cast<int> ((xiy-boxloy)*delyinv+shift) - OFFSET;
+ nz = static_cast<int> ((xiz-boxloz)*delzinv+shift) - OFFSET;
- part2grid[i][0] = nx;
- part2grid[i][1] = ny;
- part2grid[i][2] = nz;
+ p2g[i].a = nx;
+ p2g[i].b = ny;
+ p2g[i].t = nz;
// check that entire stencil around nx,ny,nz will fit in my 3d brick
@@ -169,11 +188,11 @@ void PPPMTIP4P::make_rho()
{
int l,m,n,nx,ny,nz,iH1,iH2;
FFT_SCALAR dx,dy,dz,x0,y0,z0;
- double *xi,*xM;
+ double *xM;
// clear 3d density array
- FFT_SCALAR *density = &density_brick[nzlo_out][nylo_out][nxlo_out];
+ FFT_SCALAR * _noalias const density = &density_brick[nzlo_out][nylo_out][nxlo_out];
std::memset(density,0,ngrid*sizeof(FFT_SCALAR));
// loop over my charges, add their contribution to nearby grid points
@@ -183,24 +202,35 @@ void PPPMTIP4P::make_rho()
int *type = atom->type;
double *q = atom->q;
- double **x = atom->x;
+ const auto * _noalias const x = (dbl3_t *) atom->x[0];
+ const auto * _noalias const p2g = (int3_t *) part2grid[0];
int nlocal = atom->nlocal;
const int nx_out = nxhi_out - nxlo_out + 1;
const int ny_out = nyhi_out - nylo_out + 1;
const int nyz_out = nx_out * ny_out;
+ const double boxlox = boxlo[0];
+ const double boxloy = boxlo[1];
+ const double boxloz = boxlo[2];
for (int i = 0; i < nlocal; i++) {
+ double xix, xiy, xiz;
if (type[i] == typeO) {
find_M_cached(i,iH1,iH2,xM);
- xi = xM;
- } else xi = x[i];
+ xix = xM[0];
+ xiy = xM[1];
+ xiz = xM[2];
+ } else {
+ xix = x[i].x;
+ xiy = x[i].y;
+ xiz = x[i].z;
+ }
- nx = part2grid[i][0];
- ny = part2grid[i][1];
- nz = part2grid[i][2];
- dx = nx+shiftone - (xi[0]-boxlo[0])*delxinv;
- dy = ny+shiftone - (xi[1]-boxlo[1])*delyinv;
- dz = nz+shiftone - (xi[2]-boxlo[2])*delzinv;
+ nx = p2g[i].a;
+ ny = p2g[i].b;
+ nz = p2g[i].t;
+ dx = nx+shiftone - (xix-boxlox)*delxinv;
+ dy = ny+shiftone - (xiy-boxloy)*delyinv;
+ dz = nz+shiftone - (xiz-boxloz)*delzinv;
compute_rho1d(dx,dy,dz);
@@ -229,7 +259,6 @@ void PPPMTIP4P::fieldforce_ik()
int i,l,m,n,nx,ny,nz,iH1,iH2;
FFT_SCALAR dx,dy,dz,x0,y0,z0;
FFT_SCALAR ekx,eky,ekz;
- double *xi;
double *xM;
double fx,fy,fz;
@@ -239,8 +268,9 @@ void PPPMTIP4P::fieldforce_ik()
// (mx,my,mz) = global coords of moving stencil pt
// ek = 3 components of E-field on particle
double *q = atom->q;
- double **x = atom->x;
- double **f = atom->f;
+ const auto * _noalias const x = (dbl3_t *) atom->x[0];
+ auto * _noalias const f = (dbl3_t *) atom->f[0];
+ const auto * _noalias const p2g = (int3_t *) part2grid[0];
const double half_alpha = 0.5 * alpha;
const double one_minus_alpha = 1.0 - alpha;
const int zforce_active = (slabflag != 2);
@@ -253,19 +283,29 @@ void PPPMTIP4P::fieldforce_ik()
const int nx_out = nxhi_out - nxlo_out + 1;
const int ny_out = nyhi_out - nylo_out + 1;
const int nyz_out = nx_out * ny_out;
+ const double boxlox = boxlo[0];
+ const double boxloy = boxlo[1];
+ const double boxloz = boxlo[2];
for (i = 0; i < nlocal; i++) {
+ double xix, xiy, xiz;
if (type[i] == typeO) {
find_M_cached(i,iH1,iH2,xM);
- xi = xM;
- } else xi = x[i];
+ xix = xM[0];
+ xiy = xM[1];
+ xiz = xM[2];
+ } else {
+ xix = x[i].x;
+ xiy = x[i].y;
+ xiz = x[i].z;
+ }
- nx = part2grid[i][0];
- ny = part2grid[i][1];
- nz = part2grid[i][2];
- dx = nx+shiftone - (xi[0]-boxlo[0])*delxinv;
- dy = ny+shiftone - (xi[1]-boxlo[1])*delyinv;
- dz = nz+shiftone - (xi[2]-boxlo[2])*delzinv;
+ nx = p2g[i].a;
+ ny = p2g[i].b;
+ nz = p2g[i].t;
+ dx = nx+shiftone - (xix-boxlox)*delxinv;
+ dy = ny+shiftone - (xiy-boxloy)*delyinv;
+ dz = nz+shiftone - (xiz-boxloz)*delzinv;
compute_rho1d(dx,dy,dz);
@@ -290,26 +330,26 @@ void PPPMTIP4P::fieldforce_ik()
const double qfactor = qqrd2e * scale * q[i];
if (type[i] != typeO) {
- f[i][0] += qfactor*ekx;
- f[i][1] += qfactor*eky;
- if (zforce_active) f[i][2] += qfactor*ekz;
+ f[i].x += qfactor*ekx;
+ f[i].y += qfactor*eky;
+ if (zforce_active) f[i].z += qfactor*ekz;
} else {
fx = qfactor * ekx;
fy = qfactor * eky;
fz = qfactor * ekz;
- f[i][0] += fx * one_minus_alpha;
- f[i][1] += fy * one_minus_alpha;
- if (zforce_active) f[i][2] += fz * one_minus_alpha;
+ f[i].x += fx * one_minus_alpha;
+ f[i].y += fy * one_minus_alpha;
+ if (zforce_active) f[i].z += fz * one_minus_alpha;
- f[iH1][0] += half_alpha * fx;
- f[iH1][1] += half_alpha * fy;
- if (zforce_active) f[iH1][2] += half_alpha * fz;
+ f[iH1].x += half_alpha * fx;
+ f[iH1].y += half_alpha * fy;
+ if (zforce_active) f[iH1].z += half_alpha * fz;
- f[iH2][0] += half_alpha * fx;
- f[iH2][1] += half_alpha * fy;
- if (zforce_active) f[iH2][2] += half_alpha * fz;
+ f[iH2].x += half_alpha * fx;
+ f[iH2].y += half_alpha * fy;
+ if (zforce_active) f[iH2].z += half_alpha * fz;
}
}
}