Iteration 0010 — cf4ba8e7204d (accepted)¶
GitHub commit: cf4ba8e7204d Published branch: fermilink-optimize/lammps-bond
Change summary¶
Combine the correctness-safe bond_class2 one-type two-way unrolled hot kernel with the correctness-safe angle_harmonic one-type dispatch and constant hoisting on top of incumbent 4a79c588ea08.
Acceptance rationale¶
Correctness passed and weighted_median_bond_seconds improved from 0.27301 to 0.26621 (+2.49% vs incumbent), clearing the 2% promotion threshold.
Guardrails & metrics¶
field |
value |
|---|---|
decision |
ACCEPTED |
correctness |
ok |
correctness mode |
field_tolerances |
hard reject |
no |
guardrail errors |
0 |
incumbent commit |
|
candidate commit |
|
incumbent metric |
0.27301 |
candidate metric |
0.26621 |
baseline metric |
0.32627 |
Δ vs incumbent |
+2.491% (lower-is-better sign) |
changed files |
src/CLASS2/bond_class2.cpp, src/CLASS2/bond_class2.h, src/MOLECULE/angle_harmonic.cpp, src/MOLECULE/angle_harmonic.h |
Diffstat¶
src/CLASS2/bond_class2.cpp | 102 +++++++++++++++++++++++++++++++
src/CLASS2/bond_class2.h | 1 +
src/MOLECULE/angle_harmonic.cpp | 130 +++++++++++++++++++++++++++++++++++++---
src/MOLECULE/angle_harmonic.h | 1 +
4 files changed, 226 insertions(+), 8 deletions(-)
Diff¶
diff --git a/src/CLASS2/bond_class2.cpp b/src/CLASS2/bond_class2.cpp
index 4eb7c0fb39..0984b0a7e1 100644
--- a/src/CLASS2/bond_class2.cpp
+++ b/src/CLASS2/bond_class2.cpp
@@ -72,6 +72,11 @@ void BondClass2::compute(int eflag, int vflag)
if (nbondlist <= 0) return;
+ if (one_type && !evflag && newton_bond) {
+ eval_one_type_hot(nbondlist);
+ return;
+ }
+
if (one_type) {
if (evflag) {
if (eflag) {
@@ -101,6 +106,103 @@ void BondClass2::compute(int eflag, int vflag)
}
}
+void BondClass2::eval_one_type_hot(int nbondlist)
+{
+ const auto *_noalias const x = (dbl3_t *) atom->x[0];
+ auto *_noalias const f = (dbl3_t *) atom->f[0]; // NOLINT
+ const int3_t *_noalias const bondlist = (int3_t *) neighbor->bondlist[0];
+ const double coeff_r0 = r0[1];
+ const double coeff_2k2 = 2.0 * k2[1];
+ const double coeff_3k3 = 3.0 * k3[1];
+ const double coeff_4k4 = 4.0 * k4[1];
+
+ int n = 0;
+
+ // Two-way unrolling gives the compiler more independent exact work around the sqrt/div pair.
+ for (; n + 1 < nbondlist; n += 2) {
+ const int i1_0 = bondlist[n].a;
+ const int i2_0 = bondlist[n].b;
+ const int i1_1 = bondlist[n + 1].a;
+ const int i2_1 = bondlist[n + 1].b;
+
+ const double delx0 = x[i1_0].x - x[i2_0].x;
+ const double dely0 = x[i1_0].y - x[i2_0].y;
+ const double delz0 = x[i1_0].z - x[i2_0].z;
+ const double delx1 = x[i1_1].x - x[i2_1].x;
+ const double dely1 = x[i1_1].y - x[i2_1].y;
+ const double delz1 = x[i1_1].z - x[i2_1].z;
+
+ const double rsq0 = delx0 * delx0 + dely0 * dely0 + delz0 * delz0;
+ const double rsq1 = delx1 * delx1 + dely1 * dely1 + delz1 * delz1;
+ const double r0 = sqrt(rsq0);
+ const double r1 = sqrt(rsq1);
+ const double dr0 = r0 - coeff_r0;
+ const double dr1 = r1 - coeff_r0;
+ const double dr20 = dr0 * dr0;
+ const double dr21 = dr1 * dr1;
+ const double dr30 = dr20 * dr0;
+ const double dr31 = dr21 * dr1;
+
+ const double de_bond0 = coeff_2k2 * dr0 + coeff_3k3 * dr20 + coeff_4k4 * dr30;
+ const double de_bond1 = coeff_2k2 * dr1 + coeff_3k3 * dr21 + coeff_4k4 * dr31;
+ double fbond0, fbond1;
+ if (r0 > 0.0) fbond0 = -de_bond0 / r0;
+ else fbond0 = 0.0;
+ if (r1 > 0.0) fbond1 = -de_bond1 / r1;
+ else fbond1 = 0.0;
+
+ const double fx0 = delx0 * fbond0;
+ const double fy0 = dely0 * fbond0;
+ const double fz0 = delz0 * fbond0;
+ f[i1_0].x += fx0;
+ f[i1_0].y += fy0;
+ f[i1_0].z += fz0;
+ f[i2_0].x -= fx0;
+ f[i2_0].y -= fy0;
+ f[i2_0].z -= fz0;
+
+ const double fx1 = delx1 * fbond1;
+ const double fy1 = dely1 * fbond1;
+ const double fz1 = delz1 * fbond1;
+ f[i1_1].x += fx1;
+ f[i1_1].y += fy1;
+ f[i1_1].z += fz1;
+ f[i2_1].x -= fx1;
+ f[i2_1].y -= fy1;
+ f[i2_1].z -= fz1;
+ }
+
+ for (; n < nbondlist; n++) {
+ const int i1 = bondlist[n].a;
+ const int i2 = bondlist[n].b;
+
+ const double delx = x[i1].x - x[i2].x;
+ const double dely = x[i1].y - x[i2].y;
+ const double delz = x[i1].z - x[i2].z;
+
+ const double rsq = delx * delx + dely * dely + delz * delz;
+ const double r = sqrt(rsq);
+ const double dr = r - coeff_r0;
+ const double dr2 = dr * dr;
+ const double dr3 = dr2 * dr;
+
+ const double de_bond = coeff_2k2 * dr + coeff_3k3 * dr2 + coeff_4k4 * dr3;
+ double fbond;
+ if (r > 0.0) fbond = -de_bond / r;
+ else fbond = 0.0;
+
+ const double fx = delx * fbond;
+ const double fy = dely * fbond;
+ const double fz = delz * fbond;
+ f[i1].x += fx;
+ f[i1].y += fy;
+ f[i1].z += fz;
+ f[i2].x -= fx;
+ f[i2].y -= fy;
+ f[i2].z -= fz;
+ }
+}
+
template <int EVFLAG, int EFLAG, int NEWTON_BOND>
void BondClass2::eval(int nbondlist)
{
diff --git a/src/CLASS2/bond_class2.h b/src/CLASS2/bond_class2.h
index de72ebe03b..d8ae006de7 100644
--- a/src/CLASS2/bond_class2.h
+++ b/src/CLASS2/bond_class2.h
@@ -42,6 +42,7 @@ class BondClass2 : public Bond {
double *r0, *k2, *k3, *k4;
virtual void allocate();
+ void eval_one_type_hot(int);
template <int EVFLAG, int EFLAG, int NEWTON_BOND> void eval(int);
template <int EVFLAG, int EFLAG, int NEWTON_BOND> void eval_one_type(int);
};
diff --git a/src/MOLECULE/angle_harmonic.cpp b/src/MOLECULE/angle_harmonic.cpp
index 49f2849985..1c2eec9828 100644
--- a/src/MOLECULE/angle_harmonic.cpp
+++ b/src/MOLECULE/angle_harmonic.cpp
@@ -67,20 +67,36 @@ void AngleHarmonic::compute(int eflag, int vflag)
int nanglelist = neighbor->nanglelist;
int newton_bond = force->newton_bond;
+ const int one_type = atom->nangletypes == 1;
if (nanglelist <= 0) return;
- if (evflag) {
- if (eflag) {
- if (newton_bond) eval<1,1,1>(nanglelist);
- else eval<1,1,0>(nanglelist);
+ if (one_type) {
+ if (evflag) {
+ if (eflag) {
+ if (newton_bond) eval_one_type<1,1,1>(nanglelist);
+ else eval_one_type<1,1,0>(nanglelist);
+ } else {
+ if (newton_bond) eval_one_type<1,0,1>(nanglelist);
+ else eval_one_type<1,0,0>(nanglelist);
+ }
} else {
- if (newton_bond) eval<1,0,1>(nanglelist);
- else eval<1,0,0>(nanglelist);
+ if (newton_bond) eval_one_type<0,0,1>(nanglelist);
+ else eval_one_type<0,0,0>(nanglelist);
}
} else {
- if (newton_bond) eval<0,0,1>(nanglelist);
- else eval<0,0,0>(nanglelist);
+ if (evflag) {
+ if (eflag) {
+ if (newton_bond) eval<1,1,1>(nanglelist);
+ else eval<1,1,0>(nanglelist);
+ } else {
+ if (newton_bond) eval<1,0,1>(nanglelist);
+ else eval<1,0,0>(nanglelist);
+ }
+ } else {
+ if (newton_bond) eval<0,0,1>(nanglelist);
+ else eval<0,0,0>(nanglelist);
+ }
}
}
@@ -183,6 +199,104 @@ void AngleHarmonic::eval(int nanglelist)
}
}
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void AngleHarmonic::eval_one_type(int nanglelist)
+{
+ int i1, i2, i3;
+ double delx1, dely1, delz1, delx2, dely2, delz2;
+ double eangle, f1[3], f3[3];
+ double dtheta, tk;
+ double rsq1, rsq2, r1, r2, c, s, a, a11, a12, a22;
+
+ const auto *_noalias const x = (dbl3_t *) atom->x[0];
+ auto *_noalias const f = (dbl3_t *) atom->f[0]; // NOLINT
+ const int4_t *_noalias const anglelist = (int4_t *) neighbor->anglelist[0];
+ const int nlocal = atom->nlocal;
+ const double coeff_k = k[1];
+ const double coeff_theta0 = theta0[1];
+
+ eangle = 0.0;
+
+ for (int n = 0; n < nanglelist; n++) {
+ i1 = anglelist[n].a;
+ i2 = anglelist[n].b;
+ i3 = anglelist[n].c;
+
+ // 1st bond
+
+ delx1 = x[i1].x - x[i2].x;
+ dely1 = x[i1].y - x[i2].y;
+ delz1 = x[i1].z - x[i2].z;
+
+ rsq1 = delx1 * delx1 + dely1 * dely1 + delz1 * delz1;
+ r1 = sqrt(rsq1);
+
+ // 2nd bond
+
+ delx2 = x[i3].x - x[i2].x;
+ dely2 = x[i3].y - x[i2].y;
+ delz2 = x[i3].z - x[i2].z;
+
+ rsq2 = delx2 * delx2 + dely2 * dely2 + delz2 * delz2;
+ r2 = sqrt(rsq2);
+
+ // angle (cos and sin)
+
+ c = delx1 * delx2 + dely1 * dely2 + delz1 * delz2;
+ c /= r1 * r2;
+
+ if (c > 1.0) c = 1.0;
+ if (c < -1.0) c = -1.0;
+
+ s = sqrt(1.0 - c * c);
+ if (s < SMALL) s = SMALL;
+ s = 1.0 / s;
+
+ // force & energy
+
+ dtheta = acos(c) - coeff_theta0;
+ tk = coeff_k * dtheta;
+
+ if (EFLAG) eangle = tk * dtheta;
+
+ a = -2.0 * tk * s;
+ a11 = a * c / rsq1;
+ a12 = -a / (r1 * r2);
+ a22 = a * c / rsq2;
+
+ f1[0] = a11 * delx1 + a12 * delx2;
+ f1[1] = a11 * dely1 + a12 * dely2;
+ f1[2] = a11 * delz1 + a12 * delz2;
+ f3[0] = a22 * delx2 + a12 * delx1;
+ f3[1] = a22 * dely2 + a12 * dely1;
+ f3[2] = a22 * delz2 + a12 * delz1;
+
+ // apply force to each of 3 atoms
+
+ if (NEWTON_BOND || i1 < nlocal) {
+ f[i1].x += f1[0];
+ f[i1].y += f1[1];
+ f[i1].z += f1[2];
+ }
+
+ if (NEWTON_BOND || i2 < nlocal) {
+ f[i2].x -= f1[0] + f3[0];
+ f[i2].y -= f1[1] + f3[1];
+ f[i2].z -= f1[2] + f3[2];
+ }
+
+ if (NEWTON_BOND || i3 < nlocal) {
+ f[i3].x += f3[0];
+ f[i3].y += f3[1];
+ f[i3].z += f3[2];
+ }
+
+ if (EVFLAG)
+ ev_tally(i1, i2, i3, nlocal, NEWTON_BOND, eangle, f1, f3, delx1, dely1, delz1, delx2, dely2,
+ delz2);
+ }
+}
+
/* ---------------------------------------------------------------------- */
void AngleHarmonic::allocate()
diff --git a/src/MOLECULE/angle_harmonic.h b/src/MOLECULE/angle_harmonic.h
index 0c58059539..104b1913ae 100644
--- a/src/MOLECULE/angle_harmonic.h
+++ b/src/MOLECULE/angle_harmonic.h
@@ -43,6 +43,7 @@ class AngleHarmonic : public Angle {
virtual void allocate();
template <int EVFLAG, int EFLAG, int NEWTON_BOND> void eval(int);
+ template <int EVFLAG, int EFLAG, int NEWTON_BOND> void eval_one_type(int);
};
} // namespace LAMMPS_NS