Iteration 0025 — bf07f28c4118 (accepted)

GitHub commit: bf07f28c4118 Published branch: fermilink-optimize/lammps-bond

Change summary

Compact the hot angle cache to exact r12 payloads and widen the one-type cached bond_class2 producer to four-way exact unrolling while preserving the incumbent angle_harmonic arithmetic order and fallback guard

Acceptance rationale

Correctness passed and weighted_median_bond_seconds improved from 0.22529 to 0.21875, a 2.90% gain over the incumbent that clears the 2% promotion threshold.

Guardrails & metrics

field

value

decision

ACCEPTED

correctness

ok

correctness mode

field_tolerances

hard reject

no

guardrail errors

0

incumbent commit

30584dbe2ab0

candidate commit

bf07f28c4118

incumbent metric

0.22529

candidate metric

0.21875

baseline metric

0.32627

Δ vs incumbent

+2.903% (lower-is-better sign)

changed files

src/CLASS2/bond_class2.cpp, src/CLASS2/bond_class2.h, src/MOLECULE/angle_harmonic.cpp

Diffstat

src/CLASS2/bond_class2.cpp      | 191 ++++++++++++++++++++++++++++++++--------
src/CLASS2/bond_class2.h        |   4 +-
src/MOLECULE/angle_harmonic.cpp |  21 ++---
3 files changed, 167 insertions(+), 49 deletions(-)

Diff

download full diff

diff --git a/src/CLASS2/bond_class2.cpp b/src/CLASS2/bond_class2.cpp
index 097547c488..4250318473 100644
--- a/src/CLASS2/bond_class2.cpp
+++ b/src/CLASS2/bond_class2.cpp
@@ -44,6 +44,44 @@ using int4_t = struct {
   int a, b, c, t;
 };

+static inline __attribute__((always_inline)) bool fill_hot_angle_cache_entry(
+    BondClass2AngleHotEntry &cache, const int4_t &angle, const int i1_0, const int i2_0,
+    const int i1_1, const int i2_1, const double delx0, const double dely0, const double delz0,
+    const double rsq0, const double r0, const double delx1, const double dely1,
+    const double delz1, const double rsq1, const double r1)
+{
+  if (i1_0 != angle.b || i1_1 != angle.b) return false;
+
+  const double r12 = r0 * r1;
+  if (i2_0 == angle.a && i2_1 == angle.c) {
+    cache.delx1 = -delx0;
+    cache.dely1 = -dely0;
+    cache.delz1 = -delz0;
+    cache.rsq1 = rsq0;
+    cache.delx2 = -delx1;
+    cache.dely2 = -dely1;
+    cache.delz2 = -delz1;
+    cache.rsq2 = rsq1;
+    cache.r12 = r12;
+    return true;
+  }
+
+  if (i2_0 == angle.c && i2_1 == angle.a) {
+    cache.delx1 = -delx1;
+    cache.dely1 = -dely1;
+    cache.delz1 = -delz1;
+    cache.rsq1 = rsq1;
+    cache.delx2 = -delx0;
+    cache.dely2 = -dely0;
+    cache.delz2 = -delz0;
+    cache.rsq2 = rsq0;
+    cache.r12 = r12;
+    return true;
+  }
+
+  return false;
+}
+
 namespace LAMMPS_NS {
 std::vector<BondClass2AngleHotEntry> bond_class2_hot_angle_cache;
 bigint bond_class2_hot_angle_timestep = -1;
@@ -145,7 +183,120 @@ void BondClass2::eval_one_type_hot(int nbondlist)

   int n = 0;

-  // Two-way unrolling gives the compiler more independent exact work around the sqrt/div pair.
+  // Four-way unrolling exposes more independent exact work around the sqrt/div pair.
+  for (; n + 3 < nbondlist; n += 4) {
+    const int i1_0 = bondlist[n].a;
+    const int i2_0 = bondlist[n].b;
+    const int i1_1 = bondlist[n + 1].a;
+    const int i2_1 = bondlist[n + 1].b;
+    const int i1_2 = bondlist[n + 2].a;
+    const int i2_2 = bondlist[n + 2].b;
+    const int i1_3 = bondlist[n + 3].a;
+    const int i2_3 = bondlist[n + 3].b;
+
+    const double delx0 = x[i1_0].x - x[i2_0].x;
+    const double dely0 = x[i1_0].y - x[i2_0].y;
+    const double delz0 = x[i1_0].z - x[i2_0].z;
+    const double delx1 = x[i1_1].x - x[i2_1].x;
+    const double dely1 = x[i1_1].y - x[i2_1].y;
+    const double delz1 = x[i1_1].z - x[i2_1].z;
+    const double delx2 = x[i1_2].x - x[i2_2].x;
+    const double dely2 = x[i1_2].y - x[i2_2].y;
+    const double delz2 = x[i1_2].z - x[i2_2].z;
+    const double delx3 = x[i1_3].x - x[i2_3].x;
+    const double dely3 = x[i1_3].y - x[i2_3].y;
+    const double delz3 = x[i1_3].z - x[i2_3].z;
+
+    const double rsq0 = delx0 * delx0 + dely0 * dely0 + delz0 * delz0;
+    const double rsq1 = delx1 * delx1 + dely1 * dely1 + delz1 * delz1;
+    const double rsq2 = delx2 * delx2 + dely2 * dely2 + delz2 * delz2;
+    const double rsq3 = delx3 * delx3 + dely3 * dely3 + delz3 * delz3;
+    const double r0 = sqrt(rsq0);
+    const double r1 = sqrt(rsq1);
+    const double r2 = sqrt(rsq2);
+    const double r3 = sqrt(rsq3);
+
+    if (angle_cache_valid) {
+      const int angle_index0 = n >> 1;
+      const int angle_index1 = angle_index0 + 1;
+      angle_cache_valid =
+          fill_hot_angle_cache_entry(bond_class2_hot_angle_cache[angle_index0], anglelist[angle_index0],
+                                     i1_0, i2_0, i1_1, i2_1, delx0, dely0, delz0, rsq0, r0, delx1,
+                                     dely1, delz1, rsq1, r1) &&
+          fill_hot_angle_cache_entry(bond_class2_hot_angle_cache[angle_index1], anglelist[angle_index1],
+                                     i1_2, i2_2, i1_3, i2_3, delx2, dely2, delz2, rsq2, r2, delx3,
+                                     dely3, delz3, rsq3, r3);
+    }
+
+    const double dr0 = r0 - coeff_r0;
+    const double dr1 = r1 - coeff_r0;
+    const double dr2 = r2 - coeff_r0;
+    const double dr3 = r3 - coeff_r0;
+    const double dr20 = dr0 * dr0;
+    const double dr21 = dr1 * dr1;
+    const double dr22 = dr2 * dr2;
+    const double dr23 = dr3 * dr3;
+    const double dr30 = dr20 * dr0;
+    const double dr31 = dr21 * dr1;
+    const double dr32 = dr22 * dr2;
+    const double dr33 = dr23 * dr3;
+
+    const double de_bond0 = coeff_2k2 * dr0 + coeff_3k3 * dr20 + coeff_4k4 * dr30;
+    const double de_bond1 = coeff_2k2 * dr1 + coeff_3k3 * dr21 + coeff_4k4 * dr31;
+    const double de_bond2 = coeff_2k2 * dr2 + coeff_3k3 * dr22 + coeff_4k4 * dr32;
+    const double de_bond3 = coeff_2k2 * dr3 + coeff_3k3 * dr23 + coeff_4k4 * dr33;
+    double fbond0, fbond1, fbond2, fbond3;
+    if (r0 > 0.0) fbond0 = -de_bond0 / r0;
+    else fbond0 = 0.0;
+    if (r1 > 0.0) fbond1 = -de_bond1 / r1;
+    else fbond1 = 0.0;
+    if (r2 > 0.0) fbond2 = -de_bond2 / r2;
+    else fbond2 = 0.0;
+    if (r3 > 0.0) fbond3 = -de_bond3 / r3;
+    else fbond3 = 0.0;
+
+    const double fx0 = delx0 * fbond0;
+    const double fy0 = dely0 * fbond0;
+    const double fz0 = delz0 * fbond0;
+    f[i1_0].x += fx0;
+    f[i1_0].y += fy0;
+    f[i1_0].z += fz0;
+    f[i2_0].x -= fx0;
+    f[i2_0].y -= fy0;
+    f[i2_0].z -= fz0;
+
+    const double fx1 = delx1 * fbond1;
+    const double fy1 = dely1 * fbond1;
+    const double fz1 = delz1 * fbond1;
+    f[i1_1].x += fx1;
+    f[i1_1].y += fy1;
+    f[i1_1].z += fz1;
+    f[i2_1].x -= fx1;
+    f[i2_1].y -= fy1;
+    f[i2_1].z -= fz1;
+
+    const double fx2 = delx2 * fbond2;
+    const double fy2 = dely2 * fbond2;
+    const double fz2 = delz2 * fbond2;
+    f[i1_2].x += fx2;
+    f[i1_2].y += fy2;
+    f[i1_2].z += fz2;
+    f[i2_2].x -= fx2;
+    f[i2_2].y -= fy2;
+    f[i2_2].z -= fz2;
+
+    const double fx3 = delx3 * fbond3;
+    const double fy3 = dely3 * fbond3;
+    const double fz3 = delz3 * fbond3;
+    f[i1_3].x += fx3;
+    f[i1_3].y += fy3;
+    f[i1_3].z += fz3;
+    f[i2_3].x -= fx3;
+    f[i2_3].y -= fy3;
+    f[i2_3].z -= fz3;
+  }
+
+  // Two-way unrolling still handles the final aligned bond pair exactly.
   for (; n + 1 < nbondlist; n += 2) {
     const int i1_0 = bondlist[n].a;
     const int i2_0 = bondlist[n].b;
@@ -166,40 +317,10 @@ void BondClass2::eval_one_type_hot(int nbondlist)

     if (angle_cache_valid) {
       const int angle_index = n >> 1;
-      const int angle_i1 = anglelist[angle_index].a;
-      const int angle_i2 = anglelist[angle_index].b;
-      const int angle_i3 = anglelist[angle_index].c;
-
-      if (i1_0 == angle_i2 && i1_1 == angle_i2) {
-        auto &cache = bond_class2_hot_angle_cache[angle_index];
-        if (i2_0 == angle_i1 && i2_1 == angle_i3) {
-          cache.delx1 = -delx0;
-          cache.dely1 = -dely0;
-          cache.delz1 = -delz0;
-          cache.rsq1 = rsq0;
-          cache.r1 = r0;
-          cache.delx2 = -delx1;
-          cache.dely2 = -dely1;
-          cache.delz2 = -delz1;
-          cache.rsq2 = rsq1;
-          cache.r2 = r1;
-        } else if (i2_0 == angle_i3 && i2_1 == angle_i1) {
-          cache.delx1 = -delx1;
-          cache.dely1 = -dely1;
-          cache.delz1 = -delz1;
-          cache.rsq1 = rsq1;
-          cache.r1 = r1;
-          cache.delx2 = -delx0;
-          cache.dely2 = -dely0;
-          cache.delz2 = -delz0;
-          cache.rsq2 = rsq0;
-          cache.r2 = r0;
-        } else {
-          angle_cache_valid = false;
-        }
-      } else {
-        angle_cache_valid = false;
-      }
+      angle_cache_valid =
+          fill_hot_angle_cache_entry(bond_class2_hot_angle_cache[angle_index], anglelist[angle_index],
+                                     i1_0, i2_0, i1_1, i2_1, delx0, dely0, delz0, rsq0, r0, delx1,
+                                     dely1, delz1, rsq1, r1);
     }

     const double dr0 = r0 - coeff_r0;
diff --git a/src/CLASS2/bond_class2.h b/src/CLASS2/bond_class2.h
index 6bcddc6e9e..3540a00c45 100644
--- a/src/CLASS2/bond_class2.h
+++ b/src/CLASS2/bond_class2.h
@@ -27,8 +27,8 @@ BondStyle(class2,BondClass2);
 namespace LAMMPS_NS {

 struct BondClass2AngleHotEntry {
-  double delx1, dely1, delz1, rsq1, r1;
-  double delx2, dely2, delz2, rsq2, r2;
+  double delx1, dely1, delz1, rsq1;
+  double delx2, dely2, delz2, rsq2, r12;
 };

 extern std::vector<BondClass2AngleHotEntry> bond_class2_hot_angle_cache;
diff --git a/src/MOLECULE/angle_harmonic.cpp b/src/MOLECULE/angle_harmonic.cpp
index 0e42dc22eb..2c82c27a3a 100644
--- a/src/MOLECULE/angle_harmonic.cpp
+++ b/src/MOLECULE/angle_harmonic.cpp
@@ -137,11 +137,10 @@ void AngleHarmonic::eval_one_type_hot(int nanglelist)
       const double delz2_0 = cache0.delz2;
       const double rsq1_0 = cache0.rsq1;
       const double rsq2_0 = cache0.rsq2;
-      const double r1_0 = cache0.r1;
-      const double r2_0 = cache0.r2;
+      const double r12_0 = cache0.r12;

       double c0 = delx1_0 * delx2_0 + dely1_0 * dely2_0 + delz1_0 * delz2_0;
-      c0 /= r1_0 * r2_0;
+      c0 /= r12_0;
       if (c0 > 1.0) c0 = 1.0;
       if (c0 < -1.0) c0 = -1.0;
       double s0 = sqrt(1.0 - c0 * c0);
@@ -151,7 +150,7 @@ void AngleHarmonic::eval_one_type_hot(int nanglelist)
       const double tk0 = coeff_k * dtheta0;
       const double a0 = -2.0 * tk0 * s0;
       const double a11_0 = a0 * c0 / rsq1_0;
-      const double a12_0 = -a0 / (r1_0 * r2_0);
+      const double a12_0 = -a0 / r12_0;
       const double a22_0 = a0 * c0 / rsq2_0;
       const double f1x0 = a11_0 * delx1_0 + a12_0 * delx2_0;
       const double f1y0 = a11_0 * dely1_0 + a12_0 * dely2_0;
@@ -177,11 +176,10 @@ void AngleHarmonic::eval_one_type_hot(int nanglelist)
       const double delz2_1 = cache1.delz2;
       const double rsq1_1 = cache1.rsq1;
       const double rsq2_1 = cache1.rsq2;
-      const double r1_1 = cache1.r1;
-      const double r2_1 = cache1.r2;
+      const double r12_1 = cache1.r12;

       double c1 = delx1_1 * delx2_1 + dely1_1 * dely2_1 + delz1_1 * delz2_1;
-      c1 /= r1_1 * r2_1;
+      c1 /= r12_1;
       if (c1 > 1.0) c1 = 1.0;
       if (c1 < -1.0) c1 = -1.0;
       double s1 = sqrt(1.0 - c1 * c1);
@@ -191,7 +189,7 @@ void AngleHarmonic::eval_one_type_hot(int nanglelist)
       const double tk1 = coeff_k * dtheta1;
       const double a1 = -2.0 * tk1 * s1;
       const double a11_1 = a1 * c1 / rsq1_1;
-      const double a12_1 = -a1 / (r1_1 * r2_1);
+      const double a12_1 = -a1 / r12_1;
       const double a22_1 = a1 * c1 / rsq2_1;
       const double f1x1 = a11_1 * delx1_1 + a12_1 * delx2_1;
       const double f1y1 = a11_1 * dely1_1 + a12_1 * dely2_1;
@@ -224,11 +222,10 @@ void AngleHarmonic::eval_one_type_hot(int nanglelist)
       const double delz2 = cache.delz2;
       const double rsq1 = cache.rsq1;
       const double rsq2 = cache.rsq2;
-      const double r1 = cache.r1;
-      const double r2 = cache.r2;
+      const double r12 = cache.r12;

       double c = delx1 * delx2 + dely1 * dely2 + delz1 * delz2;
-      c /= r1 * r2;
+      c /= r12;
       if (c > 1.0) c = 1.0;
       if (c < -1.0) c = -1.0;

@@ -240,7 +237,7 @@ void AngleHarmonic::eval_one_type_hot(int nanglelist)
       const double tk = coeff_k * dtheta;
       const double a = -2.0 * tk * s;
       const double a11 = a * c / rsq1;
-      const double a12 = -a / (r1 * r2);
+      const double a12 = -a / r12;
       const double a22 = a * c / rsq2;

       const double f1x = a11 * delx1 + a12 * delx2;