diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/NumericUtilsBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/NumericUtilsBenchmark.java
new file mode 100644
index 000000000000..d82834cebed6
--- /dev/null
+++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/NumericUtilsBenchmark.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.benchmark.jmh;
+
+import java.math.BigInteger;
+import java.util.Arrays;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.concurrent.TimeUnit;
+import org.apache.lucene.util.NumericUtils;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.MICROSECONDS)
+@State(Scope.Benchmark)
+// first iteration is complete garbage, so make sure we really warmup
+@Warmup(iterations = 4, time = 1)
+// real iterations. not useful to spend tons of time here, better to fork more
+@Measurement(iterations = 5, time = 1)
+// engage some noise reduction
+@Fork(
+    value = 3,
+    jvmArgsAppend = {"-Xmx2g", "-Xms2g", "-XX:+AlwaysPreTouch"})
+public class NumericUtilsBenchmark {
+  @Param({"1", "128", "207", "256", "300", "512", "702", "1024"})
+  int size;
+
+  private byte[] subA;
+  private byte[] subB;
+  private byte[] subResult;
+  private byte[] subExpected;
+
+  private byte[] addA;
+  private byte[] addB;
+  private byte[] addResult;
+  private byte[] addExpected;
+
+  @Setup(Level.Iteration)
+  public void subInit() {
+    ThreadLocalRandom random = ThreadLocalRandom.current();
+
+    subA = new byte[size];
+    subB = new byte[size];
+    subResult = new byte[size];
+    subExpected = new byte[size];
+
+    random.nextBytes(subA);
+    random.nextBytes(subB);
+
+    // Treat as unsigned integers
+    BigInteger aBig = new BigInteger(1, subA);
+    BigInteger bBig = new BigInteger(1, subB);
+
+    // Swap a <-> b if a < b
+    if (aBig.compareTo(bBig) < 0) {
+      byte[] temp = subA;
+      subA = subB;
+      subB = temp;
+
+      BigInteger tempBig = aBig;
+      aBig = bBig;
+      bBig = tempBig;
+    }
+
+    byte[] temp = aBig.subtract(bBig).toByteArray();
+    if (temp.length == size + 1) { // BigInteger pads with extra 0 if MSB is 1
+      assert temp[0] == 0;
+      System.arraycopy(temp, 1, subExpected, 0, size);
+    } else {
+      System.arraycopy(temp, 0, subExpected, size - temp.length, temp.length);
+    }
+  }
+
+  @Setup(Level.Iteration)
+  public void addInit() {
+    ThreadLocalRandom random = ThreadLocalRandom.current();
+
+    addA = new byte[size];
+    addB = new byte[size];
+    addResult = new byte[size];
+    addExpected = new byte[size];
+
+    random.nextBytes(addA);
+    random.nextBytes(addB);
+
+    // Treat as unsigned integers
+    BigInteger aBig = new BigInteger(1, addA);
+    BigInteger bBig = new BigInteger(1, addB);
+
+    byte[] temp = aBig.add(bBig).toByteArray();
+    if (temp.length == size + 1) { // BigInteger pads with extra 0 if MSB is 1
+      if (temp[0] != 0) { // overflow
+        addInit(); // re-init
+        return;
+      }
+      System.arraycopy(temp, 1, addExpected, 0, size);
+    } else {
+      System.arraycopy(temp, 0, addExpected, size - temp.length, temp.length);
+    }
+  }
+
+  @Benchmark
+  public void subtract() {
+    NumericUtils.subtract(size, 0, subA, subB, subResult);
+    assert Arrays.equals(subExpected, subResult);
+  }
+
+  @Benchmark
+  public void add() {
+    NumericUtils.add(size, 0, addA, addB, addResult);
+    assert Arrays.equals(addExpected, addResult);
+  }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/util/NumericUtils.java b/lucene/core/src/java/org/apache/lucene/util/NumericUtils.java
index 1249b8324e0a..6fd711a4ba43 100644
--- a/lucene/core/src/java/org/apache/lucene/util/NumericUtils.java
+++ b/lucene/core/src/java/org/apache/lucene/util/NumericUtils.java
@@ -94,17 +94,35 @@ public static int sortableFloatBits(int bits) {
   public static void subtract(int bytesPerDim, int dim, byte[] a, byte[] b, byte[] result) {
     int start = dim * bytesPerDim;
     int end = start + bytesPerDim;
+
     int borrow = 0;
-    for (int i = end - 1; i >= start; i--) {
-      int diff = (a[i] & 0xff) - (b[i] & 0xff) - borrow;
+    int i;
+
+    int limit = start + (bytesPerDim & ~3);
+    for (i = end - 1; i >= limit; i--) {
+      int diff = Byte.toUnsignedInt(a[i]) - Byte.toUnsignedInt(b[i]) - borrow;
       if (diff < 0) {
-        diff += 256;
         borrow = 1;
       } else {
         borrow = 0;
       }
       result[i - start] = (byte) diff;
     }
+
+    for (i -= 3; i >= start; i -= 4) {
+      int aInt = (int) BitUtil.VH_BE_INT.get(a, i);
+      int bInt = (int) BitUtil.VH_BE_INT.get(b, i);
+
+      long diff = Integer.toUnsignedLong(aInt) - Integer.toUnsignedLong(bInt) - borrow;
+      if (diff < 0) {
+        borrow = 1;
+      } else {
+        borrow = 0;
+      }
+
+      BitUtil.VH_BE_INT.set(result, i - start, (int) diff);
+    }
+
     if (borrow != 0) {
       throw new IllegalArgumentException("a < b");
     }
@@ -117,17 +135,35 @@ public static void subtract(int bytesPerDim, int dim, byte[] a, byte[] b, byte[]
   public static void add(int bytesPerDim, int dim, byte[] a, byte[] b, byte[] result) {
     int start = dim * bytesPerDim;
     int end = start + bytesPerDim;
+
     int carry = 0;
-    for (int i = end - 1; i >= start; i--) {
-      int digitSum = (a[i] & 0xff) + (b[i] & 0xff) + carry;
-      if (digitSum > 255) {
-        digitSum -= 256;
+    int i;
+
+    int limit = start + (bytesPerDim & ~3);
+    for (i = end - 1; i >= limit; i--) {
+      int digitSum = Byte.toUnsignedInt(a[i]) + Byte.toUnsignedInt(b[i]) + carry;
+      if (digitSum >= 256) {
         carry = 1;
       } else {
         carry = 0;
       }
       result[i - start] = (byte) digitSum;
     }
+
+    for (i -= 3; i >= start; i -= 4) {
+      int aInt = (int) BitUtil.VH_BE_INT.get(a, i);
+      int bInt = (int) BitUtil.VH_BE_INT.get(b, i);
+
+      long digitSum = Integer.toUnsignedLong(aInt) + Integer.toUnsignedLong(bInt) + carry;
+      if (digitSum >= 0x100000000L) {
+        carry = 1;
+      } else {
+        carry = 0;
+      }
+
+      BitUtil.VH_BE_INT.set(result, i - start, (int) digitSum);
+    }
+
     if (carry != 0) {
       throw new IllegalArgumentException("a + b overflows bytesPerDim=" + bytesPerDim);
     }