rename "offset" to "counter" and always increment it by 1

This is simpler than sometimes incrementing by CHUNK_LEN and other times incrementing by BLOCK_LEN.
author: Jack O'Connor <[email protected]> 2019-12-12 18:21:17 -0500
committer: Jack O'Connor <[email protected]> 2019-12-12 21:41:30 -0500
commit: b5f1e925f79589ade494289d730c1fb5c25a09b1 (patch)
tree: a8608132b339b9f2c55bec46bf18d297cd0216b6 /src
parent: a5cc3b28679d724020b8810ea9aecea9f54a058f (diff)
12 files changed, 376 insertions, 363 deletions
diff --git a/src/avx2.rs b/src/avx2.rs
index 471a2dc..3424a83 100644
--- a/src/avx2.rs
+++ b/src/avx2.rs
@@ -3,7 +3,9 @@ use core::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use core::arch::x86_64::*;
 
-use crate::{offset_high, offset_low, CVWords, OffsetDeltas, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN};
+use crate::{
+    counter_high, counter_low, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN,
+};
 use arrayref::{array_mut_ref, mut_array_refs};
 
 pub const DEGREE: usize = 8;
@@ -270,27 +272,28 @@ unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize)
 }
 
 #[inline(always)]
-unsafe fn load_offsets(offset: u64, offset_deltas: &OffsetDeltas) -> (__m256i, __m256i) {
+unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m256i, __m256i) {
+    let mask = if increment_counter.yes() { !0 } else { 0 };
     (
         set8(
-            offset_low(offset + offset_deltas[0]),
-            offset_low(offset + offset_deltas[1]),
-            offset_low(offset + offset_deltas[2]),
-            offset_low(offset + offset_deltas[3]),
-            offset_low(offset + offset_deltas[4]),
-            offset_low(offset + offset_deltas[5]),
-            offset_low(offset + offset_deltas[6]),
-            offset_low(offset + offset_deltas[7]),
+            counter_low(counter + (mask & 0)),
+            counter_low(counter + (mask & 1)),
+            counter_low(counter + (mask & 2)),
+            counter_low(counter + (mask & 3)),
+            counter_low(counter + (mask & 4)),
+            counter_low(counter + (mask & 5)),
+            counter_low(counter + (mask & 6)),
+            counter_low(counter + (mask & 7)),
         ),
         set8(
-            offset_high(offset + offset_deltas[0]),
-            offset_high(offset + offset_deltas[1]),
-            offset_high(offset + offset_deltas[2]),
-            offset_high(offset + offset_deltas[3]),
-            offset_high(offset + offset_deltas[4]),
-            offset_high(offset + offset_deltas[5]),
-            offset_high(offset + offset_deltas[6]),
-            offset_high(offset + offset_deltas[7]),
+            counter_high(counter + (mask & 0)),
+            counter_high(counter + (mask & 1)),
+            counter_high(counter + (mask & 2)),
+            counter_high(counter + (mask & 3)),
+            counter_high(counter + (mask & 4)),
+            counter_high(counter + (mask & 5)),
+            counter_high(counter + (mask & 6)),
+            counter_high(counter + (mask & 7)),
         ),
     )
 }
@@ -300,8 +303,8 @@ pub unsafe fn hash8(
     inputs: &[*const u8; DEGREE],
     blocks: usize,
     key: &CVWords,
-    offset: u64,
-    offset_deltas: &OffsetDeltas,
+    counter: u64,
+    increment_counter: IncrementCounter,
     flags: u8,
     flags_start: u8,
     flags_end: u8,
@@ -317,7 +320,7 @@ pub unsafe fn hash8(
         set1(key[6]),
         set1(key[7]),
     ];
-    let (offset_low_vec, offset_high_vec) = load_offsets(offset, offset_deltas);
+    let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter);
     let mut block_flags = flags | flags_start;
 
     for block in 0..blocks {
@@ -345,8 +348,8 @@ pub unsafe fn hash8(
             set1(IV[1]),
             set1(IV[2]),
             set1(IV[3]),
-            offset_low_vec,
-            offset_high_vec,
+            counter_low_vec,
+            counter_high_vec,
             block_len_vec,
             block_flags_vec,
         ];
@@ -384,8 +387,8 @@ pub unsafe fn hash8(
 pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
     mut inputs: &[&A],
     key: &CVWords,
-    mut offset: u64,
-    offset_deltas: &OffsetDeltas,
+    mut counter: u64,
+    increment_counter: IncrementCounter,
     flags: u8,
     flags_start: u8,
     flags_end: u8,
@@ -401,22 +404,24 @@ pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
             input_ptrs,
             blocks,
             key,
-            offset,
-            offset_deltas,
+            counter,
+            increment_counter,
             flags,
             flags_start,
             flags_end,
             array_mut_ref!(out, 0, DEGREE * OUT_LEN),
         );
+        if increment_counter.yes() {
+            counter += DEGREE as u64;
+        }
         inputs = &inputs[DEGREE..];
-        offset += DEGREE as u64 * offset_deltas[1];
         out = &mut out[DEGREE * OUT_LEN..];
     }
     crate::sse41::hash_many(
         inputs,
         key,
-        offset,
-        offset_deltas,
+        counter,
+        increment_counter,
         flags,
         flags_start,
         flags_end,
diff --git a/src/c/blake3.h b/src/c/blake3.h
index 4d4a409..4c5624c 100644
--- a/src/c/blake3.h
+++ b/src/c/blake3.h
@@ -11,10 +11,10 @@
 
 typedef struct {
   uint32_t cv[8];
-  uint64_t offset;
-  uint16_t count;
+  uint64_t chunk_counter;
   uint8_t buf[BLAKE3_BLOCK_LEN];
   uint8_t buf_len;
+  uint8_t blocks_compressed;
   uint8_t flags;
 } blake3_chunk_state;
 
diff --git a/src/c/blake3_avx512.c b/src/c/blake3_avx512.c
index f30e302..2c8657c 100644
--- a/src/c/blake3_avx512.c
+++ b/src/c/blake3_avx512.c
@@ -111,12 +111,12 @@ INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
 
 INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
                          const uint8_t block[BLAKE3_BLOCK_LEN],
-                         uint8_t block_len, uint64_t offset, uint8_t flags) {
+                         uint8_t block_len, uint64_t counter, uint8_t flags) {
   rows[0] = loadu_128((uint8_t *)&cv[0]);
   rows[1] = loadu_128((uint8_t *)&cv[4]);
   rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
-  rows[3] = set4(offset_low(offset), offset_high(offset), (uint32_t)block_len,
-                 (uint32_t)flags);
+  rows[3] = set4(counter_low(counter), counter_high(counter),
+                 (uint32_t)block_len, (uint32_t)flags);
 
   __m128i m0 = loadu_128(&block[sizeof(__m128i) * 0]);
   __m128i m1 = loadu_128(&block[sizeof(__m128i) * 1]);
@@ -281,10 +281,10 @@ INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
 
 void blake3_compress_xof_avx512(const uint32_t cv[8],
                                 const uint8_t block[BLAKE3_BLOCK_LEN],
-                                uint8_t block_len, uint64_t offset,
+                                uint8_t block_len, uint64_t counter,
                                 uint8_t flags, uint8_t out[64]) {
   __m128i rows[4];
-  compress_pre(rows, cv, block, block_len, offset, flags);
+  compress_pre(rows, cv, block, block_len, counter, flags);
   storeu_128(xor_128(rows[0], rows[2]), &out[0]);
   storeu_128(xor_128(rows[1], rows[3]), &out[16]);
   storeu_128(xor_128(rows[2], loadu_128((uint8_t *)&cv[0])), &out[32]);
@@ -293,10 +293,10 @@ void blake3_compress_xof_avx512(const uint32_t cv[8],
 
 void blake3_compress_in_place_avx512(uint32_t cv[8],
                                      const uint8_t block[BLAKE3_BLOCK_LEN],
-                                     uint8_t block_len, uint64_t offset,
+                                     uint8_t block_len, uint64_t counter,
                                      uint8_t flags) {
   __m128i rows[4];
-  compress_pre(rows, cv, block, block_len, offset, flags);
+  compress_pre(rows, cv, block, block_len, counter, flags);
   storeu_128(xor_128(rows[0], rows[2]), (uint8_t *)&cv[0]);
   storeu_128(xor_128(rows[1], rows[3]), (uint8_t *)&cv[4]);
 }
@@ -468,24 +468,29 @@ INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
   transpose_vecs_128(&out[12]);
 }
 
-INLINE void load_offsets4(uint64_t offset, const uint64_t deltas[4],
-                          __m128i *out_lo, __m128i *out_hi) {
-  __m256i a = _mm256_add_epi64(_mm256_set1_epi64x((int64_t)offset),
-                               _mm256_loadu_si256((const __m256i *)deltas));
-  *out_lo = _mm256_cvtepi64_epi32(a);
-  *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(a, 32));
+INLINE void load_counters4(uint64_t counter, bool increment_counter,
+                           __m128i *out_lo, __m128i *out_hi) {
+  uint64_t mask = (increment_counter ? ~0 : 0);
+  __m256i mask_vec = _mm256_set1_epi64x(mask);
+  __m256i deltas = _mm256_setr_epi64x(0, 1, 2, 3);
+  deltas = _mm256_and_si256(mask_vec, deltas);
+  __m256i counters =
+      _mm256_add_epi64(_mm256_set1_epi64x((int64_t)counter), deltas);
+  *out_lo = _mm256_cvtepi64_epi32(counters);
+  *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32));
 }
 
 void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks,
-                         const uint32_t key[8], uint64_t offset,
-                         offset_deltas_t offset_deltas, uint8_t flags,
+                         const uint32_t key[8], uint64_t counter,
+                         bool increment_counter, uint8_t flags,
                          uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
   __m128i h_vecs[8] = {
       set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
       set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
   };
-  __m128i offset_low_vec, offset_high_vec;
-  load_offsets4(offset, offset_deltas, &offset_low_vec, &offset_high_vec);
+  __m128i counter_low_vec, counter_high_vec;
+  load_counters4(counter, increment_counter, &counter_low_vec,
+                 &counter_high_vec);
   uint8_t block_flags = flags | flags_start;
 
   for (size_t block = 0; block < blocks; block++) {
@@ -498,10 +503,10 @@ void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks,
     transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
 
     __m128i v[16] = {
-        h_vecs[0],       h_vecs[1],       h_vecs[2],       h_vecs[3],
-        h_vecs[4],       h_vecs[5],       h_vecs[6],       h_vecs[7],
-        set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]),
-        offset_low_vec,  offset_high_vec, block_len_vec,   block_flags_vec,
+        h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
+        h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
+        set1_128(IV[0]), set1_128(IV[1]),  set1_128(IV[2]), set1_128(IV[3]),
+        counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
     };
     round_fn4(v, msg_vecs, 0);
     round_fn4(v, msg_vecs, 1);
@@ -714,24 +719,29 @@ INLINE void transpose_msg_vecs8(const uint8_t *const *inputs,
   transpose_vecs_256(&out[8]);
 }
 
-INLINE void load_offsets8(uint64_t offset, const uint64_t deltas[8],
-                          __m256i *out_lo, __m256i *out_hi) {
-  __m512i a = _mm512_add_epi64(_mm512_set1_epi64((int64_t)offset),
-                               _mm512_loadu_si512((const __m512i *)deltas));
-  *out_lo = _mm512_cvtepi64_epi32(a);
-  *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(a, 32));
+INLINE void load_counters8(uint64_t counter, bool increment_counter,
+                           __m256i *out_lo, __m256i *out_hi) {
+  uint64_t mask = (increment_counter ? ~0 : 0);
+  __m512i mask_vec = _mm512_set1_epi64(mask);
+  __m512i deltas = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+  deltas = _mm512_and_si512(mask_vec, deltas);
+  __m512i counters =
+      _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas);
+  *out_lo = _mm512_cvtepi64_epi32(counters);
+  *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32));
 }
 
 void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks,
-                         const uint32_t key[8], uint64_t offset,
-                         offset_deltas_t offset_deltas, uint8_t flags,
+                         const uint32_t key[8], uint64_t counter,
+                         bool increment_counter, uint8_t flags,
                          uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
   __m256i h_vecs[8] = {
       set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]),
       set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]),
   };
-  __m256i offset_low_vec, offset_high_vec;
-  load_offsets8(offset, offset_deltas, &offset_low_vec, &offset_high_vec);
+  __m256i counter_low_vec, counter_high_vec;
+  load_counters8(counter, increment_counter, &counter_low_vec,
+                 &counter_high_vec);
   uint8_t block_flags = flags | flags_start;
 
   for (size_t block = 0; block < blocks; block++) {
@@ -744,10 +754,10 @@ void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks,
     transpose_msg_vecs8(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
 
     __m256i v[16] = {
-        h_vecs[0],       h_vecs[1],       h_vecs[2],       h_vecs[3],
-        h_vecs[4],       h_vecs[5],       h_vecs[6],       h_vecs[7],
-        set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]),
-        offset_low_vec,  offset_high_vec, block_len_vec,   block_flags_vec,
+        h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
+        h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
+        set1_256(IV[0]), set1_256(IV[1]),  set1_256(IV[2]), set1_256(IV[3]),
+        counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
     };
     round_fn8(v, msg_vecs, 0);
     round_fn8(v, msg_vecs, 1);
@@ -1018,12 +1028,16 @@ INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
   transpose_vecs_512(out);
 }
 
-INLINE void load_offsets16(uint64_t offset, const uint64_t deltas[16],
-                           __m512i *out_lo, __m512i *out_hi) {
-  __m512i a = _mm512_add_epi64(_mm512_set1_epi64((int64_t)offset),
-                               _mm512_loadu_si512((const __m512i *)&deltas[0]));
-  __m512i b = _mm512_add_epi64(_mm512_set1_epi64((int64_t)offset),
-                               _mm512_loadu_si512((const __m512i *)&deltas[8]));
+INLINE void load_counters16(uint64_t counter, bool increment_counter,
+                            __m512i *out_lo, __m512i *out_hi) {
+  uint64_t mask = (increment_counter ? ~0 : 0);
+  __m512i mask_vec = _mm512_set1_epi64(mask);
+  __m512i deltas_a = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+  deltas_a = _mm512_and_si512(mask_vec, deltas_a);
+  __m512i deltas_b = _mm512_setr_epi64(8, 9, 10, 11, 12, 13, 14, 15);
+  deltas_b = _mm512_and_si512(mask_vec, deltas_b);
+  __m512i a = _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas_a);
+  __m512i b = _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas_b);
   __m512i lo_indexes = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20,
                                          22, 24, 26, 28, 30);
   __m512i hi_indexes = _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21,
@@ -1033,16 +1047,17 @@ INLINE void load_offsets16(uint64_t offset, const uint64_t deltas[16],
 }
 
 void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
-                          const uint32_t key[8], uint64_t offset,
-                          offset_deltas_t offset_deltas, uint8_t flags,
+                          const uint32_t key[8], uint64_t counter,
+                          bool increment_counter, uint8_t flags,
                           uint8_t flags_start, uint8_t flags_end,
                           uint8_t *out) {
   __m512i h_vecs[8] = {
       set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]),
       set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]),
   };
-  __m512i offset_low_vec, offset_high_vec;
-  load_offsets16(offset, offset_deltas, &offset_low_vec, &offset_high_vec);
+  __m512i counter_low_vec, counter_high_vec;
+  load_counters16(counter, increment_counter, &counter_low_vec,
+                  &counter_high_vec);
   uint8_t block_flags = flags | flags_start;
 
   for (size_t block = 0; block < blocks; block++) {
@@ -1055,10 +1070,10 @@ void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
     transpose_msg_vecs16(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
 
     __m512i v[16] = {
-        h_vecs[0],       h_vecs[1],       h_vecs[2],       h_vecs[3],
-        h_vecs[4],       h_vecs[5],       h_vecs[6],       h_vecs[7],
-        set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]),
-        offset_low_vec,  offset_high_vec, block_len_vec,   block_flags_vec,
+        h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
+        h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
+        set1_512(IV[0]), set1_512(IV[1]),  set1_512(IV[2]), set1_512(IV[3]),
+        counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
     };
     round_fn16(v, msg_vecs, 0);
     round_fn16(v, msg_vecs, 1);
@@ -1114,7 +1129,7 @@ void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
  */
 
 INLINE void hash_one_avx512(const uint8_t *input, size_t blocks,
-                            const uint32_t key[8], uint64_t offset,
+                            const uint32_t key[8], uint64_t counter,
                             uint8_t flags, uint8_t flags_start,
                             uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
   uint32_t cv[8];
@@ -1124,7 +1139,7 @@ INLINE void hash_one_avx512(const uint8_t *input, size_t blocks,
     if (blocks == 1) {
       block_flags |= flags_end;
     }
-    blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, offset,
+    blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter,
                                     block_flags);
     input = &input[BLAKE3_BLOCK_LEN];
     blocks -= 1;
@@ -1135,39 +1150,47 @@ INLINE void hash_one_avx512(const uint8_t *input, size_t blocks,
 
 void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
                              size_t blocks, const uint32_t key[8],
-                             uint64_t offset, offset_deltas_t offset_deltas,
+                             uint64_t counter, bool increment_counter,
                              uint8_t flags, uint8_t flags_start,
                              uint8_t flags_end, uint8_t *out) {
   while (num_inputs >= 16) {
-    blake3_hash16_avx512(inputs, blocks, key, offset, offset_deltas, flags,
+    blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags,
                          flags_start, flags_end, out);
+    if (increment_counter) {
+      counter += 16;
+    }
     inputs += 16;
     num_inputs -= 16;
-    offset += offset_deltas[16];
     out = &out[16 * BLAKE3_OUT_LEN];
   }
   while (num_inputs >= 8) {
-    blake3_hash8_avx512(inputs, blocks, key, offset, offset_deltas, flags,
+    blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags,
                         flags_start, flags_end, out);
+    if (increment_counter) {
+      counter += 8;
+    }
     inputs += 8;
     num_inputs -= 8;
-    offset += offset_deltas[8];
     out = &out[8 * BLAKE3_OUT_LEN];
   }
   while (num_inputs >= 4) {
-    blake3_hash4_avx512(inputs, blocks, key, offset, offset_deltas, flags,
+    blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags,
                         flags_start, flags_end, out);
+    if (increment_counter) {
+      counter += 4;
+    }
     inputs += 4;
     num_inputs -= 4;
-    offset += offset_deltas[4];
     out = &out[4 * BLAKE3_OUT_LEN];
   }
   while (num_inputs > 0) {
-    hash_one_avx512(inputs[0], blocks, key, offset, flags, flags_start,
+    hash_one_avx512(inputs[0], blocks, key, counter, flags, flags_start,
                     flags_end, out);
+    if (increment_counter) {
+      counter += 1;
+    }
     inputs += 1;
     num_inputs -= 1;
-    offset += offset_deltas[1];
     out = &out[BLAKE3_OUT_LEN];
   }
 }
diff --git a/src/c/blake3_impl.h b/src/c/blake3_impl.h
index af55d93..9a44391 100644
--- a/src/c/blake3_impl.h
+++ b/src/c/blake3_impl.h
@@ -42,19 +42,6 @@ static const uint8_t MSG_SCHEDULE[7][16] = {
     {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
 };
 
-// 17 is 1 + the largest supported SIMD degree. Each hash_many() implementation
-// can thus do `offset += offset_deltas[DEGREE]` at the end of each batch.
-typedef const uint64_t offset_deltas_t[17];
-
-static offset_deltas_t CHUNK_OFFSET_DELTAS = {
-    BLAKE3_CHUNK_LEN * 0,  BLAKE3_CHUNK_LEN * 1,  BLAKE3_CHUNK_LEN * 2,
-    BLAKE3_CHUNK_LEN * 3,  BLAKE3_CHUNK_LEN * 4,  BLAKE3_CHUNK_LEN * 5,
-    BLAKE3_CHUNK_LEN * 6,  BLAKE3_CHUNK_LEN * 7,  BLAKE3_CHUNK_LEN * 8,
-    BLAKE3_CHUNK_LEN * 9,  BLAKE3_CHUNK_LEN * 10, BLAKE3_CHUNK_LEN * 11,
-    BLAKE3_CHUNK_LEN * 12, BLAKE3_CHUNK_LEN * 13, BLAKE3_CHUNK_LEN * 14,
-    BLAKE3_CHUNK_LEN * 15, BLAKE3_CHUNK_LEN * 16,
-};
-
 // Count the number of 1 bits.
 INLINE uint8_t popcnt(uint64_t x) {
 #if __POPCNT__
@@ -69,10 +56,10 @@ INLINE uint8_t popcnt(uint64_t x) {
 #endif
 }
 
-INLINE uint32_t offset_low(uint64_t offset) { return (uint32_t)offset; }
+INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; }
 
-INLINE uint32_t offset_high(uint64_t offset) {
-  return (uint32_t)(offset >> 32);
+INLINE uint32_t counter_high(uint64_t counter) {
+  return (uint32_t)(counter >> 32);
 }
 
 INLINE uint32_t load32(const void *src) {
@@ -96,50 +83,50 @@ INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
 // Declarations for implementation-specific functions.
 void blake3_compress_in_place_portable(uint32_t cv[8],
                                        const uint8_t block[BLAKE3_BLOCK_LEN],
-                                       uint8_t block_len, uint64_t offset,
+                                       uint8_t block_len, uint64_t counter,
                                        uint8_t flags);
 void blake3_compress_in_place_sse41(uint32_t cv[8],
                                     const uint8_t block[BLAKE3_BLOCK_LEN],
-                                    uint8_t block_len, uint64_t offset,
+                                    uint8_t block_len, uint64_t counter,
                                     uint8_t flags);
 void blake3_compress_in_place_avx512(uint32_t cv[8],
                                      const uint8_t block[BLAKE3_BLOCK_LEN],
-                                     uint8_t block_len, uint64_t offset,
+                                     uint8_t block_len, uint64_t counter,
                                      uint8_t flags);
 void blake3_compress_xof_portable(const uint32_t cv[8],
                                   const uint8_t block[BLAKE3_BLOCK_LEN],
-                                  uint8_t block_len, uint64_t offset,
+                                  uint8_t block_len, uint64_t counter,
                                   uint8_t flags, uint8_t out[64]);
 void blake3_compress_xof_sse41(const uint32_t cv[8],
                                const uint8_t block[BLAKE3_BLOCK_LEN],
-                               uint8_t block_len, uint64_t offset,
+                               uint8_t block_len, uint64_t counter,
                                uint8_t flags, uint8_t out[64]);
 void blake3_compress_xof_avx512(const uint32_t cv[8],
                                 const uint8_t block[BLAKE3_BLOCK_LEN],
-                                uint8_t block_len, uint64_t offset,
+                                uint8_t block_len, uint64_t counter,
                                 uint8_t flags, uint8_t out[64]);
 void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
                                size_t blocks, const uint32_t key[8],
-                               uint64_t offset, offset_deltas_t od,
+                               uint64_t counter, bool increment_counter,
                                uint8_t flags, uint8_t flags_start,
                                uint8_t flags_end, uint8_t *out);
 void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
                             size_t blocks, const uint32_t key[8],
-                            uint64_t offset, offset_deltas_t od, uint8_t flags,
-                            uint8_t flags_start, uint8_t flags_end,
-                            uint8_t *out);
+                            uint64_t counter, bool increment_counter,
+                            uint8_t flags, uint8_t flags_start,
+                            uint8_t flags_end, uint8_t *out);
 void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
                            size_t blocks, const uint32_t key[8],
-                           uint64_t offset, offset_deltas_t od, uint8_t flags,
-                           uint8_t flags_start, uint8_t flags_end,
-                           uint8_t *out);
+                           uint64_t counter, bool increment_counter,
+                           uint8_t flags, uint8_t flags_start,
+                           uint8_t flags_end, uint8_t *out);
 void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
                              size_t blocks, const uint32_t key[8],
-                             uint64_t offset, offset_deltas_t od, uint8_t flags,
-                             uint8_t flags_start, uint8_t flags_end,
-                             uint8_t *out);
+                             uint64_t counter, bool increment_counter,
+                             uint8_t flags, uint8_t flags_start,
+                             uint8_t flags_end, uint8_t *out);
 void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
                            size_t blocks, const uint32_t key[8],
-                           uint64_t offset, offset_deltas_t od, uint8_t flags,
-                           uint8_t flags_start, uint8_t flags_end,
-                           uint8_t *out);
+                           uint64_t counter, bool increment_counter,
+                           uint8_t flags, uint8_t flags_start,
+                           uint8_t flags_end, uint8_t *out);
diff --git a/src/c/blake3_neon.c b/src/c/blake3_neon.c
index 0ffa17e..7335c19 100644
--- a/src/c/blake3_neon.c
+++ b/src/c/blake3_neon.c
@@ -212,26 +212,28 @@ INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
   transpose_vecs_128(&out[12]);
 }
 
-INLINE void load_offsets4(uint64_t offset, const uint64_t deltas[4],
-                          uint32x4_t *out_lo, uint32x4_t *out_hi) {
-  *out_lo =
-      set4(offset_low(offset + deltas[0]), offset_low(offset + deltas[1]),
-           offset_low(offset + deltas[2]), offset_low(offset + deltas[3]));
-  *out_hi =
-      set4(offset_high(offset + deltas[0]), offset_high(offset + deltas[1]),
-           offset_high(offset + deltas[2]), offset_high(offset + deltas[3]));
+INLINE void load_counters4(uint64_t counter, bool increment_counter,
+                           uint32x4_t *out_low, uint32x4_t *out_high) {
+  uint64_t mask = (increment_counter ? ~0 : 0);
+  *out_low = set4(
+      counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)),
+      counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3)));
+  *out_high = set4(
+      counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)),
+      counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)));
 }
 
 void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
-                       const uint32_t key[8], uint64_t offset,
-                       offset_deltas_t offset_deltas, uint8_t flags,
+                       const uint32_t key[8], uint64_t counter,
+                       bool increment_counter, uint8_t flags,
                        uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
   uint32x4_t h_vecs[8] = {
       set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
       set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
   };
-  uint32x4_t offset_low_vec, offset_high_vec;
-  load_offsets4(offset, offset_deltas, &offset_low_vec, &offset_high_vec);
+  uint32x4_t counter_low_vec, counter_high_vec;
+  load_counters4(counter, increment_counter, &counter_low_vec,
+                 &counter_high_vec);
   uint8_t block_flags = flags | flags_start;
 
   for (size_t block = 0; block < blocks; block++) {
@@ -244,10 +246,10 @@ void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
     transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
 
     uint32x4_t v[16] = {
-        h_vecs[0],       h_vecs[1],       h_vecs[2],       h_vecs[3],
-        h_vecs[4],       h_vecs[5],       h_vecs[6],       h_vecs[7],
-        set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]),
-        offset_low_vec,  offset_high_vec, block_len_vec,   block_flags_vec,
+        h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
+        h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
+        set1_128(IV[0]), set1_128(IV[1]),  set1_128(IV[2]), set1_128(IV[3]),
+        counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
     };
     round_fn4(v, msg_vecs, 0);
     round_fn4(v, msg_vecs, 1);
@@ -289,8 +291,8 @@ void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
  */
 
 INLINE void hash_one_neon(const uint8_t *input, size_t blocks,
-                          const uint32_t key[8], uint64_t offset, uint8_t flags,
-                          uint8_t flags_start, uint8_t flags_end,
+                          const uint32_t key[8], uint64_t counter,
+                          uint8_t flags, uint8_t flags_start, uint8_t flags_end,
                           uint8_t out[BLAKE3_OUT_LEN]) {
   uint32_t cv[8];
   memcpy(cv, key, BLAKE3_KEY_LEN);
@@ -302,7 +304,7 @@ INLINE void hash_one_neon(const uint8_t *input, size_t blocks,
     // TODO: Implement compress_neon. However note that according to
     // https://github.com/BLAKE2/BLAKE2/commit/7965d3e6e1b4193438b8d3a656787587d2579227,
     // compress_neon might not be any faster than compress_portable.
-    blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, offset,
+    blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter,
                                       block_flags);
     input = &input[BLAKE3_BLOCK_LEN];
     blocks -= 1;
@@ -313,23 +315,27 @@ INLINE void hash_one_neon(const uint8_t *input, size_t blocks,
 
 void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
                            size_t blocks, const uint32_t key[8],
-                           uint64_t offset, offset_deltas_t offset_deltas,
+                           uint64_t counter, bool increment_counter,
                            uint8_t flags, uint8_t flags_start,
                            uint8_t flags_end, uint8_t *out) {
   while (num_inputs >= 4) {
-    blake3_hash4_neon(inputs, blocks, key, offset, offset_deltas, flags,
+    blake3_hash4_neon(inputs, blocks, key, counter, increment_counter, flags,
                       flags_start, flags_end, out);
+    if (increment_counter) {
+      counter += 4;
+    }
     inputs += 4;
     num_inputs -= 4;
-    offset += offset_deltas[4];
     out = &out[4 * BLAKE3_OUT_LEN];
   }
   while (num_inputs > 0) {
-    hash_one_neon(inputs[0], blocks, key, offset, flags, flags_start, flags_end,
-                  out);
+    hash_one_neon(inputs[0], blocks, key, counter, flags, flags_start,
+                  flags_end, out);
+    if (increment_counter) {
+      counter += 1;
+    }
     inputs += 1;
     num_inputs -= 1;
-    offset += offset_deltas[1];
     out = &out[BLAKE3_OUT_LEN];
   }
 }
diff --git a/src/c_avx512.rs b/src/c_avx512.rs
index e66f102..f20de2c 100644
--- a/src/c_avx512.rs
+++ b/src/c_avx512.rs
@@ -1,4 +1,4 @@
-use crate::{CVWords, OffsetDeltas, BLOCK_LEN, OUT_LEN};
+use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
 
 pub const DEGREE: usize = 16;
 
@@ -7,10 +7,10 @@ pub unsafe fn compress_in_place(
     cv: &mut CVWords,
     block: &[u8; BLOCK_LEN],
     block_len: u8,
-    offset: u64,
+    counter: u64,
     flags: u8,
 ) {
-    ffi::blake3_compress_in_place_avx512(cv.as_mut_ptr(), block.as_ptr(), block_len, offset, flags)
+    ffi::blake3_compress_in_place_avx512(cv.as_mut_ptr(), block.as_ptr(), block_len, counter, flags)
 }
 
 // Unsafe because this may only be called on platforms supporting AVX-512.
@@ -18,7 +18,7 @@ pub unsafe fn compress_xof(
     cv: &CVWords,
     block: &[u8; BLOCK_LEN],
     block_len: u8,
-    offset: u64,
+    counter: u64,
     flags: u8,
 ) -> [u8; 64] {
     let mut out = [0u8; 64];
@@ -26,7 +26,7 @@ pub unsafe fn compress_xof(
         cv.as_ptr(),
         block.as_ptr(),
         block_len,
-        offset,
+        counter,
         flags,
         out.as_mut_ptr(),
     );
@@ -37,8 +37,8 @@ pub unsafe fn compress_xof(
 pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
     inputs: &[&A],
     key: &CVWords,
-    offset: u64,
-    offset_deltas: &OffsetDeltas,
+    counter: u64,
+    increment_counter: IncrementCounter,
     flags: u8,
     flags_start: u8,
     flags_end: u8,
@@ -53,8 +53,8 @@ pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
         inputs.len(),
         A::CAPACITY / BLOCK_LEN,
         key.as_ptr(),
-        offset,
-        offset_deltas.as_ptr(),
+        counter,
+        increment_counter.yes(),
         flags,
         flags_start,
         flags_end,
@@ -68,14 +68,14 @@ pub mod ffi {
             cv: *mut u32,
             block: *const u8,
             block_len: u8,
-            offset: u64,
+            counter: u64,
             flags: u8,
         );
         pub fn blake3_compress_xof_avx512(
             cv: *const u32,
             block: *const u8,
             block_len: u8,
-            offset: u64,
+            counter: u64,
             flags: u8,
             out: *mut u8,
         );
@@ -84,13 +84,14 @@ pub mod ffi {
             num_inputs: usize,
             blocks: usize,
             key: *const u32,
-            offset: u64,
-            offset_deltas: *const u64,
+            counter: u64,
+            increment_counter: bool,
             flags: u8,
             flags_start: u8,
             flags_end: u8,
             out: *mut u8,
         );
+
     }
 }
 
diff --git a/src/c_neon.rs b/src/c_neon.rs
index 029bf7e..34ef074 100644
--- a/src/c_neon.rs
+++ b/src/c_neon.rs
@@ -1,4 +1,4 @@
-use crate::{CVWords, OffsetDeltas, BLOCK_LEN, OUT_LEN};
+use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
 
 pub const DEGREE: usize = 4;
 
@@ -6,8 +6,8 @@ pub const DEGREE: usize = 4;
 pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
     inputs: &[&A],
     key: &CVWords,
-    offset: u64,
-    offset_deltas: &OffsetDeltas,
+    counter: u64,
+    increment_counter: IncrementCounter,
     flags: u8,
     flags_start: u8,
     flags_end: u8,
@@ -22,8 +22,8 @@ pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
         inputs.len(),
         A::CAPACITY / BLOCK_LEN,
         key.as_ptr(),
-        offset,
-        offset_deltas.as_ptr(),
+        counter,
+        increment_counter.yes(),
         flags,
         flags_start,
         flags_end,
@@ -40,7 +40,7 @@ pub extern "C" fn blake3_compress_in_place_portable(
     cv: *mut u32,
     block: *const u8,
     block_len: u8,
-    offset: u64,
+    counter: u64,
     flags: u8,
 ) {
     unsafe {
@@ -48,7 +48,7 @@ pub extern "C" fn blake3_compress_in_place_portable(
             &mut *(cv as *mut [u32; 8]),
             &*(block as *const [u8; 64]),
             block_len,
-            offset,
+            counter,
             flags,
         )
     }
@@ -61,8 +61,8 @@ pub mod ffi {
             num_inputs: usize,
             blocks: usize,
             key: *const u32,
-            offset: u64,
-            offset_deltas: *const u64,
+            counter: u64,
+            increment_counter: bool,
             flags: u8,
             flags_start: u8,
             flags_end: u8,
diff --git a/src/lib.rs b/src/lib.rs
index 6489c04..55ac42b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -63,33 +63,6 @@ const MSG_SCHEDULE: [[usize; 16]; 7] = [
     [12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11],
 ];
 
-// 17 is 1 + the largest supported SIMD degree (including AVX-512, currently in C).
-// Each hash_many() implementation can thus do `offset += offset_deltas[DEGREE]`
-// at the end of each batch.
-type OffsetDeltas = [u64; 17];
-
-const CHUNK_OFFSET_DELTAS: &OffsetDeltas = &[
-    CHUNK_LEN as u64 * 0,
-    CHUNK_LEN as u64 * 1,
-    CHUNK_LEN as u64 * 2,
-    CHUNK_LEN as u64 * 3,
-    CHUNK_LEN as u64 * 4,
-    CHUNK_LEN as u64 * 5,
-    CHUNK_LEN as u64 * 6,
-    CHUNK_LEN as u64 * 7,
-    CHUNK_LEN as u64 * 8,
-    CHUNK_LEN as u64 * 9,
-    CHUNK_LEN as u64 * 10,
-    CHUNK_LEN as u64 * 11,
-    CHUNK_LEN as u64 * 12,
-    CHUNK_LEN as u64 * 13,
-    CHUNK_LEN as u64 * 14,
-    CHUNK_LEN as u64 * 15,
-    CHUNK_LEN as u64 * 16,
-];
-
-const PARENT_OFFSET_DELTAS: &OffsetDeltas = &[0; 17];
-
 // These are the internal flags that we use to domain separate root/non-root,
 // chunk/parent, and chunk beginning/middle/end. These get set at the high end
 // of the block flags word in the compression function, so their values start
@@ -101,12 +74,12 @@ const ROOT: u8 = 1 << 3;
 const KEYED_HASH: u8 = 1 << 4;
 const DERIVE_KEY: u8 = 1 << 5;
 
-fn offset_low(offset: u64) -> u32 {
-    offset as u32
+fn counter_low(counter: u64) -> u32 {
+    counter as u32
 }
 
-fn offset_high(offset: u64) -> u32 {
-    (offset >> 32) as u32
+fn counter_high(counter: u64) -> u32 {
+    (counter >> 32) as u32
 }
 
 /// A BLAKE3 output of the default size, 32 bytes, which implements
@@ -181,7 +154,7 @@ struct Output {
     input_chaining_value: CVWords,
     block: [u8; 64],
     block_len: u8,
-    offset: u64,
+    counter: u64,
     flags: u8,
     platform: Platform,
 }
@@ -193,14 +166,14 @@ impl Output {
             &mut cv,
             &self.block,
             self.block_len,
-            self.offset,
+            self.counter,
             self.flags,
         );
         platform::le_bytes_from_words_32(&cv)
     }
 
     fn root_hash(&self) -> Hash {
-        debug_assert_eq!(self.offset, 0);
+        debug_assert_eq!(self.counter, 0);
         let mut cv = self.input_chaining_value;
         self.platform
             .compress_in_place(&mut cv, &self.block, self.block_len, 0, self.flags | ROOT);
@@ -212,7 +185,7 @@ impl Output {
             &self.input_chaining_value,
             &self.block,
             self.block_len,
-            self.offset,
+            self.counter,
             self.flags | ROOT,
         )
     }
@@ -221,7 +194,7 @@ impl Output {
 #[derive(Clone)]
 struct ChunkState {
     cv: CVWords,
-    offset: u64,
+    chunk_counter: u64,
     buf: [u8; BLOCK_LEN],
     buf_len: u8,
     blocks_compressed: u8,
@@ -230,10 +203,10 @@ struct ChunkState {
 }
 
 impl ChunkState {
-    fn new(key: &CVWords, offset: u64, flags: u8, platform: Platform) -> Self {
+    fn new(key: &CVWords, chunk_counter: u64, flags: u8, platform: Platform) -> Self {
         Self {
             cv: *key,
-            offset,
+            chunk_counter,
             buf: [0; BLOCK_LEN],
             buf_len: 0,
             blocks_compressed: 0,
@@ -242,15 +215,6 @@ impl ChunkState {
         }
     }
 
-    fn reset(&mut self, key: &CVWords, new_offset: u64) {
-        debug_assert_eq!(new_offset % CHUNK_LEN as u64, 0);
-        self.cv = *key;
-        self.offset = new_offset;
-        self.buf = [0; BLOCK_LEN];
-        self.buf_len = 0;
-        self.blocks_compressed = 0;
-    }
-
     fn len(&self) -> usize {
         BLOCK_LEN * self.blocks_compressed as usize + self.buf_len as usize
     }
@@ -283,7 +247,7 @@ impl ChunkState {
                     &mut self.cv,
                     &self.buf,
                     BLOCK_LEN as u8,
-                    self.offset,
+                    self.chunk_counter,
                     block_flags,
                 );
                 self.buf_len = 0;
@@ -299,7 +263,7 @@ impl ChunkState {
                 &mut self.cv,
                 array_ref!(input, 0, BLOCK_LEN),
                 BLOCK_LEN as u8,
-                self.offset,
+                self.chunk_counter,
                 block_flags,
             );
             self.blocks_compressed += 1;
@@ -318,7 +282,7 @@ impl ChunkState {
             input_chaining_value: self.cv,
             block: self.buf,
             block_len: self.buf_len,
-            offset: self.offset,
+            counter: self.chunk_counter,
             flags: block_flags,
             platform: self.platform,
         }
@@ -330,9 +294,9 @@ impl fmt::Debug for ChunkState {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         write!(
             f,
-            "ChunkState {{ len: {}, offset: {}, flags: {:?}, platform: {:?} }}",
+            "ChunkState {{ len: {}, chunk_counter: {}, flags: {:?}, platform: {:?} }}",
             self.len(),
-            self.offset,
+            self.chunk_counter,
             self.flags,
             self.platform
         )
@@ -354,6 +318,23 @@ impl fmt::Debug for ChunkState {
 //   use full-width SIMD vectors for parent hashing. Without parallel parent
 //   hashing, we lose about 10% of overall throughput on AVX2 and AVX-512.
 
+// pub for benchmarks
+#[doc(hidden)]
+#[derive(Clone, Copy)]
+pub enum IncrementCounter {
+    Yes,
+    No,
+}
+
+impl IncrementCounter {
+    fn yes(&self) -> bool {
+        match self {
+            IncrementCounter::Yes => true,
+            IncrementCounter::No => false,
+        }
+    }
+}
+
 // The largest power of two less than or equal to `n`, used for left_len()
 // immediately below, and also directly in Hasher::update().
 fn largest_power_of_two_leq(n: usize) -> usize {
@@ -395,14 +376,13 @@ where
 fn compress_chunks_parallel(
     input: &[u8],
     key: &CVWords,
-    offset: u64,
+    chunk_counter: u64,
     flags: u8,
     platform: Platform,
     out: &mut [u8],
 ) -> usize {
     debug_assert!(!input.is_empty(), "empty chunks below the root");
     debug_assert!(input.len() <= MAX_SIMD_DEGREE * CHUNK_LEN);
-    debug_assert_eq!(offset % CHUNK_LEN as u64, 0, "invalid offset");
 
     let mut chunks_exact = input.chunks_exact(CHUNK_LEN);
     let mut chunks_array = ArrayVec::<[&[u8; CHUNK_LEN]; MAX_SIMD_DEGREE]>::new();
@@ -412,8 +392,8 @@ fn compress_chunks_parallel(
     platform.hash_many(
         &chunks_array,
         key,
-        offset,
-        CHUNK_OFFSET_DELTAS,
+        chunk_counter,
+        IncrementCounter::Yes,
         flags,
         CHUNK_START,
         CHUNK_END,
@@ -424,8 +404,8 @@ fn compress_chunks_parallel(
     // chunk (meaning the empty message) is a different codepath.
     let chunks_so_far = chunks_array.len();
     if !chunks_exact.remainder().is_empty() {
-        let chunk_offset = offset + (chunks_so_far * CHUNK_LEN) as u64;
-        let mut chunk_state = ChunkState::new(key, chunk_offset, flags, platform);
+        let counter = chunk_counter + chunks_so_far as u64;
+        let mut chunk_state = ChunkState::new(key, counter, flags, platform);
         chunk_state.update(chunks_exact.remainder());
         *array_mut_ref!(out, chunks_so_far * OUT_LEN, OUT_LEN) =
             chunk_state.output().chaining_value();
@@ -462,8 +442,8 @@ fn compress_parents_parallel(
     platform.hash_many(
         &parents_array,
         key,
-        0, // Parents always use offset 0.
-        PARENT_OFFSET_DELTAS,
+        0, // Parents always use counter 0.
+        IncrementCounter::No,
         flags | PARENT,
         0, // Parents have no start flags.
         0, // Parents have no end flags.
@@ -496,7 +476,7 @@ fn compress_parents_parallel(
 fn compress_subtree_wide(
     input: &[u8],
     key: &CVWords,
-    offset: u64,
+    chunk_counter: u64,
     flags: u8,
     platform: Platform,
     out: &mut [u8],
@@ -505,7 +485,7 @@ fn compress_subtree_wide(
     // when it is 1. This allows Rayon the option of multi-threading even the
     // 2-chunk case, which can help performance on smaller platforms.
     if input.len() <= platform.simd_degree() * CHUNK_LEN {
-        return compress_chunks_parallel(input, key, offset, flags, platform, out);
+        return compress_chunks_parallel(input, key, chunk_counter, flags, platform, out);
     }
 
     // With more than simd_degree chunks, we need to recurse. Start by dividing
@@ -514,7 +494,7 @@ fn compress_subtree_wide(
     // of 3 or something, we'll need a more complicated strategy.)
     debug_assert_eq!(platform.simd_degree().count_ones(), 1, "power of 2");
     let (left, right) = input.split_at(left_len(input.len()));
-    let right_offset = offset + left.len() as u64;
+    let right_chunk_counter = chunk_counter + (left.len() / CHUNK_LEN) as u64;
 
     // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to
     // account for the special case of returning 2 outputs when the SIMD degree
@@ -531,8 +511,8 @@ fn compress_subtree_wide(
 
     // Recurse! This uses multiple threads if the "rayon" feature is enabled.
     let (left_n, right_n) = join(
-        || compress_subtree_wide(left, key, offset, flags, platform, left_out),
-        || compress_subtree_wide(right, key, right_offset, flags, platform, right_out),
+        || compress_subtree_wide(left, key, chunk_counter, flags, platform, left_out),
+        || compress_subtree_wide(right, key, right_chunk_counter, flags, platform, right_out),
     );
 
     // The special case again. If simd_degree=1, then we'll have left_n=1 and
@@ -568,13 +548,14 @@ fn compress_subtree_wide(
 fn compress_subtree_to_parent_node(
     input: &[u8],
     key: &CVWords,
-    offset: u64,
+    chunk_counter: u64,
     flags: u8,
     platform: Platform,
 ) -> [u8; BLOCK_LEN] {
     debug_assert!(input.len() > CHUNK_LEN);
     let mut cv_array = [0; 2 * MAX_SIMD_DEGREE_OR_2 * OUT_LEN];
-    let mut num_cvs = compress_subtree_wide(input, &key, offset, flags, platform, &mut cv_array);
+    let mut num_cvs =
+        compress_subtree_wide(input, &key, chunk_counter, flags, platform, &mut cv_array);
     debug_assert!(num_cvs >= 2);
 
     // If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
@@ -607,7 +588,7 @@ fn hash_all_at_once(input: &[u8], key: &CVWords, flags: u8) -> Output {
         input_chaining_value: *key,
         block: compress_subtree_to_parent_node(input, key, 0, flags, platform),
         block_len: BLOCK_LEN as u8,
-        offset: 0,
+        counter: 0,
         flags: flags | PARENT,
         platform,
     }
@@ -646,7 +627,7 @@ fn parent_node_output(
         input_chaining_value: *key,
         block,
         block_len: BLOCK_LEN as u8,
-        offset: 0,
+        counter: 0,
         flags: flags | PARENT,
         platform,
     }
@@ -694,11 +675,6 @@ impl Hasher {
         Self::new_internal(&key_words, DERIVE_KEY)
     }
 
-    /// The total number of input bytes so far.
-    pub fn count(&self) -> u64 {
-        self.chunk_state.offset + self.chunk_state.len() as u64
-    }
-
     // See comment in push_cv.
     fn merge_cv_stack(&mut self, total_len: u64) {
         let post_merge_stack_len = total_len.count_ones() as usize;
@@ -716,7 +692,7 @@ impl Hasher {
         }
     }
 
-    fn push_cv(&mut self, new_cv: &CVBytes, offset: u64) {
+    fn push_cv(&mut self, new_cv: &CVBytes, chunk_counter: u64) {
         // In reference_impl.rs, we merge the new CV with existing CVs from the
         // stack before pushing it. We can do that because we know more input
         // is coming, so we know none of the merges are root.
@@ -739,10 +715,10 @@ impl Hasher {
         // merging with the new CV itself.
         //
         // We still use the "count the 1 bits" algorithm, adjusted slightly for
-        // this setting, using the offset (the start of the new CV's bytes)
-        // rather than the final total (the end of the new CV's bytes). That
+        // this setting, using the new chunk's counter numer (the previous
+        // total number of chunks) rather than new total number of chunks. That
         // algorithm is explained in detail in the spec.
-        self.merge_cv_stack(offset);
+        self.merge_cv_stack(chunk_counter);
         self.cv_stack.push(*new_cv);
     }
 
@@ -762,9 +738,13 @@ impl Hasher {
                 // Then we'll proceed to hashing whole chunks below.
                 debug_assert_eq!(self.chunk_state.len(), CHUNK_LEN);
                 let chunk_cv = self.chunk_state.output().chaining_value();
-                self.push_cv(&chunk_cv, self.chunk_state.offset);
-                let new_offset = self.chunk_state.offset + CHUNK_LEN as u64;
-                self.chunk_state.reset(&self.key, new_offset);
+                self.push_cv(&chunk_cv, self.chunk_state.chunk_counter);
+                self.chunk_state = ChunkState::new(
+                    &self.key,
+                    self.chunk_state.chunk_counter + 1,
+                    self.chunk_state.flags,
+                    self.chunk_state.platform,
+                );
             } else {
                 return self;
             }
@@ -786,32 +766,33 @@ impl Hasher {
         while input.len() > CHUNK_LEN {
             debug_assert_eq!(self.chunk_state.len(), 0, "no partial chunk data");
             debug_assert_eq!(CHUNK_LEN.count_ones(), 1, "power of 2 chunk len");
-            debug_assert_eq!(self.chunk_state.offset % CHUNK_LEN as u64, 0);
             let mut subtree_len = largest_power_of_two_leq(input.len());
+            let count_so_far = self.chunk_state.chunk_counter * CHUNK_LEN as u64;
             // Shrink the subtree_len until it evenly divides the count so far.
             // We know it's a power of 2, so we can use a bitmask rather than
             // the more expensive modulus operation. Note that if the caller
             // consistently passes power-of-2 inputs of the same size (as is
             // hopefully typical), we'll always skip over this loop.
-            while (subtree_len - 1) as u64 & self.chunk_state.offset != 0 {
+            while (subtree_len - 1) as u64 & count_so_far != 0 {
                 subtree_len /= 2;
             }
             // The shrunken subtree_len might now be 1 chunk long. If so, hash
             // that one chunk by itself. Otherwise, compress the subtree into a
             // pair of CVs.
+            let subtree_chunks = (subtree_len / CHUNK_LEN) as u64;
             if subtree_len <= CHUNK_LEN {
                 debug_assert_eq!(subtree_len, CHUNK_LEN);
                 self.push_cv(
                     &ChunkState::new(
                         &self.key,
-                        self.chunk_state.offset,
+                        self.chunk_state.chunk_counter,
                         self.chunk_state.flags,
                         self.chunk_state.platform,
                     )
                     .update(&input[..subtree_len])
                     .output()
                     .chaining_value(),
-                    self.chunk_state.offset,
+                    self.chunk_state.chunk_counter,
                 );
             } else {
                 // This is the high-performance happy path, though getting here
@@ -819,7 +800,7 @@ impl Hasher {
                 let cv_pair = compress_subtree_to_parent_node(
                     &input[..subtree_len],
                     &self.key,
-                    self.chunk_state.offset,
+                    self.chunk_state.chunk_counter,
                     self.chunk_state.flags,
                     self.chunk_state.platform,
                 );
@@ -828,10 +809,13 @@ impl Hasher {
                 // Push the two CVs we received into the CV stack in order. Because
                 // the stack merges lazily, this guarantees we aren't merging the
                 // root.
-                self.push_cv(left_cv, self.chunk_state.offset);
-                self.push_cv(right_cv, self.chunk_state.offset + (subtree_len as u64 / 2));
+                self.push_cv(left_cv, self.chunk_state.chunk_counter);
+                self.push_cv(
+                    right_cv,
+                    self.chunk_state.chunk_counter + (subtree_chunks / 2),
+                );
             }
-            self.chunk_state.offset += subtree_len as u64;
+            self.chunk_state.chunk_counter += subtree_chunks;
             input = &input[subtree_len..];
         }
 
@@ -842,7 +826,7 @@ impl Hasher {
             // Having added some input to the chunk_state, we know what's in
             // the CV stack won't become the root node, and we can do an extra
             // merge. This simplifies finalize().
-            self.merge_cv_stack(self.chunk_state.offset);
+            self.merge_cv_stack(self.chunk_state.chunk_counter);
         }
 
         self
@@ -853,7 +837,7 @@ impl Hasher {
         // also. Convert it directly into an Output. Otherwise, we need to
         // merge subtrees below.
         if self.cv_stack.is_empty() {
-            debug_assert_eq!(self.chunk_state.offset, 0);
+            debug_assert_eq!(self.chunk_state.chunk_counter, 0);
             return self.chunk_state.output();
         }
 
@@ -874,7 +858,7 @@ impl Hasher {
         if self.chunk_state.len() > 0 {
             debug_assert_eq!(
                 self.cv_stack.len(),
-                self.chunk_state.offset.count_ones() as usize,
+                self.chunk_state.chunk_counter.count_ones() as usize,
                 "cv stack does not need a merge"
             );
             output = self.chunk_state.output();
@@ -930,10 +914,8 @@ impl fmt::Debug for Hasher {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         write!(
             f,
-            "Hasher {{ count: {}, flags: {:?}, platform: {:?} }}",
-            self.count(),
-            self.chunk_state.flags,
-            self.chunk_state.platform
+            "Hasher {{ flags: {:?}, platform: {:?} }}",
+            self.chunk_state.flags, self.chunk_state.platform
         )
     }
 }
@@ -982,19 +964,19 @@ impl OutputReader {
             buf = &mut buf[take..];
             self.position_within_block += take as u8;
             if self.position_within_block == BLOCK_LEN as u8 {
-                self.inner.offset += BLOCK_LEN as u64;
+                self.inner.counter += 1;
                 self.position_within_block = 0;
             }
         }
     }
 
     pub fn position(&self) -> u64 {
-        self.inner.offset + self.position_within_block as u64
+        self.inner.counter * BLOCK_LEN as u64 + self.position_within_block as u64
     }
 
     pub fn set_position(&mut self, position: u64) {
         self.position_within_block = (position % BLOCK_LEN as u64) as u8;
-        self.inner.offset = position - self.position_within_block as u64;
+        self.inner.counter = position / BLOCK_LEN as u64;
     }
 }
 
diff --git a/src/platform.rs b/src/platform.rs
index 66d70f6..abf9d30 100644
--- a/src/platform.rs
+++ b/src/platform.rs
@@ -1,4 +1,4 @@
-use crate::{portable, CVWords, OffsetDeltas, BLOCK_LEN};
+use crate::{portable, CVWords, IncrementCounter, BLOCK_LEN};
 use arrayref::{array_mut_ref, array_ref};
 
 #[cfg(feature = "c_avx512")]
@@ -108,25 +108,25 @@ impl Platform {
         cv: &mut CVWords,
         block: &[u8; BLOCK_LEN],
         block_len: u8,
-        offset: u64,
+        counter: u64,
         flags: u8,
     ) {
         match self {
-            Platform::Portable => portable::compress_in_place(cv, block, block_len, offset, flags),
+            Platform::Portable => portable::compress_in_place(cv, block, block_len, counter, flags),
             // Safe because detect() checked for platform support.
             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
             Platform::SSE41 | Platform::AVX2 => unsafe {
-                sse41::compress_in_place(cv, block, block_len, offset, flags)
+                sse41::compress_in_place(cv, block, block_len, counter, flags)
             },
             // Safe because detect() checked for platform support.
             #[cfg(feature = "c_avx512")]
             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
             Platform::AVX512 => unsafe {
-                c_avx512::compress_in_place(cv, block, block_len, offset, flags)
+                c_avx512::compress_in_place(cv, block, block_len, counter, flags)
             },
             // No NEON compress_in_place() implementation yet.
             #[cfg(feature = "c_neon")]
-            Platform::NEON => portable::compress_in_place(cv, block, block_len, offset, flags),
+            Platform::NEON => portable::compress_in_place(cv, block, block_len, counter, flags),
         }
     }
 
@@ -135,25 +135,25 @@ impl Platform {
         cv: &CVWords,
         block: &[u8; BLOCK_LEN],
         block_len: u8,
-        offset: u64,
+        counter: u64,
         flags: u8,
     ) -> [u8; 64] {
         match self {
-            Platform::Portable => portable::compress_xof(cv, block, block_len, offset, flags),
+            Platform::Portable => portable::compress_xof(cv, block, block_len, counter, flags),
             // Safe because detect() checked for platform support.
             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
             Platform::SSE41 | Platform::AVX2 => unsafe {
-                sse41::compress_xof(cv, block, block_len, offset, flags)
+                sse41::compress_xof(cv, block, block_len, counter, flags)
             },
             // Safe because detect() checked for platform support.
             #[cfg(feature = "c_avx512")]
             #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
             Platform::AVX512 => unsafe {
-                c_avx512::compress_xof(cv, block, block_len, offset, flags)
+                c_avx512::compress_xof(cv, block, block_len, counter, flags)
             },
             // No NEON compress_xof() implementation yet.
             #[cfg(feature = "c_neon")]
-            Platform::NEON => portable::compress_xof(cv, block, block_len, offset, flags),
+            Platform::NEON => portable::compress_xof(cv, block, block_len, counter, flags),
         }
     }
 
@@ -171,8 +171,8 @@ impl Platform {
         &self,
         inputs: &[&A],
         key: &CVWords,
-        offset: u64,
-        offset_deltas: &OffsetDeltas,
+        counter: u64,
+        increment_counter: IncrementCounter,
         flags: u8,
         flags_start: u8,
         flags_end: u8,
@@ -182,8 +182,8 @@ impl Platform {
             Platform::Portable => portable::hash_many(
                 inputs,
                 key,
-                offset,
-                offset_deltas,
+                counter,
+                increment_counter,
                 flags,
                 flags_start,
                 flags_end,
@@ -195,8 +195,8 @@ impl Platform {
                 sse41::hash_many(
                     inputs,
                     key,
-                    offset,
-                    offset_deltas,
+                    counter,
+                    increment_counter,
                     flags,
                     flags_start,
                     flags_end,
@@ -209,8 +209,8 @@ impl Platform {
                 avx2::hash_many(
                     inputs,
                     key,
-                    offset,
-                    offset_deltas,
+                    counter,
+                    increment_counter,
                     flags,
                     flags_start,
                     flags_end,
@@ -224,8 +224,8 @@ impl Platform {
                 c_avx512::hash_many(
                     inputs,
                     key,
-                    offset,
-                    offset_deltas,
+                    counter,
+                    increment_counter,
                     flags,
                     flags_start,
                     flags_end,
@@ -238,8 +238,8 @@ impl Platform {
                 c_neon::hash_many(
                     inputs,
                     key,
-                    offset,
-                    offset_deltas,
+                    counter,
+                    increment_counter,
                     flags,
                     flags_start,
                     flags_end,
diff --git a/src/portable.rs b/src/portable.rs
index fa0e17d..0a569ce 100644
--- a/src/portable.rs
+++ b/src/portable.rs
@@ -1,5 +1,6 @@
 use crate::{
-    offset_high, offset_low, CVBytes, CVWords, OffsetDeltas, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN,
+    counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE,
+    OUT_LEN,
 };
 use arrayref::{array_mut_ref, array_ref};
 
@@ -38,7 +39,7 @@ fn compress_pre(
     cv: &CVWords,
     block: &[u8; BLOCK_LEN],
     block_len: u8,
-    offset: u64,
+    counter: u64,
     flags: u8,
 ) -> [u32; 16] {
     let block_words = crate::platform::words_from_le_bytes_64(block);
@@ -56,8 +57,8 @@ fn compress_pre(
         IV[1],
         IV[2],
         IV[3],
-        offset_low(offset),
-        offset_high(offset),
+        counter_low(counter),
+        counter_high(counter),
         block_len as u32,
         flags as u32,
     ];
@@ -77,10 +78,10 @@ pub fn compress_in_place(
     cv: &mut CVWords,
     block: &[u8; BLOCK_LEN],
     block_len: u8,
-    offset: u64,
+    counter: u64,
     flags: u8,
 ) {
-    let state = compress_pre(cv, block, block_len, offset, flags);
+    let state = compress_pre(cv, block, block_len, counter, flags);
 
     cv[0] = state[0] ^ state[8];
     cv[1] = state[1] ^ state[9];
@@ -96,10 +97,10 @@ pub fn compress_xof(
     cv: &CVWords,
     block: &[u8; BLOCK_LEN],
     block_len: u8,
-    offset: u64,
+    counter: u64,
     flags: u8,
 ) -> [u8; 64] {
-    let mut state = compress_pre(cv, block, block_len, offset, flags);
+    let mut state = compress_pre(cv, block, block_len, counter, flags);
     state[0] ^= state[8];
     state[1] ^= state[9];
     state[2] ^= state[10];
@@ -122,7 +123,7 @@ pub fn compress_xof(
 pub fn hash1<A: arrayvec::Array<Item = u8>>(
     input: &A,
     key: &CVWords,
-    offset: u64,
+    counter: u64,
     flags: u8,
     flags_start: u8,
     flags_end: u8,
@@ -140,7 +141,7 @@ pub fn hash1<A: arrayvec::Array<Item = u8>>(
             &mut cv,
             array_ref!(slice, 0, BLOCK_LEN),
             BLOCK_LEN as u8,
-            offset,
+            counter,
             block_flags,
         );
         block_flags = flags;
@@ -152,8 +153,8 @@ pub fn hash1<A: arrayvec::Array<Item = u8>>(
 pub fn hash_many<A: arrayvec::Array<Item = u8>>(
     inputs: &[&A],
     key: &CVWords,
-    mut offset: u64,
-    offset_deltas: &OffsetDeltas,
+    mut counter: u64,
+    increment_counter: IncrementCounter,
     flags: u8,
     flags_start: u8,
     flags_end: u8,
@@ -164,13 +165,15 @@ pub fn hash_many<A: arrayvec::Array<Item = u8>>(
         hash1(
             input,
             key,
-            offset,
+            counter,
             flags,
             flags_start,
             flags_end,
             array_mut_ref!(output, 0, OUT_LEN),
         );
-        offset += offset_deltas[1];
+        if increment_counter.yes() {
+            counter += 1;
+        }
     }
 }
 
diff --git a/src/sse41.rs b/src/sse41.rs
index e45b22e..77cb27e 100644
--- a/src/sse41.rs
+++ b/src/sse41.rs
@@ -4,7 +4,8 @@ use core::arch::x86::*;
 use core::arch::x86_64::*;
 
 use crate::{
-    offset_high, offset_low, CVBytes, CVWords, OffsetDeltas, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN,
+    counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE,
+    OUT_LEN,
 };
 use arrayref::{array_mut_ref, array_ref, mut_array_refs};
 
@@ -129,15 +130,15 @@ unsafe fn compress_pre(
     cv: &CVWords,
     block: &[u8; BLOCK_LEN],
     block_len: u8,
-    offset: u64,
+    counter: u64,
     flags: u8,
 ) -> [__m128i; 4] {
     let row1 = &mut loadu(cv.as_ptr().add(0) as *const u8);
     let row2 = &mut loadu(cv.as_ptr().add(4) as *const u8);
     let row3 = &mut set4(IV[0], IV[1], IV[2], IV[3]);
     let row4 = &mut set4(
-        offset_low(offset),
-        offset_high(offset),
+        counter_low(counter),
+        counter_high(counter),
         block_len as u32,
         flags as u32,
     );
@@ -313,10 +314,10 @@ pub unsafe fn compress_in_place(
     cv: &mut CVWords,
     block: &[u8; BLOCK_LEN],
     block_len: u8,
-    offset: u64,
+    counter: u64,
     flags: u8,
 ) {
-    let [row1, row2, row3, row4] = compress_pre(cv, block, block_len, offset, flags);
+    let [row1, row2, row3, row4] = compress_pre(cv, block, block_len, counter, flags);
     storeu(xor(row1, row3), cv.as_mut_ptr().add(0) as *mut u8);
     storeu(xor(row2, row4), cv.as_mut_ptr().add(4) as *mut u8);
 }
@@ -326,11 +327,11 @@ pub unsafe fn compress_xof(
     cv: &CVWords,
     block: &[u8; BLOCK_LEN],
     block_len: u8,
-    offset: u64,
+    counter: u64,
     flags: u8,
 ) -> [u8; 64] {
     let [mut row1, mut row2, mut row3, mut row4] =
-        compress_pre(cv, block, block_len, offset, flags);
+        compress_pre(cv, block, block_len, counter, flags);
     row1 = xor(row1, row3);
     row2 = xor(row2, row4);
     row3 = xor(row3, loadu(cv.as_ptr().add(0) as *const u8));
@@ -506,19 +507,20 @@ unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize)
 }
 
 #[inline(always)]
-unsafe fn load_offsets(offset: u64, offset_deltas: &OffsetDeltas) -> (__m128i, __m128i) {
+unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) {
+    let mask = if increment_counter.yes() { !0 } else { 0 };
     (
         set4(
-            offset_low(offset + offset_deltas[0]),
-            offset_low(offset + offset_deltas[1]),
-            offset_low(offset + offset_deltas[2]),
-            offset_low(offset + offset_deltas[3]),
+            counter_low(counter + (mask & 0)),
+            counter_low(counter + (mask & 1)),
+            counter_low(counter + (mask & 2)),
+            counter_low(counter + (mask & 3)),
         ),
         set4(
-            offset_high(offset + offset_deltas[0]),
-            offset_high(offset + offset_deltas[1]),
-            offset_high(offset + offset_deltas[2]),
-            offset_high(offset + offset_deltas[3]),
+            counter_high(counter + (mask & 0)),
+            counter_high(counter + (mask & 1)),
+            counter_high(counter + (mask & 2)),
+            counter_high(counter + (mask & 3)),
         ),
     )
 }
@@ -528,8 +530,8 @@ pub unsafe fn hash4(
     inputs: &[*const u8; DEGREE],
     blocks: usize,
     key: &CVWords,
-    offset: u64,
-    offset_deltas: &OffsetDeltas,
+    counter: u64,
+    increment_counter: IncrementCounter,
     flags: u8,
     flags_start: u8,
     flags_end: u8,
@@ -545,7 +547,7 @@ pub unsafe fn hash4(
         set1(key[6]),
         set1(key[7]),
     ];
-    let (offset_low_vec, offset_high_vec) = load_offsets(offset, offset_deltas);
+    let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter);
     let mut block_flags = flags | flags_start;
 
     for block in 0..blocks {
@@ -573,8 +575,8 @@ pub unsafe fn hash4(
             set1(IV[1]),
             set1(IV[2]),
             set1(IV[3]),
-            offset_low_vec,
-            offset_high_vec,
+            counter_low_vec,
+            counter_high_vec,
             block_len_vec,
             block_flags_vec,
         ];
@@ -616,7 +618,7 @@ pub unsafe fn hash4(
 unsafe fn hash1<A: arrayvec::Array<Item = u8>>(
     input: &A,
     key: &CVWords,
-    offset: u64,
+    counter: u64,
     flags: u8,
     flags_start: u8,
     flags_end: u8,
@@ -634,7 +636,7 @@ unsafe fn hash1<A: arrayvec::Array<Item = u8>>(
             &mut cv,
             array_ref!(slice, 0, BLOCK_LEN),
             BLOCK_LEN as u8,
-            offset,
+            counter,
             block_flags,
         );
         block_flags = flags;
@@ -647,8 +649,8 @@ unsafe fn hash1<A: arrayvec::Array<Item = u8>>(
 pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
     mut inputs: &[&A],
     key: &CVWords,
-    mut offset: u64,
-    offset_deltas: &OffsetDeltas,
+    mut counter: u64,
+    increment_counter: IncrementCounter,
     flags: u8,
     flags_start: u8,
     flags_end: u8,
@@ -664,28 +666,32 @@ pub unsafe fn hash_many<A: arrayvec::Array<Item = u8>>(
             input_ptrs,
             blocks,
             key,
-            offset,
-            offset_deltas,
+            counter,
+            increment_counter,
             flags,
             flags_start,
             flags_end,
             array_mut_ref!(out, 0, DEGREE * OUT_LEN),
         );
+        if increment_counter.yes() {
+            counter += DEGREE as u64;
+        }
         inputs = &inputs[DEGREE..];
-        offset += DEGREE as u64 * offset_deltas[1];
         out = &mut out[DEGREE * OUT_LEN..];
     }
     for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) {
         hash1(
             input,
             key,
-            offset,
+            counter,
             flags,
             flags_start,
             flags_end,
             array_mut_ref!(output, 0, OUT_LEN),
         );
-        offset += offset_deltas[1];
+        if increment_counter.yes() {
+            counter += 1;
+        }
     }
 }
 
diff --git a/src/test.rs b/src/test.rs
index 404079c..e7fe96e 100644
--- a/src/test.rs
+++ b/src/test.rs
@@ -1,4 +1,4 @@
-use crate::{CVBytes, CVWords, OffsetDeltas, BLOCK_LEN, CHUNK_LEN, OUT_LEN};
+use crate::{CVBytes, CVWords, IncrementCounter, BLOCK_LEN, CHUNK_LEN, OUT_LEN};
 use arrayref::array_ref;
 use arrayvec::ArrayVec;
 use core::usize;
@@ -48,13 +48,13 @@ pub fn paint_test_input(buf: &mut [u8]) {
 }
 
 type CompressInPlaceFn =
-    unsafe fn(cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, offset: u64, flags: u8);
+    unsafe fn(cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8);
 
 type CompressXofFn = unsafe fn(
     cv: &CVWords,
     block: &[u8; BLOCK_LEN],
     block_len: u8,
-    offset: u64,
+    counter: u64,
     flags: u8,
 ) -> [u8; 64];
 
@@ -64,18 +64,18 @@ pub fn test_compress_fn(compress_in_place_fn: CompressInPlaceFn, compress_xof_fn
     let block_len: u8 = 61;
     let mut block = [0; BLOCK_LEN];
     paint_test_input(&mut block[..block_len as usize]);
-    // Use an offset with set bits in both 32-bit words.
-    let offset = ((5 * CHUNK_LEN as u64) << 32) + 6 * CHUNK_LEN as u64;
+    // Use a counter with set bits in both 32-bit words.
+    let counter = (5u64 << 32) + 6;
     let flags = crate::CHUNK_END | crate::ROOT | crate::KEYED_HASH;
 
     let portable_out =
-        crate::portable::compress_xof(&initial_state, &block, block_len, offset as u64, flags);
+        crate::portable::compress_xof(&initial_state, &block, block_len, counter as u64, flags);
 
     let mut test_state = initial_state;
-    unsafe { compress_in_place_fn(&mut test_state, &block, block_len, offset as u64, flags) };
+    unsafe { compress_in_place_fn(&mut test_state, &block, block_len, counter as u64, flags) };
     let test_state_bytes = crate::platform::le_bytes_from_words_32(&test_state);
     let test_xof =
-        unsafe { compress_xof_fn(&initial_state, &block, block_len, offset as u64, flags) };
+        unsafe { compress_xof_fn(&initial_state, &block, block_len, counter as u64, flags) };
 
     assert_eq!(&portable_out[..32], &test_state_bytes[..]);
     assert_eq!(&portable_out[..], &test_xof[..]);
@@ -84,8 +84,8 @@ pub fn test_compress_fn(compress_in_place_fn: CompressInPlaceFn, compress_xof_fn
 type HashManyFn<A> = unsafe fn(
     inputs: &[&A],
     key: &CVWords,
-    offset: u64,
-    offset_deltas: &OffsetDeltas,
+    counter: u64,
+    increment_counter: IncrementCounter,
     flags: u8,
     flags_start: u8,
     flags_end: u8,
@@ -101,8 +101,8 @@ pub fn test_hash_many_fn(
     const NUM_INPUTS: usize = 31;
     let mut input_buf = [0; CHUNK_LEN * NUM_INPUTS];
     crate::test::paint_test_input(&mut input_buf);
-    // An offset just prior to u32::MAX.
-    let offset = (1 << 32) - CHUNK_LEN as u64;
+    // A counter just prior to u32::MAX.
+    let counter = (1u64 << 32) - 1;
 
     // First hash chunks.
     let mut chunks = ArrayVec::<[&[u8; CHUNK_LEN]; NUM_INPUTS]>::new();
@@ -113,8 +113,8 @@ pub fn test_hash_many_fn(
     crate::portable::hash_many(
         &chunks,
         &TEST_KEY_WORDS,
-        offset,
-        crate::CHUNK_OFFSET_DELTAS,
+        counter,
+        IncrementCounter::Yes,
         crate::DERIVE_KEY,
         crate::CHUNK_START,
         crate::CHUNK_END,
@@ -126,8 +126,8 @@ pub fn test_hash_many_fn(
         hash_many_chunks_fn(
             &chunks[..],
             &TEST_KEY_WORDS,
-            offset,
-            crate::CHUNK_OFFSET_DELTAS,
+            counter,
+            IncrementCounter::Yes,
             crate::DERIVE_KEY,
             crate::CHUNK_START,
             crate::CHUNK_END,
@@ -153,7 +153,7 @@ pub fn test_hash_many_fn(
         &parents,
         &TEST_KEY_WORDS,
         0,
-        crate::PARENT_OFFSET_DELTAS,
+        IncrementCounter::No,
         crate::DERIVE_KEY | crate::PARENT,
         0,
         0,
@@ -166,7 +166,7 @@ pub fn test_hash_many_fn(
             &parents[..],
             &TEST_KEY_WORDS,
             0,
-            crate::PARENT_OFFSET_DELTAS,
+            IncrementCounter::No,
             crate::DERIVE_KEY | crate::PARENT,
             0,
             0,
@@ -202,10 +202,10 @@ fn test_reference_impl_size() {
 }
 
 #[test]
-fn test_offset_words() {
-    let offset: u64 = (1 << 32) + 2;
-    assert_eq!(crate::offset_low(offset), 2);
-    assert_eq!(crate::offset_high(offset), 1);
+fn test_counter_words() {
+    let counter: u64 = (1 << 32) + 2;
+    assert_eq!(crate::counter_low(counter), 2);
+    assert_eq!(crate::counter_high(counter), 1);
 }
 
 #[test]
author	Jack O'Connor <[email protected]>	2019-12-12 18:21:17 -0500
committer	Jack O'Connor <[email protected]>	2019-12-12 21:41:30 -0500
commit	b5f1e925f79589ade494289d730c1fb5c25a09b1 (patch)
tree	a8608132b339b9f2c55bec46bf18d297cd0216b6 /src
parent	a5cc3b28679d724020b8810ea9aecea9f54a058f (diff)