Hasher::update_with_join

oconnor663 · oconnor663 · commit fc219f4f8d92 · 2020-02-06T15:07:15.000-05:00
This is a new interface that allows the caller to provide a multi-threading implementation. It's defined in terms of a new `Join` trait, for which we provide two implementations, `SerialJoin` and `RayonJoin`. This lets the caller control when multi-threading is used, rather than the previous all-or-nothing design of the "rayon" feature. Although existing callers should keep working, this is a compatibility break, because callers who were relying on automatic multi-threading before will now be single-threaded. Thus the next release of this crate will need to be version 0.2. See #25 and #54.
diff --git a/Cargo.toml b/Cargo.toml
@@ -21,6 +21,10 @@ c_avx512 = []
 c_neon = []
 std = ["digest/std"]
 
+[package.metadata.docs.rs]
+# Document blake3::join::RayonJoin on docs.rs.
+features = ["rayon"]
+
 [dependencies]
 arrayref = "0.3.5"
 arrayvec = { version = "0.5.1", default-features = false, features = ["array-sizes-33-128"] }
diff --git a/b3sum/src/main.rs b/b3sum/src/main.rs
@@ -3,6 +3,7 @@ use clap::{App, Arg};
 use std::cmp;
 use std::convert::TryInto;
 use std::fs::File;
+use std::io;
 use std::io::prelude::*;
 
 const FILE_ARG: &str = "file";
@@ -57,21 +58,37 @@ fn clap_parse_argv() -> clap::ArgMatches<'static> {
         .get_matches()
 }
 
+// A 16 KiB buffer is enough to take advantage of all the SIMD instruction sets
+// that we support, but `std::io::copy` currently uses 8 KiB. Most platforms
+// can support at least 64 KiB, and there's some performance benefit to using
+// bigger reads, so that's what we use here.
+fn copy_wide(mut reader: impl Read, hasher: &mut blake3::Hasher) -> io::Result<u64> {
+    let mut buffer = [0; 65536];
+    let mut total = 0;
+    loop {
+        match reader.read(&mut buffer) {
+            Ok(0) => return Ok(total),
+            Ok(n) => {
+                hasher.update(&buffer[..n]);
+                total += n as u64;
+            }
+            Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
+            Err(e) => return Err(e),
+        }
+    }
+}
+
 // The slow path, for inputs that we can't memmap.
-fn hash_reader(
-    base_hasher: &blake3::Hasher,
-    mut reader: impl Read,
-) -> Result<blake3::OutputReader> {
+fn hash_reader(base_hasher: &blake3::Hasher, reader: impl Read) -> Result<blake3::OutputReader> {
     let mut hasher = base_hasher.clone();
-    // TODO: This is a narrow copy, so it might not take advantage of SIMD or
-    // threads. With a larger buffer size, most of that performance can be
-    // recovered. However, this requires some platform-specific tuning, based
-    // on both the SIMD degree and the number of cores. A double-buffering
-    // strategy is also helpful, where a dedicated background thread reads
-    // input into one buffer while another thread is calling update() on a
-    // second buffer. Since this is the slow path anyway, do the simple thing
-    // for now.
-    std::io::copy(&mut reader, &mut hasher)?;
+    // This is currently all single-threaded. Doing multi-threaded hashing
+    // without memory mapping is tricky, since all your worker threads have to
+    // stop every time you refill the buffer, and that ends up being a lot of
+    // overhead. To solve that, we need a more complicated double-buffering
+    // strategy where a background thread fills one buffer while the worker
+    // threads are hashing the other one. We might implement that in the
+    // future, but since this is the slow path anyway, it's not high priority.
+    copy_wide(reader, &mut hasher)?;
     Ok(hasher.finalize_xof())
 }
 
@@ -114,7 +131,14 @@ fn maybe_hash_memmap(
     #[cfg(feature = "memmap")]
     {
         if let Some(map) = maybe_memmap_file(_file)? {
-            return Ok(Some(_base_hasher.clone().update(&map).finalize_xof()));
+            // Memory mapping worked. Use Rayon-based multi-threading to split
+            // up the whole file across many worker threads.
+            return Ok(Some(
+                _base_hasher
+                    .clone()
+                    .update_with_join::<blake3::join::RayonJoin>(&map)
+                    .finalize_xof(),
+            ));
         }
     }
     Ok(None)
diff --git a/benches/bench.rs b/benches/bench.rs
@@ -417,3 +417,85 @@ fn bench_reference_0512_kib(b: &mut Bencher) {
 fn bench_reference_1024_kib(b: &mut Bencher) {
     bench_reference(b, 1024 * KIB);
 }
+
+#[cfg(feature = "rayon")]
+fn bench_rayon(b: &mut Bencher, len: usize) {
+    let mut input = RandomInput::new(b, len);
+    b.iter(|| {
+        blake3::Hasher::new()
+            .update_with_join::<blake3::join::RayonJoin>(input.get())
+            .finalize()
+    });
+}
+
+#[bench]
+#[cfg(feature = "rayon")]
+fn bench_rayon_0001_block(b: &mut Bencher) {
+    bench_rayon(b, BLOCK_LEN);
+}
+
+#[bench]
+#[cfg(feature = "rayon")]
+fn bench_rayon_0001_kib(b: &mut Bencher) {
+    bench_rayon(b, 1 * KIB);
+}
+
+#[bench]
+#[cfg(feature = "rayon")]
+fn bench_rayon_0002_kib(b: &mut Bencher) {
+    bench_rayon(b, 2 * KIB);
+}
+
+#[bench]
+#[cfg(feature = "rayon")]
+fn bench_rayon_0004_kib(b: &mut Bencher) {
+    bench_rayon(b, 4 * KIB);
+}
+
+#[bench]
+#[cfg(feature = "rayon")]
+fn bench_rayon_0008_kib(b: &mut Bencher) {
+    bench_rayon(b, 8 * KIB);
+}
+
+#[bench]
+#[cfg(feature = "rayon")]
+fn bench_rayon_0016_kib(b: &mut Bencher) {
+    bench_rayon(b, 16 * KIB);
+}
+
+#[bench]
+#[cfg(feature = "rayon")]
+fn bench_rayon_0032_kib(b: &mut Bencher) {
+    bench_rayon(b, 32 * KIB);
+}
+
+#[bench]
+#[cfg(feature = "rayon")]
+fn bench_rayon_0064_kib(b: &mut Bencher) {
+    bench_rayon(b, 64 * KIB);
+}
+
+#[bench]
+#[cfg(feature = "rayon")]
+fn bench_rayon_0128_kib(b: &mut Bencher) {
+    bench_rayon(b, 128 * KIB);
+}
+
+#[bench]
+#[cfg(feature = "rayon")]
+fn bench_rayon_0256_kib(b: &mut Bencher) {
+    bench_rayon(b, 256 * KIB);
+}
+
+#[bench]
+#[cfg(feature = "rayon")]
+fn bench_rayon_0512_kib(b: &mut Bencher) {
+    bench_rayon(b, 512 * KIB);
+}
+
+#[bench]
+#[cfg(feature = "rayon")]
+fn bench_rayon_1024_kib(b: &mut Bencher) {
+    bench_rayon(b, 1024 * KIB);
+}
diff --git a/src/join.rs b/src/join.rs
@@ -0,0 +1,114 @@
+//! The multi-threading abstractions used by [`Hasher::update_with_join`].
+//!
+//! Different implementations of the `Join` trait determine whether
+//! [`Hasher::update_with_join`] performs multi-threading on sufficiently large
+//! inputs. The `SerialJoin` implementation is single-threaded, and the
+//! `RayonJoin` implementation (gated by the `rayon` feature) is
+//! multi-threaded. Interfaces other than [`Hasher::update_with_join`], like
+//! [`hash`] and [`Hasher::update`], always use `SerialJoin` internally.
+//!
+//! The `Join` trait is an almost exact copy of the [`rayon::join`] API, and
+//! `RayonJoin` is the only non-trivial implementation provided. The only
+//! difference between the function signature in the `Join` trait and the
+//! underlying one in Rayon, is that the trait method includes two length
+//! parameters. This gives an implementation the option of e.g. setting a
+//! subtree size threshold below which it keeps splits on the same thread.
+//! However, neither of the two provided implementations currently makes use of
+//! those parameters. Note that in Rayon, the very first `join` call is more
+//! expensive than subsequent calls, because it moves work from the calling
+//! thread into the thread pool. That makes a coarse-grained input length
+//! threshold in the caller more effective than a fine-grained subtree size
+//! threshold after the implementation has already started recursing.
+//!
+//! # Example
+//!
+//! ```
+//! // Hash a large input using multi-threading. Note that multi-threading
+//! // comes with some overhead, and it can actually hurt performance for small
+//! // inputs. The meaning of "small" varies, however, depending on the
+//! // platform and the number of threads. (On x86_64, the cutoff tends to be
+//! // around 128 KiB.) You should benchmark your own use case to see whether
+//! // multi-threading helps.
+//! # #[cfg(feature = "rayon")]
+//! # {
+//! # fn some_large_input() -> &'static [u8] { b"foo" }
+//! let input: &[u8] = some_large_input();
+//! let mut hasher = blake3::Hasher::new();
+//! hasher.update_with_join::<blake3::join::RayonJoin>(input);
+//! let hash = hasher.finalize();
+//! # }
+//! ```
+//!
+//! [`Hasher::update_with_join`]: ../struct.Hasher.html#method.update_with_join
+//! [`Hasher::update`]: ../struct.Hasher.html#method.update
+//! [`hash`]: ../fn.hash.html
+//! [`rayon::join`]: https://docs.rs/rayon/1.3.0/rayon/fn.join.html
+
+/// The trait that abstracts over single-threaded and multi-threaded recursion.
+pub trait Join {
+    fn join<A, B, RA, RB>(oper_a: A, oper_b: B, len_a: usize, len_b: usize) -> (RA, RB)
+    where
+        A: FnOnce() -> RA + Send,
+        B: FnOnce() -> RB + Send,
+        RA: Send,
+        RB: Send;
+}
+
+/// The trivial, serial implementation of `Join`. The left and right sides are
+/// executed one after the other, on the calling thread. The standalone hashing
+/// functions and the `Hasher::update` method use this implementation
+/// internally.
+pub enum SerialJoin {}
+
+impl Join for SerialJoin {
+    #[inline]
+    fn join<A, B, RA, RB>(oper_a: A, oper_b: B, _len_a: usize, _len_b: usize) -> (RA, RB)
+    where
+        A: FnOnce() -> RA + Send,
+        B: FnOnce() -> RB + Send,
+        RA: Send,
+        RB: Send,
+    {
+        (oper_a(), oper_b())
+    }
+}
+
+/// The Rayon-based implementation of `Join`. The left and right sides are
+/// executed on the Rayon thread pool, potentially in parallel. This
+/// implementation is gated by the `rayon` feature, which is off by default.
+#[cfg(feature = "rayon")]
+pub enum RayonJoin {}
+
+#[cfg(feature = "rayon")]
+impl Join for RayonJoin {
+    #[inline]
+    fn join<A, B, RA, RB>(oper_a: A, oper_b: B, _len_a: usize, _len_b: usize) -> (RA, RB)
+    where
+        A: FnOnce() -> RA + Send,
+        B: FnOnce() -> RB + Send,
+        RA: Send,
+        RB: Send,
+    {
+        rayon::join(oper_a, oper_b)
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_serial_join() {
+        let oper_a = || 1 + 1;
+        let oper_b = || 2 + 2;
+        assert_eq!((2, 4), SerialJoin::join(oper_a, oper_b, 3, 4));
+    }
+
+    #[test]
+    #[cfg(feature = "rayon")]
+    fn test_rayon_join() {
+        let oper_a = || 1 + 1;
+        let oper_b = || 2 + 2;
+        assert_eq!((2, 4), RayonJoin::join(oper_a, oper_b, 3, 4));
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
diff --git a/src/test.rs b/src/test.rs