From 31f7b60bdffeb3b2181be0bc2b8879093e79088d Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Thu, 13 Feb 2025 03:27:00 +0000
Subject: [PATCH] Make a subset of `libm` symbols weakly available on all
 platforms

018616e78b ("Always have math functions but with `weak` linking
attribute if we can") made all math symbols available on platforms that
support weak linkage. This caused some unexpected regressions, however,
because our less accurate and sometimes slow routines were being
selected over the system `libm`, which also tends to be weak [1]. Thus,
0fab77e8d7 ("Don't include `math` for `unix` and `wasi` targets") was
applied to undo these changes on many platforms.

Now that some improvements have been made to `libm`, add back a subset
of these functions:

* cbrt
* ceil
* copysign
* fabs
* fdim
* floor
* fma
* fmax
* fmaximum
* fmin
* fminimum
* fmod
* rint
* round
* roundeven
* sqrt
* trunc

This list includes only functions that produce exact results (verified
with exhaustive / extensive tests, and also required by IEEE in most
cases), and for which benchmarks indicate performance similar to or
better than Musl's soft float math routines [^1]. All except `cbrt` also
have `f16` and `f128` implementations. Once more routines meet these
criteria, we can move them from platform-specific availability to always
available.

Once this change makes it to rust-lang/rust, we will also be able to
move the relevant functions from `std` to `core`.

[^1]: We still rely on the backend to provide optimized assmebly
      routines when available.

[1]: https://github.com/rust-lang/rust/issues/128386
---
 src/lib.rs  |  29 +-----
 src/math.rs | 259 ++++++++++++++++++++++++++++++++++------------------
 2 files changed, 173 insertions(+), 115 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 533878137..6f5bd8598 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -41,40 +41,13 @@ mod macros;
 
 pub mod float;
 pub mod int;
-
-// Disable for any of the following:
-// - x86 without sse2 due to ABI issues
-//   - <https://github.com/rust-lang/rust/issues/114479>
-//   - but exclude UEFI since it is a soft-float target
-//     - <https://github.com/rust-lang/rust/issues/128533>
-// - All unix targets (linux, macos, freebsd, android, etc)
-// - wasm with known target_os
-#[cfg(not(any(
-    all(
-        target_arch = "x86",
-        not(target_feature = "sse2"),
-        not(target_os = "uefi"),
-    ),
-    unix,
-    all(target_family = "wasm", not(target_os = "unknown"))
-)))]
 pub mod math;
+pub mod mem;
 
 // `libm` expects its `support` module to be available in the crate root. This config can be
 // cleaned up once `libm` is made always available.
-#[cfg(not(any(
-    all(
-        target_arch = "x86",
-        not(target_feature = "sse2"),
-        not(target_os = "uefi"),
-    ),
-    unix,
-    all(target_family = "wasm", not(target_os = "unknown"))
-)))]
 use math::libm::support;
 
-pub mod mem;
-
 #[cfg(target_arch = "arm")]
 pub mod arm;
 
diff --git a/src/math.rs b/src/math.rs
index fef5358e3..ccd9c5421 100644
--- a/src/math.rs
+++ b/src/math.rs
@@ -5,110 +5,195 @@
 #[path = "../libm/src/math/mod.rs"]
 pub(crate) mod libm;
 
-#[allow(unused_macros)]
-macro_rules! no_mangle {
+macro_rules! libm_intrinsics {
     ($(fn $fun:ident($($iid:ident : $ity:ty),+) -> $oty:ty;)+) => {
         intrinsics! {
             $(
                 pub extern "C" fn $fun($($iid: $ity),+) -> $oty {
-                    self::libm::$fun($($iid),+)
+                    $crate::math::libm::$fun($($iid),+)
                 }
             )+
         }
     }
 }
 
-#[cfg(not(windows))]
-no_mangle! {
-    fn acos(x: f64) -> f64;
-    fn asin(x: f64) -> f64;
-    fn cbrt(x: f64) -> f64;
-    fn expm1(x: f64) -> f64;
-    fn hypot(x: f64, y: f64) -> f64;
-    fn tan(x: f64) -> f64;
-    fn cos(x: f64) -> f64;
-    fn expf(x: f32) -> f32;
-    fn log2(x: f64) -> f64;
-    fn log2f(x: f32) -> f32;
-    fn log10(x: f64) -> f64;
-    fn log10f(x: f32) -> f32;
-    fn log(x: f64) -> f64;
-    fn logf(x: f32) -> f32;
-    fn round(x: f64) -> f64;
-    fn roundf(x: f32) -> f32;
-    fn rint(x: f64) -> f64;
-    fn rintf(x: f32) -> f32;
-    fn sin(x: f64) -> f64;
-    fn pow(x: f64, y: f64) -> f64;
-    fn powf(x: f32, y: f32) -> f32;
-    fn acosf(n: f32) -> f32;
-    fn atan2f(a: f32, b: f32) -> f32;
-    fn atanf(n: f32) -> f32;
-    fn coshf(n: f32) -> f32;
-    fn expm1f(n: f32) -> f32;
-    fn fdim(a: f64, b: f64) -> f64;
-    fn fdimf(a: f32, b: f32) -> f32;
-    fn log1pf(n: f32) -> f32;
-    fn sinhf(n: f32) -> f32;
-    fn tanhf(n: f32) -> f32;
-    fn ldexp(f: f64, n: i32) -> f64;
-    fn ldexpf(f: f32, n: i32) -> f32;
-    fn tgamma(x: f64) -> f64;
-    fn tgammaf(x: f32) -> f32;
-    fn atan(x: f64) -> f64;
-    fn atan2(x: f64, y: f64) -> f64;
-    fn cosh(x: f64) -> f64;
-    fn log1p(x: f64) -> f64;
-    fn sinh(x: f64) -> f64;
-    fn tanh(x: f64) -> f64;
-    fn cosf(x: f32) -> f32;
-    fn exp(x: f64) -> f64;
-    fn sinf(x: f32) -> f32;
-    fn exp2(x: f64) -> f64;
-    fn exp2f(x: f32) -> f32;
-    fn fma(x: f64, y: f64, z: f64) -> f64;
-    fn fmaf(x: f32, y: f32, z: f32) -> f32;
-    fn asinf(n: f32) -> f32;
-    fn cbrtf(n: f32) -> f32;
-    fn hypotf(x: f32, y: f32) -> f32;
-    fn tanf(n: f32) -> f32;
+/// This set of functions is well tested in `libm` and known to provide similar performance to
+/// system `libm`, as well as the same or better accuracy.
+pub mod full_availability {
+    #[cfg(f16_enabled)]
+    libm_intrinsics! {
+        fn ceilf16(x: f16) -> f16;
+        fn copysignf16(x: f16, y: f16) -> f16;
+        fn fabsf16(x: f16) -> f16;
+        fn fdimf16(x: f16, y: f16) -> f16;
+        fn floorf16(x: f16) -> f16;
+        fn fmaxf16(x: f16, y: f16) -> f16;
+        fn fmaximumf16(x: f16, y: f16) -> f16;
+        fn fminf16(x: f16, y: f16) -> f16;
+        fn fminimumf16(x: f16, y: f16) -> f16;
+        fn fmodf16(x: f16, y: f16) -> f16;
+        fn rintf16(x: f16) -> f16;
+        fn roundevenf16(x: f16) -> f16;
+        fn roundf16(x: f16) -> f16;
+        fn sqrtf16(x: f16) -> f16;
+        fn truncf16(x: f16) -> f16;
+    }
+
+    /* Weak linkage is unreliable on Windows and Apple, so we don't expose symbols that we know
+     * the system libc provides in order to avoid conflicts. */
 
-    fn sqrtf(x: f32) -> f32;
-    fn sqrt(x: f64) -> f64;
+    #[cfg(all(not(windows), not(target_vendor = "apple")))]
+    libm_intrinsics! {
+        /* f32 */
+        fn cbrtf(n: f32) -> f32;
+        fn ceilf(x: f32) -> f32;
+        fn copysignf(x: f32, y: f32) -> f32;
+        fn fabsf(x: f32) -> f32;
+        fn fdimf(a: f32, b: f32) -> f32;
+        fn floorf(x: f32) -> f32;
+        fn fmaf(x: f32, y: f32, z: f32) -> f32;
+        fn fmaxf(x: f32, y: f32) -> f32;
+        fn fminf(x: f32, y: f32) -> f32;
+        fn fmodf(x: f32, y: f32) -> f32;
+        fn rintf(x: f32) -> f32;
+        fn roundf(x: f32) -> f32;
+        fn sqrtf(x: f32) -> f32;
+        fn truncf(x: f32) -> f32;
 
-    fn ceil(x: f64) -> f64;
-    fn ceilf(x: f32) -> f32;
-    fn floor(x: f64) -> f64;
-    fn floorf(x: f32) -> f32;
-    fn trunc(x: f64) -> f64;
-    fn truncf(x: f32) -> f32;
+        /* f64 */
+        fn cbrt(x: f64) -> f64;
+        fn ceil(x: f64) -> f64;
+        fn copysign(x: f64, y: f64) -> f64;
+        fn fabs(x: f64) -> f64;
+        fn fdim(a: f64, b: f64) -> f64;
+        fn floor(x: f64) -> f64;
+        fn fma(x: f64, y: f64, z: f64) -> f64;
+        fn fmax(x: f64, y: f64) -> f64;
+        fn fmin(x: f64, y: f64) -> f64;
+        fn fmod(x: f64, y: f64) -> f64;
+        fn rint(x: f64) -> f64;
+        fn round(x: f64) -> f64;
+        fn sqrt(x: f64) -> f64;
+        fn trunc(x: f64) -> f64;
+    }
 
-    fn fmin(x: f64, y: f64) -> f64;
-    fn fminf(x: f32, y: f32) -> f32;
-    fn fmax(x: f64, y: f64) -> f64;
-    fn fmaxf(x: f32, y: f32) -> f32;
-    // `f64 % f64`
-    fn fmod(x: f64, y: f64) -> f64;
-    // `f32 % f32`
-    fn fmodf(x: f32, y: f32) -> f32;
+    // Windows and MacOS do not yet expose roundeven and IEEE 754-2019 `maximum` / `minimum`,
+    // however, so we still provide a fallback.
+    libm_intrinsics! {
+        fn fmaximum(x: f64, y: f64) -> f64;
+        fn fmaximumf(x: f32, y: f32) -> f32;
+        fn fminimum(x: f64, y: f64) -> f64;
+        fn fminimumf(x: f32, y: f32) -> f32;
+        fn roundeven(x: f64) -> f64;
+        fn roundevenf(x: f32) -> f32;
+    }
 
-    fn erf(x: f64) -> f64;
-    fn erff(x: f32) -> f32;
-    fn erfc(x: f64) -> f64;
-    fn erfcf(x: f32) -> f32;
+    #[cfg(f128_enabled)]
+    libm_intrinsics! {
+        fn ceilf128(x: f128) -> f128;
+        fn copysignf128(x: f128, y: f128) -> f128;
+        fn fabsf128(x: f128) -> f128;
+        fn fdimf128(x: f128, y: f128) -> f128;
+        fn floorf128(x: f128) -> f128;
+        fn fmaf128(x: f128, y: f128, z: f128) -> f128;
+        fn fmaxf128(x: f128, y: f128) -> f128;
+        fn fmaximumf128(x: f128, y: f128) -> f128;
+        fn fminf128(x: f128, y: f128) -> f128;
+        fn fminimumf128(x: f128, y: f128) -> f128;
+        fn fmodf128(x: f128, y: f128) -> f128;
+        fn rintf128(x: f128) -> f128;
+        fn roundevenf128(x: f128) -> f128;
+        fn roundf128(x: f128) -> f128;
+        fn sqrtf128(x: f128) -> f128;
+        fn truncf128(x: f128) -> f128;
+    }
 }
 
-// allow for windows (and other targets)
-intrinsics! {
-    pub extern "C" fn lgamma_r(x: f64, s: &mut i32) -> f64 {
-        let r = self::libm::lgamma_r(x);
-        *s = r.1;
-        r.0
+/// This group of functions has more performance or precision issues than system versions, or
+/// are otherwise less well tested. Provide them only on platforms that have problems with the
+/// system `libm`.
+///
+/// As `libm` improves, more functions will be moved from this group to the first group.
+///
+/// Do not supply for any of the following:
+/// - x86 without sse2 due to ABI issues
+///   - <https://github.com/rust-lang/rust/issues/114479>
+///   - but exclude UEFI since it is a soft-float target
+///     - <https://github.com/rust-lang/rust/issues/128533>
+/// - All unix targets (linux, macos, freebsd, android, etc)
+/// - wasm with known target_os
+#[cfg(not(any(
+    all(
+        target_arch = "x86",
+        not(target_feature = "sse2"),
+        not(target_os = "uefi"),
+    ),
+    unix,
+    all(target_family = "wasm", not(target_os = "unknown"))
+)))]
+pub mod partial_availability {
+    #[cfg(not(windows))]
+    libm_intrinsics! {
+        fn acos(x: f64) -> f64;
+        fn acosf(n: f32) -> f32;
+        fn asin(x: f64) -> f64;
+        fn asinf(n: f32) -> f32;
+        fn atan(x: f64) -> f64;
+        fn atan2(x: f64, y: f64) -> f64;
+        fn atan2f(a: f32, b: f32) -> f32;
+        fn atanf(n: f32) -> f32;
+        fn cos(x: f64) -> f64;
+        fn cosf(x: f32) -> f32;
+        fn cosh(x: f64) -> f64;
+        fn coshf(n: f32) -> f32;
+        fn erf(x: f64) -> f64;
+        fn erfc(x: f64) -> f64;
+        fn erfcf(x: f32) -> f32;
+        fn erff(x: f32) -> f32;
+        fn exp(x: f64) -> f64;
+        fn exp2(x: f64) -> f64;
+        fn exp2f(x: f32) -> f32;
+        fn expf(x: f32) -> f32;
+        fn expm1(x: f64) -> f64;
+        fn expm1f(n: f32) -> f32;
+        fn hypot(x: f64, y: f64) -> f64;
+        fn hypotf(x: f32, y: f32) -> f32;
+        fn ldexp(f: f64, n: i32) -> f64;
+        fn ldexpf(f: f32, n: i32) -> f32;
+        fn log(x: f64) -> f64;
+        fn log10(x: f64) -> f64;
+        fn log10f(x: f32) -> f32;
+        fn log1p(x: f64) -> f64;
+        fn log1pf(n: f32) -> f32;
+        fn log2(x: f64) -> f64;
+        fn log2f(x: f32) -> f32;
+        fn logf(x: f32) -> f32;
+        fn pow(x: f64, y: f64) -> f64;
+        fn powf(x: f32, y: f32) -> f32;
+        fn sin(x: f64) -> f64;
+        fn sinf(x: f32) -> f32;
+        fn sinh(x: f64) -> f64;
+        fn sinhf(n: f32) -> f32;
+        fn tan(x: f64) -> f64;
+        fn tanf(n: f32) -> f32;
+        fn tanh(x: f64) -> f64;
+        fn tanhf(n: f32) -> f32;
+        fn tgamma(x: f64) -> f64;
+        fn tgammaf(x: f32) -> f32;
     }
 
-    pub extern "C" fn lgammaf_r(x: f32, s: &mut i32) -> f32 {
-        let r = self::libm::lgammaf_r(x);
-        *s = r.1;
-        r.0
+    // allow for windows (and other targets)
+    intrinsics! {
+        pub extern "C" fn lgamma_r(x: f64, s: &mut i32) -> f64 {
+            let r = super::libm::lgamma_r(x);
+            *s = r.1;
+            r.0
+        }
+
+        pub extern "C" fn lgammaf_r(x: f32, s: &mut i32) -> f32 {
+            let r = super::libm::lgammaf_r(x);
+            *s = r.1;
+            r.0
+        }
     }
 }