From bbe3aefd8ca32b071953dcfd56a2560bccf45a7f Mon Sep 17 00:00:00 2001
From: Trevor Gross <tmgross@umich.edu>
Date: Thu, 11 Apr 2024 05:38:46 -0400
Subject: [PATCH] Work on bigint

Try splitting part of 'Int' into 'MinInt' so we don't need to implement everything on u256/i256

Add addsub test

Add mul/div/rem tests

Add cmp test

Remove 32-bit div implementation

formatting updates

disable div tests for now

Bigint updates

Big update

Fix widen mul

wrapping add

disable duplicate symbols in builtins

Apply temporary unord fix from @beetrees #593

tests

add lowerhex display

errors by ref

tests

fix-test

Update big tests

Fix core calls

Disable widen_mul for signed

Test adding symbols in build.rs

Add a feature to compile intrinsics that are missing on the system for testing

update

Disable f128 tests on platforms without system support

add missing build.rs file

pull cas file from master

testgs

print more div values

Add a benchmark

Work on fixing bit widths

Update benchmark
---
 build.rs                   |  12 --
 src/float/add.rs           |  22 +--
 src/float/cmp.rs           |   2 +-
 src/float/div.rs           |  95 ++++++----
 src/float/extend.rs        |   2 +-
 src/float/mod.rs           |   2 +-
 src/float/mul.rs           |   2 +-
 src/float/trunc.rs         |   2 +-
 src/int/addsub.rs          |  10 +-
 src/int/big.rs             | 364 +++++++++++++++++++++++++++++++++++++
 src/int/mod.rs             | 233 +++++++++++++-----------
 src/int/mul.rs             |   4 +-
 src/int/shift.rs           |   2 +-
 src/lib.rs                 |  57 ++++++
 testcrate/Cargo.toml       |   2 +
 testcrate/benches/float.rs |  90 +++++++++
 testcrate/build.rs         |  15 ++
 testcrate/src/lib.rs       |  18 +-
 testcrate/tests/addsub.rs  |  16 +-
 testcrate/tests/big.rs     | 104 +++++++++++
 testcrate/tests/cmp.rs     |  27 ++-
 testcrate/tests/div_rem.rs |  29 ++-
 testcrate/tests/mul.rs     |  14 +-
 23 files changed, 933 insertions(+), 191 deletions(-)
 create mode 100644 src/int/big.rs
 create mode 100644 testcrate/benches/float.rs
 create mode 100644 testcrate/build.rs
 create mode 100644 testcrate/tests/big.rs

diff --git a/build.rs b/build.rs
index bafbf75d..1229fb2a 100644
--- a/build.rs
+++ b/build.rs
@@ -479,10 +479,6 @@ mod c {
                 ("__floatsitf", "floatsitf.c"),
                 ("__floatunditf", "floatunditf.c"),
                 ("__floatunsitf", "floatunsitf.c"),
-                ("__addtf3", "addtf3.c"),
-                ("__multf3", "multf3.c"),
-                ("__subtf3", "subtf3.c"),
-                ("__divtf3", "divtf3.c"),
                 ("__powitf2", "powitf2.c"),
                 ("__fe_getround", "fp_mode.c"),
                 ("__fe_raise_inexact", "fp_mode.c"),
@@ -500,30 +496,22 @@ mod c {
         if target_arch == "mips64" {
             sources.extend(&[
                 ("__netf2", "comparetf2.c"),
-                ("__addtf3", "addtf3.c"),
-                ("__multf3", "multf3.c"),
-                ("__subtf3", "subtf3.c"),
                 ("__fixtfsi", "fixtfsi.c"),
                 ("__floatsitf", "floatsitf.c"),
                 ("__fixunstfsi", "fixunstfsi.c"),
                 ("__floatunsitf", "floatunsitf.c"),
                 ("__fe_getround", "fp_mode.c"),
-                ("__divtf3", "divtf3.c"),
             ]);
         }
 
         if target_arch == "loongarch64" {
             sources.extend(&[
                 ("__netf2", "comparetf2.c"),
-                ("__addtf3", "addtf3.c"),
-                ("__multf3", "multf3.c"),
-                ("__subtf3", "subtf3.c"),
                 ("__fixtfsi", "fixtfsi.c"),
                 ("__floatsitf", "floatsitf.c"),
                 ("__fixunstfsi", "fixunstfsi.c"),
                 ("__floatunsitf", "floatunsitf.c"),
                 ("__fe_getround", "fp_mode.c"),
-                ("__divtf3", "divtf3.c"),
             ]);
         }
 
diff --git a/src/float/add.rs b/src/float/add.rs
index e2fb8407..8fa9dac5 100644
--- a/src/float/add.rs
+++ b/src/float/add.rs
@@ -1,5 +1,5 @@
 use crate::float::Float;
-use crate::int::{CastInto, Int};
+use crate::int::{CastInto, Int, MinInt};
 
 /// Returns `a + b`
 fn add<F: Float>(a: F, b: F) -> F
@@ -57,9 +57,9 @@ where
         }
 
         // zero + anything = anything
-        if a_abs == Int::ZERO {
+        if a_abs == MinInt::ZERO {
             // but we need to get the sign right for zero + zero
-            if b_abs == Int::ZERO {
+            if b_abs == MinInt::ZERO {
                 return F::from_repr(a.repr() & b.repr());
             } else {
                 return b;
@@ -67,7 +67,7 @@ where
         }
 
         // anything + zero = anything
-        if b_abs == Int::ZERO {
+        if b_abs == MinInt::ZERO {
             return a;
         }
     }
@@ -113,10 +113,10 @@ where
     // Shift the significand of b by the difference in exponents, with a sticky
     // bottom bit to get rounding correct.
     let align = a_exponent.wrapping_sub(b_exponent).cast();
-    if align != Int::ZERO {
+    if align != MinInt::ZERO {
         if align < bits {
             let sticky =
-                F::Int::from_bool(b_significand << bits.wrapping_sub(align).cast() != Int::ZERO);
+                F::Int::from_bool(b_significand << bits.wrapping_sub(align).cast() != MinInt::ZERO);
             b_significand = (b_significand >> align.cast()) | sticky;
         } else {
             b_significand = one; // sticky; b is known to be non-zero.
@@ -125,8 +125,8 @@ where
     if subtraction {
         a_significand = a_significand.wrapping_sub(b_significand);
         // If a == -b, return +zero.
-        if a_significand == Int::ZERO {
-            return F::from_repr(Int::ZERO);
+        if a_significand == MinInt::ZERO {
+            return F::from_repr(MinInt::ZERO);
         }
 
         // If partial cancellation occured, we need to left-shift the result
@@ -143,8 +143,8 @@ where
 
         // If the addition carried up, we need to right-shift the result and
         // adjust the exponent:
-        if a_significand & implicit_bit << 4 != Int::ZERO {
-            let sticky = F::Int::from_bool(a_significand & one != Int::ZERO);
+        if a_significand & implicit_bit << 4 != MinInt::ZERO {
+            let sticky = F::Int::from_bool(a_significand & one != MinInt::ZERO);
             a_significand = a_significand >> 1 | sticky;
             a_exponent += 1;
         }
@@ -160,7 +160,7 @@ where
         // need to shift the significand.
         let shift = (1 - a_exponent).cast();
         let sticky =
-            F::Int::from_bool((a_significand << bits.wrapping_sub(shift).cast()) != Int::ZERO);
+            F::Int::from_bool((a_significand << bits.wrapping_sub(shift).cast()) != MinInt::ZERO);
         a_significand = a_significand >> shift.cast() | sticky;
         a_exponent = 0;
     }
diff --git a/src/float/cmp.rs b/src/float/cmp.rs
index 46e903dc..ae05a3a6 100644
--- a/src/float/cmp.rs
+++ b/src/float/cmp.rs
@@ -1,7 +1,7 @@
 #![allow(unreachable_code)]
 
 use crate::float::Float;
-use crate::int::Int;
+use crate::int::MinInt;
 
 #[derive(Clone, Copy)]
 enum Result {
diff --git a/src/float/div.rs b/src/float/div.rs
index 9038f6b9..6f64dfae 100644
--- a/src/float/div.rs
+++ b/src/float/div.rs
@@ -3,7 +3,9 @@
 #![allow(clippy::needless_return)]
 
 use crate::float::Float;
-use crate::int::{CastInto, DInt, HInt, Int};
+use crate::int::{CastInto, DInt, HInt, Int, MinInt};
+
+use super::HalfRep;
 
 fn div32<F: Float>(a: F, b: F) -> F
 where
@@ -37,6 +39,11 @@ where
     let quiet_bit = implicit_bit >> 1;
     let qnan_rep = exponent_mask | quiet_bit;
 
+    // #[inline(always)]
+    // fn negate<T: Int>(a: T) -> T {
+    //     T::wrapping_neg(a.signe)
+    // }
+
     #[inline(always)]
     fn negate_u32(a: u32) -> u32 {
         (<i32>::wrapping_neg(a as i32)) as u32
@@ -459,10 +466,14 @@ where
     i32: CastInto<F::Int>,
     F::Int: CastInto<i32>,
     u64: CastInto<F::Int>,
+    u64: CastInto<HalfRep<F>>,
+    F::Int: CastInto<HalfRep<F>>,
+    F::Int: From<HalfRep<F>>,
+    F::Int: From<u8>,
     F::Int: CastInto<u64>,
     i64: CastInto<F::Int>,
     F::Int: CastInto<i64>,
-    F::Int: HInt,
+    F::Int: HInt + DInt,
 {
     const NUMBER_OF_HALF_ITERATIONS: usize = 3;
     const NUMBER_OF_FULL_ITERATIONS: usize = 1;
@@ -471,7 +482,7 @@ where
     let one = F::Int::ONE;
     let zero = F::Int::ZERO;
     let hw = F::BITS / 2;
-    let lo_mask = u64::MAX >> hw;
+    let lo_mask = F::Int::MAX >> hw;
 
     let significand_bits = F::SIGNIFICAND_BITS;
     let max_exponent = F::EXPONENT_MAX;
@@ -616,8 +627,9 @@ where
 
     let mut x_uq0 = if NUMBER_OF_HALF_ITERATIONS > 0 {
         // Starting with (n-1) half-width iterations
-        let b_uq1_hw: u32 =
-            (CastInto::<u64>::cast(b_significand) >> (significand_bits + 1 - hw)) as u32;
+        let b_uq1_hw: HalfRep<F> = CastInto::<HalfRep<F>>::cast(
+            CastInto::<u64>::cast(b_significand) >> (significand_bits + 1 - hw),
+        );
 
         // C is (3/4 + 1/sqrt(2)) - 1 truncated to W0 fractional bits as UQ0.HW
         // with W0 being either 16 or 32 and W0 <= HW.
@@ -625,12 +637,13 @@ where
         // b/2 is subtracted to obtain x0) wrapped to [0, 1) range.
 
         // HW is at least 32. Shifting into the highest bits if needed.
-        let c_hw = (0x7504F333_u64 as u32).wrapping_shl(hw.wrapping_sub(32));
+        let c_hw = (CastInto::<HalfRep<F>>::cast(0x7504F333_u64)).wrapping_shl(hw.wrapping_sub(32));
 
         // b >= 1, thus an upper bound for 3/4 + 1/sqrt(2) - b/2 is about 0.9572,
         // so x0 fits to UQ0.HW without wrapping.
-        let x_uq0_hw: u32 = {
-            let mut x_uq0_hw: u32 = c_hw.wrapping_sub(b_uq1_hw /* exact b_hw/2 as UQ0.HW */);
+        let x_uq0_hw: HalfRep<F> = {
+            let mut x_uq0_hw: HalfRep<F> =
+                c_hw.wrapping_sub(b_uq1_hw /* exact b_hw/2 as UQ0.HW */);
             // dbg!(x_uq0_hw);
             // An e_0 error is comprised of errors due to
             // * x0 being an inherently imprecise first approximation of 1/b_hw
@@ -661,8 +674,9 @@ where
                 // no overflow occurred earlier: ((rep_t)x_UQ0_hw * b_UQ1_hw >> HW) is
                 // expected to be strictly positive because b_UQ1_hw has its highest bit set
                 // and x_UQ0_hw should be rather large (it converges to 1/2 < 1/b_hw <= 1).
-                let corr_uq1_hw: u32 =
-                    0.wrapping_sub(((x_uq0_hw as u64).wrapping_mul(b_uq1_hw as u64)) >> hw) as u32;
+                let corr_uq1_hw: HalfRep<F> = CastInto::<HalfRep<F>>::cast(zero.wrapping_sub(
+                    ((F::Int::from(x_uq0_hw)).wrapping_mul(F::Int::from(b_uq1_hw))) >> hw,
+                ));
                 // dbg!(corr_uq1_hw);
 
                 // Now, we should multiply UQ0.HW and UQ1.(HW-1) numbers, naturally
@@ -677,7 +691,9 @@ where
                 // The fact corr_UQ1_hw was virtually round up (due to result of
                 // multiplication being **first** truncated, then negated - to improve
                 // error estimations) can increase x_UQ0_hw by up to 2*Ulp of x_UQ0_hw.
-                x_uq0_hw = ((x_uq0_hw as u64).wrapping_mul(corr_uq1_hw as u64) >> (hw - 1)) as u32;
+                x_uq0_hw = ((F::Int::from(x_uq0_hw)).wrapping_mul(F::Int::from(corr_uq1_hw))
+                    >> (hw - 1))
+                    .cast();
                 // dbg!(x_uq0_hw);
                 // Now, either no overflow occurred or x_UQ0_hw is 0 or 1 in its half_rep_t
                 // representation. In the latter case, x_UQ0_hw will be either 0 or 1 after
@@ -707,7 +723,7 @@ where
             // be not below that value (see g(x) above), so it is safe to decrement just
             // once after the final iteration. On the other hand, an effective value of
             // divisor changes after this point (from b_hw to b), so adjust here.
-            x_uq0_hw.wrapping_sub(1_u32)
+            x_uq0_hw.wrapping_sub(HalfRep::<F>::ONE)
         };
 
         // Error estimations for full-precision iterations are calculated just
@@ -717,7 +733,7 @@ where
         // Simulating operations on a twice_rep_t to perform a single final full-width
         // iteration. Using ad-hoc multiplication implementations to take advantage
         // of particular structure of operands.
-        let blo: u64 = (CastInto::<u64>::cast(b_uq1)) & lo_mask;
+        let blo: F::Int = b_uq1 & lo_mask;
         // x_UQ0 = x_UQ0_hw * 2^HW - 1
         // x_UQ0 * b_UQ1 = (x_UQ0_hw * 2^HW) * (b_UQ1_hw * 2^HW + blo) - b_UQ1
         //
@@ -726,19 +742,20 @@ where
         // +            [  x_UQ0_hw *  blo  ]
         // -                      [      b_UQ1       ]
         // = [      result       ][.... discarded ...]
-        let corr_uq1 = negate_u64(
-            (x_uq0_hw as u64) * (b_uq1_hw as u64) + (((x_uq0_hw as u64) * (blo)) >> hw) - 1,
-        ); // account for *possible* carry
-        let lo_corr = corr_uq1 & lo_mask;
-        let hi_corr = corr_uq1 >> hw;
+        let corr_uq1: F::Int = (F::Int::from(x_uq0_hw) * F::Int::from(b_uq1_hw)
+            + ((F::Int::from(x_uq0_hw) * blo) >> hw))
+            .wrapping_sub(one)
+            .wrapping_neg(); // account for *possible* carry
+        let lo_corr: F::Int = corr_uq1 & lo_mask;
+        let hi_corr: F::Int = corr_uq1 >> hw;
         // x_UQ0 * corr_UQ1 = (x_UQ0_hw * 2^HW) * (hi_corr * 2^HW + lo_corr) - corr_UQ1
-        let mut x_uq0: <F as Float>::Int = ((((x_uq0_hw as u64) * hi_corr) << 1)
-            .wrapping_add(((x_uq0_hw as u64) * lo_corr) >> (hw - 1))
-            .wrapping_sub(2))
-        .cast(); // 1 to account for the highest bit of corr_UQ1 can be 1
-                 // 1 to account for possible carry
-                 // Just like the case of half-width iterations but with possibility
-                 // of overflowing by one extra Ulp of x_UQ0.
+        let mut x_uq0: F::Int = ((F::Int::from(x_uq0_hw) * hi_corr) << 1)
+            .wrapping_add((F::Int::from(x_uq0_hw) * lo_corr) >> (hw - 1))
+            .wrapping_sub(F::Int::from(2u8));
+        // 1 to account for the highest bit of corr_UQ1 can be 1
+        // 1 to account for possible carry
+        // Just like the case of half-width iterations but with possibility
+        // of overflowing by one extra Ulp of x_UQ0.
         x_uq0 -= one;
         // ... and then traditional fixup by 2 should work
 
@@ -755,8 +772,8 @@ where
         x_uq0
     } else {
         // C is (3/4 + 1/sqrt(2)) - 1 truncated to 64 fractional bits as UQ0.n
-        let c: <F as Float>::Int = (0x7504F333 << (F::BITS - 32)).cast();
-        let x_uq0: <F as Float>::Int = c.wrapping_sub(b_uq1);
+        let c: F::Int = (0x7504F333 << (F::BITS - 32)).cast();
+        let x_uq0: F::Int = c.wrapping_sub(b_uq1);
         // E_0 <= 3/4 - 1/sqrt(2) + 2 * 2^-64
         x_uq0
     };
@@ -799,14 +816,27 @@ where
 
     // Add 2 to U_N due to final decrement.
 
-    let reciprocal_precision: <F as Float>::Int = 220.cast();
+    let reciprocal_precision: F::Int = if F::BITS == 32
+        && NUMBER_OF_HALF_ITERATIONS == 2
+        && NUMBER_OF_FULL_ITERATIONS == 1
+    {
+        74.cast()
+    } else if F::BITS == 32 && NUMBER_OF_HALF_ITERATIONS == 0 && NUMBER_OF_FULL_ITERATIONS == 3 {
+        10.cast()
+    } else if F::BITS == 64 && NUMBER_OF_HALF_ITERATIONS == 3 && NUMBER_OF_FULL_ITERATIONS == 1 {
+        220.cast()
+    } else if F::BITS == 128 && NUMBER_OF_HALF_ITERATIONS == 4 && NUMBER_OF_FULL_ITERATIONS == 1 {
+        13922.cast()
+    } else {
+        panic!("invalid iterations for the specified bits");
+    };
 
     // Suppose 1/b - P * 2^-W < x < 1/b + P * 2^-W
     let x_uq0 = x_uq0 - reciprocal_precision;
     // Now 1/b - (2*P) * 2^-W < x < 1/b
     // FIXME Is x_UQ0 still >= 0.5?
 
-    let mut quotient: <F as Float>::Int = x_uq0.widen_mul(a_significand << 1).hi();
+    let mut quotient: F::Int = x_uq0.widen_mul(a_significand << 1).hi();
     // Now, a/b - 4*P * 2^-W < q < a/b for q=<quotient_UQ1:dummy> in UQ1.(SB+1+W).
 
     // quotient_UQ1 is in [0.5, 2.0) as UQ1.(SB+1),
@@ -914,13 +944,8 @@ intrinsics! {
         div64(a, b)
     }
 
-    // TODO: how should `HInt` be handled?
     pub extern "C" fn __divtf3(a: f128, b: f128) -> f128 {
-        if cfg!(target_pointer_width = "64") {
-            div32(a, b)
-        } else {
-            div64(a, b)
-        }
+        div64(a, b)
     }
 
     #[cfg(target_arch = "arm")]
diff --git a/src/float/extend.rs b/src/float/extend.rs
index 7c244660..5b0c0d97 100644
--- a/src/float/extend.rs
+++ b/src/float/extend.rs
@@ -1,5 +1,5 @@
 use crate::float::Float;
-use crate::int::{CastInto, Int};
+use crate::int::{CastInto, Int, MinInt};
 
 /// Generic conversion from a narrower to a wider IEEE-754 floating-point type
 fn extend<F: Float, R: Float>(a: F) -> R
diff --git a/src/float/mod.rs b/src/float/mod.rs
index 02d291ed..a82dd7d2 100644
--- a/src/float/mod.rs
+++ b/src/float/mod.rs
@@ -59,7 +59,7 @@ pub(crate) trait Float:
     /// A mask for the significand
     const SIGNIFICAND_MASK: Self::Int;
 
-    /// The implicit bit of the float format
+    // The implicit bit of the float format
     const IMPLICIT_BIT: Self::Int;
 
     /// A mask for the exponent
diff --git a/src/float/mul.rs b/src/float/mul.rs
index eed29527..e3e5708e 100644
--- a/src/float/mul.rs
+++ b/src/float/mul.rs
@@ -1,5 +1,5 @@
 use crate::float::Float;
-use crate::int::{CastInto, DInt, HInt, Int};
+use crate::int::{CastInto, DInt, HInt, Int, MinInt};
 
 fn mul<F: Float>(a: F, b: F) -> F
 where
diff --git a/src/float/trunc.rs b/src/float/trunc.rs
index 6de446c1..b607a654 100644
--- a/src/float/trunc.rs
+++ b/src/float/trunc.rs
@@ -1,5 +1,5 @@
 use crate::float::Float;
-use crate::int::{CastInto, Int};
+use crate::int::{CastInto, Int, MinInt};
 
 fn trunc<F: Float, R: Float>(a: F) -> R
 where
diff --git a/src/int/addsub.rs b/src/int/addsub.rs
index f31eff4b..e95590d8 100644
--- a/src/int/addsub.rs
+++ b/src/int/addsub.rs
@@ -1,6 +1,6 @@
-use crate::int::{DInt, Int};
+use crate::int::{DInt, Int, MinInt};
 
-trait UAddSub: DInt {
+trait UAddSub: DInt + Int {
     fn uadd(self, other: Self) -> Self {
         let (lo, carry) = self.lo().overflowing_add(other.lo());
         let hi = self.hi().wrapping_add(other.hi());
@@ -22,7 +22,7 @@ impl UAddSub for u128 {}
 
 trait AddSub: Int
 where
-    <Self as Int>::UnsignedInt: UAddSub,
+    <Self as MinInt>::UnsignedInt: UAddSub,
 {
     fn add(self, other: Self) -> Self {
         Self::from_unsigned(self.unsigned().uadd(other.unsigned()))
@@ -37,7 +37,7 @@ impl AddSub for i128 {}
 
 trait Addo: AddSub
 where
-    <Self as Int>::UnsignedInt: UAddSub,
+    <Self as MinInt>::UnsignedInt: UAddSub,
 {
     fn addo(self, other: Self) -> (Self, bool) {
         let sum = AddSub::add(self, other);
@@ -50,7 +50,7 @@ impl Addo for u128 {}
 
 trait Subo: AddSub
 where
-    <Self as Int>::UnsignedInt: UAddSub,
+    <Self as MinInt>::UnsignedInt: UAddSub,
 {
     fn subo(self, other: Self) -> (Self, bool) {
         let sum = AddSub::sub(self, other);
diff --git a/src/int/big.rs b/src/int/big.rs
new file mode 100644
index 00000000..a54d6259
--- /dev/null
+++ b/src/int/big.rs
@@ -0,0 +1,364 @@
+//! Integers used for wide operations, larger than `u128`.
+
+#![allow(unused)]
+
+use crate::int::{DInt, HInt, Int, MinInt};
+use core::{fmt, ops};
+
+const WORD_LO_MASK: u64 = 0x00000000ffffffff;
+const WORD_HI_MASK: u64 = 0xffffffff00000000;
+const WORD_FULL_MASK: u64 = 0xffffffffffffffff;
+const U128_LO_MASK: u128 = u64::MAX as u128;
+const U128_HI_MASK: u128 = (u64::MAX as u128) << 64;
+
+/// A 256-bit unsigned integer represented as 4 64-bit limbs.
+///
+/// Each limb is a native-endian number, but the array is little-limb-endian.
+#[allow(non_camel_case_types)]
+#[derive(Clone, Copy, Debug, PartialEq, PartialOrd)]
+pub struct u256(pub [u64; 4]);
+
+impl u256 {
+    pub const MAX: Self = Self([u64::MAX, u64::MAX, u64::MAX, u64::MAX]);
+
+    /// Reinterpret as a signed integer
+    pub fn signed(self) -> i256 {
+        i256(self.0)
+    }
+}
+
+/// A 256-bit signed integer represented as 4 64-bit limbs.
+///
+/// Each limb is a native-endian number, but the array is little-limb-endian.
+#[allow(non_camel_case_types)]
+#[derive(Clone, Copy, Debug, PartialEq, PartialOrd)]
+pub struct i256(pub [u64; 4]);
+
+impl i256 {
+    /// Reinterpret as an unsigned integer
+    pub fn unsigned(self) -> u256 {
+        u256(self.0)
+    }
+}
+
+impl MinInt for u256 {
+    type OtherSign = i256;
+
+    type UnsignedInt = u256;
+
+    const SIGNED: bool = false;
+    const BITS: u32 = 256;
+    const ZERO: Self = Self([0u64; 4]);
+    const ONE: Self = Self([1, 0, 0, 0]);
+    const MIN: Self = Self([0u64; 4]);
+    const MAX: Self = Self([u64::MAX; 4]);
+}
+
+impl MinInt for i256 {
+    type OtherSign = u256;
+
+    type UnsignedInt = u256;
+
+    const SIGNED: bool = false;
+    const BITS: u32 = 256;
+    const ZERO: Self = Self([0u64; 4]);
+    const ONE: Self = Self([1, 0, 0, 0]);
+    const MIN: Self = Self([0, 0, 0, 1 << 63]);
+    const MAX: Self = Self([u64::MAX, u64::MAX, u64::MAX, u64::MAX << 1]);
+}
+
+// impl Int for i256 {
+//     fn is_zero(self) -> bool {
+//         self == Self::ZERO
+//     }
+
+//     fn wrapping_neg(self) -> Self {
+//         Self::ZERO.wrapping_sub(self)
+//     }
+
+//     fn wrapping_add(self, other: Self) -> Self {
+//         self.overflowing_add(other).0
+//     }
+//
+//     fn overflowing_add(self, other: Self) -> (Self, bool) {
+//         let x0 = (u128::from(self.0[0])).wrapping_add(u128::from(other.0[0]));
+//         let v0 = x0 as u64;
+//         let c0 = x0 >> 64;
+
+//         let x1 = (u128::from(self.0[1]))
+//             .wrapping_add(u128::from(other.0[1]))
+//             .wrapping_add(c0);
+//         let v1 = x1 as u64;
+//         let c1 = x1 >> 64;
+
+//         let x2 = (u128::from(self.0[2]))
+//             .wrapping_add(u128::from(other.0[2]))
+//             .wrapping_add(c1);
+//         let v2 = x2 as u64;
+//         let c2 = x2 >> 64;
+
+//         let x3 = (u128::from(self.0[3]))
+//             .wrapping_add(u128::from(other.0[3]))
+//             .wrapping_add(c2);
+//         let v3 = x3 as u64;
+//         let c3 = x3 >> 64;
+
+//         (Self([v0, v1, v2, v3]), c3 > 0)
+//     }
+// }
+
+macro_rules! impl_common {
+    ($ty:ty) => {
+        //         impl ops::Add for $ty {
+        //             type Output = Self;
+
+        //             fn add(self, rhs: Self) -> Self::Output {
+        //                 let (val, wrapped) = self.overflowing_add(rhs);
+        //                 debug_assert!(!wrapped, "attempted to add with overflow");
+        //                 val
+        //             }
+        //         }
+
+        //         impl ops::AddAssign for $ty {
+        //             fn add_assign(&mut self, rhs: Self) {
+        //                 *self = *self + rhs
+        //             }
+        //         }
+
+        //         impl ops::BitAnd for $ty {
+        //             type Output = Self;
+
+        //             fn bitand(self, rhs: Self) -> Self::Output {
+        //                 Self([
+        //                     self.0[0] & rhs.0[0],
+        //                     self.0[1] & rhs.0[1],
+        //                     self.0[2] & rhs.0[2],
+        //                     self.0[3] & rhs.0[3],
+        //                 ])
+        //             }
+        //         }
+
+        //         impl ops::BitAndAssign for $ty {
+        //             fn bitand_assign(&mut self, rhs: Self) {
+        //                 *self = *self & rhs
+        //             }
+        //         }
+
+        impl ops::BitOr for $ty {
+            type Output = Self;
+
+            fn bitor(mut self, rhs: Self) -> Self::Output {
+                self.0[0] |= rhs.0[0];
+                self.0[1] |= rhs.0[1];
+                self.0[2] |= rhs.0[2];
+                self.0[3] |= rhs.0[3];
+                self
+            }
+        }
+
+        //         impl ops::BitOrAssign for $ty {
+        //             fn bitor_assign(&mut self, rhs: Self) {
+        //                 *self = *self | rhs
+        //             }
+        //         }
+
+        //         impl ops::BitXor for $ty {
+        //             type Output = Self;
+
+        //             fn bitxor(self, rhs: Self) -> Self::Output {
+        //                 Self([
+        //                     self.0[0] ^ rhs.0[0],
+        //                     self.0[1] ^ rhs.0[1],
+        //                     self.0[2] ^ rhs.0[2],
+        //                     self.0[3] ^ rhs.0[3],
+        //                 ])
+        //             }
+        //         }
+
+        //         impl ops::BitXorAssign for $ty {
+        //             fn bitxor_assign(&mut self, rhs: Self) {
+        //                 *self = *self ^ rhs
+        //             }
+        //         }
+
+        impl ops::Not for $ty {
+            type Output = Self;
+
+            fn not(self) -> Self::Output {
+                Self([!self.0[0], !self.0[1], !self.0[2], !self.0[3]])
+            }
+        }
+
+        impl ops::Shl<u32> for $ty {
+            type Output = Self;
+
+            fn shl(self, rhs: u32) -> Self::Output {
+                todo!()
+            }
+        }
+    };
+}
+
+impl_common!(i256);
+impl_common!(u256);
+
+macro_rules! word {
+    (1, $val:expr) => {
+        (($val >> (32 * 3)) & Self::from(WORD_LO_MASK)) as u64
+    };
+    (2, $val:expr) => {
+        (($val >> (32 * 2)) & Self::from(WORD_LO_MASK)) as u64
+    };
+    (3, $val:expr) => {
+        (($val >> (32 * 1)) & Self::from(WORD_LO_MASK)) as u64
+    };
+    (4, $val:expr) => {
+        (($val >> (32 * 0)) & Self::from(WORD_LO_MASK)) as u64
+    };
+}
+
+impl HInt for u128 {
+    type D = u256;
+
+    fn widen(self) -> Self::D {
+        let w0 = self & u128::from(u64::MAX);
+        let w1 = (self >> u64::BITS) & u128::from(u64::MAX);
+        u256([w0 as u64, w1 as u64, 0, 0])
+    }
+
+    fn zero_widen(self) -> Self::D {
+        self.widen()
+    }
+
+    fn zero_widen_mul(self, rhs: Self) -> Self::D {
+        let product11: u64 = word!(1, self) * word!(1, rhs);
+        let product12: u64 = word!(1, self) * word!(2, rhs);
+        let product13: u64 = word!(1, self) * word!(3, rhs);
+        let product14: u64 = word!(1, self) * word!(4, rhs);
+        let product21: u64 = word!(2, self) * word!(1, rhs);
+        let product22: u64 = word!(2, self) * word!(2, rhs);
+        let product23: u64 = word!(2, self) * word!(3, rhs);
+        let product24: u64 = word!(2, self) * word!(4, rhs);
+        let product31: u64 = word!(3, self) * word!(1, rhs);
+        let product32: u64 = word!(3, self) * word!(2, rhs);
+        let product33: u64 = word!(3, self) * word!(3, rhs);
+        let product34: u64 = word!(3, self) * word!(4, rhs);
+        let product41: u64 = word!(4, self) * word!(1, rhs);
+        let product42: u64 = word!(4, self) * word!(2, rhs);
+        let product43: u64 = word!(4, self) * word!(3, rhs);
+        let product44: u64 = word!(4, self) * word!(4, rhs);
+
+        let sum0: u128 = u128::from(product44);
+        let sum1: u128 = u128::from(product34) + u128::from(product43);
+        let sum2: u128 = u128::from(product24) + u128::from(product33) + u128::from(product42);
+        let sum3: u128 = u128::from(product14)
+            + u128::from(product23)
+            + u128::from(product32)
+            + u128::from(product41);
+        let sum4: u128 = u128::from(product13) + u128::from(product22) + u128::from(product31);
+        let sum5: u128 = u128::from(product12) + u128::from(product21);
+        let sum6: u128 = u128::from(product11);
+
+        let r0: u128 =
+            (sum0 & u128::from(WORD_FULL_MASK)) + ((sum1 & u128::from(WORD_LO_MASK)) << 32);
+        let r1: u128 = (sum0 >> 64)
+            + ((sum1 >> 32) & u128::from(WORD_FULL_MASK))
+            + (sum2 & u128::from(WORD_FULL_MASK))
+            + ((sum3 << 32) & u128::from(WORD_HI_MASK));
+
+        let lo = r0.wrapping_add(r1 << 64);
+        let hi = (r1 >> 64)
+            + (sum1 >> 96)
+            + (sum2 >> 64)
+            + (sum3 >> 32)
+            + sum4
+            + (sum5 << 32)
+            + (sum6 << 64);
+
+        u256([
+            (lo & U128_LO_MASK) as u64,
+            ((lo >> 64) & U128_LO_MASK) as u64,
+            (hi & U128_LO_MASK) as u64,
+            ((hi >> 64) & U128_LO_MASK) as u64,
+        ])
+    }
+
+    fn widen_mul(self, rhs: Self) -> Self::D {
+        self.zero_widen_mul(rhs)
+    }
+}
+
+impl HInt for i128 {
+    type D = i256;
+
+    fn widen(self) -> Self::D {
+        let mut ret = self.unsigned().zero_widen().signed();
+        if self.is_negative() {
+            ret.0[2] = u64::MAX;
+            ret.0[3] = u64::MAX;
+        }
+        ret
+    }
+
+    fn zero_widen(self) -> Self::D {
+        self.unsigned().zero_widen().signed()
+    }
+
+    fn zero_widen_mul(self, rhs: Self) -> Self::D {
+        self.unsigned().zero_widen_mul(rhs.unsigned()).signed()
+    }
+
+    fn widen_mul(self, rhs: Self) -> Self::D {
+        unimplemented!()
+        // let mut res = self.zero_widen_mul(rhs);
+        // if self.is_negative() ^ rhs.is_negative() {
+        //     // Sign extend as needed
+        //     // for word in res.0.iter_mut().rev() {
+        //     //     let zeroes = word.leading_zeros();
+        //     //     let leading = u64::MAX << (64 - zeroes);
+        //     //     *word |= leading;
+        //     //     if zeroes != 64 {
+        //     //         break;
+        //     //     }
+        //     // }
+        // }
+
+        // res
+    }
+}
+
+impl DInt for u256 {
+    type H = u128;
+
+    fn lo(self) -> Self::H {
+        let mut tmp = [0u8; 16];
+        tmp[..8].copy_from_slice(&self.0[0].to_le_bytes());
+        tmp[8..].copy_from_slice(&self.0[1].to_le_bytes());
+        u128::from_le_bytes(tmp)
+    }
+
+    fn hi(self) -> Self::H {
+        let mut tmp = [0u8; 16];
+        tmp[..8].copy_from_slice(&self.0[2].to_le_bytes());
+        tmp[8..].copy_from_slice(&self.0[3].to_le_bytes());
+        u128::from_le_bytes(tmp)
+    }
+}
+
+impl DInt for i256 {
+    type H = i128;
+
+    fn lo(self) -> Self::H {
+        let mut tmp = [0u8; 16];
+        tmp[..8].copy_from_slice(&self.0[0].to_le_bytes());
+        tmp[8..].copy_from_slice(&self.0[1].to_le_bytes());
+        i128::from_le_bytes(tmp)
+    }
+
+    fn hi(self) -> Self::H {
+        let mut tmp = [0u8; 16];
+        tmp[..8].copy_from_slice(&self.0[2].to_le_bytes());
+        tmp[8..].copy_from_slice(&self.0[3].to_le_bytes());
+        i128::from_le_bytes(tmp)
+    }
+}
diff --git a/src/int/mod.rs b/src/int/mod.rs
index 509f9fda..bb343d79 100644
--- a/src/int/mod.rs
+++ b/src/int/mod.rs
@@ -3,42 +3,29 @@ use core::ops;
 mod specialized_div_rem;
 
 pub mod addsub;
+mod big;
 pub mod leading_zeros;
 pub mod mul;
 pub mod sdiv;
 pub mod shift;
 pub mod udiv;
 
-pub use self::leading_zeros::__clzsi2;
+pub use big::{i256, u256};
+pub use leading_zeros::__clzsi2;
 
 public_test_dep! {
-/// Trait for some basic operations on integers
-pub(crate) trait Int:
-    Copy
+/// Minimal integer implementations needed on all integer types, including wide integers.
+pub(crate) trait MinInt: Copy
     + core::fmt::Debug
-    + PartialEq
-    + PartialOrd
-    + ops::AddAssign
-    + ops::SubAssign
-    + ops::BitAndAssign
-    + ops::BitOrAssign
-    + ops::BitXorAssign
-    + ops::ShlAssign<i32>
-    + ops::ShrAssign<u32>
-    + ops::Add<Output = Self>
-    + ops::Sub<Output = Self>
-    + ops::Div<Output = Self>
-    + ops::Shl<u32, Output = Self>
-    + ops::Shr<u32, Output = Self>
     + ops::BitOr<Output = Self>
-    + ops::BitXor<Output = Self>
-    + ops::BitAnd<Output = Self>
     + ops::Not<Output = Self>
+    + ops::Shl<u32, Output = Self>
 {
+
     /// Type with the same width but other signedness
-    type OtherSign: Int;
+    type OtherSign: MinInt;
     /// Unsigned version of Self
-    type UnsignedInt: Int;
+    type UnsignedInt: MinInt;
 
     /// If `Self` is a signed integer
     const SIGNED: bool;
@@ -50,13 +37,46 @@ pub(crate) trait Int:
     const ONE: Self;
     const MIN: Self;
     const MAX: Self;
+}
+}
 
+public_test_dep! {
+/// Trait for some basic operations on integers
+pub(crate) trait Int: MinInt
+    + PartialEq
+    + PartialOrd
+    + ops::AddAssign
+    + ops::SubAssign
+    + ops::BitAndAssign
+    + ops::BitOrAssign
+    + ops::BitXorAssign
+    + ops::ShlAssign<i32>
+    + ops::ShrAssign<u32>
+    + ops::Add<Output = Self>
+    + ops::Sub<Output = Self>
+    + ops::Mul<Output = Self>
+    + ops::Div<Output = Self>
+    + ops::Shr<u32, Output = Self>
+    + ops::BitXor<Output = Self>
+    + ops::BitAnd<Output = Self>
+{
     /// LUT used for maximizing the space covered and minimizing the computational cost of fuzzing
     /// in `testcrate`. For example, Self = u128 produces [0,1,2,7,8,15,16,31,32,63,64,95,96,111,
     /// 112,119,120,125,126,127].
-    const FUZZ_LENGTHS: [u8; 20];
+    const FUZZ_LENGTHS: [u8; 20] = make_fuzz_lengths(<Self as MinInt>::BITS);
+
     /// The number of entries of `FUZZ_LENGTHS` actually used. The maximum is 20 for u128.
-    const FUZZ_NUM: usize;
+    const FUZZ_NUM: usize = {
+        let log2 = (<Self as MinInt>::BITS - 1).count_ones() as usize;
+        if log2 == 3 {
+            // case for u8
+            6
+        } else {
+            // 3 entries on each extreme, 2 in the middle, and 4 for each scale of intermediate
+            // boundaries.
+            8 + (4 * (log2 - 4))
+        }
+    };
 
     fn unsigned(self) -> Self::UnsignedInt;
     fn from_unsigned(unsigned: Self::UnsignedInt) -> Self;
@@ -83,74 +103,54 @@ pub(crate) trait Int:
 }
 }
 
+pub(crate) const fn make_fuzz_lengths(bits: u32) -> [u8; 20] {
+    let mut v = [0u8; 20];
+    v[0] = 0;
+    v[1] = 1;
+    v[2] = 2; // important for parity and the iX::MIN case when reversed
+    let mut i = 3;
+
+    // No need for any more until the byte boundary, because there should be no algorithms
+    // that are sensitive to anything not next to byte boundaries after 2. We also scale
+    // in powers of two, which is important to prevent u128 corner tests from getting too
+    // big.
+    let mut l = 8;
+    loop {
+        if l >= ((bits / 2) as u8) {
+            break;
+        }
+        // get both sides of the byte boundary
+        v[i] = l - 1;
+        i += 1;
+        v[i] = l;
+        i += 1;
+        l *= 2;
+    }
+
+    if bits != 8 {
+        // add the lower side of the middle boundary
+        v[i] = ((bits / 2) - 1) as u8;
+        i += 1;
+    }
+
+    // We do not want to jump directly from the Self::BITS/2 boundary to the Self::BITS
+    // boundary because of algorithms that split the high part up. We reverse the scaling
+    // as we go to Self::BITS.
+    let mid = i;
+    let mut j = 1;
+    loop {
+        v[i] = (bits as u8) - (v[mid - j]) - 1;
+        if j == mid {
+            break;
+        }
+        i += 1;
+        j += 1;
+    }
+    v
+}
+
 macro_rules! int_impl_common {
     ($ty:ty) => {
-        const BITS: u32 = <Self as Int>::ZERO.count_zeros();
-        const SIGNED: bool = Self::MIN != Self::ZERO;
-
-        const ZERO: Self = 0;
-        const ONE: Self = 1;
-        const MIN: Self = <Self>::MIN;
-        const MAX: Self = <Self>::MAX;
-
-        const FUZZ_LENGTHS: [u8; 20] = {
-            let bits = <Self as Int>::BITS;
-            let mut v = [0u8; 20];
-            v[0] = 0;
-            v[1] = 1;
-            v[2] = 2; // important for parity and the iX::MIN case when reversed
-            let mut i = 3;
-            // No need for any more until the byte boundary, because there should be no algorithms
-            // that are sensitive to anything not next to byte boundaries after 2. We also scale
-            // in powers of two, which is important to prevent u128 corner tests from getting too
-            // big.
-            let mut l = 8;
-            loop {
-                if l >= ((bits / 2) as u8) {
-                    break;
-                }
-                // get both sides of the byte boundary
-                v[i] = l - 1;
-                i += 1;
-                v[i] = l;
-                i += 1;
-                l *= 2;
-            }
-
-            if bits != 8 {
-                // add the lower side of the middle boundary
-                v[i] = ((bits / 2) - 1) as u8;
-                i += 1;
-            }
-
-            // We do not want to jump directly from the Self::BITS/2 boundary to the Self::BITS
-            // boundary because of algorithms that split the high part up. We reverse the scaling
-            // as we go to Self::BITS.
-            let mid = i;
-            let mut j = 1;
-            loop {
-                v[i] = (bits as u8) - (v[mid - j]) - 1;
-                if j == mid {
-                    break;
-                }
-                i += 1;
-                j += 1;
-            }
-            v
-        };
-
-        const FUZZ_NUM: usize = {
-            let log2 = (<Self as Int>::BITS - 1).count_ones() as usize;
-            if log2 == 3 {
-                // case for u8
-                6
-            } else {
-                // 3 entries on each extreme, 2 in the middle, and 4 for each scale of intermediate
-                // boundaries.
-                8 + (4 * (log2 - 4))
-            }
-        };
-
         fn from_bool(b: bool) -> Self {
             b as $ty
         }
@@ -203,10 +203,20 @@ macro_rules! int_impl_common {
 
 macro_rules! int_impl {
     ($ity:ty, $uty:ty) => {
-        impl Int for $uty {
+        impl MinInt for $uty {
             type OtherSign = $ity;
             type UnsignedInt = $uty;
 
+            const BITS: u32 = <Self as MinInt>::ZERO.count_zeros();
+            const SIGNED: bool = Self::MIN != Self::ZERO;
+
+            const ZERO: Self = 0;
+            const ONE: Self = 1;
+            const MIN: Self = <Self>::MIN;
+            const MAX: Self = <Self>::MAX;
+        }
+
+        impl Int for $uty {
             fn unsigned(self) -> $uty {
                 self
             }
@@ -228,10 +238,20 @@ macro_rules! int_impl {
             int_impl_common!($uty);
         }
 
-        impl Int for $ity {
+        impl MinInt for $ity {
             type OtherSign = $uty;
             type UnsignedInt = $uty;
 
+            const BITS: u32 = <Self as MinInt>::ZERO.count_zeros();
+            const SIGNED: bool = Self::MIN != Self::ZERO;
+
+            const ZERO: Self = 0;
+            const ONE: Self = 1;
+            const MIN: Self = <Self>::MIN;
+            const MAX: Self = <Self>::MAX;
+        }
+
+        impl Int for $ity {
             fn unsigned(self) -> $uty {
                 self as $uty
             }
@@ -259,18 +279,22 @@ int_impl!(i128, u128);
 public_test_dep! {
 /// Trait for integers twice the bit width of another integer. This is implemented for all
 /// primitives except for `u8`, because there is not a smaller primitive.
-pub(crate) trait DInt: Int {
+pub(crate) trait DInt: MinInt {
     /// Integer that is half the bit width of the integer this trait is implemented for
-    type H: HInt<D = Self> + Int;
+    type H: HInt<D = Self>;
 
     /// Returns the low half of `self`
     fn lo(self) -> Self::H;
     /// Returns the high half of `self`
     fn hi(self) -> Self::H;
     /// Returns the low and high halves of `self` as a tuple
-    fn lo_hi(self) -> (Self::H, Self::H);
+    fn lo_hi(self) -> (Self::H, Self::H) {
+        (self.lo(), self.hi())
+    }
     /// Constructs an integer using lower and higher half parts
-    fn from_lo_hi(lo: Self::H, hi: Self::H) -> Self;
+    fn from_lo_hi(lo: Self::H, hi: Self::H) -> Self {
+        lo.zero_widen() | hi.widen_hi()
+    }
 }
 }
 
@@ -279,7 +303,7 @@ public_test_dep! {
 /// primitives except for `u128`, because it there is not a larger primitive.
 pub(crate) trait HInt: Int {
     /// Integer that is double the bit width of the integer this trait is implemented for
-    type D: DInt<H = Self> + Int;
+    type D: DInt<H = Self> + MinInt;
 
     /// Widens (using default extension) the integer to have double bit width
     fn widen(self) -> Self::D;
@@ -287,7 +311,9 @@ pub(crate) trait HInt: Int {
     /// around problems with associated type bounds (such as `Int<Othersign: DInt>`) being unstable
     fn zero_widen(self) -> Self::D;
     /// Widens the integer to have double bit width and shifts the integer into the higher bits
-    fn widen_hi(self) -> Self::D;
+    fn widen_hi(self) -> Self::D {
+        self.widen() << <Self as MinInt>::BITS
+    }
     /// Widening multiplication with zero widening. This cannot overflow.
     fn zero_widen_mul(self, rhs: Self) -> Self::D;
     /// Widening multiplication. This cannot overflow.
@@ -305,13 +331,7 @@ macro_rules! impl_d_int {
                     self as $X
                 }
                 fn hi(self) -> Self::H {
-                    (self >> <$X as Int>::BITS) as $X
-                }
-                fn lo_hi(self) -> (Self::H, Self::H) {
-                    (self.lo(), self.hi())
-                }
-                fn from_lo_hi(lo: Self::H, hi: Self::H) -> Self {
-                    lo.zero_widen() | hi.widen_hi()
+                    (self >> <$X as MinInt>::BITS) as $X
                 }
             }
         )*
@@ -330,9 +350,6 @@ macro_rules! impl_h_int {
                 fn zero_widen(self) -> Self::D {
                     (self as $uH) as $X
                 }
-                fn widen_hi(self) -> Self::D {
-                    (self as $X) << <$H as Int>::BITS
-                }
                 fn zero_widen_mul(self, rhs: Self) -> Self::D {
                     self.zero_widen().wrapping_mul(rhs.zero_widen())
                 }
diff --git a/src/int/mul.rs b/src/int/mul.rs
index 2538e2f4..e0093a72 100644
--- a/src/int/mul.rs
+++ b/src/int/mul.rs
@@ -1,6 +1,6 @@
 use crate::int::{DInt, HInt, Int};
 
-trait Mul: DInt
+trait Mul: DInt + Int
 where
     Self::H: DInt,
 {
@@ -30,7 +30,7 @@ where
 impl Mul for u64 {}
 impl Mul for i128 {}
 
-pub(crate) trait UMulo: Int + DInt {
+pub(crate) trait UMulo: DInt + Int {
     fn mulo(self, rhs: Self) -> (Self, bool) {
         match (self.hi().is_zero(), rhs.hi().is_zero()) {
             // overflow is guaranteed
diff --git a/src/int/shift.rs b/src/int/shift.rs
index dbd04018..31727298 100644
--- a/src/int/shift.rs
+++ b/src/int/shift.rs
@@ -1,4 +1,4 @@
-use crate::int::{DInt, HInt, Int};
+use crate::int::{DInt, HInt, Int, MinInt};
 
 trait Ashl: DInt {
     /// Returns `a << b`, requires `b < Self::BITS`
diff --git a/src/lib.rs b/src/lib.rs
index ea376631..7c0b5072 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -44,6 +44,21 @@ extern crate core;
 
 #[macro_use]
 mod macros;
+macro_rules! vdbg {
+    ($val:expr $(,)?) => {
+        // Use of `match` here is intentional because it affects the lifetimes
+        // of temporaries - https://stackoverflow.com/a/48732525/1063961
+        match $val {
+            tmp => {
+                $crate::write_val(
+                    tmp,
+                    concat!("[", file!(), ":", line!(), "] ", stringify!($val), " = "),
+                );
+                tmp
+            }
+        }
+    };
+}
 
 pub mod float;
 pub mod int;
@@ -80,3 +95,45 @@ pub mod x86;
 pub mod x86_64;
 
 pub mod probestack;
+
+// Hacky way to print values since we don't have `std` for the crate
+mod val_print {
+    extern "C" {
+        fn print_callback(val_ptr: *const u8, val_sz: usize, name_ptr: *const u8, name_len: usize);
+    }
+
+    pub fn write_val<T: Copy>(val: T, name: &str) {
+        unsafe {
+            print_callback(
+                core::ptr::addr_of!(val).cast(),
+                core::mem::size_of::<T>(),
+                name.as_ptr(),
+                name.len(),
+            )
+        };
+    }
+}
+
+pub use val_print::write_val;
+
+#[macro_export]
+macro_rules! set_val_callback {
+    () => {
+        #[no_mangle]
+        unsafe extern "C" fn print_callback(
+            val_ptr: *const u8,
+            val_sz: usize,
+            name_ptr: *const u8,
+            name_len: usize,
+        ) {
+            let val = unsafe { core::slice::from_raw_parts(val_ptr, val_sz) };
+            let name_slice = unsafe { core::slice::from_raw_parts(name_ptr, name_len) };
+            let name = core::str::from_utf8(name_slice).unwrap();
+            print!("{}: 0x", name);
+            for byte in val.iter().rev() {
+                print!("{:02x}", byte);
+            }
+            println!();
+        }
+    };
+}
diff --git a/testcrate/Cargo.toml b/testcrate/Cargo.toml
index 6ff3fde1..6f771181 100644
--- a/testcrate/Cargo.toml
+++ b/testcrate/Cargo.toml
@@ -33,3 +33,5 @@ no-asm = ["compiler_builtins/no-asm"]
 no-f16-f128 = ["compiler_builtins/no-f16-f128"]
 mem = ["compiler_builtins/mem"]
 mangled-names = ["compiler_builtins/mangled-names"]
+# Skip tests that rely on f128 symbols being available on the system
+no-sys-f128 = []
diff --git a/testcrate/benches/float.rs b/testcrate/benches/float.rs
new file mode 100644
index 00000000..a12300b3
--- /dev/null
+++ b/testcrate/benches/float.rs
@@ -0,0 +1,90 @@
+#![feature(test, f16, f128)]
+
+extern crate test;
+use core::hint::black_box;
+use test::Bencher;
+
+extern crate compiler_builtins;
+
+macro_rules! test_values {
+    ($ty:ty) => {
+        &[
+            <$ty>::MIN,
+            <$ty>::MAX,
+            <$ty>::NAN,
+            <$ty>::INFINITY,
+            <$ty>::NEG_INFINITY,
+            <$ty>::MIN_POSITIVE,
+            0.0,
+            1.0,
+            -1.0,
+        ]
+    };
+}
+
+fn combine2<T: Copy>(vals: &[T]) -> Vec<(T, T)> {
+    let mut ret = Vec::new();
+    for x in vals.iter().copied() {
+        for y in vals.iter().copied() {
+            ret.push((x, y));
+        }
+    }
+    ret
+}
+
+macro_rules! test_iter {
+    ($b:ident, $ty:ty, $fn:path) => {{
+        let vals = combine2(test_values!($ty));
+        let iter_loop = || {
+            for (a, b) in vals.iter().copied() {
+                black_box($fn(black_box(a), black_box(b)));
+            }
+        };
+
+        // Warmup
+        for _ in 0..1000 {
+            iter_loop();
+        }
+
+        $b.iter(iter_loop);
+    }};
+}
+
+macro_rules! foobar {
+    ($($ty:ty, $rust_fn:ident, $builtin_fn:ident, $mod:ident::$sym:ident);* $(;)?) => {
+        $(
+            #[bench]
+            fn $rust_fn(b: &mut Bencher) {
+                // Equalize with the builtin function which is called separately
+                #[inline(never)]
+                fn inline_wrapper(a: $ty, b: $ty) -> $ty {
+                    compiler_builtins::float::$mod::$sym(black_box(a), black_box(b))
+                }
+
+                test_iter!(b, $ty, inline_wrapper);
+            }
+
+            #[bench]
+            fn $builtin_fn(b: &mut Bencher) {
+                extern "C" {
+                    fn $sym(a: $ty, b: $ty) -> $ty;
+                }
+
+                unsafe {
+                    test_iter!(b, $ty, $sym);
+                }
+            }
+        )*
+    };
+}
+
+foobar! {
+    f32, addsf3_rust, addsf3_builtin, add::__addsf3;
+    f32, subsf3_rust, subsf3_builtin, sub::__subsf3;
+    f32, mulsf3_rust, mulsf3_builtin, mul::__mulsf3;
+    f32, divsf3_rust, divsf3_builtin, div::__divsf3;
+    f64, adddf3_rust, adddf3_builtin, add::__adddf3;
+    f64, subdf3_rust, subdf3_builtin, sub::__subdf3;
+    f64, muldf3_rust, muldf3_builtin, mul::__muldf3;
+    f64, divdf3_rust, divdf3_builtin, div::__divdf3;
+}
diff --git a/testcrate/build.rs b/testcrate/build.rs
new file mode 100644
index 00000000..86c97af1
--- /dev/null
+++ b/testcrate/build.rs
@@ -0,0 +1,15 @@
+use std::env;
+
+fn main() {
+    let target = env::var("TARGET").unwrap();
+
+    // These platforms do not have f128 symbols available in their system libraries, so
+    // skip related tests.
+    if target.starts_with("arm-")
+        || target.contains("apple-darwin")
+        || target.contains("windows-msvc")
+    {
+        println!("cargo:warning=skipping `f128` tests; system does not have relevant symbols");
+        println!("cargo:rustc-cfg=feature=\"no-sys-f128\"");
+    }
+}
diff --git a/testcrate/src/lib.rs b/testcrate/src/lib.rs
index 9bd155f6..13abf459 100644
--- a/testcrate/src/lib.rs
+++ b/testcrate/src/lib.rs
@@ -15,7 +15,7 @@
 #![no_std]
 
 use compiler_builtins::float::Float;
-use compiler_builtins::int::Int;
+use compiler_builtins::int::{Int, MinInt};
 
 use rand_xoshiro::rand_core::{RngCore, SeedableRng};
 use rand_xoshiro::Xoshiro128StarStar;
@@ -101,7 +101,10 @@ macro_rules! edge_cases {
 
 /// Feeds a series of fuzzing inputs to `f`. The fuzzer first uses an algorithm designed to find
 /// edge cases, followed by a more random fuzzer that runs `n` times.
-pub fn fuzz<I: Int, F: FnMut(I)>(n: u32, mut f: F) {
+pub fn fuzz<I: Int, F: FnMut(I)>(n: u32, mut f: F)
+where
+    <I as MinInt>::UnsignedInt: Int,
+{
     // edge case tester. Calls `f` 210 times for u128.
     // zero gets skipped by the loop
     f(I::ZERO);
@@ -111,7 +114,7 @@ pub fn fuzz<I: Int, F: FnMut(I)>(n: u32, mut f: F) {
 
     // random fuzzer
     let mut rng = Xoshiro128StarStar::seed_from_u64(0);
-    let mut x: I = Int::ZERO;
+    let mut x: I = MinInt::ZERO;
     for _ in 0..n {
         fuzz_step(&mut rng, &mut x);
         f(x)
@@ -119,7 +122,10 @@ pub fn fuzz<I: Int, F: FnMut(I)>(n: u32, mut f: F) {
 }
 
 /// The same as `fuzz`, except `f` has two inputs.
-pub fn fuzz_2<I: Int, F: Fn(I, I)>(n: u32, f: F) {
+pub fn fuzz_2<I: Int, F: Fn(I, I)>(n: u32, f: F)
+where
+    <I as MinInt>::UnsignedInt: Int,
+{
     // Check cases where the first and second inputs are zero. Both call `f` 210 times for `u128`.
     edge_cases!(I, case, {
         f(I::ZERO, case);
@@ -150,10 +156,10 @@ pub fn fuzz_shift<I: Int, F: Fn(I, u32)>(f: F) {
     // Shift functions are very simple and do not need anything other than shifting a small
     // set of random patterns for every fuzz length.
     let mut rng = Xoshiro128StarStar::seed_from_u64(0);
-    let mut x: I = Int::ZERO;
+    let mut x: I = MinInt::ZERO;
     for i in 0..I::FUZZ_NUM {
         fuzz_step(&mut rng, &mut x);
-        f(x, Int::ZERO);
+        f(x, MinInt::ZERO);
         f(x, I::FUZZ_LENGTHS[i] as u32);
     }
 }
diff --git a/testcrate/tests/addsub.rs b/testcrate/tests/addsub.rs
index da7684ec..343e47ae 100644
--- a/testcrate/tests/addsub.rs
+++ b/testcrate/tests/addsub.rs
@@ -1,4 +1,6 @@
 #![allow(unused_macros)]
+#![feature(f128)]
+#![feature(f16)]
 
 use testcrate::*;
 
@@ -80,13 +82,13 @@ macro_rules! float_sum {
                 let sub1: $f = $fn_sub(x, y);
                 if !Float::eq_repr(add0, add1) {
                     panic!(
-                        "{}({}, {}): std: {}, builtins: {}",
+                        "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
                         stringify!($fn_add), x, y, add0, add1
                     );
                 }
                 if !Float::eq_repr(sub0, sub1) {
                     panic!(
-                        "{}({}, {}): std: {}, builtins: {}",
+                        "{:?}({:?}, {:?}): std: {:?}, builtins: {:?}",
                         stringify!($fn_sub), x, y, sub0, sub1
                     );
                 }
@@ -110,6 +112,16 @@ fn float_addsub() {
     );
 }
 
+#[test]
+#[cfg(not(feature = "no-sys-f128"))]
+fn float_addsub_f128() {
+    use compiler_builtins::float::{add::__addtf3, sub::__subtf3, Float};
+
+    float_sum!(
+        f128, __addtf3, __subtf3;
+    );
+}
+
 #[cfg(target_arch = "arm")]
 #[test]
 fn float_addsub_arm() {
diff --git a/testcrate/tests/big.rs b/testcrate/tests/big.rs
new file mode 100644
index 00000000..abf7d77c
--- /dev/null
+++ b/testcrate/tests/big.rs
@@ -0,0 +1,104 @@
+use compiler_builtins::int::{i256, u256, HInt, Int, MinInt};
+
+const LOHI_SPLIT: u128 = 0xaaaaaaaaaaaaaaaaffffffffffffffff;
+
+/// Print a `u256` as hex since we can't add format implementations
+fn hexu(v: u256) -> String {
+    format!(
+        "0x{:016x}{:016x}{:016x}{:016x}",
+        v.0[3], v.0[2], v.0[1], v.0[0]
+    )
+}
+
+fn hexi(v: i256) -> String {
+    hexu(v.unsigned())
+}
+
+#[test]
+fn widen_u128() {
+    assert_eq!(u128::MAX.widen(), u256([u64::MAX, u64::MAX, 0, 0]));
+    assert_eq!(
+        LOHI_SPLIT.widen(),
+        u256([u64::MAX, 0xaaaaaaaaaaaaaaaa, 0, 0])
+    );
+}
+
+#[test]
+fn widen_i128() {
+    assert_eq!((-1i128).widen(), u256::MAX.signed());
+    assert_eq!(
+        (LOHI_SPLIT as i128).widen(),
+        i256([u64::MAX, 0xaaaaaaaaaaaaaaaa, u64::MAX, u64::MAX])
+    );
+    assert_eq!((-1i128).zero_widen().unsigned(), (u128::MAX).widen());
+}
+
+#[test]
+fn widen_mul_u128() {
+    let tests = [
+        (u128::MAX / 2, 2_u128, u256([u64::MAX - 1, u64::MAX, 0, 0])),
+        (u128::MAX, 2_u128, u256([u64::MAX - 1, u64::MAX, 1, 0])),
+        // TODO: /~https://github.com/rust-lang/compiler-builtins/pull/587#issuecomment-2060543566
+        // (u128::MAX, u128::MAX, u256([1, 0, u64::MAX - 1, u64::MAX])),
+        (u128::MIN, u128::MIN, u256::ZERO),
+        (1234, 0, u256::ZERO),
+        (0, 1234, u256::ZERO),
+    ];
+
+    let mut errors = Vec::new();
+    for (i, (a, b, exp)) in tests.iter().copied().enumerate() {
+        let res = a.widen_mul(b);
+        let res_z = a.zero_widen_mul(b);
+        assert_eq!(res, res_z);
+        if res != exp {
+            errors.push((i, a, b, exp, res));
+        }
+    }
+
+    for (i, a, b, exp, res) in &errors {
+        eprintln!(
+            "FAILURE ({i}): {a:#034x} * {b:#034x} = {} got {}",
+            hexu(*exp),
+            hexu(*res)
+        );
+    }
+    assert!(errors.is_empty());
+}
+
+// #[test]
+// fn widen_mul_i128() {
+//     let tests = [
+//         (
+//             i128::MAX / 2,
+//             2_i128,
+//             i256([u64::MAX - 1, u64::MAX >> 1, 0, 0]),
+//         ),
+//         (i128::MAX, 2_i128, i256([u64::MAX - 1, u64::MAX, 0, 0])),
+//         (i128::MIN, 2_i128, i256([0, 0, u64::MAX, u64::MAX])),
+//         (
+//             i128::MAX,
+//             i128::MAX,
+//             i256([1, 0, u64::MAX - 1, u64::MAX >> 2]),
+//         ),
+//         (i128::MAX, i128::MIN, i256([0, 0, 0, 0b11 << 62])),
+//         (i128::MIN, i128::MIN, i256([0, 0, 0, 0])),
+//         (1234, 0, i256::ZERO),
+//         (0, 1234, i256::ZERO),
+//         (-1234, 0, i256::ZERO),
+//         (0, -1234, i256::ZERO),
+//     ];
+
+//     let mut errors = Vec::new();
+//     for (i, (a, b, exp)) in tests.iter().copied().enumerate() {
+//         let res = a.widen_mul(b);
+//         // TODO check zero widen mul
+//         if res != exp {
+//             errors.push((i, a, b, exp, res));
+//         }
+//     }
+
+//     for (i, a, b, exp, res) in &errors {
+//         eprintln!("FAILURE ({i}): {a:#034x} * {b:#034x} = {} got {}", hexi(*exp), hexi(*res));
+//     }
+//     assert!(errors.is_empty());
+// }
diff --git a/testcrate/tests/cmp.rs b/testcrate/tests/cmp.rs
index 5c10a560..7ad90a7c 100644
--- a/testcrate/tests/cmp.rs
+++ b/testcrate/tests/cmp.rs
@@ -1,4 +1,6 @@
 #![allow(unused_macros)]
+#![feature(f128)]
+#![feature(f16)]
 
 use testcrate::*;
 
@@ -16,7 +18,10 @@ macro_rules! cmp {
             };
             let cmp1 = $fn($x, $y);
             if cmp0 != cmp1 {
-                panic!("{}({}, {}): std: {}, builtins: {}", stringify!($fn_builtins), $x, $y, cmp0, cmp1);
+                panic!(
+                    "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
+                    stringify!($fn_builtins), $x, $y, cmp0, cmp1
+                );
             }
         )*
     };
@@ -55,6 +60,26 @@ fn float_comparisons() {
     });
 }
 
+#[cfg(not(feature = "no-sys-f128"))]
+#[test]
+fn float_comparisons_f128() {
+    use compiler_builtins::float::cmp::{
+        __eqtf2, __getf2, __gttf2, __letf2, __lttf2, __netf2, __unordtf2,
+    };
+
+    fuzz_float_2(N, |x: f128, y: f128| {
+        assert_eq!(__unordtf2(x, y) != 0, x.is_nan() || y.is_nan());
+        cmp!(x, y,
+            1, __lttf2;
+            1, __letf2;
+            1, __eqtf2;
+            -1, __getf2;
+            -1, __gttf2;
+            1, __netf2;
+        );
+    });
+}
+
 macro_rules! cmp2 {
     ($x:ident, $y:ident, $($unordered_val:expr, $fn_std:expr, $fn_builtins:ident);*;) => {
         $(
diff --git a/testcrate/tests/div_rem.rs b/testcrate/tests/div_rem.rs
index de3bd9be..07bd233c 100644
--- a/testcrate/tests/div_rem.rs
+++ b/testcrate/tests/div_rem.rs
@@ -1,9 +1,13 @@
 #![allow(unused_macros)]
+#![feature(f128)]
+#![feature(f16)]
 
 use compiler_builtins::int::sdiv::{__divmoddi4, __divmodsi4, __divmodti4};
 use compiler_builtins::int::udiv::{__udivmoddi4, __udivmodsi4, __udivmodti4, u128_divide_sparc};
 use testcrate::*;
 
+compiler_builtins::set_val_callback!();
+
 // Division algorithms have by far the nastiest and largest number of edge cases, and experience shows
 // that sometimes 100_000 iterations of the random fuzzer is needed.
 
@@ -107,12 +111,15 @@ macro_rules! float {
     ($($i:ty, $fn:ident);*;) => {
         $(
             fuzz_float_2(N, |x: $i, y: $i| {
+                dbg!(x, y);
                 let quo0 = x / y;
+                dbg!(quo0);
                 let quo1: $i = $fn(x, y);
+                dbg!(quo1);
                 #[cfg(not(target_arch = "arm"))]
                 if !Float::eq_repr(quo0, quo1) {
                     panic!(
-                        "{}({}, {}): std: {}, builtins: {}",
+                        "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
                         stringify!($fn), x, y, quo0, quo1
                     );
                 }
@@ -122,7 +129,7 @@ macro_rules! float {
                 if !(Float::is_subnormal(quo0) || Float::is_subnormal(quo1)) {
                     if !Float::eq_repr(quo0, quo1) {
                         panic!(
-                            "{}({}, {}): std: {}, builtins: {}",
+                            "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
                             stringify!($fn), x, y, quo0, quo1
                         );
                     }
@@ -146,6 +153,24 @@ fn float_div() {
     );
 }
 
+#[cfg(not(feature = "no-sys-f128"))]
+#[test]
+fn float_div_f128() {
+    use compiler_builtins::float::{div::__divtf3, Float};
+
+    float!(
+        f128, __divtf3;
+    );
+}
+
+#[test]
+fn div_failures() {
+    use compiler_builtins::float::{div::__divtf3, Float};
+    let a = f128::from_bits(0x1);
+    let b = f128::from_bits(0x1);
+    dbg!(__divtf3(a, b));
+}
+
 #[cfg(target_arch = "arm")]
 #[test]
 fn float_div_arm() {
diff --git a/testcrate/tests/mul.rs b/testcrate/tests/mul.rs
index 819f06ca..446d5c46 100644
--- a/testcrate/tests/mul.rs
+++ b/testcrate/tests/mul.rs
@@ -1,4 +1,6 @@
 #![allow(unused_macros)]
+#![feature(f128)]
+#![feature(f16)]
 
 use testcrate::*;
 
@@ -91,7 +93,7 @@ macro_rules! float_mul {
                 if !(Float::is_subnormal(mul0) || Float::is_subnormal(mul1)) {
                     if !Float::eq_repr(mul0, mul1) {
                         panic!(
-                            "{}({}, {}): std: {}, builtins: {}",
+                            "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
                             stringify!($fn), x, y, mul0, mul1
                         );
                     }
@@ -115,6 +117,16 @@ fn float_mul() {
     );
 }
 
+#[test]
+#[cfg(not(feature = "no-sys-f128"))]
+fn float_mul_f128() {
+    use compiler_builtins::float::{mul::__multf3, Float};
+
+    float_mul!(
+        f128, __multf3;
+    );
+}
+
 #[cfg(target_arch = "arm")]
 #[test]
 fn float_mul_arm() {