From 7fd038bb2e8ba92762a8d1ffd86a87c7cbd637da Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Tue, 3 Apr 2018 09:44:32 -0700
Subject: [PATCH] Stabilize x86/x86_64 intrinsics

This commit stabilizes all intrinsics in the `x86` and `x86_64` modules, namely
allowing stabilization of the `arch::x86` and `arch::x86_64` module in libstd.
Stabilizations here were applied in an automated fashion using [this
script][scr], and notably everything related to `__m64` was omitted from this
round of stabilization

[scr]: https://gist.github.com/alexcrichton/5b456d495d6fe1df46a158754565c7a5
---
 coresimd/arm/mod.rs              |   6 +-
 coresimd/mod.rs                  |  12 +-
 coresimd/x86/abm.rs              |   6 +
 coresimd/x86/aes.rs              |  18 +
 coresimd/x86/avx.rs              | 584 ++++++++++++++++++++++++++++
 coresimd/x86/avx2.rs             | 582 ++++++++++++++++++++++++++++
 coresimd/x86/{bmi.rs => bmi1.rs} |  24 ++
 coresimd/x86/bmi2.rs             |  12 +
 coresimd/x86/bswap.rs            |   3 +
 coresimd/x86/cpuid.rs            |   8 +
 coresimd/x86/eflags.rs           |  12 +
 coresimd/x86/fxsr.rs             |   6 +
 coresimd/x86/mod.rs              |  12 +-
 coresimd/x86/pclmulqdq.rs        |   3 +
 coresimd/x86/rdrand.rs           |  12 +
 coresimd/x86/rdtsc.rs            |   6 +
 coresimd/x86/sha.rs              |  21 ++
 coresimd/x86/sse.rs              | 341 +++++++++++++++++
 coresimd/x86/sse2.rs             | 630 +++++++++++++++++++++++++++++++
 coresimd/x86/sse3.rs             |  33 ++
 coresimd/x86/sse41.rs            | 187 +++++++++
 coresimd/x86/sse42.rs            |  70 ++++
 coresimd/x86/sse4a.rs            |   4 +
 coresimd/x86/ssse3.rs            |  48 +++
 coresimd/x86/tbm.rs              |  18 +
 coresimd/x86/xsave.rs            |  25 ++
 coresimd/x86_64/abm.rs           |   6 +
 coresimd/x86_64/avx.rs           |   3 +
 coresimd/x86_64/avx2.rs          |   3 +
 coresimd/x86_64/bmi.rs           |  24 ++
 coresimd/x86_64/bmi2.rs          |  12 +
 coresimd/x86_64/bswap.rs         |   3 +
 coresimd/x86_64/fxsr.rs          |   6 +
 coresimd/x86_64/rdrand.rs        |   6 +
 coresimd/x86_64/sse.rs           |   9 +
 coresimd/x86_64/sse2.rs          |  33 ++
 coresimd/x86_64/sse41.rs         |   6 +
 coresimd/x86_64/sse42.rs         |   3 +
 coresimd/x86_64/xsave.rs         |  18 +
 crates/stdsimd/src/lib.rs        |   2 +-
 stdsimd/arch/detect/arch/x86.rs  |  64 +++-
 stdsimd/mod.rs                   |  16 +-
 42 files changed, 2889 insertions(+), 8 deletions(-)
 rename coresimd/x86/{bmi.rs => bmi1.rs} (78%)

diff --git a/coresimd/arm/mod.rs b/coresimd/arm/mod.rs
index d778ed6f14..9798db59bc 100644
--- a/coresimd/arm/mod.rs
+++ b/coresimd/arm/mod.rs
@@ -20,8 +20,10 @@ pub use self::v7::*;
 // NEON is supported on AArch64, and on ARM when built with the v7 and neon
 // features. Building ARM without neon produces incorrect codegen.
 #[cfg(any(target_arch = "aarch64",
-          all(target_feature = "v7", target_feature = "neon")))]
+          all(target_feature = "v7", target_feature = "neon"),
+          dox))]
 mod neon;
 #[cfg(any(target_arch = "aarch64",
-          all(target_feature = "v7", target_feature = "neon")))]
+          all(target_feature = "v7", target_feature = "neon"),
+          dox))]
 pub use self::neon::*;
diff --git a/coresimd/mod.rs b/coresimd/mod.rs
index 5768ca9b34..9cf63d14af 100644
--- a/coresimd/mod.rs
+++ b/coresimd/mod.rs
@@ -41,14 +41,16 @@ pub mod simd {
 /// [`aarch64`]: https://rust-lang-nursery.github.io/stdsimd/aarch64/stdsimd/arch/index.html
 /// [`mips`]: https://rust-lang-nursery.github.io/stdsimd/mips/stdsimd/arch/index.html
 /// [`mips64`]: https://rust-lang-nursery.github.io/stdsimd/mips64/stdsimd/arch/index.html
-#[unstable(feature = "stdsimd", issue = "0")]
+#[stable(feature = "simd_arch", since = "1.27.0")]
 pub mod arch {
     /// Platform-specific intrinsics for the `x86` platform.
     ///
     /// See the [module documentation](../index.html) for more details.
     #[cfg(any(target_arch = "x86", dox))]
     #[doc(cfg(target_arch = "x86"))]
+    #[stable(feature = "simd_x86", since = "1.27.0")]
     pub mod x86 {
+        #[stable(feature = "simd_x86", since = "1.27.0")]
         pub use coresimd::x86::*;
     }
 
@@ -57,8 +59,11 @@ pub mod arch {
     /// See the [module documentation](../index.html) for more details.
     #[cfg(any(target_arch = "x86_64", dox))]
     #[doc(cfg(target_arch = "x86_64"))]
+    #[stable(feature = "simd_x86", since = "1.27.0")]
     pub mod x86_64 {
+        #[stable(feature = "simd_x86", since = "1.27.0")]
         pub use coresimd::x86::*;
+        #[stable(feature = "simd_x86", since = "1.27.0")]
         pub use coresimd::x86_64::*;
     }
 
@@ -67,6 +72,7 @@ pub mod arch {
     /// See the [module documentation](../index.html) for more details.
     #[cfg(any(target_arch = "arm", dox))]
     #[doc(cfg(target_arch = "arm"))]
+    #[unstable(feature = "stdsimd", issue = "0")]
     pub mod arm {
         pub use coresimd::arm::*;
     }
@@ -76,6 +82,7 @@ pub mod arch {
     /// See the [module documentation](../index.html) for more details.
     #[cfg(any(target_arch = "aarch64", dox))]
     #[doc(cfg(target_arch = "aarch64"))]
+    #[unstable(feature = "stdsimd", issue = "0")]
     pub mod aarch64 {
         pub use coresimd::aarch64::*;
         pub use coresimd::arm::*;
@@ -85,6 +92,7 @@ pub mod arch {
     ///
     /// See the [module documentation](../index.html) for more details.
     #[cfg(target_arch = "wasm32")]
+    #[unstable(feature = "stdsimd", issue = "0")]
     pub mod wasm32 {
         pub use coresimd::wasm32::*;
     }
@@ -94,6 +102,7 @@ pub mod arch {
     /// See the [module documentation](../index.html) for more details.
     #[cfg(any(target_arch = "mips", dox))]
     #[doc(cfg(target_arch = "mips"))]
+    #[unstable(feature = "stdsimd", issue = "0")]
     pub mod mips {
         pub use coresimd::mips::*;
     }
@@ -103,6 +112,7 @@ pub mod arch {
     /// See the [module documentation](../index.html) for more details.
     #[cfg(any(target_arch = "mips64", dox))]
     #[doc(cfg(target_arch = "mips64"))]
+    #[unstable(feature = "stdsimd", issue = "0")]
     pub mod mips64 {
         pub use coresimd::mips::*;
     }
diff --git a/coresimd/x86/abm.rs b/coresimd/x86/abm.rs
index 1576c0a750..b8d336b03e 100644
--- a/coresimd/x86/abm.rs
+++ b/coresimd/x86/abm.rs
@@ -23,17 +23,23 @@ use stdsimd_test::assert_instr;
 /// Counts the leading most significant zero bits.
 ///
 /// When the operand is zero, it returns its size in bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_lzcnt_u32)
 #[inline]
 #[target_feature(enable = "lzcnt")]
 #[cfg_attr(test, assert_instr(lzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _lzcnt_u32(x: u32) -> u32 {
     x.leading_zeros()
 }
 
 /// Counts the bits that are set.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_popcnt32)
 #[inline]
 #[target_feature(enable = "popcnt")]
 #[cfg_attr(test, assert_instr(popcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _popcnt32(x: i32) -> i32 {
     x.count_ones() as i32
 }
diff --git a/coresimd/x86/aes.rs b/coresimd/x86/aes.rs
index 6fade3d453..7a44d4ae5c 100644
--- a/coresimd/x86/aes.rs
+++ b/coresimd/x86/aes.rs
@@ -29,41 +29,56 @@ extern "C" {
 }
 
 /// Perform one round of an AES decryption flow on data (state) in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesdec_si128)
 #[inline]
 #[target_feature(enable = "aes")]
 #[cfg_attr(test, assert_instr(aesdec))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_aesdec_si128(a: __m128i, round_key: __m128i) -> __m128i {
     aesdec(a, round_key)
 }
 
 /// Perform the last round of an AES decryption flow on data (state) in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesdeclast_si128)
 #[inline]
 #[target_feature(enable = "aes")]
 #[cfg_attr(test, assert_instr(aesdeclast))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_aesdeclast_si128(a: __m128i, round_key: __m128i) -> __m128i {
     aesdeclast(a, round_key)
 }
 
 /// Perform one round of an AES encryption flow on data (state) in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenc_si128)
 #[inline]
 #[target_feature(enable = "aes")]
 #[cfg_attr(test, assert_instr(aesenc))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_aesenc_si128(a: __m128i, round_key: __m128i) -> __m128i {
     aesenc(a, round_key)
 }
 
 /// Perform the last round of an AES encryption flow on data (state) in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128)
 #[inline]
 #[target_feature(enable = "aes")]
 #[cfg_attr(test, assert_instr(aesenclast))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_aesenclast_si128(a: __m128i, round_key: __m128i) -> __m128i {
     aesenclast(a, round_key)
 }
 
 /// Perform the `InvMixColumns` transformation on `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesimc_si128)
 #[inline]
 #[target_feature(enable = "aes")]
 #[cfg_attr(test, assert_instr(aesimc))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_aesimc_si128(a: __m128i) -> __m128i {
     aesimc(a)
 }
@@ -73,10 +88,13 @@ pub unsafe fn _mm_aesimc_si128(a: __m128i) -> __m128i {
 /// Assist in expanding the AES cipher key by computing steps towards
 /// generating a round key for encryption cipher using data from `a` and an
 /// 8-bit round constant `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aeskeygenassist_si128)
 #[inline]
 #[target_feature(enable = "aes")]
 #[cfg_attr(test, assert_instr(aeskeygenassist, imm8 = 0))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_aeskeygenassist_si128(a: __m128i, imm8: i32) -> __m128i {
     macro_rules! call {
         ($imm8:expr) => {
diff --git a/coresimd/x86/avx.rs b/coresimd/x86/avx.rs
index 16fa5ce217..cd4e65e562 100644
--- a/coresimd/x86/avx.rs
+++ b/coresimd/x86/avx.rs
@@ -25,18 +25,24 @@ use stdsimd_test::assert_instr;
 
 /// Add packed double-precision (64-bit) floating-point elements
 /// in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vaddpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
     simd_add(a, b)
 }
 
 /// Add packed single-precision (32-bit) floating-point elements in `a` and
 /// `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vaddps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
     simd_add(a, b)
 }
@@ -44,11 +50,14 @@ pub unsafe fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
 /// Compute the bitwise AND of a packed double-precision (64-bit)
 /// floating-point elements
 /// in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_and_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 // FIXME: Should be 'vandpd' instuction.
 // See /~https://github.com/rust-lang-nursery/stdsimd/issues/71
 #[cfg_attr(test, assert_instr(vandps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d {
     let a: u64x4 = mem::transmute(a);
     let b: u64x4 = mem::transmute(b);
@@ -57,9 +66,12 @@ pub unsafe fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d {
 
 /// Compute the bitwise AND of packed single-precision (32-bit) floating-point
 /// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_and_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vandps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
     let a: u32x8 = mem::transmute(a);
     let b: u32x8 = mem::transmute(b);
@@ -68,11 +80,14 @@ pub unsafe fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
 
 /// Compute the bitwise OR packed double-precision (64-bit) floating-point
 /// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_or_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 // FIXME: Should be 'vorpd' instuction.
 // See /~https://github.com/rust-lang-nursery/stdsimd/issues/71
 #[cfg_attr(test, assert_instr(vorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d {
     let a: u64x4 = mem::transmute(a);
     let b: u64x4 = mem::transmute(b);
@@ -81,9 +96,12 @@ pub unsafe fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d {
 
 /// Compute the bitwise OR packed single-precision (32-bit) floating-point
 /// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_or_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
     let a: u32x8 = mem::transmute(a);
     let b: u32x8 = mem::transmute(b);
@@ -92,10 +110,13 @@ pub unsafe fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
 
 /// Shuffle double-precision (64-bit) floating-point elements within 128-bit
 /// lanes using the control in `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vshufpd, imm8 = 0x1))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_shuffle_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d {
     let imm8 = (imm8 & 0xFF) as u8;
     macro_rules! shuffle4 {
@@ -135,10 +156,13 @@ pub unsafe fn _mm256_shuffle_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d {
 
 /// Shuffle single-precision (32-bit) floating-point elements in `a` within
 /// 128-bit lanes using the control in `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vshufps, imm8 = 0x0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_shuffle_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
     let imm8 = (imm8 & 0xFF) as u8;
     macro_rules! shuffle4 {
@@ -196,10 +220,13 @@ pub unsafe fn _mm256_shuffle_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
 /// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
 /// elements in `a`
 /// and then AND with `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_andnot_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 // FIXME: Should be 'vandnpd' instruction.
 #[cfg_attr(test, assert_instr(vandnps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d {
     let a: u64x4 = mem::transmute(a);
     let b: u64x4 = mem::transmute(b);
@@ -209,9 +236,12 @@ pub unsafe fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d {
 /// Compute the bitwise NOT of packed single-precision (32-bit) floating-point
 /// elements in `a`
 /// and then AND with `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_andnot_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vandnps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
     let a: u32x8 = mem::transmute(a);
     let b: u32x8 = mem::transmute(b);
@@ -220,108 +250,144 @@ pub unsafe fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 
 /// Compare packed double-precision (64-bit) floating-point elements
 /// in `a` and `b`, and return packed maximum values
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaxpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
     maxpd256(a, b)
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in `a`
 /// and `b`, and return packed maximum values
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaxps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
     maxps256(a, b)
 }
 
 /// Compare packed double-precision (64-bit) floating-point elements
 /// in `a` and `b`, and return packed minimum values
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vminpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
     minpd256(a, b)
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in `a`
 /// and `b`, and return packed minimum values
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vminps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
     minps256(a, b)
 }
 
 /// Add packed double-precision (64-bit) floating-point elements
 /// in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmulpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
     simd_mul(a, b)
 }
 
 /// Add packed single-precision (32-bit) floating-point elements in `a` and
 /// `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmulps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
     simd_mul(a, b)
 }
 
 /// Alternatively add and subtract packed double-precision (64-bit)
 /// floating-point elements in `a` to/from packed elements in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_addsub_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vaddsubpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
     addsubpd256(a, b)
 }
 
 /// Alternatively add and subtract packed single-precision (32-bit)
 /// floating-point elements in `a` to/from packed elements in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_addsub_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vaddsubps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
     addsubps256(a, b)
 }
 
 /// Subtract packed double-precision (64-bit) floating-point elements in `b`
 /// from packed elements in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vsubpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
     simd_sub(a, b)
 }
 
 /// Subtract packed single-precision (32-bit) floating-point elements in `b`
 /// from packed elements in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vsubps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
     simd_sub(a, b)
 }
 
 /// Compute the division of each of the 8 packed 32-bit floating-point elements
 /// in `a` by the corresponding packed elements in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_div_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vdivps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
     simd_div(a, b)
 }
 
 /// Compute the division of each of the 4 packed 64-bit floating-point elements
 /// in `a` by the corresponding packed elements in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_div_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vdivpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
     simd_div(a, b)
 }
@@ -337,10 +403,13 @@ pub unsafe fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
 /// For a complete list of options, check [the LLVM docs][llvm_docs].
 ///
 /// [llvm_docs]: /~https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_round_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vroundpd, b = 0x3))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_round_pd(a: __m256d, b: i32) -> __m256d {
     macro_rules! call {
         ($imm8:expr) => {
@@ -352,18 +421,24 @@ pub unsafe fn _mm256_round_pd(a: __m256d, b: i32) -> __m256d {
 
 /// Round packed double-precision (64-bit) floating point elements in `a`
 /// toward positive infinity.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_ceil_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vroundpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_ceil_pd(a: __m256d) -> __m256d {
     roundpd256(a, 0x02)
 }
 
 /// Round packed double-precision (64-bit) floating point elements in `a`
 /// toward negative infinity.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_floor_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vroundpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_floor_pd(a: __m256d) -> __m256d {
     roundpd256(a, 0x01)
 }
@@ -379,10 +454,13 @@ pub unsafe fn _mm256_floor_pd(a: __m256d) -> __m256d {
 /// For a complete list of options, check [the LLVM docs][llvm_docs].
 ///
 /// [llvm_docs]: /~https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_round_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vroundps, b = 0x00))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_round_ps(a: __m256, b: i32) -> __m256 {
     macro_rules! call {
         ($imm8:expr) => {
@@ -394,46 +472,61 @@ pub unsafe fn _mm256_round_ps(a: __m256, b: i32) -> __m256 {
 
 /// Round packed single-precision (32-bit) floating point elements in `a`
 /// toward positive infinity.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_ceil_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vroundps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_ceil_ps(a: __m256) -> __m256 {
     roundps256(a, 0x02)
 }
 
 /// Round packed single-precision (32-bit) floating point elements in `a`
 /// toward negative infinity.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_floor_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vroundps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_floor_ps(a: __m256) -> __m256 {
     roundps256(a, 0x01)
 }
 
 /// Return the square root of packed single-precision (32-bit) floating point
 /// elements in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sqrt_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vsqrtps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_sqrt_ps(a: __m256) -> __m256 {
     sqrtps256(a)
 }
 
 /// Return the square root of packed double-precision (64-bit) floating point
 /// elements in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sqrt_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vsqrtpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
     sqrtpd256(a)
 }
 
 /// Blend packed double-precision (64-bit) floating-point elements from
 /// `a` and `b` using control mask `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blend_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vblendpd, imm8 = 9))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_blend_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d {
     let imm8 = (imm8 & 0xFF) as u8;
     macro_rules! blend4 {
@@ -473,10 +566,13 @@ pub unsafe fn _mm256_blend_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d {
 
 /// Blend packed single-precision (32-bit) floating-point elements from
 /// `a` and `b` using control mask `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blend_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vblendps, imm8 = 9))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_blend_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
     let imm8 = (imm8 & 0xFF) as u8;
     macro_rules! blend4 {
@@ -533,18 +629,24 @@ pub unsafe fn _mm256_blend_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
 
 /// Blend packed double-precision (64-bit) floating-point elements from
 /// `a` and `b` using `c` as a mask.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blendv_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vblendvpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
     vblendvpd(a, b, c)
 }
 
 /// Blend packed single-precision (32-bit) floating-point elements from
 /// `a` and `b` using `c` as a mask.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blendv_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vblendvps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
     vblendvps(a, b, c)
 }
@@ -553,10 +655,13 @@ pub unsafe fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
 /// elements in `a` and `b` using the high 4 bits in `imm8`,
 /// sum the four products, and conditionally return the sum
 ///  using the low 4 bits of `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dp_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vdpps, imm8 = 0x0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_dp_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
     macro_rules! call {
         ($imm8:expr) => {
@@ -570,9 +675,12 @@ pub unsafe fn _mm256_dp_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
 /// of 4 64-bit floating points `a` and `b`.
 /// In the result, sums of elements from `a` are returned in even locations,
 /// while sums of elements from `b` are returned in odd locations.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadd_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vhaddpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
     vhaddpd(a, b)
 }
@@ -582,9 +690,12 @@ pub unsafe fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
 /// In the result, sums of elements from `a` are returned in locations of
 /// indices 0, 1, 4, 5; while sums of elements from `b` are locations
 /// 2, 3, 6, 7.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadd_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vhaddps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
     vhaddps(a, b)
 }
@@ -593,9 +704,12 @@ pub unsafe fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
 /// of 4 64-bit floating points `a` and `b`.
 /// In the result, sums of elements from `a` are returned in even locations,
 /// while sums of elements from `b` are returned in odd locations.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsub_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vhsubpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
     vhsubpd(a, b)
 }
@@ -605,19 +719,25 @@ pub unsafe fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
 /// In the result, sums of elements from `a` are returned in locations of
 /// indices 0, 1, 4, 5; while sums of elements from `b` are locations
 /// 2, 3, 6, 7.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsub_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vhsubps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
     vhsubps(a, b)
 }
 
 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
 /// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_xor_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 // FIXME Should be 'vxorpd' instruction.
 #[cfg_attr(test, assert_instr(vxorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d {
     let a: u64x4 = mem::transmute(a);
     let b: u64x4 = mem::transmute(b);
@@ -626,9 +746,12 @@ pub unsafe fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d {
 
 /// Compute the bitwise XOR of packed single-precision (32-bit) floating-point
 /// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_xor_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vxorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256 {
     let a: u32x8 = mem::transmute(a);
     let b: u32x8 = mem::transmute(b);
@@ -636,77 +759,112 @@ pub unsafe fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256 {
 }
 
 /// Equal (ordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_EQ_OQ: i32 = 0x00;
 /// Less-than (ordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_LT_OS: i32 = 0x01;
 /// Less-than-or-equal (ordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_LE_OS: i32 = 0x02;
 /// Unordered (non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_UNORD_Q: i32 = 0x03;
 /// Not-equal (unordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_NEQ_UQ: i32 = 0x04;
 /// Not-less-than (unordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_NLT_US: i32 = 0x05;
 /// Not-less-than-or-equal (unordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_NLE_US: i32 = 0x06;
 /// Ordered (non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_ORD_Q: i32 = 0x07;
 /// Equal (unordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_EQ_UQ: i32 = 0x08;
 /// Not-greater-than-or-equal (unordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_NGE_US: i32 = 0x09;
 /// Not-greater-than (unordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_NGT_US: i32 = 0x0a;
 /// False (ordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_FALSE_OQ: i32 = 0x0b;
 /// Not-equal (ordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_NEQ_OQ: i32 = 0x0c;
 /// Greater-than-or-equal (ordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_GE_OS: i32 = 0x0d;
 /// Greater-than (ordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_GT_OS: i32 = 0x0e;
 /// True (unordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_TRUE_UQ: i32 = 0x0f;
 /// Equal (ordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_EQ_OS: i32 = 0x10;
 /// Less-than (ordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_LT_OQ: i32 = 0x11;
 /// Less-than-or-equal (ordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_LE_OQ: i32 = 0x12;
 /// Unordered (signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_UNORD_S: i32 = 0x13;
 /// Not-equal (unordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_NEQ_US: i32 = 0x14;
 /// Not-less-than (unordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_NLT_UQ: i32 = 0x15;
 /// Not-less-than-or-equal (unordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_NLE_UQ: i32 = 0x16;
 /// Ordered (signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_ORD_S: i32 = 0x17;
 /// Equal (unordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_EQ_US: i32 = 0x18;
 /// Not-greater-than-or-equal (unordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_NGE_UQ: i32 = 0x19;
 /// Not-greater-than (unordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_NGT_UQ: i32 = 0x1a;
 /// False (ordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_FALSE_OS: i32 = 0x1b;
 /// Not-equal (ordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_NEQ_OS: i32 = 0x1c;
 /// Greater-than-or-equal (ordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_GE_OQ: i32 = 0x1d;
 /// Greater-than (ordered, non-signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_GT_OQ: i32 = 0x1e;
 /// True (unordered, signaling)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _CMP_TRUE_US: i32 = 0x1f;
 
 /// Compare packed double-precision (64-bit) floating-point
 /// elements in `a` and `b` based on the comparison operand
 /// specified by `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_pd)
 #[inline]
 #[target_feature(enable = "avx,sse2")]
 #[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmp_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
     macro_rules! call {
         ($imm8:expr) => {
@@ -719,10 +877,13 @@ pub unsafe fn _mm_cmp_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
 /// Compare packed double-precision (64-bit) floating-point
 /// elements in `a` and `b` based on the comparison operand
 /// specified by `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cmp_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d {
     macro_rules! call {
         ($imm8:expr) => {
@@ -735,10 +896,13 @@ pub unsafe fn _mm256_cmp_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d {
 /// Compare packed single-precision (32-bit) floating-point
 /// elements in `a` and `b` based on the comparison operand
 /// specified by `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ps)
 #[inline]
 #[target_feature(enable = "avx,sse")]
 #[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmp_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
     macro_rules! call {
         ($imm8:expr) => {
@@ -751,10 +915,13 @@ pub unsafe fn _mm_cmp_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
 /// Compare packed single-precision (32-bit) floating-point
 /// elements in `a` and `b` based on the comparison operand
 /// specified by `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cmp_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
     macro_rules! call {
         ($imm8:expr) => {
@@ -769,10 +936,13 @@ pub unsafe fn _mm256_cmp_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
 /// store the result in the lower element of returned vector,
 /// and copy the upper element from `a` to the upper element of returned
 /// vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_sd)
 #[inline]
 #[target_feature(enable = "avx,sse2")]
 #[cfg_attr(test, assert_instr(vcmpeqsd, imm8 = 0))] // TODO Validate vcmpsd
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmp_sd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
     macro_rules! call {
         ($imm8:expr) => {
@@ -787,10 +957,13 @@ pub unsafe fn _mm_cmp_sd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
 /// store the result in the lower element of returned vector,
 /// and copy the upper 3 packed elements from `a` to the upper elements of
 /// returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ss)
 #[inline]
 #[target_feature(enable = "avx,sse")]
 #[cfg_attr(test, assert_instr(vcmpeqss, imm8 = 0))] // TODO Validate vcmpss
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmp_ss(a: __m128, b: __m128, imm8: i32) -> __m128 {
     macro_rules! call {
         ($imm8:expr) => {
@@ -802,82 +975,109 @@ pub unsafe fn _mm_cmp_ss(a: __m128, b: __m128, imm8: i32) -> __m128 {
 
 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
 /// floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvtdq2pd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d {
     simd_cast(a.as_i32x4())
 }
 
 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit)
 /// floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvtdq2ps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 {
     vcvtdq2ps(a.as_i32x8())
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in `a`
 /// to packed single-precision (32-bit) floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpd_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvtpd2ps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
     vcvtpd2ps(a)
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in `a`
 /// to packed 32-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtps_epi32)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvtps2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
     mem::transmute(vcvtps2dq(a))
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in `a`
 /// to packed double-precision (64-bit) floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtps_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvtps2pd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtps_pd(a: __m128) -> __m256d {
     simd_cast(a)
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in `a`
 /// to packed 32-bit integers with truncation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttpd_epi32)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvttpd2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
     mem::transmute(vcvttpd2dq(a))
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in `a`
 /// to packed 32-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpd_epi32)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvtpd2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
     mem::transmute(vcvtpd2dq(a))
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in `a`
 /// to packed 32-bit integers with truncation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttps_epi32)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvttps2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
     mem::transmute(vcvttps2dq(a))
 }
 
 /// Extract 128 bits (composed of 4 packed single-precision (32-bit)
 /// floating-point elements) from `a`, selected with `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extractf128_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vextractf128, imm8 = 1))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_extractf128_ps(a: __m256, imm8: i32) -> __m128 {
     match imm8 & 1 {
         0 => simd_shuffle4(a, _mm256_undefined_ps(), [0, 1, 2, 3]),
@@ -887,10 +1087,13 @@ pub unsafe fn _mm256_extractf128_ps(a: __m256, imm8: i32) -> __m128 {
 
 /// Extract 128 bits (composed of 2 packed double-precision (64-bit)
 /// floating-point elements) from `a`, selected with `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extractf128_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vextractf128, imm8 = 1))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_extractf128_pd(a: __m256d, imm8: i32) -> __m128d {
     match imm8 & 1 {
         0 => simd_shuffle2(a, _mm256_undefined_pd(), [0, 1]),
@@ -899,10 +1102,13 @@ pub unsafe fn _mm256_extractf128_pd(a: __m256d, imm8: i32) -> __m128d {
 }
 
 /// Extract 128 bits (composed of integer data) from `a`, selected with `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extractf128_si256)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vextractf128, imm8 = 1))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_extractf128_si256(a: __m256i, imm8: i32) -> __m128i {
     let b = _mm256_undefined_si256().as_i64x4();
     let dst: i64x2 = match imm8 & 1 {
@@ -913,46 +1119,61 @@ pub unsafe fn _mm256_extractf128_si256(a: __m256i, imm8: i32) -> __m128i {
 }
 
 /// Zero the contents of all XMM or YMM registers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zeroall)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vzeroall))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_zeroall() {
     vzeroall()
 }
 
 /// Zero the upper 128 bits of all YMM registers;
 /// the lower 128-bits of the registers are unmodified.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zeroupper)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vzeroupper))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_zeroupper() {
     vzeroupper()
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in `a`
 /// within 128-bit lanes using the control in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutevar_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vpermilps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
     vpermilps256(a, b.as_i32x8())
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in `a`
 /// using the control in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutevar_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vpermilps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
     vpermilps(a, b.as_i32x4())
 }
 
 /// Shuffle single-precision (32-bit) floating-point elements in `a`
 /// within 128-bit lanes using the control in `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_permute_ps(a: __m256, imm8: i32) -> __m256 {
     let imm8 = (imm8 & 0xFF) as u8;
     macro_rules! shuffle4 {
@@ -1004,10 +1225,13 @@ pub unsafe fn _mm256_permute_ps(a: __m256, imm8: i32) -> __m256 {
 
 /// Shuffle single-precision (32-bit) floating-point elements in `a`
 /// using the control in `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permute_ps)
 #[inline]
 #[target_feature(enable = "avx,sse")]
 #[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_permute_ps(a: __m128, imm8: i32) -> __m128 {
     let imm8 = (imm8 & 0xFF) as u8;
     macro_rules! shuffle4 {
@@ -1055,28 +1279,37 @@ pub unsafe fn _mm_permute_ps(a: __m128, imm8: i32) -> __m128 {
 
 /// Shuffle double-precision (64-bit) floating-point elements in `a`
 /// within 256-bit lanes using the control in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutevar_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vpermilpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
     vpermilpd256(a, b.as_i64x4())
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in `a`
 /// using the control in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutevar_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vpermilpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
     vpermilpd(a, b.as_i64x2())
 }
 
 /// Shuffle double-precision (64-bit) floating-point elements in `a`
 /// within 128-bit lanes using the control in `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vpermilpd, imm8 = 0x1))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_permute_pd(a: __m256d, imm8: i32) -> __m256d {
     let imm8 = (imm8 & 0xFF) as u8;
     macro_rules! shuffle4 {
@@ -1116,10 +1349,13 @@ pub unsafe fn _mm256_permute_pd(a: __m256d, imm8: i32) -> __m256d {
 
 /// Shuffle double-precision (64-bit) floating-point elements in `a`
 /// using the control in `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permute_pd)
 #[inline]
 #[target_feature(enable = "avx,sse2")]
 #[cfg_attr(test, assert_instr(vpermilpd, imm8 = 0x1))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_permute_pd(a: __m128d, imm8: i32) -> __m128d {
     let imm8 = (imm8 & 0xFF) as u8;
     macro_rules! shuffle2 {
@@ -1143,10 +1379,13 @@ pub unsafe fn _mm_permute_pd(a: __m128d, imm8: i32) -> __m128d {
 
 /// Shuffle 256-bits (composed of 8 packed single-precision (32-bit)
 /// floating-point elements) selected by `imm8` from `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute2f128_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x5))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_permute2f128_ps(
     a: __m256, b: __m256, imm8: i32
 ) -> __m256 {
@@ -1160,10 +1399,13 @@ pub unsafe fn _mm256_permute2f128_ps(
 
 /// Shuffle 256-bits (composed of 4 packed double-precision (64-bit)
 /// floating-point elements) selected by `imm8` from `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute2f128_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x31))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_permute2f128_pd(
     a: __m256d, b: __m256d, imm8: i32
 ) -> __m256d {
@@ -1177,10 +1419,13 @@ pub unsafe fn _mm256_permute2f128_pd(
 
 /// Shuffle 258-bits (composed of integer data) selected by `imm8`
 /// from `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute2f128_si256)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x31))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_permute2f128_si256(
     a: __m256i, b: __m256i, imm8: i32
 ) -> __m256i {
@@ -1197,45 +1442,60 @@ pub unsafe fn _mm256_permute2f128_si256(
 
 /// Broadcast a single-precision (32-bit) floating-point element from memory
 /// to all elements of the returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcast_ss)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_broadcast_ss(f: &f32) -> __m256 {
     _mm256_set1_ps(*f)
 }
 
 /// Broadcast a single-precision (32-bit) floating-point element from memory
 /// to all elements of the returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcast_ss)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_broadcast_ss(f: &f32) -> __m128 {
     _mm_set1_ps(*f)
 }
 
 /// Broadcast a double-precision (64-bit) floating-point element from memory
 /// to all elements of the returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcast_sd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vbroadcastsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_broadcast_sd(f: &f64) -> __m256d {
     _mm256_set1_pd(*f)
 }
 
 /// Broadcast 128 bits from memory (composed of 4 packed single-precision
 /// (32-bit) floating-point elements) to all elements of the returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcast_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vbroadcastf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
     vbroadcastf128ps256(a)
 }
 
 /// Broadcast 128 bits from memory (composed of 2 packed double-precision
 /// (64-bit) floating-point elements) to all elements of the returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcast_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vbroadcastf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
     vbroadcastf128pd256(a)
 }
@@ -1243,10 +1503,13 @@ pub unsafe fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
 /// Copy `a` to result, then insert 128 bits (composed of 4 packed
 /// single-precision (32-bit) floating-point elements) from `b` into result
 /// at the location specified by `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insertf128_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_insertf128_ps(a: __m256, b: __m128, imm8: i32) -> __m256 {
     let b = _mm256_castps128_ps256(b);
     match imm8 & 1 {
@@ -1258,10 +1521,13 @@ pub unsafe fn _mm256_insertf128_ps(a: __m256, b: __m128, imm8: i32) -> __m256 {
 /// Copy `a` to result, then insert 128 bits (composed of 2 packed
 /// double-precision (64-bit) floating-point elements) from `b` into result
 /// at the location specified by `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insertf128_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_insertf128_pd(
     a: __m256d, b: __m128d, imm8: i32
 ) -> __m256d {
@@ -1273,10 +1539,13 @@ pub unsafe fn _mm256_insertf128_pd(
 
 /// Copy `a` to result, then insert 128 bits from `b` into result
 /// at the location specified by `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insertf128_si256)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_insertf128_si256(
     a: __m256i, b: __m128i, imm8: i32
 ) -> __m256i {
@@ -1290,10 +1559,13 @@ pub unsafe fn _mm256_insertf128_si256(
 
 /// Copy `a` to result, and insert the 8-bit integer `i` into result
 /// at the location specified by `index`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insert_epi8)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_insert_epi8(a: __m256i, i: i8, index: i32) -> __m256i {
     mem::transmute(simd_insert(
         a.as_i8x32(),
@@ -1304,10 +1576,13 @@ pub unsafe fn _mm256_insert_epi8(a: __m256i, i: i8, index: i32) -> __m256i {
 
 /// Copy `a` to result, and insert the 16-bit integer `i` into result
 /// at the location specified by `index`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insert_epi16)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_insert_epi16(a: __m256i, i: i16, index: i32) -> __m256i {
     mem::transmute(simd_insert(
         a.as_i16x16(),
@@ -1318,10 +1593,13 @@ pub unsafe fn _mm256_insert_epi16(a: __m256i, i: i16, index: i32) -> __m256i {
 
 /// Copy `a` to result, and insert the 32-bit integer `i` into result
 /// at the location specified by `index`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insert_epi32)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_insert_epi32(a: __m256i, i: i32, index: i32) -> __m256i {
     mem::transmute(simd_insert(a.as_i32x8(), (index as u32) & 7, i))
 }
@@ -1330,9 +1608,12 @@ pub unsafe fn _mm256_insert_epi32(a: __m256i, i: i32, index: i32) -> __m256i {
 /// floating-point elements) from memory into result.
 /// `mem_addr` must be aligned on a 32-byte boundary or a
 /// general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_load_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovapd expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d {
     *(mem_addr as *const __m256d)
 }
@@ -1341,9 +1622,12 @@ pub unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d {
 /// floating-point elements) from `a` into memory.
 /// `mem_addr` must be aligned on a 32-byte boundary or a
 /// general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_store_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovapd expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_store_pd(mem_addr: *const f64, a: __m256d) {
     *(mem_addr as *mut __m256d) = a;
 }
@@ -1352,9 +1636,12 @@ pub unsafe fn _mm256_store_pd(mem_addr: *const f64, a: __m256d) {
 /// floating-point elements) from memory into result.
 /// `mem_addr` must be aligned on a 32-byte boundary or a
 /// general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_load_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 {
     *(mem_addr as *const __m256)
 }
@@ -1363,9 +1650,12 @@ pub unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 {
 /// floating-point elements) from `a` into memory.
 /// `mem_addr` must be aligned on a 32-byte boundary or a
 /// general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_store_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_store_ps(mem_addr: *const f32, a: __m256) {
     *(mem_addr as *mut __m256) = a;
 }
@@ -1373,9 +1663,12 @@ pub unsafe fn _mm256_store_ps(mem_addr: *const f32, a: __m256) {
 /// Load 256-bits (composed of 4 packed double-precision (64-bit)
 /// floating-point elements) from memory into result.
 /// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovupd expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d {
     let mut dst = _mm256_undefined_pd();
     ptr::copy_nonoverlapping(
@@ -1389,9 +1682,12 @@ pub unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d {
 /// Store 256-bits (composed of 4 packed double-precision (64-bit)
 /// floating-point elements) from `a` into memory.
 /// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovupd expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_storeu_pd(mem_addr: *mut f64, a: __m256d) {
     storeupd256(mem_addr, a);
 }
@@ -1399,9 +1695,12 @@ pub unsafe fn _mm256_storeu_pd(mem_addr: *mut f64, a: __m256d) {
 /// Load 256-bits (composed of 8 packed single-precision (32-bit)
 /// floating-point elements) from memory into result.
 /// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovups))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> __m256 {
     let mut dst = _mm256_undefined_ps();
     ptr::copy_nonoverlapping(
@@ -1415,9 +1714,12 @@ pub unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> __m256 {
 /// Store 256-bits (composed of 8 packed single-precision (32-bit)
 /// floating-point elements) from `a` into memory.
 /// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovups))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256) {
     storeups256(mem_addr, a);
 }
@@ -1425,9 +1727,12 @@ pub unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256) {
 /// Load 256-bits of integer data from memory into result.
 /// `mem_addr` must be aligned on a 32-byte boundary or a
 /// general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_load_si256)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovdqa expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i {
     *mem_addr
 }
@@ -1435,18 +1740,24 @@ pub unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i {
 /// Store 256-bits of integer data from `a` into memory.
 /// `mem_addr` must be aligned on a 32-byte boundary or a
 /// general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_store_si256)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovdqa expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_store_si256(mem_addr: *mut __m256i, a: __m256i) {
     *mem_addr = a;
 }
 
 /// Load 256-bits of integer data from memory into result.
 /// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu_si256)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i {
     let mut dst = _mm256_undefined_si256();
     ptr::copy_nonoverlapping(
@@ -1459,9 +1770,12 @@ pub unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i {
 
 /// Store 256-bits of integer data from `a` into memory.
 /// 	`mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu_si256)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
     storeudq256(mem_addr as *mut i8, a.as_i8x32());
 }
@@ -1469,9 +1783,12 @@ pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
 /// Load packed double-precision (64-bit) floating-point elements from memory
 /// into result using `mask` (elements are zeroed out when the high bit of the
 /// corresponding element is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskload_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaskmovpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_maskload_pd(
     mem_addr: *const f64, mask: __m256i
 ) -> __m256d {
@@ -1480,9 +1797,12 @@ pub unsafe fn _mm256_maskload_pd(
 
 /// Store packed double-precision (64-bit) floating-point elements from `a`
 /// into memory using `mask`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskstore_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaskmovpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_maskstore_pd(
     mem_addr: *mut f64, mask: __m256i, a: __m256d
 ) {
@@ -1492,18 +1812,24 @@ pub unsafe fn _mm256_maskstore_pd(
 /// Load packed double-precision (64-bit) floating-point elements from memory
 /// into result using `mask` (elements are zeroed out when the high bit of the
 /// corresponding element is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskload_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaskmovpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d {
     maskloadpd(mem_addr as *const i8, mask.as_i64x2())
 }
 
 /// Store packed double-precision (64-bit) floating-point elements from `a`
 /// into memory using `mask`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskstore_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaskmovpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) {
     maskstorepd(mem_addr as *mut i8, mask.as_i64x2(), a);
 }
@@ -1511,9 +1837,12 @@ pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) {
 /// Load packed single-precision (32-bit) floating-point elements from memory
 /// into result using `mask` (elements are zeroed out when the high bit of the
 /// corresponding element is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskload_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaskmovps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_maskload_ps(
     mem_addr: *const f32, mask: __m256i
 ) -> __m256 {
@@ -1522,9 +1851,12 @@ pub unsafe fn _mm256_maskload_ps(
 
 /// Store packed single-precision (32-bit) floating-point elements from `a`
 /// into memory using `mask`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskstore_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaskmovps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_maskstore_ps(
     mem_addr: *mut f32, mask: __m256i, a: __m256
 ) {
@@ -1534,45 +1866,60 @@ pub unsafe fn _mm256_maskstore_ps(
 /// Load packed single-precision (32-bit) floating-point elements from memory
 /// into result using `mask` (elements are zeroed out when the high bit of the
 /// corresponding element is not set).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskload_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaskmovps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128 {
     maskloadps(mem_addr as *const i8, mask.as_i32x4())
 }
 
 /// Store packed single-precision (32-bit) floating-point elements from `a`
 /// into memory using `mask`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskstore_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaskmovps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) {
     maskstoreps(mem_addr as *mut i8, mask.as_i32x4(), a);
 }
 
 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements
 /// from `a`, and return the results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movehdup_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovshdup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_movehdup_ps(a: __m256) -> __m256 {
     simd_shuffle8(a, a, [1, 1, 3, 3, 5, 5, 7, 7])
 }
 
 /// Duplicate even-indexed single-precision (32-bit) floating-point elements
 /// from `a`, and return the results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_moveldup_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovsldup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_moveldup_ps(a: __m256) -> __m256 {
     simd_shuffle8(a, a, [0, 0, 2, 2, 4, 4, 6, 6])
 }
 
 /// Duplicate even-indexed double-precision (64-bit) floating-point elements
 /// from "a", and return the results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movedup_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovddup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_movedup_pd(a: __m256d) -> __m256d {
     simd_shuffle4(a, a, [0, 0, 2, 2])
 }
@@ -1580,9 +1927,12 @@ pub unsafe fn _mm256_movedup_pd(a: __m256d) -> __m256d {
 /// Load 256-bits of integer data from unaligned memory into result.
 /// This intrinsic may perform better than `_mm256_loadu_si256` when the
 /// data crosses a cache line boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_lddqu_si256)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vlddqu))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
     mem::transmute(vlddqu(mem_addr as *const i8))
 }
@@ -1590,9 +1940,12 @@ pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
 /// Moves integer data from a 256-bit integer vector to a 32-byte
 /// aligned memory location. To minimize caching, the data is flagged as
 /// non-temporal (unlikely to be used again soon)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_stream_si256)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntdq
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_stream_si256(mem_addr: *const __m256i, a: __m256i) {
     intrinsics::nontemporal_store(mem::transmute(mem_addr), a);
 }
@@ -1600,9 +1953,12 @@ pub unsafe fn _mm256_stream_si256(mem_addr: *const __m256i, a: __m256i) {
 /// Moves double-precision values from a 256-bit vector of [4 x double]
 /// to a 32-byte aligned memory location. To minimize caching, the data is
 /// flagged as non-temporal (unlikely to be used again soon).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_stream_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntpd
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_stream_pd(mem_addr: *const f64, a: __m256d) {
     intrinsics::nontemporal_store(mem::transmute(mem_addr), a);
 }
@@ -1611,9 +1967,12 @@ pub unsafe fn _mm256_stream_pd(mem_addr: *const f64, a: __m256d) {
 /// of [8 x float] to a 32-byte aligned memory location. To minimize
 /// caching, the data is flagged as non-temporal (unlikely to be used again
 /// soon).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_stream_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovntps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_stream_ps(mem_addr: *const f32, a: __m256) {
     intrinsics::nontemporal_store(mem::transmute(mem_addr), a);
 }
@@ -1621,9 +1980,12 @@ pub unsafe fn _mm256_stream_ps(mem_addr: *const f32, a: __m256) {
 /// Compute the approximate reciprocal of packed single-precision (32-bit)
 /// floating-point elements in `a`, and return the results. The maximum
 /// relative error for this approximation is less than 1.5*2^-12.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rcp_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vrcpps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_rcp_ps(a: __m256) -> __m256 {
     vrcpps(a)
 }
@@ -1631,45 +1993,60 @@ pub unsafe fn _mm256_rcp_ps(a: __m256) -> __m256 {
 /// Compute the approximate reciprocal square root of packed single-precision
 /// (32-bit) floating-point elements in `a`, and return the results.
 /// The maximum relative error for this approximation is less than 1.5*2^-12.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rsqrt_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vrsqrtps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
     vrsqrtps(a)
 }
 
 /// Unpack and interleave double-precision (64-bit) floating-point elements
 /// from the high half of each 128-bit lane in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vunpckhpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
     simd_shuffle4(a, b, [1, 5, 3, 7])
 }
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements
 /// from the high half of each 128-bit lane in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vunpckhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
     simd_shuffle8(a, b, [2, 10, 3, 11, 6, 14, 7, 15])
 }
 
 /// Unpack and interleave double-precision (64-bit) floating-point elements
 /// from the low half of each 128-bit lane in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vunpcklpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
     simd_shuffle4(a, b, [0, 4, 2, 6])
 }
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements
 /// from the low half of each 128-bit lane in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vunpcklps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
     simd_shuffle8(a, b, [0, 8, 1, 9, 4, 12, 5, 13])
 }
@@ -1678,9 +2055,12 @@ pub unsafe fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
 /// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
 /// Compute the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
 /// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testz_si256)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
     ptestz256(a.as_i64x4(), b.as_i64x4())
 }
@@ -1689,9 +2069,12 @@ pub unsafe fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
 /// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
 /// Compute the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
 /// the result is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testc_si256)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
     ptestc256(a.as_i64x4(), b.as_i64x4())
 }
@@ -1701,9 +2084,12 @@ pub unsafe fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
 /// Compute the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
 /// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and
 /// `CF` values are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testnzc_si256)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
     ptestnzc256(a.as_i64x4(), b.as_i64x4())
 }
@@ -1715,9 +2101,12 @@ pub unsafe fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
 /// NOT of `a` and then AND with `b`, producing an intermediate value, and set
 /// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testz_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
     vtestzpd256(a, b)
 }
@@ -1729,9 +2118,12 @@ pub unsafe fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
 /// NOT of `a` and then AND with `b`, producing an intermediate value, and set
 /// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testc_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
     vtestcpd256(a, b)
 }
@@ -1744,9 +2136,12 @@ pub unsafe fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
 /// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
 /// are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testnzc_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
     vtestnzcpd256(a, b)
 }
@@ -1758,9 +2153,12 @@ pub unsafe fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
 /// NOT of `a` and then AND with `b`, producing an intermediate value, and set
 /// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
     vtestzpd(a, b)
 }
@@ -1772,9 +2170,12 @@ pub unsafe fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
 /// NOT of `a` and then AND with `b`, producing an intermediate value, and set
 /// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
     vtestcpd(a, b)
 }
@@ -1787,9 +2188,12 @@ pub unsafe fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
 /// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
 /// are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
     vtestnzcpd(a, b)
 }
@@ -1801,9 +2205,12 @@ pub unsafe fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
 /// NOT of `a` and then AND with `b`, producing an intermediate value, and set
 /// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testz_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
     vtestzps256(a, b)
 }
@@ -1815,9 +2222,12 @@ pub unsafe fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
 /// NOT of `a` and then AND with `b`, producing an intermediate value, and set
 /// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testc_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
     vtestcps256(a, b)
 }
@@ -1830,9 +2240,12 @@ pub unsafe fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
 /// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
 /// are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testnzc_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
     vtestnzcps256(a, b)
 }
@@ -1844,9 +2257,12 @@ pub unsafe fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
 /// NOT of `a` and then AND with `b`, producing an intermediate value, and set
 /// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return the `ZF` value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
     vtestzps(a, b)
 }
@@ -1858,9 +2274,12 @@ pub unsafe fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
 /// NOT of `a` and then AND with `b`, producing an intermediate value, and set
 /// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return the `CF` value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
     vtestcps(a, b)
 }
@@ -1873,9 +2292,12 @@ pub unsafe fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
 /// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
 /// are zero, otherwise return 0.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
     vtestnzcps(a, b)
 }
@@ -1883,9 +2305,12 @@ pub unsafe fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
 /// Set each bit of the returned mask based on the most significant bit of the
 /// corresponding packed double-precision (64-bit) floating-point element in
 /// `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movemask_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovmskpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 {
     movmskpd256(a)
 }
@@ -1893,52 +2318,70 @@ pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 {
 /// Set each bit of the returned mask based on the most significant bit of the
 /// corresponding packed single-precision (32-bit) floating-point element in
 /// `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movemask_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovmskps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_movemask_ps(a: __m256) -> i32 {
     movmskps256(a)
 }
 
 /// Return vector of type __m256d with all elements set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setzero_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vxorps))] // FIXME vxorpd expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_setzero_pd() -> __m256d {
     _mm256_set1_pd(0.0)
 }
 
 /// Return vector of type __m256 with all elements set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setzero_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vxorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_setzero_ps() -> __m256 {
     _mm256_set1_ps(0.0)
 }
 
 /// Return vector of type __m256i with all elements set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setzero_si256)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vxor))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_setzero_si256() -> __m256i {
     _mm256_set1_epi8(0)
 }
 
 /// Set packed double-precision (64-bit) floating-point elements in returned
 /// vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 #[cfg_attr(test, assert_instr(vinsertf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
     _mm256_setr_pd(d, c, b, a)
 }
 
 /// Set packed single-precision (32-bit) floating-point elements in returned
 /// vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_set_ps(
     a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32
 ) -> __m256 {
@@ -1947,9 +2390,12 @@ pub unsafe fn _mm256_set_ps(
 
 /// Set packed 8-bit integers in returned vector with the supplied values in
 /// reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_epi8)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_set_epi8(
     e00: i8, e01: i8, e02: i8, e03: i8, e04: i8, e05: i8, e06: i8, e07: i8,
     e08: i8, e09: i8, e10: i8, e11: i8, e12: i8, e13: i8, e14: i8, e15: i8,
@@ -1966,9 +2412,12 @@ pub unsafe fn _mm256_set_epi8(
 }
 
 /// Set packed 16-bit integers in returned vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_epi16)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_set_epi16(
     e00: i16, e01: i16, e02: i16, e03: i16, e04: i16, e05: i16, e06: i16,
     e07: i16, e08: i16, e09: i16, e10: i16, e11: i16, e12: i16, e13: i16,
@@ -1984,9 +2433,12 @@ pub unsafe fn _mm256_set_epi16(
 }
 
 /// Set packed 32-bit integers in returned vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_epi32)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_set_epi32(
     e0: i32, e1: i32, e2: i32, e3: i32, e4: i32, e5: i32, e6: i32, e7: i32
 ) -> __m256i {
@@ -1994,27 +2446,36 @@ pub unsafe fn _mm256_set_epi32(
 }
 
 /// Set packed 64-bit integers in returned vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_epi64x)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
     _mm256_setr_epi64x(d, c, b, a)
 }
 
 /// Set packed double-precision (64-bit) floating-point elements in returned
 /// vector with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
     __m256d(a, b, c, d)
 }
 
 /// Set packed single-precision (32-bit) floating-point elements in returned
 /// vector with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_setr_ps(
     a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32
 ) -> __m256 {
@@ -2023,9 +2484,12 @@ pub unsafe fn _mm256_setr_ps(
 
 /// Set packed 8-bit integers in returned vector with the supplied values in
 /// reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_epi8)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_setr_epi8(
     e00: i8, e01: i8, e02: i8, e03: i8, e04: i8, e05: i8, e06: i8, e07: i8,
     e08: i8, e09: i8, e10: i8, e11: i8, e12: i8, e13: i8, e14: i8, e15: i8,
@@ -2043,9 +2507,12 @@ pub unsafe fn _mm256_setr_epi8(
 
 /// Set packed 16-bit integers in returned vector with the supplied values in
 /// reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_epi16)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_setr_epi16(
     e00: i16, e01: i16, e02: i16, e03: i16, e04: i16, e05: i16, e06: i16,
     e07: i16, e08: i16, e09: i16, e10: i16, e11: i16, e12: i16, e13: i16,
@@ -2062,9 +2529,12 @@ pub unsafe fn _mm256_setr_epi16(
 
 /// Set packed 32-bit integers in returned vector with the supplied values in
 /// reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_epi32)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_setr_epi32(
     e0: i32, e1: i32, e2: i32, e3: i32, e4: i32, e5: i32, e6: i32, e7: i32
 ) -> __m256i {
@@ -2073,38 +2543,50 @@ pub unsafe fn _mm256_setr_epi32(
 
 /// Set packed 64-bit integers in returned vector with the supplied values in
 /// reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_epi64x)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
     mem::transmute(i64x4::new(a, b, c, d))
 }
 
 /// Broadcast double-precision (64-bit) floating-point value `a` to all
 /// elements of returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_set1_pd(a: f64) -> __m256d {
     _mm256_setr_pd(a, a, a, a)
 }
 
 /// Broadcast single-precision (32-bit) floating-point value `a` to all
 /// elements of returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_set1_ps(a: f32) -> __m256 {
     _mm256_setr_ps(a, a, a, a, a, a, a, a)
 }
 
 /// Broadcast 8-bit integer `a` to all elements of returned vector.
 /// This intrinsic may generate the `vpbroadcastb`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_epi8)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vpshufb))]
 #[cfg_attr(test, assert_instr(vinsertf128))]
 // This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_set1_epi8(a: i8) -> __m256i {
     #[cfg_attr(rustfmt, rustfmt_skip)]
     _mm256_setr_epi8(
@@ -2117,112 +2599,148 @@ pub unsafe fn _mm256_set1_epi8(a: i8) -> __m256i {
 
 /// Broadcast 16-bit integer `a` to all all elements of returned vector.
 /// This intrinsic may generate the `vpbroadcastw`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_epi16)
 #[inline]
 #[target_feature(enable = "avx")]
 //#[cfg_attr(test, assert_instr(vpshufb))]
 #[cfg_attr(test, assert_instr(vinsertf128))]
 // This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_set1_epi16(a: i16) -> __m256i {
     _mm256_setr_epi16(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
 }
 
 /// Broadcast 32-bit integer `a` to all elements of returned vector.
 /// This intrinsic may generate the `vpbroadcastd`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_epi32)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_set1_epi32(a: i32) -> __m256i {
     _mm256_setr_epi32(a, a, a, a, a, a, a, a)
 }
 
 /// Broadcast 64-bit integer `a` to all elements of returned vector.
 /// This intrinsic may generate the `vpbroadcastq`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_epi64x)
 #[inline]
 #[target_feature(enable = "avx")]
 //#[cfg_attr(test, assert_instr(vmovddup))]
 #[cfg_attr(test, assert_instr(vinsertf128))]
 // This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_set1_epi64x(a: i64) -> __m256i {
     _mm256_setr_epi64x(a, a, a, a)
 }
 
 /// Cast vector of type __m256d to type __m256.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_castpd_ps(a: __m256d) -> __m256 {
     mem::transmute(a)
 }
 
 /// Cast vector of type __m256 to type __m256d.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castps_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_castps_pd(a: __m256) -> __m256d {
     mem::transmute(a)
 }
 
 /// Casts vector of type __m256 to type __m256i.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castps_si256)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_castps_si256(a: __m256) -> __m256i {
     mem::transmute(a)
 }
 
 /// Casts vector of type __m256i to type __m256.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castsi256_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
     mem::transmute(a)
 }
 
 /// Casts vector of type __m256d to type __m256i.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd_si256)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_castpd_si256(a: __m256d) -> __m256i {
     mem::transmute(a)
 }
 
 /// Casts vector of type __m256i to type __m256d.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castsi256_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
     mem::transmute(a)
 }
 
 /// Casts vector of type __m256 to type __m128.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castps256_ps128)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_castps256_ps128(a: __m256) -> __m128 {
     simd_shuffle4(a, a, [0, 1, 2, 3])
 }
 
 /// Casts vector of type __m256d to type __m128d.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd256_pd128)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
     simd_shuffle2(a, a, [0, 1])
 }
 
 /// Casts vector of type __m256i to type __m128i.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castsi256_si128)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
     let a = a.as_i64x4();
     let dst: i64x2 = simd_shuffle2(a, a, [0, 1]);
@@ -2231,10 +2749,13 @@ pub unsafe fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
 
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castps128_ps256)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_castps128_ps256(a: __m128) -> __m256 {
     // FIXME simd_shuffle8(a, a, [0, 1, 2, 3, -1, -1, -1, -1])
     simd_shuffle8(a, a, [0, 1, 2, 3, 0, 0, 0, 0])
@@ -2242,10 +2763,13 @@ pub unsafe fn _mm256_castps128_ps256(a: __m128) -> __m256 {
 
 /// Casts vector of type __m128d to type __m256d;
 /// the upper 128 bits of the result are undefined.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd128_pd256)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
     // FIXME simd_shuffle4(a, a, [0, 1, -1, -1])
     simd_shuffle4(a, a, [0, 1, 0, 0])
@@ -2253,10 +2777,13 @@ pub unsafe fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
 
 /// Casts vector of type __m128i to type __m256i;
 /// the upper 128 bits of the result are undefined.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castsi128_si256)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
     let a = a.as_i64x2();
     // FIXME simd_shuffle4(a, a, [0, 1, -1, -1])
@@ -2267,10 +2794,13 @@ pub unsafe fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
 /// Constructs a 256-bit floating-point vector of [8 x float] from a
 /// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
 /// the value of the source vector. The upper 128 bits are set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zextps128_ps256)
 #[inline]
 #[target_feature(enable = "avx,sse")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
     simd_shuffle8(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7])
 }
@@ -2278,10 +2808,13 @@ pub unsafe fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
 /// Constructs a 256-bit integer vector from a 128-bit integer vector.
 /// The lower 128 bits contain the value of the source vector. The upper
 /// 128 bits are set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zextsi128_si256)
 #[inline]
 #[target_feature(enable = "avx,sse2")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
     let b = _mm_setzero_si128().as_i64x2();
     let dst: i64x4 = simd_shuffle4(a.as_i64x2(), b, [0, 1, 2, 3]);
@@ -2292,50 +2825,68 @@ pub unsafe fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
 /// 128-bit floating-point vector of [2 x double]. The lower 128 bits
 /// contain the value of the source vector. The upper 128 bits are set
 /// to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zextpd128_pd256)
 #[inline]
 #[target_feature(enable = "avx,sse2")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
     simd_shuffle4(a, _mm_setzero_pd(), [0, 1, 2, 3])
 }
 
 /// Return vector of type `__m256` with undefined elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_undefined_ps)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_undefined_ps() -> __m256 {
     _mm256_set1_ps(mem::uninitialized())
 }
 
 /// Return vector of type `__m256d` with undefined elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_undefined_pd)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_undefined_pd() -> __m256d {
     _mm256_set1_pd(mem::uninitialized())
 }
 
 /// Return vector of type __m256i with undefined elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_undefined_si256)
 #[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_undefined_si256() -> __m256i {
     _mm256_set1_epi8(mem::uninitialized())
 }
 
 /// Set packed __m256 returned vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_m128)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
     simd_shuffle8(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7])
 }
 
 /// Set packed __m256d returned vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_m128d)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d {
     let hi: __m128 = mem::transmute(hi);
     let lo: __m128 = mem::transmute(lo);
@@ -2343,9 +2894,12 @@ pub unsafe fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d {
 }
 
 /// Set packed __m256i returned vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_m128i)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
     let hi: __m128 = mem::transmute(hi);
     let lo: __m128 = mem::transmute(lo);
@@ -2353,25 +2907,34 @@ pub unsafe fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
 }
 
 /// Set packed __m256 returned vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_m128)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 {
     _mm256_set_m128(hi, lo)
 }
 
 /// Set packed __m256d returned vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_m128d)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d {
     _mm256_set_m128d(hi, lo)
 }
 
 /// Set packed __m256i returned vector with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_m128i)
 #[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
     _mm256_set_m128i(hi, lo)
 }
@@ -2380,9 +2943,12 @@ pub unsafe fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
 /// floating-point elements) from memory, and combine them into a 256-bit
 /// value.
 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu2_m128)
 #[inline]
 #[target_feature(enable = "avx,sse")]
 // This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_loadu2_m128(
     hiaddr: *const f32, loaddr: *const f32
 ) -> __m256 {
@@ -2394,9 +2960,12 @@ pub unsafe fn _mm256_loadu2_m128(
 /// floating-point elements) from memory, and combine them into a 256-bit
 /// value.
 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu2_m128d)
 #[inline]
 #[target_feature(enable = "avx,sse2")]
 // This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_loadu2_m128d(
     hiaddr: *const f64, loaddr: *const f64
 ) -> __m256d {
@@ -2407,9 +2976,12 @@ pub unsafe fn _mm256_loadu2_m128d(
 /// Load two 128-bit values (composed of integer data) from memory, and combine
 /// them into a 256-bit value.
 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu2_m128i)
 #[inline]
 #[target_feature(enable = "avx,sse2")]
 // This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_loadu2_m128i(
     hiaddr: *const __m128i, loaddr: *const __m128i
 ) -> __m256i {
@@ -2421,9 +2993,12 @@ pub unsafe fn _mm256_loadu2_m128i(
 /// single-precision (32-bit) floating-point elements) from `a` into memory two
 /// different 128-bit locations.
 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu2_m128)
 #[inline]
 #[target_feature(enable = "avx,sse")]
 // This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_storeu2_m128(
     hiaddr: *mut f32, loaddr: *mut f32, a: __m256
 ) {
@@ -2437,9 +3012,12 @@ pub unsafe fn _mm256_storeu2_m128(
 /// double-precision (64-bit) floating-point elements) from `a` into memory two
 /// different 128-bit locations.
 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu2_m128d)
 #[inline]
 #[target_feature(enable = "avx,sse2")]
 // This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_storeu2_m128d(
     hiaddr: *mut f64, loaddr: *mut f64, a: __m256d
 ) {
@@ -2452,9 +3030,12 @@ pub unsafe fn _mm256_storeu2_m128d(
 /// Store the high and low 128-bit halves (each composed of integer data) from
 /// `a` into memory two different 128-bit locations.
 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu2_m128i)
 #[inline]
 #[target_feature(enable = "avx,sse2")]
 // This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_storeu2_m128i(
     hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i
 ) {
@@ -2465,9 +3046,12 @@ pub unsafe fn _mm256_storeu2_m128i(
 }
 
 /// Returns the first element of the input vector of [8 x float].
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtss_f32)
 #[inline]
 #[target_feature(enable = "avx")]
 //#[cfg_attr(test, assert_instr(movss))] FIXME
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtss_f32(a: __m256) -> f32 {
     simd_extract(a, 0)
 }
diff --git a/coresimd/x86/avx2.rs b/coresimd/x86/avx2.rs
index 66683f630d..a1e9e1f2a6 100644
--- a/coresimd/x86/avx2.rs
+++ b/coresimd/x86/avx2.rs
@@ -27,99 +27,135 @@ use mem;
 use stdsimd_test::assert_instr;
 
 /// Computes the absolute values of packed 32-bit integers in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpabsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_abs_epi32(a: __m256i) -> __m256i {
     mem::transmute(pabsd(a.as_i32x8()))
 }
 
 /// Computes the absolute values of packed 16-bit integers in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpabsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_abs_epi16(a: __m256i) -> __m256i {
     mem::transmute(pabsw(a.as_i16x16()))
 }
 
 /// Computes the absolute values of packed 8-bit integers in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi8)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpabsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_abs_epi8(a: __m256i) -> __m256i {
     mem::transmute(pabsb(a.as_i8x32()))
 }
 
 /// Add packed 64-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpaddq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(simd_add(a.as_i64x4(), b.as_i64x4()))
 }
 
 /// Add packed 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpaddd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(simd_add(a.as_i32x8(), b.as_i32x8()))
 }
 
 /// Add packed 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpaddw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(simd_add(a.as_i16x16(), b.as_i16x16()))
 }
 
 /// Add packed 8-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_epi8)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpaddb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(simd_add(a.as_i8x32(), b.as_i8x32()))
 }
 
 /// Add packed 8-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epi8)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpaddsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(paddsb(a.as_i8x32(), b.as_i8x32()))
 }
 
 /// Add packed 16-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpaddsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(paddsw(a.as_i16x16(), b.as_i16x16()))
 }
 
 /// Add packed unsigned 8-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epu8)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpaddusb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(paddusb(a.as_u8x32(), b.as_u8x32()))
 }
 
 /// Add packed unsigned 16-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_adds_epu16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpaddusw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(paddusw(a.as_u16x16(), b.as_u16x16()))
 }
 
 /// Concatenate pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
 /// result, shift the result right by `n` bytes, and return the low 16 bytes.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_alignr_epi8)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpalignr, n = 7))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_alignr_epi8(a: __m256i, b: __m256i, n: i32) -> __m256i {
     let n = n as u32;
     // If palignr is shifting the pair of vectors more than the size of two
@@ -284,18 +320,24 @@ pub unsafe fn _mm256_alignr_epi8(a: __m256i, b: __m256i, n: i32) -> __m256i {
 
 /// Compute the bitwise AND of 256 bits (representing integer data)
 /// in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_and_si256)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vandps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(simd_and(a.as_i64x4(), b.as_i64x4()))
 }
 
 /// Compute the bitwise NOT of 256 bits (representing integer data)
 /// in `a` and then AND with `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_andnot_si256)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vandnps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
     let all_ones = _mm256_set1_epi8(-1);
     mem::transmute(simd_and(
@@ -305,26 +347,35 @@ pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Average packed unsigned 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_avg_epu16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpavgw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(pavgw(a.as_u16x16(), b.as_u16x16()))
 }
 
 /// Average packed unsigned 8-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_avg_epu8)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpavgb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(pavgb(a.as_u8x32(), b.as_u8x32()))
 }
 
 /// Blend packed 32-bit integers from `a` and `b` using control mask `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vblendps, imm8 = 9))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_blend_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
     let imm8 = (imm8 & 0xFF) as u8;
     let a = a.as_i32x4();
@@ -354,10 +405,13 @@ pub unsafe fn _mm_blend_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 }
 
 /// Blend packed 32-bit integers from `a` and `b` using control mask `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blend_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vblendps, imm8 = 9))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_blend_epi32(
     a: __m256i, b: __m256i, imm8: i32
 ) -> __m256i {
@@ -418,10 +472,13 @@ pub unsafe fn _mm256_blend_epi32(
 }
 
 /// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blend_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpblendw, imm8 = 9))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_blend_epi16(
     a: __m256i, b: __m256i, imm8: i32
 ) -> __m256i {
@@ -639,9 +696,12 @@ pub unsafe fn _mm256_blend_epi16(
 }
 
 /// Blend packed 8-bit integers from `a` and `b` using `mask`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_blendv_epi8)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpblendvb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_blendv_epi8(
     a: __m256i, b: __m256i, mask: __m256i
 ) -> __m256i {
@@ -654,9 +714,12 @@ pub unsafe fn _mm256_blendv_epi8(
 
 /// Broadcast the low packed 8-bit integer from `a` to all elements of
 /// the 128-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastb_epi8)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpbroadcastb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
     let zero = _mm_setzero_si128();
     let ret = simd_shuffle16(a.as_i8x16(), zero.as_i8x16(), [0_u32; 16]);
@@ -665,9 +728,12 @@ pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
 
 /// Broadcast the low packed 8-bit integer from `a` to all elements of
 /// the 256-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastb_epi8)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpbroadcastb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
     let zero = _mm_setzero_si128();
     let ret = simd_shuffle32(a.as_i8x16(), zero.as_i8x16(), [0_u32; 32]);
@@ -678,9 +744,12 @@ pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
 // often compiled to vbroadcastss.
 /// Broadcast the low packed 32-bit integer from `a` to all elements of
 /// the 128-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastd_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
     let zero = _mm_setzero_si128();
     let ret = simd_shuffle4(a.as_i32x4(), zero.as_i32x4(), [0_u32; 4]);
@@ -691,9 +760,12 @@ pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
 // often compiled to vbroadcastss.
 /// Broadcast the low packed 32-bit integer from `a` to all elements of
 /// the 256-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastd_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
     let zero = _mm_setzero_si128();
     let ret = simd_shuffle8(a.as_i32x4(), zero.as_i32x4(), [0_u32; 8]);
@@ -702,9 +774,12 @@ pub unsafe fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
 
 /// Broadcast the low packed 64-bit integer from `a` to all elements of
 /// the 128-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastq_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpbroadcastq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
     let zero = _mm_setzero_si128().as_i64x2();
     let ret = simd_shuffle2(a.as_i64x2(), zero, [0_u32; 2]);
@@ -715,9 +790,12 @@ pub unsafe fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
 // often compiled to vbroadcastsd.
 /// Broadcast the low packed 64-bit integer from `a` to all elements of
 /// the 256-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastq_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vbroadcastsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
     let zero = _mm_setzero_si128();
     let ret = simd_shuffle4(a.as_i64x2(), zero.as_i64x2(), [0_u32; 4]);
@@ -726,18 +804,24 @@ pub unsafe fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
 
 /// Broadcast the low double-precision (64-bit) floating-point element
 /// from `a` to all elements of the 128-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastsd_pd)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vmovddup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
     simd_shuffle2(a, _mm_setzero_pd(), [0_u32; 2])
 }
 
 /// Broadcast the low double-precision (64-bit) floating-point element
 /// from `a` to all elements of the 256-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastsd_pd)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vbroadcastsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
     simd_shuffle4(a, _mm_setzero_pd(), [0_u32; 4])
 }
@@ -746,8 +830,11 @@ pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
 // vbroadcastf128.
 /// Broadcast 128 bits of integer data from a to all 128-bit lanes in
 /// the 256-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastsi128_si256)
 #[inline]
 #[target_feature(enable = "avx2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
     let zero = _mm_setzero_si128();
     let ret = simd_shuffle4(a.as_i64x2(), zero.as_i64x2(), [0, 1, 0, 1]);
@@ -756,27 +843,36 @@ pub unsafe fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
 
 /// Broadcast the low single-precision (32-bit) floating-point element
 /// from `a` to all elements of the 128-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastss_ps)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_broadcastss_ps(a: __m128) -> __m128 {
     simd_shuffle4(a, _mm_setzero_ps(), [0_u32; 4])
 }
 
 /// Broadcast the low single-precision (32-bit) floating-point element
 /// from `a` to all elements of the 256-bit returned value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastss_ps)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
     simd_shuffle8(a, _mm_setzero_ps(), [0_u32; 8])
 }
 
 /// Broadcast the low packed 16-bit integer from a to all elements of
 /// the 128-bit returned value
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_broadcastw_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpbroadcastw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
     let zero = _mm_setzero_si128();
     let ret = simd_shuffle8(a.as_i16x8(), zero.as_i16x8(), [0_u32; 8]);
@@ -785,9 +881,12 @@ pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
 
 /// Broadcast the low packed 16-bit integer from a to all elements of
 /// the 256-bit returned value
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcastw_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpbroadcastw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
     let zero = _mm_setzero_si128();
     let ret = simd_shuffle16(a.as_i16x8(), zero.as_i16x8(), [0_u32; 16]);
@@ -795,81 +894,111 @@ pub unsafe fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
 }
 
 /// Compare packed 64-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpcmpeqq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute::<i64x4, _>(simd_eq(a.as_i64x4(), b.as_i64x4()))
 }
 
 /// Compare packed 32-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpcmpeqd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8()))
 }
 
 /// Compare packed 16-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpcmpeqw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16()))
 }
 
 /// Compare packed 8-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi8)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpcmpeqb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32()))
 }
 
 /// Compare packed 64-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpcmpgtq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute::<i64x4, _>(simd_gt(a.as_i64x4(), b.as_i64x4()))
 }
 
 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpcmpgtd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8()))
 }
 
 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpcmpgtw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16()))
 }
 
 /// Compare packed 8-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi8)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpcmpgtb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32()))
 }
 
 /// Sign-extend 16-bit integers to 32-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovsxwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
     mem::transmute::<i32x8, _>(simd_cast(a.as_i16x8()))
 }
 
 /// Sign-extend 16-bit integers to 64-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovsxwq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
     let a = a.as_i16x8();
     let v64: i16x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
@@ -877,25 +1006,34 @@ pub unsafe fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
 }
 
 /// Sign-extend 32-bit integers to 64-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovsxdq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
     mem::transmute::<i64x4, _>(simd_cast(a.as_i32x4()))
 }
 
 /// Sign-extend 8-bit integers to 16-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi8_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovsxbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
     mem::transmute::<i16x16, _>(simd_cast(a.as_i8x16()))
 }
 
 /// Sign-extend 8-bit integers to 32-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi8_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovsxbd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
     let a = a.as_i8x16();
     let v64: i8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
@@ -903,9 +1041,12 @@ pub unsafe fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
 }
 
 /// Sign-extend 8-bit integers to 64-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi8_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovsxbq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
     let a = a.as_i8x16();
     let v32: i8x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
@@ -914,18 +1055,24 @@ pub unsafe fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
 
 /// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit
 /// integers, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovzxwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
     mem::transmute::<i32x8, _>(simd_cast(a.as_u16x8()))
 }
 
 /// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
 /// integers. The upper four elements of `a` are unused.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovzxwq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
     let a = a.as_u16x8();
     let v64: u16x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
@@ -933,26 +1080,35 @@ pub unsafe fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
 }
 
 /// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu32_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovzxdq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
     mem::transmute::<i64x4, _>(simd_cast(a.as_u32x4()))
 }
 
 /// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu8_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovzxbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
     mem::transmute::<i16x16, _>(simd_cast(a.as_u8x16()))
 }
 
 /// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
 /// integers. The upper eight elements of `a` are unused.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu8_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovzxbd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
     let a = a.as_u8x16();
     let v64: u8x8 = simd_shuffle8(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
@@ -961,9 +1117,12 @@ pub unsafe fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
 
 /// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
 /// integers. The upper twelve elements of `a` are unused.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu8_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovzxbq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
     let a = a.as_u8x16();
     let v32: u8x4 = simd_shuffle4(a, a, [0, 1, 2, 3]);
@@ -971,10 +1130,13 @@ pub unsafe fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
 }
 
 /// Extract 128 bits (of integer data) from `a` selected with `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extracti128_si256)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vextractf128, imm8 = 1))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_extracti128_si256(a: __m256i, imm8: i32) -> __m128i {
     let a = a.as_i64x4();
     let b = _mm256_undefined_si256().as_i64x4();
@@ -986,51 +1148,69 @@ pub unsafe fn _mm256_extracti128_si256(a: __m256i, imm8: i32) -> __m128i {
 }
 
 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadd_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vphaddw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(phaddw(a.as_i16x16(), b.as_i16x16()))
 }
 
 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadd_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vphaddd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(phaddd(a.as_i32x8(), b.as_i32x8()))
 }
 
 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`
 /// using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hadds_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vphaddsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(phaddsw(a.as_i16x16(), b.as_i16x16()))
 }
 
 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsub_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vphsubw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(phsubw(a.as_i16x16(), b.as_i16x16()))
 }
 
 /// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsub_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vphsubd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(phsubd(a.as_i32x8(), b.as_i32x8()))
 }
 
 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
 /// using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_hsubs_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vphsubsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(phsubsw(a.as_i16x16(), b.as_i16x16()))
 }
@@ -1038,10 +1218,13 @@ pub unsafe fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_i32gather_epi32(
     slice: *const i32, offsets: __m128i, scale: i32
 ) -> __m128i {
@@ -1062,10 +1245,13 @@ pub unsafe fn _mm_i32gather_epi32(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
 #[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mask_i32gather_epi32(
     src: __m128i, slice: *const i32, offsets: __m128i, mask: __m128i,
     scale: i32,
@@ -1086,10 +1272,13 @@ pub unsafe fn _mm_mask_i32gather_epi32(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_i32gather_epi32(
     slice: *const i32, offsets: __m256i, scale: i32
 ) -> __m256i {
@@ -1110,10 +1299,13 @@ pub unsafe fn _mm256_i32gather_epi32(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
 #[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mask_i32gather_epi32(
     src: __m256i, slice: *const i32, offsets: __m256i, mask: __m256i,
     scale: i32,
@@ -1134,10 +1326,13 @@ pub unsafe fn _mm256_mask_i32gather_epi32(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_ps)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_i32gather_ps(
     slice: *const f32, offsets: __m128i, scale: i32
 ) -> __m128 {
@@ -1157,10 +1352,13 @@ pub unsafe fn _mm_i32gather_ps(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_ps)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
 #[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mask_i32gather_ps(
     src: __m128, slice: *const f32, offsets: __m128i, mask: __m128, scale: i32
 ) -> __m128 {
@@ -1177,10 +1375,13 @@ pub unsafe fn _mm_mask_i32gather_ps(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_ps)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_i32gather_ps(
     slice: *const f32, offsets: __m256i, scale: i32
 ) -> __m256 {
@@ -1200,10 +1401,13 @@ pub unsafe fn _mm256_i32gather_ps(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_ps)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
 #[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mask_i32gather_ps(
     src: __m256, slice: *const f32, offsets: __m256i, mask: __m256, scale: i32
 ) -> __m256 {
@@ -1220,10 +1424,13 @@ pub unsafe fn _mm256_mask_i32gather_ps(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_i32gather_epi64(
     slice: *const i64, offsets: __m128i, scale: i32
 ) -> __m128i {
@@ -1244,10 +1451,13 @@ pub unsafe fn _mm_i32gather_epi64(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
 #[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mask_i32gather_epi64(
     src: __m128i, slice: *const i64, offsets: __m128i, mask: __m128i,
     scale: i32,
@@ -1268,10 +1478,13 @@ pub unsafe fn _mm_mask_i32gather_epi64(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_i32gather_epi64(
     slice: *const i64, offsets: __m128i, scale: i32
 ) -> __m256i {
@@ -1292,10 +1505,13 @@ pub unsafe fn _mm256_i32gather_epi64(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
 #[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mask_i32gather_epi64(
     src: __m256i, slice: *const i64, offsets: __m128i, mask: __m256i,
     scale: i32,
@@ -1316,10 +1532,13 @@ pub unsafe fn _mm256_mask_i32gather_epi64(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i32gather_pd)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_i32gather_pd(
     slice: *const f64, offsets: __m128i, scale: i32
 ) -> __m128d {
@@ -1339,10 +1558,13 @@ pub unsafe fn _mm_i32gather_pd(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i32gather_pd)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
 #[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mask_i32gather_pd(
     src: __m128d, slice: *const f64, offsets: __m128i, mask: __m128d,
     scale: i32,
@@ -1360,10 +1582,13 @@ pub unsafe fn _mm_mask_i32gather_pd(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i32gather_pd)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_i32gather_pd(
     slice: *const f64, offsets: __m128i, scale: i32
 ) -> __m256d {
@@ -1383,10 +1608,13 @@ pub unsafe fn _mm256_i32gather_pd(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i32gather_pd)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
 #[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mask_i32gather_pd(
     src: __m256d, slice: *const f64, offsets: __m128i, mask: __m256d,
     scale: i32,
@@ -1404,10 +1632,13 @@ pub unsafe fn _mm256_mask_i32gather_pd(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_i64gather_epi32(
     slice: *const i32, offsets: __m128i, scale: i32
 ) -> __m128i {
@@ -1428,10 +1659,13 @@ pub unsafe fn _mm_i64gather_epi32(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
 #[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mask_i64gather_epi32(
     src: __m128i, slice: *const i32, offsets: __m128i, mask: __m128i,
     scale: i32,
@@ -1452,10 +1686,13 @@ pub unsafe fn _mm_mask_i64gather_epi32(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_i64gather_epi32(
     slice: *const i32, offsets: __m256i, scale: i32
 ) -> __m128i {
@@ -1476,10 +1713,13 @@ pub unsafe fn _mm256_i64gather_epi32(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
 #[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mask_i64gather_epi32(
     src: __m128i, slice: *const i32, offsets: __m256i, mask: __m128i,
     scale: i32,
@@ -1500,10 +1740,13 @@ pub unsafe fn _mm256_mask_i64gather_epi32(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_ps)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_i64gather_ps(
     slice: *const f32, offsets: __m128i, scale: i32
 ) -> __m128 {
@@ -1523,10 +1766,13 @@ pub unsafe fn _mm_i64gather_ps(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_ps)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
 #[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mask_i64gather_ps(
     src: __m128, slice: *const f32, offsets: __m128i, mask: __m128, scale: i32
 ) -> __m128 {
@@ -1543,10 +1789,13 @@ pub unsafe fn _mm_mask_i64gather_ps(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_ps)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_i64gather_ps(
     slice: *const f32, offsets: __m256i, scale: i32
 ) -> __m128 {
@@ -1566,10 +1815,13 @@ pub unsafe fn _mm256_i64gather_ps(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_ps)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
 #[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mask_i64gather_ps(
     src: __m128, slice: *const f32, offsets: __m256i, mask: __m128, scale: i32
 ) -> __m128 {
@@ -1586,10 +1838,13 @@ pub unsafe fn _mm256_mask_i64gather_ps(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_i64gather_epi64(
     slice: *const i64, offsets: __m128i, scale: i32
 ) -> __m128i {
@@ -1610,10 +1865,13 @@ pub unsafe fn _mm_i64gather_epi64(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
 #[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mask_i64gather_epi64(
     src: __m128i, slice: *const i64, offsets: __m128i, mask: __m128i,
     scale: i32,
@@ -1634,10 +1892,13 @@ pub unsafe fn _mm_mask_i64gather_epi64(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_i64gather_epi64(
     slice: *const i64, offsets: __m256i, scale: i32
 ) -> __m256i {
@@ -1658,10 +1919,13 @@ pub unsafe fn _mm256_i64gather_epi64(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
 #[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mask_i64gather_epi64(
     src: __m256i, slice: *const i64, offsets: __m256i, mask: __m256i,
     scale: i32,
@@ -1682,10 +1946,13 @@ pub unsafe fn _mm256_mask_i64gather_epi64(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_i64gather_pd)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_i64gather_pd(
     slice: *const f64, offsets: __m128i, scale: i32
 ) -> __m128d {
@@ -1705,10 +1972,13 @@ pub unsafe fn _mm_i64gather_pd(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_i64gather_pd)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
 #[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mask_i64gather_pd(
     src: __m128d, slice: *const f64, offsets: __m128i, mask: __m128d,
     scale: i32,
@@ -1726,10 +1996,13 @@ pub unsafe fn _mm_mask_i64gather_pd(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_i64gather_pd)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_i64gather_pd(
     slice: *const f64, offsets: __m256i, scale: i32
 ) -> __m256d {
@@ -1749,10 +2022,13 @@ pub unsafe fn _mm256_i64gather_pd(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_i64gather_pd)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
 #[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mask_i64gather_pd(
     src: __m256d, slice: *const f64, offsets: __m256i, mask: __m256d,
     scale: i32,
@@ -1769,10 +2045,13 @@ pub unsafe fn _mm256_mask_i64gather_pd(
 
 /// Copy `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
 /// location specified by `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_inserti128_si256)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_inserti128_si256(
     a: __m256i, b: __m128i, imm8: i32
 ) -> __m256i {
@@ -1788,9 +2067,12 @@ pub unsafe fn _mm256_inserti128_si256(
 /// Multiply packed signed 16-bit integers in `a` and `b`, producing
 /// intermediate signed 32-bit integers. Horizontally add adjacent pairs
 /// of intermediate 32-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_madd_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaddwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(pmaddwd(a.as_i16x16(), b.as_i16x16()))
 }
@@ -1799,9 +2081,12 @@ pub unsafe fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// corresponding signed 8-bit integer from `b`, producing intermediate
 /// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
 /// signed 16-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maddubs_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaddubsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(pmaddubsw(a.as_u8x32(), b.as_u8x32()))
 }
@@ -1809,9 +2094,12 @@ pub unsafe fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// Load packed 32-bit integers from memory pointed by `mem_addr` using `mask`
 /// (elements are zeroed out when the highest bit is not set in the
 /// corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskload_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaskmovd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_maskload_epi32(
     mem_addr: *const i32, mask: __m128i
 ) -> __m128i {
@@ -1824,9 +2112,12 @@ pub unsafe fn _mm_maskload_epi32(
 /// Load packed 32-bit integers from memory pointed by `mem_addr` using `mask`
 /// (elements are zeroed out when the highest bit is not set in the
 /// corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskload_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaskmovd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_maskload_epi32(
     mem_addr: *const i32, mask: __m256i
 ) -> __m256i {
@@ -1839,9 +2130,12 @@ pub unsafe fn _mm256_maskload_epi32(
 /// Load packed 64-bit integers from memory pointed by `mem_addr` using `mask`
 /// (elements are zeroed out when the highest bit is not set in the
 /// corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskload_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaskmovq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_maskload_epi64(
     mem_addr: *const i64, mask: __m128i
 ) -> __m128i {
@@ -1854,9 +2148,12 @@ pub unsafe fn _mm_maskload_epi64(
 /// Load packed 64-bit integers from memory pointed by `mem_addr` using `mask`
 /// (elements are zeroed out when the highest bit is not set in the
 /// corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskload_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaskmovq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_maskload_epi64(
     mem_addr: *const i64, mask: __m256i
 ) -> __m256i {
@@ -1869,9 +2166,12 @@ pub unsafe fn _mm256_maskload_epi64(
 /// Store packed 32-bit integers from `a` into memory pointed by `mem_addr`
 /// using `mask` (elements are not stored when the highest bit is not set
 /// in the corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskstore_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaskmovd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_maskstore_epi32(
     mem_addr: *mut i32, mask: __m128i, a: __m128i
 ) {
@@ -1885,9 +2185,12 @@ pub unsafe fn _mm_maskstore_epi32(
 /// Store packed 32-bit integers from `a` into memory pointed by `mem_addr`
 /// using `mask` (elements are not stored when the highest bit is not set
 /// in the corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskstore_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaskmovd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_maskstore_epi32(
     mem_addr: *mut i32, mask: __m256i, a: __m256i
 ) {
@@ -1901,9 +2204,12 @@ pub unsafe fn _mm256_maskstore_epi32(
 /// Store packed 64-bit integers from `a` into memory pointed by `mem_addr`
 /// using `mask` (elements are not stored when the highest bit is not set
 /// in the corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskstore_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaskmovq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_maskstore_epi64(
     mem_addr: *mut i64, mask: __m128i, a: __m128i
 ) {
@@ -1917,9 +2223,12 @@ pub unsafe fn _mm_maskstore_epi64(
 /// Store packed 64-bit integers from `a` into memory pointed by `mem_addr`
 /// using `mask` (elements are not stored when the highest bit is not set
 /// in the corresponding element).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskstore_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaskmovq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_maskstore_epi64(
     mem_addr: *mut i64, mask: __m256i, a: __m256i
 ) {
@@ -1932,117 +2241,156 @@ pub unsafe fn _mm256_maskstore_epi64(
 
 /// Compare packed 16-bit integers in `a` and `b`, and return the packed
 /// maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaxsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(pmaxsw(a.as_i16x16(), b.as_i16x16()))
 }
 
 /// Compare packed 32-bit integers in `a` and `b`, and return the packed
 /// maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaxsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(pmaxsd(a.as_i32x8(), b.as_i32x8()))
 }
 
 /// Compare packed 8-bit integers in `a` and `b`, and return the packed
 /// maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi8)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaxsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(pmaxsb(a.as_i8x32(), b.as_i8x32()))
 }
 
 /// Compare packed unsigned 16-bit integers in `a` and `b`, and return
 /// the packed maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaxuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(pmaxuw(a.as_u16x16(), b.as_u16x16()))
 }
 
 /// Compare packed unsigned 32-bit integers in `a` and `b`, and return
 /// the packed maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaxud))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(pmaxud(a.as_u32x8(), b.as_u32x8()))
 }
 
 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return
 /// the packed maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu8)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaxub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(pmaxub(a.as_u8x32(), b.as_u8x32()))
 }
 
 /// Compare packed 16-bit integers in `a` and `b`, and return the packed
 /// minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpminsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(pminsw(a.as_i16x16(), b.as_i16x16()))
 }
 
 /// Compare packed 32-bit integers in `a` and `b`, and return the packed
 /// minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpminsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(pminsd(a.as_i32x8(), b.as_i32x8()))
 }
 
 /// Compare packed 8-bit integers in `a` and `b`, and return the packed
 /// minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi8)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpminsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(pminsb(a.as_i8x32(), b.as_i8x32()))
 }
 
 /// Compare packed unsigned 16-bit integers in `a` and `b`, and return
 /// the packed minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpminuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(pminuw(a.as_u16x16(), b.as_u16x16()))
 }
 
 /// Compare packed unsigned 32-bit integers in `a` and `b`, and return
 /// the packed minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpminud))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(pminud(a.as_u32x8(), b.as_u32x8()))
 }
 
 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return
 /// the packed minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu8)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpminub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(pminub(a.as_u8x32(), b.as_u8x32()))
 }
 
 /// Create mask from the most significant bit of each 8-bit element in `a`,
 /// return the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_movemask_epi8)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovmskb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 {
     pmovmskb(a.as_i8x32())
 }
@@ -2054,10 +2402,13 @@ pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 {
 /// selected from `b` starting at on the offset specified in `imm8`. Eight
 /// quadruplets are formed from sequential 8-bit integers selected from `a`
 /// starting at the offset specified in `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mpsadbw_epu8)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vmpsadbw, imm8 = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mpsadbw_epu8(
     a: __m256i, b: __m256i, imm8: i32
 ) -> __m256i {
@@ -2076,9 +2427,12 @@ pub unsafe fn _mm256_mpsadbw_epu8(
 /// `a` and `b`
 ///
 /// Return the 64-bit results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmuldq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(pmuldq(a.as_i32x8(), b.as_i32x8()))
 }
@@ -2087,9 +2441,12 @@ pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// element in `a` and `b`
 ///
 /// Return the unsigned 64-bit results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_epu32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmuludq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(pmuludq(a.as_u32x8(), b.as_u32x8()))
 }
@@ -2097,9 +2454,12 @@ pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
 /// Multiply the packed 16-bit integers in `a` and `b`, producing
 /// intermediate 32-bit integers and returning the high 16 bits of the
 /// intermediate integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mulhi_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmulhw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(pmulhw(a.as_i16x16(), b.as_i16x16()))
 }
@@ -2107,9 +2467,12 @@ pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing
 /// intermediate 32-bit integers and returning the high 16 bits of the
 /// intermediate integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mulhi_epu16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmulhuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(pmulhuw(a.as_u16x16(), b.as_u16x16()))
 }
@@ -2117,9 +2480,12 @@ pub unsafe fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
 /// Multiply the packed 16-bit integers in `a` and `b`, producing
 /// intermediate 32-bit integers, and return the low 16 bits of the
 /// intermediate integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mullo_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmullw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(simd_mul(a.as_i16x16(), b.as_i16x16()))
 }
@@ -2127,9 +2493,12 @@ pub unsafe fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// Multiply the packed 32-bit integers in `a` and `b`, producing
 /// intermediate 64-bit integers, and return the low 16 bits of the
 /// intermediate integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mullo_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmulld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(simd_mul(a.as_i32x8(), b.as_i32x8()))
 }
@@ -2138,54 +2507,72 @@ pub unsafe fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// intermediate signed 32-bit integers. Truncate each intermediate
 /// integer to the 18 most significant bits, round by adding 1, and
 /// return bits [16:1]
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mulhrs_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmulhrsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16()))
 }
 
 /// Compute the bitwise OR of 256 bits (representing integer data) in `a`
 /// and `b`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_or_si256)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(simd_or(a.as_i32x8(), b.as_i32x8()))
 }
 
 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using signed saturation
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packs_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpacksswb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(packsswb(a.as_i16x16(), b.as_i16x16()))
 }
 
 /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
 /// using signed saturation
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packs_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpackssdw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(packssdw(a.as_i32x8(), b.as_i32x8()))
 }
 
 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using unsigned saturation
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packus_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpackuswb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(packuswb(a.as_i16x16(), b.as_i16x16()))
 }
 
 /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
 /// using unsigned saturation
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_packus_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpackusdw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(packusdw(a.as_i32x8(), b.as_i32x8()))
 }
@@ -2194,18 +2581,24 @@ pub unsafe fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
 ///
 /// The last 3 bits of each integer of `b` are used as addresses into the 8
 /// integers of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutevar8x32_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpermps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(permd(a.as_u32x8(), b.as_u32x8()))
 }
 
 /// Permutes 64-bit integers from `a` using control mask `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute4x64_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpermpd, imm8 = 9))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_permute4x64_epi64(a: __m256i, imm8: i32) -> __m256i {
     let imm8 = (imm8 & 0xFF) as u8;
     let zero = _mm256_setzero_si256().as_i64x4();
@@ -2255,10 +2648,13 @@ pub unsafe fn _mm256_permute4x64_epi64(a: __m256i, imm8: i32) -> __m256i {
 }
 
 /// Shuffle 128-bits of integer data selected by `imm8` from `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute2x128_si256)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 9))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_permute2x128_si256(
     a: __m256i, b: __m256i, imm8: i32
 ) -> __m256i {
@@ -2274,10 +2670,13 @@ pub unsafe fn _mm256_permute2x128_si256(
 
 /// Shuffle 64-bit floating-point elements in `a` across lanes using the
 /// control in `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permute4x64_pd)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpermpd, imm8 = 1))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_permute4x64_pd(a: __m256d, imm8: i32) -> __m256d {
     let imm8 = (imm8 & 0xFF) as u8;
     let undef = _mm256_undefined_pd();
@@ -2326,9 +2725,12 @@ pub unsafe fn _mm256_permute4x64_pd(a: __m256d, imm8: i32) -> __m256d {
 
 /// Shuffle eight 32-bit foating-point elements in `a` across lanes using
 /// the corresponding 32-bit integer index in `idx`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutevar8x32_ps)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpermps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
     permps(a, idx.as_i32x8())
 }
@@ -2337,9 +2739,12 @@ pub unsafe fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
 /// and `b`, then horizontally sum each consecutive 8 differences to
 /// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
 /// integers in the low 16 bits of the 64-bit return value
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sad_epu8)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsadbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(psadbw(a.as_u8x32(), b.as_u8x32()))
 }
@@ -2373,9 +2778,12 @@ pub unsafe fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
 ///     r
 /// }
 /// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_epi8)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpshufb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(pshufb(a.as_u8x32(), b.as_u8x32()))
 }
@@ -2416,10 +2824,13 @@ pub unsafe fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// #     }
 /// # }
 /// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_shuffle_epi32(a: __m256i, imm8: i32) -> __m256i {
     // simd_shuffleX requires that its selector parameter be made up of
     // constant values, but we can't enforce that here. In spirit, we need
@@ -2491,10 +2902,13 @@ pub unsafe fn _mm256_shuffle_epi32(a: __m256i, imm8: i32) -> __m256i {
 /// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
 /// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
 /// to the output.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shufflehi_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpshufhw, imm8 = 9))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_shufflehi_epi16(a: __m256i, imm8: i32) -> __m256i {
     let imm8 = (imm8 & 0xFF) as u8;
     let a = a.as_i16x16();
@@ -2549,10 +2963,13 @@ pub unsafe fn _mm256_shufflehi_epi16(a: __m256i, imm8: i32) -> __m256i {
 /// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
 /// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
 /// to the output.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shufflelo_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpshuflw, imm8 = 9))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_shufflelo_epi16(a: __m256i, imm8: i32) -> __m256i {
     let imm8 = (imm8 & 0xFF) as u8;
     let a = a.as_i16x16();
@@ -2607,9 +3024,12 @@ pub unsafe fn _mm256_shufflelo_epi16(a: __m256i, imm8: i32) -> __m256i {
 /// Negate packed 16-bit integers in `a` when the corresponding signed
 /// 16-bit integer in `b` is negative, and return the results.
 /// Results are zeroed out when the corresponding element in `b` is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sign_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsignw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(psignw(a.as_i16x16(), b.as_i16x16()))
 }
@@ -2617,9 +3037,12 @@ pub unsafe fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// Negate packed 32-bit integers in `a` when the corresponding signed
 /// 32-bit integer in `b` is negative, and return the results.
 /// Results are zeroed out when the corresponding element in `b` is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sign_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsignd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(psignd(a.as_i32x8(), b.as_i32x8()))
 }
@@ -2627,72 +3050,96 @@ pub unsafe fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// Negate packed 8-bit integers in `a` when the corresponding signed
 /// 8-bit integer in `b` is negative, and return the results.
 /// Results are zeroed out when the corresponding element in `b` is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sign_epi8)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsignb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(psignb(a.as_i8x32(), b.as_i8x32()))
 }
 
 /// Shift packed 16-bit integers in `a` left by `count` while
 /// shifting in zeros, and return the result
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sll_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsllw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
     mem::transmute(psllw(a.as_i16x16(), count.as_i16x8()))
 }
 
 /// Shift packed 32-bit integers in `a` left by `count` while
 /// shifting in zeros, and return the result
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sll_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpslld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
     mem::transmute(pslld(a.as_i32x8(), count.as_i32x4()))
 }
 
 /// Shift packed 64-bit integers in `a` left by `count` while
 /// shifting in zeros, and return the result
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sll_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsllq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
     mem::transmute(psllq(a.as_i64x4(), count.as_i64x2()))
 }
 
 /// Shift packed 16-bit integers in `a` left by `imm8` while
 /// shifting in zeros, return the results;
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsllw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_slli_epi16(a: __m256i, imm8: i32) -> __m256i {
     mem::transmute(pslliw(a.as_i16x16(), imm8))
 }
 
 /// Shift packed 32-bit integers in `a` left by `imm8` while
 /// shifting in zeros, return the results;
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpslld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_slli_epi32(a: __m256i, imm8: i32) -> __m256i {
     mem::transmute(psllid(a.as_i32x8(), imm8))
 }
 
 /// Shift packed 64-bit integers in `a` left by `imm8` while
 /// shifting in zeros, return the results;
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsllq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_slli_epi64(a: __m256i, imm8: i32) -> __m256i {
     mem::transmute(pslliq(a.as_i64x4(), imm8))
 }
 
 /// Shift 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_slli_si256)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_slli_si256(a: __m256i, imm8: i32) -> __m256i {
     let a = a.as_i64x4();
     macro_rules! call {
@@ -2704,10 +3151,13 @@ pub unsafe fn _mm256_slli_si256(a: __m256i, imm8: i32) -> __m256i {
 }
 
 /// Shift 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bslli_epi128)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_bslli_epi128(a: __m256i, imm8: i32) -> __m256i {
     let a = a.as_i64x4();
     macro_rules! call {
@@ -2721,9 +3171,12 @@ pub unsafe fn _mm256_bslli_epi128(a: __m256i, imm8: i32) -> __m256i {
 /// Shift packed 32-bit integers in `a` left by the amount
 /// specified by the corresponding element in `count` while
 /// shifting in zeros, and return the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sllv_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsllvd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
     mem::transmute(psllvd(a.as_i32x4(), count.as_i32x4()))
 }
@@ -2731,9 +3184,12 @@ pub unsafe fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
 /// Shift packed 32-bit integers in `a` left by the amount
 /// specified by the corresponding element in `count` while
 /// shifting in zeros, and return the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sllv_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsllvd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
     mem::transmute(psllvd256(a.as_i32x8(), count.as_i32x8()))
 }
@@ -2741,9 +3197,12 @@ pub unsafe fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
 /// Shift packed 64-bit integers in `a` left by the amount
 /// specified by the corresponding element in `count` while
 /// shifting in zeros, and return the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sllv_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsllvq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
     mem::transmute(psllvq(a.as_i64x2(), count.as_i64x2()))
 }
@@ -2751,72 +3210,96 @@ pub unsafe fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
 /// Shift packed 64-bit integers in `a` left by the amount
 /// specified by the corresponding element in `count` while
 /// shifting in zeros, and return the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sllv_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsllvq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
     mem::transmute(psllvq256(a.as_i64x4(), count.as_i64x4()))
 }
 
 /// Shift packed 16-bit integers in `a` right by `count` while
 /// shifting in sign bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sra_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsraw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
     mem::transmute(psraw(a.as_i16x16(), count.as_i16x8()))
 }
 
 /// Shift packed 32-bit integers in `a` right by `count` while
 /// shifting in sign bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sra_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrad))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
     mem::transmute(psrad(a.as_i32x8(), count.as_i32x4()))
 }
 
 /// Shift packed 16-bit integers in `a` right by `imm8` while
 /// shifting in sign bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srai_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsraw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_srai_epi16(a: __m256i, imm8: i32) -> __m256i {
     mem::transmute(psraiw(a.as_i16x16(), imm8))
 }
 
 /// Shift packed 32-bit integers in `a` right by `imm8` while
 /// shifting in sign bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srai_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrad))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_srai_epi32(a: __m256i, imm8: i32) -> __m256i {
     mem::transmute(psraid(a.as_i32x8(), imm8))
 }
 
 /// Shift packed 32-bit integers in `a` right by the amount specified by the
 /// corresponding element in `count` while shifting in sign bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srav_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsravd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
     mem::transmute(psravd(a.as_i32x4(), count.as_i32x4()))
 }
 
 /// Shift packed 32-bit integers in `a` right by the amount specified by the
 /// corresponding element in `count` while shifting in sign bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srav_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsravd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
     mem::transmute(psravd256(a.as_i32x8(), count.as_i32x8()))
 }
 
 /// Shift 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_si256)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_srli_si256(a: __m256i, imm8: i32) -> __m256i {
     let a = a.as_i64x4();
     macro_rules! call {
@@ -2828,10 +3311,13 @@ pub unsafe fn _mm256_srli_si256(a: __m256i, imm8: i32) -> __m256i {
 }
 
 /// Shift 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bsrli_epi128)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_bsrli_epi128(a: __m256i, imm8: i32) -> __m256i {
     let a = a.as_i64x4();
     macro_rules! call {
@@ -2844,90 +3330,120 @@ pub unsafe fn _mm256_bsrli_epi128(a: __m256i, imm8: i32) -> __m256i {
 
 /// Shift packed 16-bit integers in `a` right by `count` while shifting in
 /// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srl_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrlw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
     mem::transmute(psrlw(a.as_i16x16(), count.as_i16x8()))
 }
 
 /// Shift packed 32-bit integers in `a` right by `count` while shifting in
 /// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srl_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
     mem::transmute(psrld(a.as_i32x8(), count.as_i32x4()))
 }
 
 /// Shift packed 64-bit integers in `a` right by `count` while shifting in
 /// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srl_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrlq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
     mem::transmute(psrlq(a.as_i64x4(), count.as_i64x2()))
 }
 
 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in
 /// zeros
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrlw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_srli_epi16(a: __m256i, imm8: i32) -> __m256i {
     mem::transmute(psrliw(a.as_i16x16(), imm8))
 }
 
 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in
 /// zeros
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_srli_epi32(a: __m256i, imm8: i32) -> __m256i {
     mem::transmute(psrlid(a.as_i32x8(), imm8))
 }
 
 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in
 /// zeros
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srli_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrlq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_srli_epi64(a: __m256i, imm8: i32) -> __m256i {
     mem::transmute(psrliq(a.as_i64x4(), imm8))
 }
 
 /// Shift packed 32-bit integers in `a` right by the amount specified by
 /// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srlv_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrlvd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
     mem::transmute(psrlvd(a.as_i32x4(), count.as_i32x4()))
 }
 
 /// Shift packed 32-bit integers in `a` right by the amount specified by
 /// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srlv_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrlvd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
     mem::transmute(psrlvd256(a.as_i32x8(), count.as_i32x8()))
 }
 
 /// Shift packed 64-bit integers in `a` right by the amount specified by
 /// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srlv_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrlvq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
     mem::transmute(psrlvq(a.as_i64x2(), count.as_i64x2()))
 }
 
 /// Shift packed 64-bit integers in `a` right by the amount specified by
 /// the corresponding element in `count` while shifting in zeros,
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srlv_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrlvq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
     mem::transmute(psrlvq256(a.as_i64x4(), count.as_i64x4()))
 }
@@ -2935,69 +3451,93 @@ pub unsafe fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
 // TODO _mm256_stream_load_si256 (__m256i const* mem_addr)
 
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsubw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(simd_sub(a.as_i16x16(), b.as_i16x16()))
 }
 
 /// Subtract packed 32-bit integers in `b` from packed 16-bit integers in `a`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsubd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(simd_sub(a.as_i32x8(), b.as_i32x8()))
 }
 
 /// Subtract packed 64-bit integers in `b` from packed 16-bit integers in `a`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsubq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(simd_sub(a.as_i64x4(), b.as_i64x4()))
 }
 
 /// Subtract packed 8-bit integers in `b` from packed 16-bit integers in `a`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_epi8)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsubb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(simd_sub(a.as_i8x32(), b.as_i8x32()))
 }
 
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
 /// `a` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsubsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(psubsw(a.as_i16x16(), b.as_i16x16()))
 }
 
 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
 /// `a` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epi8)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsubsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(psubsb(a.as_i8x32(), b.as_i8x32()))
 }
 
 /// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
 /// integers in `a` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epu16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsubusw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(psubusw(a.as_u16x16(), b.as_u16x16()))
 }
 
 /// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
 /// integers in `a` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_subs_epu8)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsubusb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(psubusb(a.as_u8x32(), b.as_u8x32()))
 }
@@ -3040,9 +3580,12 @@ pub unsafe fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
 /// #     }
 /// # }
 /// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi8)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpunpckhbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
     #[cfg_attr(rustfmt, rustfmt_skip)]
     let r: i8x32 = simd_shuffle32(a.as_i8x32(), b.as_i8x32(), [
@@ -3091,9 +3634,12 @@ pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// #     }
 /// # }
 /// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi8)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpunpcklbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
     #[cfg_attr(rustfmt, rustfmt_skip)]
     let r: i8x32 = simd_shuffle32(a.as_i8x32(), b.as_i8x32(), [
@@ -3140,9 +3686,12 @@ pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// #     }
 /// # }
 /// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpunpckhwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
     let r: i16x16 = simd_shuffle16(
         a.as_i16x16(),
@@ -3190,9 +3739,12 @@ pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// #     }
 /// # }
 /// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpunpcklwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
     let r: i16x16 = simd_shuffle16(
         a.as_i16x16(),
@@ -3238,9 +3790,12 @@ pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// #     }
 /// # }
 /// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vunpckhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
     let r: i32x8 = simd_shuffle8(
         a.as_i32x8(),
@@ -3284,9 +3839,12 @@ pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// #     }
 /// # }
 /// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vunpcklps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
     let r: i32x8 = simd_shuffle8(
         a.as_i32x8(),
@@ -3330,9 +3888,12 @@ pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// #     }
 /// # }
 /// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpackhi_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vunpckhpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
     let r: i64x4 = simd_shuffle4(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
     mem::transmute(r)
@@ -3372,9 +3933,12 @@ pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
 /// #     }
 /// # }
 /// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_unpacklo_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vunpcklpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
     let r: i64x4 = simd_shuffle4(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
     mem::transmute(r)
@@ -3382,9 +3946,12 @@ pub unsafe fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
 
 /// Compute the bitwise XOR of 256 bits (representing integer data)
 /// in `a` and `b`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_xor_si256)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vxorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
     mem::transmute(simd_xor(a.as_i64x4(), b.as_i64x4()))
 }
@@ -3393,10 +3960,13 @@ pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
 /// integer containing the zero-extended integer data.
 ///
 /// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi8)
 #[inline]
 #[target_feature(enable = "avx2")]
 // This intrinsic has no corresponding instruction.
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_extract_epi8(a: __m256i, imm8: i32) -> i8 {
     let imm8 = (imm8 & 31) as u32;
     simd_extract(a.as_i8x32(), imm8)
@@ -3406,37 +3976,49 @@ pub unsafe fn _mm256_extract_epi8(a: __m256i, imm8: i32) -> i8 {
 /// integer containing the zero-extended integer data.
 ///
 /// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi16)
 #[inline]
 #[target_feature(enable = "avx2")]
 // This intrinsic has no corresponding instruction.
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_extract_epi16(a: __m256i, imm8: i32) -> i16 {
     let imm8 = (imm8 & 15) as u32;
     simd_extract(a.as_i16x16(), imm8)
 }
 
 /// Extract a 32-bit integer from `a`, selected with `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi32)
 #[inline]
 #[target_feature(enable = "avx2")]
 // This intrinsic has no corresponding instruction.
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_extract_epi32(a: __m256i, imm8: i32) -> i32 {
     let imm8 = (imm8 & 7) as u32;
     simd_extract(a.as_i32x8(), imm8)
 }
 
 /// Returns the first element of the input vector of [4 x double].
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsd_f64)
 #[inline]
 #[target_feature(enable = "avx2")]
 //#[cfg_attr(test, assert_instr(movsd))] FIXME
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
     simd_extract(a, 0)
 }
 
 /// Returns the first element of the input vector of [8 x i32].
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsi256_si32)
 #[inline]
 #[target_feature(enable = "avx2")]
 //#[cfg_attr(test, assert_instr(movd))] FIXME
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
     simd_extract(a.as_i32x8(), 0)
 }
diff --git a/coresimd/x86/bmi.rs b/coresimd/x86/bmi1.rs
similarity index 78%
rename from coresimd/x86/bmi.rs
rename to coresimd/x86/bmi1.rs
index aac578cb45..dc27fb2b7e 100644
--- a/coresimd/x86/bmi.rs
+++ b/coresimd/x86/bmi1.rs
@@ -14,9 +14,12 @@ use stdsimd_test::assert_instr;
 
 /// Extracts bits in range [`start`, `start` + `length`) from `a` into
 /// the least significant bits of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bextr_u32)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(bextr))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
     _bextr2_u32(
         a,
@@ -29,33 +32,45 @@ pub unsafe fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
 ///
 /// Bits [7,0] of `control` specify the index to the first bit in the range to
 /// be extracted, and bits [15,8] specify the length of the range.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bextr2_u32)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(bextr))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _bextr2_u32(a: u32, control: u32) -> u32 {
     x86_bmi_bextr_32(a, control)
 }
 
 /// Bitwise logical `AND` of inverted `a` with `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_andn_u32)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(andn))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _andn_u32(a: u32, b: u32) -> u32 {
     !a & b
 }
 
 /// Extract lowest set isolated bit.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_blsi_u32)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(blsi))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blsi_u32(x: u32) -> u32 {
     x & x.wrapping_neg()
 }
 
 /// Get mask up to lowest set bit.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_blsmsk_u32)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(blsmsk))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blsmsk_u32(x: u32) -> u32 {
     x ^ (x.wrapping_sub(1_u32))
 }
@@ -63,9 +78,12 @@ pub unsafe fn _blsmsk_u32(x: u32) -> u32 {
 /// Resets the lowest set bit of `x`.
 ///
 /// If `x` is sets CF.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_blsr_u32)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(blsr))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blsr_u32(x: u32) -> u32 {
     x & (x.wrapping_sub(1))
 }
@@ -73,9 +91,12 @@ pub unsafe fn _blsr_u32(x: u32) -> u32 {
 /// Counts the number of trailing least significant zero bits.
 ///
 /// When the source operand is 0, it returns its size in bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_tzcnt_u32)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(tzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _tzcnt_u32(x: u32) -> u32 {
     x.trailing_zeros()
 }
@@ -83,9 +104,12 @@ pub unsafe fn _tzcnt_u32(x: u32) -> u32 {
 /// Counts the number of trailing least significant zero bits.
 ///
 /// When the source operand is 0, it returns its size in bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_tzcnt_32)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(tzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_tzcnt_32(x: u32) -> i32 {
     x.trailing_zeros() as i32
 }
diff --git a/coresimd/x86/bmi2.rs b/coresimd/x86/bmi2.rs
index 686576ce42..adb1b52fee 100644
--- a/coresimd/x86/bmi2.rs
+++ b/coresimd/x86/bmi2.rs
@@ -17,11 +17,14 @@ use stdsimd_test::assert_instr;
 ///
 /// Unsigned multiplication of `a` with `b` returning a pair `(lo, hi)` with
 /// the low half and the high half of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mulx_u32)
 #[inline]
 // LLVM BUG (should be mulxl): https://bugs.llvm.org/show_bug.cgi?id=34232
 #[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(imul))]
 #[cfg_attr(all(test, target_arch = "x86"), assert_instr(mulx))]
 #[target_feature(enable = "bmi2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mulx_u32(a: u32, b: u32, hi: &mut u32) -> u32 {
     let result: u64 = (a as u64) * (b as u64);
     *hi = (result >> 32) as u32;
@@ -29,27 +32,36 @@ pub unsafe fn _mulx_u32(a: u32, b: u32, hi: &mut u32) -> u32 {
 }
 
 /// Zero higher bits of `a` >= `index`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bzhi_u32)
 #[inline]
 #[target_feature(enable = "bmi2")]
 #[cfg_attr(test, assert_instr(bzhi))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _bzhi_u32(a: u32, index: u32) -> u32 {
     x86_bmi2_bzhi_32(a, index)
 }
 
 /// Scatter contiguous low order bits of `a` to the result at the positions
 /// specified by the `mask`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_pdep_u32)
 #[inline]
 #[target_feature(enable = "bmi2")]
 #[cfg_attr(test, assert_instr(pdep))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _pdep_u32(a: u32, mask: u32) -> u32 {
     x86_bmi2_pdep_32(a, mask)
 }
 
 /// Gathers the bits of `x` specified by the `mask` into the contiguous low
 /// order bit positions of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_pext_u32)
 #[inline]
 #[target_feature(enable = "bmi2")]
 #[cfg_attr(test, assert_instr(pext))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _pext_u32(a: u32, mask: u32) -> u32 {
     x86_bmi2_pext_32(a, mask)
 }
diff --git a/coresimd/x86/bswap.rs b/coresimd/x86/bswap.rs
index 72b981e3ba..d7152562c7 100644
--- a/coresimd/x86/bswap.rs
+++ b/coresimd/x86/bswap.rs
@@ -6,8 +6,11 @@
 use stdsimd_test::assert_instr;
 
 /// Return an integer with the reversed byte order of x
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bswap)
 #[inline]
 #[cfg_attr(test, assert_instr(bswap))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _bswap(x: i32) -> i32 {
     bswap_i32(x)
 }
diff --git a/coresimd/x86/cpuid.rs b/coresimd/x86/cpuid.rs
index 9f72e6b53e..7e000625ce 100644
--- a/coresimd/x86/cpuid.rs
+++ b/coresimd/x86/cpuid.rs
@@ -10,14 +10,19 @@ use stdsimd_test::assert_instr;
 /// Result of the `cpuid` instruction.
 #[derive(Copy, Clone, Eq, Ord, PartialEq, PartialOrd)]
 #[cfg_attr(feature = "cargo-clippy", allow(stutter))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub struct CpuidResult {
     /// EAX register.
+    #[stable(feature = "simd_x86", since = "1.27.0")]
     pub eax: u32,
     /// EBX register.
+    #[stable(feature = "simd_x86", since = "1.27.0")]
     pub ebx: u32,
     /// ECX register.
+    #[stable(feature = "simd_x86", since = "1.27.0")]
     pub ecx: u32,
     /// EDX register.
+    #[stable(feature = "simd_x86", since = "1.27.0")]
     pub edx: u32,
 }
 
@@ -46,6 +51,7 @@ pub struct CpuidResult {
 /// [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
 #[inline]
 #[cfg_attr(test, assert_instr(cpuid))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn __cpuid_count(leaf: u32, sub_leaf: u32) -> CpuidResult {
     let mut r = mem::uninitialized::<CpuidResult>();
     if cfg!(target_arch = "x86") {
@@ -66,6 +72,7 @@ pub unsafe fn __cpuid_count(leaf: u32, sub_leaf: u32) -> CpuidResult {
 /// See [`__cpuid_count`](fn.__cpuid_count.html).
 #[inline]
 #[cfg_attr(test, assert_instr(cpuid))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn __cpuid(leaf: u32) -> CpuidResult {
     __cpuid_count(leaf, 0)
 }
@@ -114,6 +121,7 @@ pub fn has_cpuid() -> bool {
 /// See also [`__cpuid`](fn.__cpuid.html) and
 /// [`__cpuid_count`](fn.__cpuid_count.html).
 #[inline]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn __get_cpuid_max(leaf: u32) -> (u32, u32) {
     let CpuidResult { eax, ebx, .. } = __cpuid(leaf);
     (eax, ebx)
diff --git a/coresimd/x86/eflags.rs b/coresimd/x86/eflags.rs
index 8d925ecdd0..0a7ba919a7 100644
--- a/coresimd/x86/eflags.rs
+++ b/coresimd/x86/eflags.rs
@@ -1,8 +1,11 @@
 //! `i386` intrinsics
 
 /// Reads EFLAGS.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__readeflags)
 #[cfg(target_arch = "x86")]
 #[inline(always)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn __readeflags() -> u32 {
     let eflags: u32;
     asm!("pushfd; popl $0" : "=r"(eflags) : : : "volatile");
@@ -10,8 +13,11 @@ pub unsafe fn __readeflags() -> u32 {
 }
 
 /// Reads EFLAGS.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__readeflags)
 #[cfg(target_arch = "x86_64")]
 #[inline(always)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn __readeflags() -> u64 {
     let eflags: u64;
     asm!("pushfq; popq $0" : "=r"(eflags) : : : "volatile");
@@ -19,15 +25,21 @@ pub unsafe fn __readeflags() -> u64 {
 }
 
 /// Write EFLAGS.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__writeeflags)
 #[cfg(target_arch = "x86")]
 #[inline(always)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn __writeeflags(eflags: u32) {
     asm!("pushl $0; popfd" : : "r"(eflags) : "cc", "flags" : "volatile");
 }
 
 /// Write EFLAGS.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__writeeflags)
 #[cfg(target_arch = "x86_64")]
 #[inline(always)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn __writeeflags(eflags: u64) {
     asm!("pushq $0; popfq" : : "r"(eflags) : "cc", "flags" : "volatile");
 }
diff --git a/coresimd/x86/fxsr.rs b/coresimd/x86/fxsr.rs
index 2fa2685157..c8da47f3ca 100644
--- a/coresimd/x86/fxsr.rs
+++ b/coresimd/x86/fxsr.rs
@@ -21,9 +21,12 @@ extern "C" {
 ///
 /// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
 /// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_fxsave)
 #[inline]
 #[target_feature(enable = "fxsr")]
 #[cfg_attr(test, assert_instr(fxsave))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _fxsave(mem_addr: *mut u8) {
     fxsave(mem_addr)
 }
@@ -42,9 +45,12 @@ pub unsafe fn _fxsave(mem_addr: *mut u8) {
 ///
 /// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
 /// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_fxrstor)
 #[inline]
 #[target_feature(enable = "fxsr")]
 #[cfg_attr(test, assert_instr(fxrstor))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _fxrstor(mem_addr: *const u8) {
     fxrstor(mem_addr)
 }
diff --git a/coresimd/x86/mod.rs b/coresimd/x86/mod.rs
index 16ef3eead5..521b98e84e 100644
--- a/coresimd/x86/mod.rs
+++ b/coresimd/x86/mod.rs
@@ -105,6 +105,7 @@ types! {
     /// # if is_x86_feature_detected!("sse2") { unsafe { foo() } }
     /// # }
     /// ```
+    #[stable(feature = "simd_x86", since = "1.27.0")]
     pub struct __m128i(i64, i64);
 
     /// 128-bit wide set of four `f32` types, x86-specific
@@ -148,6 +149,7 @@ types! {
     /// # if is_x86_feature_detected!("sse") { unsafe { foo() } }
     /// # }
     /// ```
+    #[stable(feature = "simd_x86", since = "1.27.0")]
     pub struct __m128(f32, f32, f32, f32);
 
     /// 128-bit wide set of two `f64` types, x86-specific
@@ -191,6 +193,7 @@ types! {
     /// # if is_x86_feature_detected!("sse") { unsafe { foo() } }
     /// # }
     /// ```
+    #[stable(feature = "simd_x86", since = "1.27.0")]
     pub struct __m128d(f64, f64);
 
     /// 256-bit wide integer vector type, x86-specific
@@ -238,6 +241,7 @@ types! {
     /// # if is_x86_feature_detected!("avx") { unsafe { foo() } }
     /// # }
     /// ```
+    #[stable(feature = "simd_x86", since = "1.27.0")]
     pub struct __m256i(i64, i64, i64, i64);
 
     /// 256-bit wide set of eight `f32` types, x86-specific
@@ -281,6 +285,7 @@ types! {
     /// # if is_x86_feature_detected!("sse") { unsafe { foo() } }
     /// # }
     /// ```
+    #[stable(feature = "simd_x86", since = "1.27.0")]
     pub struct __m256(f32, f32, f32, f32, f32, f32, f32, f32);
 
     /// 256-bit wide set of four `f64` types, x86-specific
@@ -324,6 +329,7 @@ types! {
     /// # if is_x86_feature_detected!("avx") { unsafe { foo() } }
     /// # }
     /// ```
+    #[stable(feature = "simd_x86", since = "1.27.0")]
     pub struct __m256d(f64, f64, f64, f64);
 }
 
@@ -334,6 +340,7 @@ pub use self::test::*;
 
 #[doc(hidden)]
 #[allow(non_camel_case_types)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub(crate) trait m128iExt: Sized {
     fn as_m128i(self) -> __m128i;
 
@@ -387,6 +394,7 @@ impl m128iExt for __m128i {
 
 #[doc(hidden)]
 #[allow(non_camel_case_types)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub(crate) trait m256iExt: Sized {
     fn as_m256i(self) -> __m256i;
 
@@ -570,8 +578,8 @@ pub use self::avx2::*;
 
 mod abm;
 pub use self::abm::*;
-mod bmi;
-pub use self::bmi::*;
+mod bmi1;
+pub use self::bmi1::*;
 
 mod bmi2;
 pub use self::bmi2::*;
diff --git a/coresimd/x86/pclmulqdq.rs b/coresimd/x86/pclmulqdq.rs
index b5b7465ca4..4cef220295 100644
--- a/coresimd/x86/pclmulqdq.rs
+++ b/coresimd/x86/pclmulqdq.rs
@@ -21,6 +21,8 @@ extern "C" {
 ///
 /// The immediate byte is used for determining which halves of `a` and `b`
 /// should be used. Immediate bits other than 0 and 4 are ignored.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128)
 #[inline]
 #[target_feature(enable = "pclmulqdq")]
 #[cfg_attr(all(test, not(target_os = "linux")),
@@ -34,6 +36,7 @@ extern "C" {
 #[cfg_attr(all(test, target_os = "linux"),
            assert_instr(pclmulhqhqdq, imm8 = 17))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_clmulepi64_si128(
     a: __m128i, b: __m128i, imm8: i32
 ) -> __m128i {
diff --git a/coresimd/x86/rdrand.rs b/coresimd/x86/rdrand.rs
index 9877125851..2b900837fd 100644
--- a/coresimd/x86/rdrand.rs
+++ b/coresimd/x86/rdrand.rs
@@ -14,10 +14,13 @@ use stdsimd_test::assert_instr;
 
 /// Read a hardware generated 16-bit random value and store the result in val.
 /// Return 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdrand16_step)
 #[inline]
 #[target_feature(enable = "rdrand")]
 #[cfg_attr(test, assert_instr(rdrand))]
 #[cfg_attr(feature = "cargo-clippy", allow(stutter))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _rdrand16_step(val: &mut u16) -> i32 {
     let (v, flag) = x86_rdrand16_step();
     *val = v;
@@ -26,10 +29,13 @@ pub unsafe fn _rdrand16_step(val: &mut u16) -> i32 {
 
 /// Read a hardware generated 32-bit random value and store the result in val.
 /// Return 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdrand32_step)
 #[inline]
 #[target_feature(enable = "rdrand")]
 #[cfg_attr(test, assert_instr(rdrand))]
 #[cfg_attr(feature = "cargo-clippy", allow(stutter))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _rdrand32_step(val: &mut u32) -> i32 {
     let (v, flag) = x86_rdrand32_step();
     *val = v;
@@ -38,9 +44,12 @@ pub unsafe fn _rdrand32_step(val: &mut u32) -> i32 {
 
 /// Read a 16-bit NIST SP800-90B and SP800-90C compliant random value and store
 /// in val. Return 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdseed16_step)
 #[inline]
 #[target_feature(enable = "rdseed")]
 #[cfg_attr(test, assert_instr(rdseed))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _rdseed16_step(val: &mut u16) -> i32 {
     let (v, flag) = x86_rdseed16_step();
     *val = v;
@@ -49,9 +58,12 @@ pub unsafe fn _rdseed16_step(val: &mut u16) -> i32 {
 
 /// Read a 32-bit NIST SP800-90B and SP800-90C compliant random value and store
 /// in val. Return 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdseed32_step)
 #[inline]
 #[target_feature(enable = "rdseed")]
 #[cfg_attr(test, assert_instr(rdseed))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _rdseed32_step(val: &mut u32) -> i32 {
     let (v, flag) = x86_rdseed32_step();
     *val = v;
diff --git a/coresimd/x86/rdtsc.rs b/coresimd/x86/rdtsc.rs
index 468fa09bc8..79e32a7943 100644
--- a/coresimd/x86/rdtsc.rs
+++ b/coresimd/x86/rdtsc.rs
@@ -17,8 +17,11 @@ use stdsimd_test::assert_instr;
 ///
 /// On processors that support the Intel 64 architecture, the
 /// high-order 32 bits of each of RAX and RDX are cleared.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdtsc)
 #[inline]
 #[cfg_attr(test, assert_instr(rdtsc))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _rdtsc() -> i64 {
     rdtsc()
 }
@@ -37,8 +40,11 @@ pub unsafe fn _rdtsc() -> i64 {
 ///
 /// On processors that support the Intel 64 architecture, the
 /// high-order 32 bits of each of RAX, RDX, and RCX are cleared.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=__rdtscp)
 #[inline]
 #[cfg_attr(test, assert_instr(rdtscp))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn __rdtscp(aux: *mut u32) -> u64 {
     rdtscp(aux as *mut _)
 }
diff --git a/coresimd/x86/sha.rs b/coresimd/x86/sha.rs
index d574e7d9d7..5135165cf6 100644
--- a/coresimd/x86/sha.rs
+++ b/coresimd/x86/sha.rs
@@ -26,9 +26,12 @@ use stdsimd_test::assert_instr;
 /// Perform an intermediate calculation for the next four SHA1 message values
 /// (unsigned 32-bit integers) using previous message values from `a` and `b`,
 /// and returning the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sha1msg1_epu32)
 #[inline]
 #[target_feature(enable = "sha")]
 #[cfg_attr(test, assert_instr(sha1msg1))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sha1msg1_epu32(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(sha1msg1(a.as_i32x4(), b.as_i32x4()))
 }
@@ -36,9 +39,12 @@ pub unsafe fn _mm_sha1msg1_epu32(a: __m128i, b: __m128i) -> __m128i {
 /// Perform the final calculation for the next four SHA1 message values
 /// (unsigned 32-bit integers) using the intermediate result in `a` and the
 /// previous message values in `b`, and returns the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sha1msg2_epu32)
 #[inline]
 #[target_feature(enable = "sha")]
 #[cfg_attr(test, assert_instr(sha1msg2))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sha1msg2_epu32(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(sha1msg2(a.as_i32x4(), b.as_i32x4()))
 }
@@ -46,9 +52,12 @@ pub unsafe fn _mm_sha1msg2_epu32(a: __m128i, b: __m128i) -> __m128i {
 /// Calculate SHA1 state variable E after four rounds of operation from the
 /// current SHA1 state variable `a`, add that value to the scheduled values
 /// (unsigned 32-bit integers) in `b`, and returns the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sha1nexte_epu32)
 #[inline]
 #[target_feature(enable = "sha")]
 #[cfg_attr(test, assert_instr(sha1nexte))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sha1nexte_epu32(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(sha1nexte(a.as_i32x4(), b.as_i32x4()))
 }
@@ -58,10 +67,13 @@ pub unsafe fn _mm_sha1nexte_epu32(a: __m128i, b: __m128i) -> __m128i {
 /// (unsigned 32-bit integers), and state variable E from `b`, and return the
 /// updated SHA1 state (A,B,C,D). `func` contains the logic functions and round
 /// constants.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sha1rnds4_epu32)
 #[inline]
 #[target_feature(enable = "sha")]
 #[cfg_attr(test, assert_instr(sha1rnds4, func = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sha1rnds4_epu32(
     a: __m128i, b: __m128i, func: i32
 ) -> __m128i {
@@ -79,9 +91,12 @@ pub unsafe fn _mm_sha1rnds4_epu32(
 /// Perform an intermediate calculation for the next four SHA256 message values
 /// (unsigned 32-bit integers) using previous message values from `a` and `b`,
 /// and return the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sha256msg1_epu32)
 #[inline]
 #[target_feature(enable = "sha")]
 #[cfg_attr(test, assert_instr(sha256msg1))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sha256msg1_epu32(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(sha256msg1(a.as_i32x4(), b.as_i32x4()))
 }
@@ -89,9 +104,12 @@ pub unsafe fn _mm_sha256msg1_epu32(a: __m128i, b: __m128i) -> __m128i {
 /// Perform the final calculation for the next four SHA256 message values
 /// (unsigned 32-bit integers) using previous message values from `a` and `b`,
 /// and return the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sha256msg2_epu32)
 #[inline]
 #[target_feature(enable = "sha")]
 #[cfg_attr(test, assert_instr(sha256msg2))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sha256msg2_epu32(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(sha256msg2(a.as_i32x4(), b.as_i32x4()))
 }
@@ -101,9 +119,12 @@ pub unsafe fn _mm_sha256msg2_epu32(a: __m128i, b: __m128i) -> __m128i {
 /// pre-computed sum of the next 2 round message values (unsigned 32-bit
 /// integers) and the corresponding round constants from `k`, and store the
 /// updated SHA256 state (A,B,E,F) in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sha256rnds2_epu32)
 #[inline]
 #[target_feature(enable = "sha")]
 #[cfg_attr(test, assert_instr(sha256rnds2))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sha256rnds2_epu32(
     a: __m128i, b: __m128i, k: __m128i
 ) -> __m128i {
diff --git a/coresimd/x86/sse.rs b/coresimd/x86/sse.rs
index dbdb727902..aa24c28891 100644
--- a/coresimd/x86/sse.rs
+++ b/coresimd/x86/sse.rs
@@ -12,122 +12,164 @@ use stdsimd_test::assert_instr;
 
 /// Adds the first component of `a` and `b`, the other components are copied
 /// from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(addss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_add_ss(a: __m128, b: __m128) -> __m128 {
     addss(a, b)
 }
 
 /// Adds __m128 vectors.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(addps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_add_ps(a: __m128, b: __m128) -> __m128 {
     simd_add(a, b)
 }
 
 /// Subtracts the first component of `b` from `a`, the other components are
 /// copied from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(subss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sub_ss(a: __m128, b: __m128) -> __m128 {
     subss(a, b)
 }
 
 /// Subtracts __m128 vectors.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(subps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sub_ps(a: __m128, b: __m128) -> __m128 {
     simd_sub(a, b)
 }
 
 /// Multiplies the first component of `a` and `b`, the other components are
 /// copied from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(mulss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mul_ss(a: __m128, b: __m128) -> __m128 {
     mulss(a, b)
 }
 
 /// Multiplies __m128 vectors.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(mulps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mul_ps(a: __m128, b: __m128) -> __m128 {
     simd_mul(a, b)
 }
 
 /// Divides the first component of `b` by `a`, the other components are
 /// copied from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(divss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_div_ss(a: __m128, b: __m128) -> __m128 {
     divss(a, b)
 }
 
 /// Divides __m128 vectors.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(divps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_div_ps(a: __m128, b: __m128) -> __m128 {
     simd_div(a, b)
 }
 
 /// Return the square root of the first single-precision (32-bit)
 /// floating-point element in `a`, the other elements are unchanged.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(sqrtss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sqrt_ss(a: __m128) -> __m128 {
     sqrtss(a)
 }
 
 /// Return the square root of packed single-precision (32-bit) floating-point
 /// elements in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(sqrtps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sqrt_ps(a: __m128) -> __m128 {
     sqrtps(a)
 }
 
 /// Return the approximate reciprocal of the first single-precision
 /// (32-bit) floating-point element in `a`, the other elements are unchanged.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(rcpss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_rcp_ss(a: __m128) -> __m128 {
     rcpss(a)
 }
 
 /// Return the approximate reciprocal of packed single-precision (32-bit)
 /// floating-point elements in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(rcpps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_rcp_ps(a: __m128) -> __m128 {
     rcpps(a)
 }
 
 /// Return the approximate reciprocal square root of the fist single-precision
 /// (32-bit) floating-point elements in `a`, the other elements are unchanged.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(rsqrtss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_rsqrt_ss(a: __m128) -> __m128 {
     rsqrtss(a)
 }
 
 /// Return the approximate reciprocal square root of packed single-precision
 /// (32-bit) floating-point elements in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(rsqrtps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_rsqrt_ps(a: __m128) -> __m128 {
     rsqrtps(a)
 }
@@ -135,18 +177,24 @@ pub unsafe fn _mm_rsqrt_ps(a: __m128) -> __m128 {
 /// Compare the first single-precision (32-bit) floating-point element of `a`
 /// and `b`, and return the minimum value in the first element of the return
 /// value, the other elements are copied from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(minss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_min_ss(a: __m128, b: __m128) -> __m128 {
     minss(a, b)
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in `a` and
 /// `b`, and return the corresponding minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(minps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_min_ps(a: __m128, b: __m128) -> __m128 {
     minps(a, b)
 }
@@ -154,28 +202,37 @@ pub unsafe fn _mm_min_ps(a: __m128, b: __m128) -> __m128 {
 /// Compare the first single-precision (32-bit) floating-point element of `a`
 /// and `b`, and return the maximum value in the first element of the return
 /// value, the other elements are copied from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(maxss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_max_ss(a: __m128, b: __m128) -> __m128 {
     maxss(a, b)
 }
 
 /// Compare packed single-precision (32-bit) floating-point elements in `a` and
 /// `b`, and return the corresponding maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(maxps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_max_ps(a: __m128, b: __m128) -> __m128 {
     maxps(a, b)
 }
 
 /// Bitwise AND of packed single-precision (32-bit) floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 // i586 only seems to generate plain `and` instructions, so ignore it.
 #[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
            assert_instr(andps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_and_ps(a: __m128, b: __m128) -> __m128 {
     let a: __m128i = mem::transmute(a);
     let b: __m128i = mem::transmute(b);
@@ -186,12 +243,15 @@ pub unsafe fn _mm_and_ps(a: __m128, b: __m128) -> __m128 {
 /// elements.
 ///
 /// Computes `!a & b` for each bit in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 // i586 only seems to generate plain `not` and `and` instructions, so ignore
 // it.
 #[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
            assert_instr(andnps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 {
     let a: __m128i = mem::transmute(a);
     let b: __m128i = mem::transmute(b);
@@ -200,11 +260,14 @@ pub unsafe fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 {
 }
 
 /// Bitwise OR of packed single-precision (32-bit) floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_or_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 // i586 only seems to generate plain `or` instructions, so we ignore it.
 #[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
            assert_instr(orps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_or_ps(a: __m128, b: __m128) -> __m128 {
     let a: __m128i = mem::transmute(a);
     let b: __m128i = mem::transmute(b);
@@ -213,11 +276,14 @@ pub unsafe fn _mm_or_ps(a: __m128, b: __m128) -> __m128 {
 
 /// Bitwise exclusive OR of packed single-precision (32-bit) floating-point
 /// elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 // i586 only seems to generate plain `xor` instructions, so we ignore it.
 #[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
            assert_instr(xorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 {
     let a: __m128i = mem::transmute(a);
     let b: __m128i = mem::transmute(b);
@@ -227,9 +293,12 @@ pub unsafe fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 {
 /// Compare the lowest `f32` of both inputs for equality. The lowest 32 bits of
 /// the result will be `0xffffffff` if the two inputs are equal, or `0`
 /// otherwise. The upper 96 bits of the result are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpeqss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128 {
     cmpss(a, b, 0)
 }
@@ -238,9 +307,12 @@ pub unsafe fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128 {
 /// of the result will be `0xffffffff` if `a.extract(0)` is less than
 /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
 /// upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpltss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128 {
     cmpss(a, b, 1)
 }
@@ -249,9 +321,12 @@ pub unsafe fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128 {
 /// 32 bits of the result will be `0xffffffff` if `a.extract(0)` is less than
 /// or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
 /// are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpless))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 {
     cmpss(a, b, 2)
 }
@@ -260,9 +335,12 @@ pub unsafe fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 {
 /// bits of the result will be `0xffffffff` if `a.extract(0)` is greater
 /// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
 /// are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpltss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 {
     simd_shuffle4(a, cmpss(b, a, 1), [4, 1, 2, 3])
 }
@@ -271,9 +349,12 @@ pub unsafe fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 {
 /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is
 /// greater than or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits
 /// of the result are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpless))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 {
     simd_shuffle4(a, cmpss(b, a, 2), [4, 1, 2, 3])
 }
@@ -282,9 +363,12 @@ pub unsafe fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 {
 /// of the result will be `0xffffffff` if `a.extract(0)` is not equal to
 /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
 /// upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpneqss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128 {
     cmpss(a, b, 4)
 }
@@ -293,9 +377,12 @@ pub unsafe fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128 {
 /// bits of the result will be `0xffffffff` if `a.extract(0)` is not less than
 /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
 /// upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpnltss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128 {
     cmpss(a, b, 5)
 }
@@ -304,9 +391,12 @@ pub unsafe fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128 {
 /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
 /// less than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 bits
 /// of the result are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpnless))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 {
     cmpss(a, b, 6)
 }
@@ -315,9 +405,12 @@ pub unsafe fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 {
 /// bits of the result will be `0xffffffff` if `a.extract(0)` is not greater
 /// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are
 /// the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpnltss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 {
     simd_shuffle4(a, cmpss(b, a, 5), [4, 1, 2, 3])
 }
@@ -326,9 +419,12 @@ pub unsafe fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 {
 /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
 /// greater than or equal to `b.extract(0)`, or `0` otherwise. The upper 96
 /// bits of the result are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpnless))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 {
     simd_shuffle4(a, cmpss(b, a, 6), [4, 1, 2, 3])
 }
@@ -337,9 +433,12 @@ pub unsafe fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 {
 /// the result will be `0xffffffff` if neither of `a.extract(0)` or
 /// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
 /// are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpordss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128 {
     cmpss(a, b, 7)
 }
@@ -348,9 +447,12 @@ pub unsafe fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128 {
 /// of the result will be `0xffffffff` if any of `a.extract(0)` or
 /// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
 /// are the upper 96 bits of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpunordss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128 {
     cmpss(a, b, 3)
 }
@@ -358,9 +460,12 @@ pub unsafe fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128 {
 /// Compare each of the four floats in `a` to the corresponding element in `b`.
 /// The result in the output vector will be `0xffffffff` if the input elements
 /// were equal, or `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpeqps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128 {
     cmpps(a, b, 0)
 }
@@ -368,9 +473,12 @@ pub unsafe fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128 {
 /// Compare each of the four floats in `a` to the corresponding element in `b`.
 /// The result in the output vector will be `0xffffffff` if the input element
 /// in `a` is less than the corresponding element in `b`, or `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpltps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128 {
     cmpps(a, b, 1)
 }
@@ -379,9 +487,12 @@ pub unsafe fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128 {
 /// The result in the output vector will be `0xffffffff` if the input element
 /// in `a` is less than or equal to the corresponding element in `b`, or `0`
 /// otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpleps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128 {
     cmpps(a, b, 2)
 }
@@ -389,9 +500,12 @@ pub unsafe fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128 {
 /// Compare each of the four floats in `a` to the corresponding element in `b`.
 /// The result in the output vector will be `0xffffffff` if the input element
 /// in `a` is greater than the corresponding element in `b`, or `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpltps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128 {
     cmpps(b, a, 1)
 }
@@ -400,9 +514,12 @@ pub unsafe fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128 {
 /// The result in the output vector will be `0xffffffff` if the input element
 /// in `a` is greater than or equal to the corresponding element in `b`, or `0`
 /// otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpleps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128 {
     cmpps(b, a, 2)
 }
@@ -410,9 +527,12 @@ pub unsafe fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128 {
 /// Compare each of the four floats in `a` to the corresponding element in `b`.
 /// The result in the output vector will be `0xffffffff` if the input elements
 /// are *not* equal, or `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpneqps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128 {
     cmpps(a, b, 4)
 }
@@ -421,9 +541,12 @@ pub unsafe fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128 {
 /// The result in the output vector will be `0xffffffff` if the input element
 /// in `a` is *not* less than the corresponding element in `b`, or `0`
 /// otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpnltps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128 {
     cmpps(a, b, 5)
 }
@@ -432,9 +555,12 @@ pub unsafe fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128 {
 /// The result in the output vector will be `0xffffffff` if the input element
 /// in `a` is *not* less than or equal to the corresponding element in `b`, or
 /// `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpnleps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128 {
     cmpps(a, b, 6)
 }
@@ -443,9 +569,12 @@ pub unsafe fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128 {
 /// The result in the output vector will be `0xffffffff` if the input element
 /// in `a` is *not* greater than the corresponding element in `b`, or `0`
 /// otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpnltps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128 {
     cmpps(b, a, 5)
 }
@@ -454,9 +583,12 @@ pub unsafe fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128 {
 /// The result in the output vector will be `0xffffffff` if the input element
 /// in `a` is *not* greater than or equal to the corresponding element in `b`,
 /// or `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpnleps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128 {
     cmpps(b, a, 6)
 }
@@ -465,9 +597,12 @@ pub unsafe fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128 {
 /// Returns four floats that have one of two possible bit patterns. The element
 /// in the output vector will be `0xffffffff` if the input elements in `a` and
 /// `b` are ordered (i.e., neither of them is a NaN), or 0 otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpordps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128 {
     cmpps(b, a, 7)
 }
@@ -476,27 +611,36 @@ pub unsafe fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128 {
 /// Returns four floats that have one of two possible bit patterns. The element
 /// in the output vector will be `0xffffffff` if the input elements in `a` and
 /// `b` are unordered (i.e., at least on of them is a NaN), or 0 otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpunordps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpunord_ps(a: __m128, b: __m128) -> __m128 {
     cmpps(b, a, 3)
 }
 
 /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
 /// `1` if they are equal, or `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(comiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_comieq_ss(a: __m128, b: __m128) -> i32 {
     comieq_ss(a, b)
 }
 
 /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
 /// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(comiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_comilt_ss(a: __m128, b: __m128) -> i32 {
     comilt_ss(a, b)
 }
@@ -504,9 +648,12 @@ pub unsafe fn _mm_comilt_ss(a: __m128, b: __m128) -> i32 {
 /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
 /// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
 /// otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(comiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_comile_ss(a: __m128, b: __m128) -> i32 {
     comile_ss(a, b)
 }
@@ -514,9 +661,12 @@ pub unsafe fn _mm_comile_ss(a: __m128, b: __m128) -> i32 {
 /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
 /// `1` if the value from `a` is greater than the one from `b`, or `0`
 /// otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(comiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_comigt_ss(a: __m128, b: __m128) -> i32 {
     comigt_ss(a, b)
 }
@@ -524,18 +674,24 @@ pub unsafe fn _mm_comigt_ss(a: __m128, b: __m128) -> i32 {
 /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
 /// `1` if the value from `a` is greater than or equal to the one from `b`, or
 /// `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(comiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_comige_ss(a: __m128, b: __m128) -> i32 {
     comige_ss(a, b)
 }
 
 /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
 /// `1` if they are *not* equal, or `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(comiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_comineq_ss(a: __m128, b: __m128) -> i32 {
     comineq_ss(a, b)
 }
@@ -543,9 +699,12 @@ pub unsafe fn _mm_comineq_ss(a: __m128, b: __m128) -> i32 {
 /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
 /// `1` if they are equal, or `0` otherwise. This instruction will not signal
 /// an exception if either argument is a quiet NaN.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomieq_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(ucomiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32 {
     ucomieq_ss(a, b)
 }
@@ -554,9 +713,12 @@ pub unsafe fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32 {
 /// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
 /// This instruction will not signal an exception if either argument is a quiet
 /// NaN.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomilt_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(ucomiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32 {
     ucomilt_ss(a, b)
 }
@@ -565,9 +727,12 @@ pub unsafe fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32 {
 /// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
 /// otherwise. This instruction will not signal an exception if either argument
 /// is a quiet NaN.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomile_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(ucomiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32 {
     ucomile_ss(a, b)
 }
@@ -576,9 +741,12 @@ pub unsafe fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32 {
 /// `1` if the value from `a` is greater than the one from `b`, or `0`
 /// otherwise. This instruction will not signal an exception if either argument
 /// is a quiet NaN.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomigt_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(ucomiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32 {
     ucomigt_ss(a, b)
 }
@@ -587,9 +755,12 @@ pub unsafe fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32 {
 /// `1` if the value from `a` is greater than or equal to the one from `b`, or
 /// `0` otherwise. This instruction will not signal an exception if either
 /// argument is a quiet NaN.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomige_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(ucomiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32 {
     ucomige_ss(a, b)
 }
@@ -597,9 +768,12 @@ pub unsafe fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32 {
 /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
 /// `1` if they are *not* equal, or `0` otherwise. This instruction will not
 /// signal an exception if either argument is a quiet NaN.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomineq_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(ucomiss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 {
     ucomineq_ss(a, b)
 }
@@ -612,17 +786,23 @@ pub unsafe fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 {
 /// unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
 ///
 /// This corresponds to the `CVTSS2SI` instruction (with 32 bit output).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cvtss2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtss_si32(a: __m128) -> i32 {
     cvtss2si(a)
 }
 
 /// Alias for [`_mm_cvtss_si32`](fn._mm_cvtss_si32.html).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cvtss2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvt_ss2si(a: __m128) -> i32 {
     _mm_cvtss_si32(a)
 }
@@ -637,26 +817,35 @@ pub unsafe fn _mm_cvt_ss2si(a: __m128) -> i32 {
 /// exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
 ///
 /// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cvttss2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvttss_si32(a: __m128) -> i32 {
     cvttss2si(a)
 }
 
 /// Alias for [`_mm_cvttss_si32`](fn._mm_cvttss_si32.html).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cvttss2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtt_ss2si(a: __m128) -> i32 {
     _mm_cvttss_si32(a)
 }
 
 /// Extract the lowest 32 bit float from the input vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32)
 #[inline]
 #[target_feature(enable = "sse")]
 // No point in using assert_instrs. In Unix x86_64 calling convention this is a
 // no-op, and on Windows it's just a `mov`.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtss_f32(a: __m128) -> f32 {
     simd_extract(a, 0)
 }
@@ -666,42 +855,57 @@ pub unsafe fn _mm_cvtss_f32(a: __m128) -> f32 {
 ///
 /// This intrinsic corresponds to the `CVTSI2SS` instruction (with 32 bit
 /// input).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cvtsi2ss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 {
     cvtsi2ss(a, b)
 }
 
 /// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cvtsi2ss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvt_si2ss(a: __m128, b: i32) -> __m128 {
     _mm_cvtsi32_ss(a, b)
 }
 
 /// Construct a `__m128` with the lowest element set to `a` and the rest set to
 /// zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_set_ss(a: f32) -> __m128 {
     __m128(a, 0.0, 0.0, 0.0)
 }
 
 /// Construct a `__m128` with all element set to `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(shufps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_set1_ps(a: f32) -> __m128 {
     __m128(a, a, a, a)
 }
 
 /// Alias for [`_mm_set1_ps`](fn._mm_set1_ps.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ps1)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(shufps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_set_ps1(a: f32) -> __m128 {
     _mm_set1_ps(a)
 }
@@ -723,9 +927,12 @@ pub unsafe fn _mm_set_ps1(a: f32) -> __m128 {
 /// ```text
 /// let v = _mm_set_ps(d, c, b, a);
 /// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(unpcklps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
     __m128(d, c, b, a)
 }
@@ -738,19 +945,25 @@ pub unsafe fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
 /// ```text
 /// assert_eq!(__m128::new(a, b, c, d), _mm_setr_ps(a, b, c, d));
 /// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(unpcklps))]
 // On a 32-bit architecture it just copies the operands from the stack.
 #[cfg_attr(all(test, target_arch = "x86"), assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_setr_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
     __m128(a, b, c, d)
 }
 
 /// Construct a `__m128` with all elements initialized to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(xorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_setzero_ps() -> __m128 {
     __m128(0.0, 0.0, 0.0, 0.0)
 }
@@ -760,10 +973,13 @@ pub unsafe fn _mm_setzero_ps() -> __m128 {
 ///
 /// The lower half of result takes values from `a` and the higher half from
 /// `b`. Mask is split to 2 control bits each to index the element from inputs.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(shufps, mask = 3))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_shuffle_ps(a: __m128, b: __m128, mask: u32) -> __m128 {
     let mask = (mask & 0xFF) as u8;
 
@@ -812,27 +1028,36 @@ pub unsafe fn _mm_shuffle_ps(a: __m128, b: __m128, mask: u32) -> __m128 {
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements
 /// from the higher half of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(unpckhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 {
     simd_shuffle4(a, b, [2, 6, 3, 7])
 }
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements
 /// from the lower half of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(unpcklps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 {
     simd_shuffle4(a, b, [0, 4, 1, 5])
 }
 
 /// Combine higher half of `a` and `b`. The highwe half of `b` occupies the
 /// lower half of result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehl_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movhlps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 {
     // TODO; figure why this is a different instruction on Windows?
     simd_shuffle4(a, b, [6, 7, 2, 3])
@@ -840,9 +1065,12 @@ pub unsafe fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 {
 
 /// Combine lower half of `a` and `b`. The lower half of `b` occupies the
 /// higher half of result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movelh_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movlhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
     simd_shuffle4(a, b, [0, 1, 4, 5])
 }
@@ -851,9 +1079,12 @@ pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
 ///
 /// The mask is stored in the 4 least significant bits of the return value.
 /// All other bits are set to `0`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movmskps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 {
     movmskps(a)
 }
@@ -973,9 +1204,12 @@ pub unsafe fn _mm_loadl_pi(a: __m128, p: *const __m64) -> __m128 {
 /// elements set to zero.
 ///
 /// This corresponds to instructions `VMOVSS` / `MOVSS`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_load_ss(p: *const f32) -> __m128 {
     __m128(*p, 0.0, 0.0, 0.0)
 }
@@ -985,18 +1219,24 @@ pub unsafe fn _mm_load_ss(p: *const f32) -> __m128 {
 ///
 /// This corresponds to instructions `VMOVSS` / `MOVSS` followed by some
 /// shuffling.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_load1_ps(p: *const f32) -> __m128 {
     let a = *p;
     __m128(a, a, a, a)
 }
 
 /// Alias for [`_mm_load1_ps`](fn._mm_load1_ps.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 {
     _mm_load1_ps(p)
 }
@@ -1009,9 +1249,12 @@ pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 {
 /// memory.
 ///
 /// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 {
     *(p as *const __m128)
 }
@@ -1023,9 +1266,12 @@ pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 {
 /// may be faster.
 ///
 /// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movups))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 {
     // Note: Using `*p` would require `f32` alignment, but `movups` has no
     // alignment restrictions.
@@ -1057,9 +1303,12 @@ pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 {
 ///
 /// This corresponds to instructions `VMOVAPS` / `MOVAPS` followed by some
 /// shuffling.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 {
     let a = _mm_load_ps(p);
     simd_shuffle4(a, a, [3, 2, 1, 0])
@@ -1124,9 +1373,12 @@ pub unsafe fn _mm_storel_pi(p: *mut __m64, a: __m128) {
 /// Store the lowest 32 bit float of `a` into memory.
 ///
 /// This intrinsic corresponds to the `MOVSS` instruction.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) {
     *p = simd_extract(a, 0);
 }
@@ -1147,18 +1399,24 @@ pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) {
 /// *p.offset(2) = x;
 /// *p.offset(3) = x;
 /// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) {
     let b: __m128 = simd_shuffle4(a, a, [0, 0, 0, 0]);
     *(p as *mut __m128) = b;
 }
 
 /// Alias for [`_mm_store1_ps`](fn._mm_store1_ps.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) {
     _mm_store1_ps(p, a);
 }
@@ -1172,9 +1430,12 @@ pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) {
 /// memory.
 ///
 /// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) {
     *(p as *mut __m128) = a;
 }
@@ -1184,9 +1445,12 @@ pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) {
 /// faster.
 ///
 /// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movups))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) {
     ptr::copy_nonoverlapping(
         &a as *const __m128 as *const u8,
@@ -1209,9 +1473,12 @@ pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) {
 /// *p.offset(2) = a.extract(1);
 /// *p.offset(3) = a.extract(0);
 /// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) {
     let b: __m128 = simd_shuffle4(a, a, [3, 2, 1, 0]);
     *(p as *mut __m128) = b;
@@ -1224,9 +1491,12 @@ pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) {
 /// ```text
 /// _mm_move_ss(a, b) == a.replace(0, b.extract(0))
 /// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
     simd_shuffle4(a, b, [4, 1, 2, 3])
 }
@@ -1237,9 +1507,12 @@ pub unsafe fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
 /// Guarantees that every store instruction that precedes, in program order, is
 /// globally visible before any store instruction which follows the fence in
 /// program order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sfence)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(sfence))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sfence() {
     sfence()
 }
@@ -1247,9 +1520,12 @@ pub unsafe fn _mm_sfence() {
 /// Get the unsigned 32-bit value of the MXCSR control and status register.
 ///
 /// For more info see [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getcsr)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(stmxcsr))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_getcsr() -> u32 {
     let mut result = 0_i32;
     stmxcsr((&mut result) as *mut _ as *mut i8);
@@ -1381,114 +1657,160 @@ pub unsafe fn _mm_getcsr() -> u32 {
 /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // turn on
 /// ```
 ///
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setcsr)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(ldmxcsr))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_setcsr(val: u32) {
     ldmxcsr(&val as *const _ as *const i8);
 }
 
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_EXCEPT_INVALID: u32 = 0x0001;
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_EXCEPT_DENORM: u32 = 0x0002;
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_EXCEPT_DIV_ZERO: u32 = 0x0004;
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_EXCEPT_OVERFLOW: u32 = 0x0008;
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_EXCEPT_UNDERFLOW: u32 = 0x0010;
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_EXCEPT_INEXACT: u32 = 0x0020;
 /// See [`_MM_GET_EXCEPTION_STATE`](fn._MM_GET_EXCEPTION_STATE.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_EXCEPT_MASK: u32 = 0x003f;
 
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_MASK_INVALID: u32 = 0x0080;
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_MASK_DENORM: u32 = 0x0100;
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_MASK_DIV_ZERO: u32 = 0x0200;
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_MASK_OVERFLOW: u32 = 0x0400;
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_MASK_UNDERFLOW: u32 = 0x0800;
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_MASK_INEXACT: u32 = 0x1000;
 /// See [`_MM_GET_EXCEPTION_MASK`](fn._MM_GET_EXCEPTION_MASK.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_MASK_MASK: u32 = 0x1f80;
 
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_ROUND_NEAREST: u32 = 0x0000;
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_ROUND_DOWN: u32 = 0x2000;
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_ROUND_UP: u32 = 0x4000;
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_ROUND_TOWARD_ZERO: u32 = 0x6000;
 
 /// See [`_MM_GET_ROUNDING_MODE`](fn._MM_GET_ROUNDING_MODE.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_ROUND_MASK: u32 = 0x6000;
 
 /// See [`_MM_GET_FLUSH_ZERO_MODE`](fn._MM_GET_FLUSH_ZERO_MODE.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_FLUSH_ZERO_MASK: u32 = 0x8000;
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000;
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000;
 
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_EXCEPTION_MASK)
 #[inline]
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
     _mm_getcsr() & _MM_MASK_MASK
 }
 
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_EXCEPTION_STATE)
 #[inline]
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
     _mm_getcsr() & _MM_EXCEPT_MASK
 }
 
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_FLUSH_ZERO_MODE)
 #[inline]
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
     _mm_getcsr() & _MM_FLUSH_ZERO_MASK
 }
 
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_GET_ROUNDING_MODE)
 #[inline]
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
     _mm_getcsr() & _MM_ROUND_MASK
 }
 
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_EXCEPTION_MASK)
 #[inline]
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
     _mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | x)
 }
 
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_EXCEPTION_STATE)
 #[inline]
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
     _mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | x)
 }
 
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_FLUSH_ZERO_MODE)
 #[inline]
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
     let val = (_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) | x;
     // println!("setting csr={:x}", val);
@@ -1496,23 +1818,30 @@ pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
 }
 
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE)
 #[inline]
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) {
     _mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) | x)
 }
 
 /// See [`_mm_prefetch`](fn._mm_prefetch.html).
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_HINT_T0: i32 = 3;
 
 /// See [`_mm_prefetch`](fn._mm_prefetch.html).
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_HINT_T1: i32 = 2;
 
 /// See [`_mm_prefetch`](fn._mm_prefetch.html).
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_HINT_T2: i32 = 1;
 
 /// See [`_mm_prefetch`](fn._mm_prefetch.html).
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_HINT_NTA: i32 = 0;
 
 /// Fetch the cache line that contains address `p` using the given `strategy`.
@@ -1551,6 +1880,8 @@ pub const _MM_HINT_NTA: i32 = 0;
 /// * Prefetching may also fail if there are not enough memory-subsystem
 ///   resources (e.g., request buffers).
 ///
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_prefetch)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(prefetcht0, strategy = _MM_HINT_T0))]
@@ -1558,6 +1889,7 @@ pub const _MM_HINT_NTA: i32 = 0;
 #[cfg_attr(test, assert_instr(prefetcht2, strategy = _MM_HINT_T2))]
 #[cfg_attr(test, assert_instr(prefetchnta, strategy = _MM_HINT_NTA))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_prefetch(p: *const i8, strategy: i32) {
     // The `strategy` must be a compile-time constant, so we use a short form
     // of `constify_imm8!` for now.
@@ -1577,8 +1909,11 @@ pub unsafe fn _mm_prefetch(p: *const i8, strategy: i32) {
 }
 
 /// Return vector of type __m128 with undefined elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps)
 #[inline]
 #[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_undefined_ps() -> __m128 {
     __m128(
         mem::uninitialized(),
@@ -1589,9 +1924,12 @@ pub unsafe fn _mm_undefined_ps() -> __m128 {
 }
 
 /// Transpose the 4x4 matrix formed by 4 rows of __m128 in place.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_TRANSPOSE4_PS)
 #[inline]
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _MM_TRANSPOSE4_PS(
     row0: &mut __m128, row1: &mut __m128, row2: &mut __m128, row3: &mut __m128
 ) {
@@ -1720,9 +2058,12 @@ extern "C" {
 ///
 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
 /// exception _may_ be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movntps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
     intrinsics::nontemporal_store(mem::transmute(mem_addr), a);
 }
diff --git a/coresimd/x86/sse2.rs b/coresimd/x86/sse2.rs
index 6d1b6de109..d443bbe96e 100644
--- a/coresimd/x86/sse2.rs
+++ b/coresimd/x86/sse2.rs
@@ -14,18 +14,24 @@ use ptr;
 ///
 /// This can help improve the performance and power consumption of spin-wait
 /// loops.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_pause)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pause))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_pause() {
     pause()
 }
 
 /// Invalidate and flush the cache line that contains `p` from all levels of
 /// the cache hierarchy.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clflush)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(clflush))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_clflush(p: *mut u8) {
     clflush(p)
 }
@@ -36,9 +42,12 @@ pub unsafe fn _mm_clflush(p: *mut u8) {
 /// Guarantees that every load instruction that precedes, in program order, is
 /// globally visible before any load instruction which follows the fence in
 /// program order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lfence)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(lfence))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_lfence() {
     lfence()
 }
@@ -49,89 +58,122 @@ pub unsafe fn _mm_lfence() {
 /// Guarantees that every memory access that precedes, in program order, the
 /// memory fence instruction is globally visible before any memory instruction
 /// which follows the fence in program order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mfence)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(mfence))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mfence() {
     mfence()
 }
 
 /// Add packed 8-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_epi8)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(paddb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(simd_add(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Add packed 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_epi16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(paddw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(simd_add(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Add packed 32-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_epi32)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(paddd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(simd_add(a.as_i32x4(), b.as_i32x4()))
 }
 
 /// Add packed 64-bit integers in `a` and "b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_epi64)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(paddq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(simd_add(a.as_i64x2(), b.as_i64x2()))
 }
 
 /// Add packed 8-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(paddsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(paddsb(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Add packed 16-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(paddsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(paddsw(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Add packed unsigned 8-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu8)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(paddusb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(paddsub(a.as_u8x16(), b.as_u8x16()))
 }
 
 /// Add packed unsigned 16-bit integers in `a` and `b` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(paddusw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(paddsuw(a.as_u16x8(), b.as_u16x8()))
 }
 
 /// Average packed unsigned 8-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_epu8)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pavgb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(pavgb(a.as_u8x16(), b.as_u8x16()))
 }
 
 /// Average packed unsigned 16-bit integers in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_epu16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pavgw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(pavgw(a.as_u16x8(), b.as_u16x8()))
 }
@@ -141,45 +183,60 @@ pub unsafe fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
 /// Multiply packed signed 16-bit integers in `a` and `b`, producing
 /// intermediate signed 32-bit integers. Horizontally add adjacent pairs of
 /// intermediate 32-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_madd_epi16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pmaddwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(pmaddwd(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Compare packed 16-bit integers in `a` and `b`, and return the packed
 /// maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pmaxsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(pmaxsw(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return the
 /// packed maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu8)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pmaxub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(pmaxub(a.as_u8x16(), b.as_u8x16()))
 }
 
 /// Compare packed 16-bit integers in `a` and `b`, and return the packed
 /// minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pminsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(pminsw(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return the
 /// packed minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu8)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pminub))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(pminub(a.as_u8x16(), b.as_u8x16()))
 }
@@ -188,9 +245,12 @@ pub unsafe fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
 ///
 /// The multiplication produces intermediate 32-bit integers, and returns the
 /// high 16 bits of the intermediate integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epi16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pmulhw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(pmulhw(a.as_i16x8(), b.as_i16x8()))
 }
@@ -199,9 +259,12 @@ pub unsafe fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
 ///
 /// The multiplication produces intermediate 32-bit integers, and returns the
 /// high 16 bits of the intermediate integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pmulhuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(pmulhuw(a.as_u16x8(), b.as_u16x8()))
 }
@@ -210,9 +273,12 @@ pub unsafe fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
 ///
 /// The multiplication produces intermediate 32-bit integers, and returns the
 /// low 16 bits of the intermediate integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mullo_epi16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pmullw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(simd_mul(a.as_i16x8(), b.as_i16x8()))
 }
@@ -221,9 +287,12 @@ pub unsafe fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// in `a` and `b`.
 ///
 /// Return the unsigned 64-bit results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_epu32)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pmuludq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(pmuludq(a.as_u32x4(), b.as_u32x4()))
 }
@@ -234,86 +303,116 @@ pub unsafe fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
 /// and `b`, then horizontally sum each consecutive 8 differences to produce
 /// two unsigned 16-bit integers, and pack these unsigned 16-bit integers in
 /// the low 16 bits of 64-bit elements returned.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psadbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(psadbw(a.as_u8x16(), b.as_u8x16()))
 }
 
 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psubb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(simd_sub(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psubw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(simd_sub(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi32)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psubd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(simd_sub(a.as_i32x4(), b.as_i32x4()))
 }
 
 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi64)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psubq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(simd_sub(a.as_i64x2(), b.as_i64x2()))
 }
 
 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
 /// using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_subs_epi8)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psubsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(psubsb(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
 /// using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_subs_epi16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psubsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(psubsw(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
 /// integers in `a` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_subs_epu8)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psubusb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(psubusb(a.as_u8x16(), b.as_u8x16()))
 }
 
 /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit
 /// integers in `a` using saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_subs_epu16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psubusw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(psubusw(a.as_u16x8(), b.as_u16x8()))
 }
 
 /// Shift `a` left by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_si128)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pslldq, imm8 = 1))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_slli_si128(a: __m128i, imm8: i32) -> __m128i {
     _mm_slli_si128_impl(a, imm8)
 }
@@ -374,120 +473,159 @@ unsafe fn _mm_slli_si128_impl(a: __m128i, imm8: i32) -> __m128i {
 }
 
 /// Shift `a` left by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bslli_si128)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pslldq, imm8 = 1))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_bslli_si128(a: __m128i, imm8: i32) -> __m128i {
     _mm_slli_si128_impl(a, imm8)
 }
 
 /// Shift `a` right by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bsrli_si128)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psrldq, imm8 = 1))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_bsrli_si128(a: __m128i, imm8: i32) -> __m128i {
     _mm_srli_si128_impl(a, imm8)
 }
 
 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psllw, imm8 = 7))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_slli_epi16(a: __m128i, imm8: i32) -> __m128i {
     mem::transmute(pslliw(a.as_i16x8(), imm8))
 }
 
 /// Shift packed 16-bit integers in `a` left by `count` while shifting in
 /// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psllw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i {
     mem::transmute(psllw(a.as_i16x8(), count.as_i16x8()))
 }
 
 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi32)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pslld, imm8 = 7))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_slli_epi32(a: __m128i, imm8: i32) -> __m128i {
     mem::transmute(psllid(a.as_i32x4(), imm8))
 }
 
 /// Shift packed 32-bit integers in `a` left by `count` while shifting in
 /// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi32)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pslld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i {
     mem::transmute(pslld(a.as_i32x4(), count.as_i32x4()))
 }
 
 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_slli_epi64)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psllq, imm8 = 7))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_slli_epi64(a: __m128i, imm8: i32) -> __m128i {
     mem::transmute(pslliq(a.as_i64x2(), imm8))
 }
 
 /// Shift packed 64-bit integers in `a` left by `count` while shifting in
 /// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sll_epi64)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psllq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
     mem::transmute(psllq(a.as_i64x2(), count.as_i64x2()))
 }
 
 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign
 /// bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psraw, imm8 = 1))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_srai_epi16(a: __m128i, imm8: i32) -> __m128i {
     mem::transmute(psraiw(a.as_i16x8(), imm8))
 }
 
 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign
 /// bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psraw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
     mem::transmute(psraw(a.as_i16x8(), count.as_i16x8()))
 }
 
 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign
 /// bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psrad, imm8 = 1))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_srai_epi32(a: __m128i, imm8: i32) -> __m128i {
     mem::transmute(psraid(a.as_i32x4(), imm8))
 }
 
 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign
 /// bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi32)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psrad))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i {
     mem::transmute(psrad(a.as_i32x4(), count.as_i32x4()))
 }
 
 /// Shift `a` right by `imm8` bytes while shifting in zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_si128)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psrldq, imm8 = 1))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_srli_si128(a: __m128i, imm8: i32) -> __m128i {
     _mm_srli_si128_impl(a, imm8)
 }
@@ -549,174 +687,234 @@ unsafe fn _mm_srli_si128_impl(a: __m128i, imm8: i32) -> __m128i {
 
 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in
 /// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psrlw, imm8 = 1))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_srli_epi16(a: __m128i, imm8: i32) -> __m128i {
     mem::transmute(psrliw(a.as_i16x8(), imm8))
 }
 
 /// Shift packed 16-bit integers in `a` right by `count` while shifting in
 /// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psrlw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i {
     mem::transmute(psrlw(a.as_i16x8(), count.as_i16x8()))
 }
 
 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in
 /// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psrld, imm8 = 8))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_srli_epi32(a: __m128i, imm8: i32) -> __m128i {
     mem::transmute(psrlid(a.as_i32x4(), imm8))
 }
 
 /// Shift packed 32-bit integers in `a` right by `count` while shifting in
 /// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi32)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psrld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i {
     mem::transmute(psrld(a.as_i32x4(), count.as_i32x4()))
 }
 
 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in
 /// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psrlq, imm8 = 1))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_srli_epi64(a: __m128i, imm8: i32) -> __m128i {
     mem::transmute(psrliq(a.as_i64x2(), imm8))
 }
 
 /// Shift packed 64-bit integers in `a` right by `count` while shifting in
 /// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srl_epi64)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psrlq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i {
     mem::transmute(psrlq(a.as_i64x2(), count.as_i64x2()))
 }
 
 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and
 /// `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_si128)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(andps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
     simd_and(a, b)
 }
 
 /// Compute the bitwise NOT of 128 bits (representing integer data) in `a` and
 /// then AND with `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_si128)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(andnps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
     simd_and(simd_xor(_mm_set1_epi8(-1), a), b)
 }
 
 /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and
 /// `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_or_si128)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(orps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
     simd_or(a, b)
 }
 
 /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and
 /// `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_si128)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(xorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
     simd_xor(a, b)
 }
 
 /// Compare packed 8-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi8)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pcmpeqb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute::<i8x16, _>(simd_eq(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Compare packed 16-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pcmpeqw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute::<i16x8, _>(simd_eq(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Compare packed 32-bit integers in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi32)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pcmpeqd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute::<i32x4, _>(simd_eq(a.as_i32x4(), b.as_i32x4()))
 }
 
 /// Compare packed 8-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_epi8)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pcmpgtb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute::<i8x16, _>(simd_gt(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_epi16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pcmpgtw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute::<i16x8, _>(simd_gt(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_epi32)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pcmpgtd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute::<i32x4, _>(simd_gt(a.as_i32x4(), b.as_i32x4()))
 }
 
 /// Compare packed 8-bit integers in `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_epi8)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pcmpgtb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute::<i8x16, _>(simd_lt(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Compare packed 16-bit integers in `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_epi16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pcmpgtw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute::<i16x8, _>(simd_lt(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Compare packed 32-bit integers in `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_epi32)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pcmpgtd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute::<i32x4, _>(simd_lt(a.as_i32x4(), b.as_i32x4()))
 }
 
 /// Convert the lower two packed 32-bit integers in `a` to packed
 /// double-precision (64-bit) floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtdq2pd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepi32_pd(a: __m128i) -> __m128d {
     let a = a.as_i32x4();
     simd_cast::<i32x2, __m128d>(simd_shuffle2(a, a, [0, 1]))
@@ -724,69 +922,93 @@ pub unsafe fn _mm_cvtepi32_pd(a: __m128i) -> __m128d {
 
 /// Return `a` with its lower element replaced by `b` after converting it to
 /// an `f64`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtsi2sd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d {
     simd_insert(a, 0, b as f64)
 }
 
 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit)
 /// floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_ps)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtdq2ps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
     cvtdq2ps(a.as_i32x4())
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in `a`
 /// to packed 32-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_epi32)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtps2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtps_epi32(a: __m128) -> __m128i {
     mem::transmute(cvtps2dq(a))
 }
 
 /// Return a vector whose lowest element is `a` and all higher elements are
 /// `0`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_si128)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtsi32_si128(a: i32) -> __m128i {
     mem::transmute(i32x4::new(a, 0, 0, 0))
 }
 
 /// Return the lowest element of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtsi128_si32(a: __m128i) -> i32 {
     simd_extract(a.as_i32x4(), 0)
 }
 
 /// Set packed 64-bit integers with the supplied values, from highest to
 /// lowest.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_epi64x)
 #[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i {
     mem::transmute(i64x2::new(e0, e1))
 }
 
 /// Set packed 32-bit integers with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_epi32)
 #[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
     mem::transmute(i32x4::new(e0, e1, e2, e3))
 }
 
 /// Set packed 16-bit integers with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_epi16)
 #[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_set_epi16(
     e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16
 ) -> __m128i {
@@ -794,9 +1016,12 @@ pub unsafe fn _mm_set_epi16(
 }
 
 /// Set packed 8-bit integers with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_epi8)
 #[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_set_epi8(
     e15: i8, e14: i8, e13: i8, e12: i8, e11: i8, e10: i8, e9: i8, e8: i8,
     e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8,
@@ -808,49 +1033,67 @@ pub unsafe fn _mm_set_epi8(
 }
 
 /// Broadcast 64-bit integer `a` to all elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x)
 #[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_set1_epi64x(a: i64) -> __m128i {
     _mm_set_epi64x(a, a)
 }
 
 /// Broadcast 32-bit integer `a` to all elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi32)
 #[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_set1_epi32(a: i32) -> __m128i {
     _mm_set_epi32(a, a, a, a)
 }
 
 /// Broadcast 16-bit integer `a` to all elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi16)
 #[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_set1_epi16(a: i16) -> __m128i {
     _mm_set_epi16(a, a, a, a, a, a, a, a)
 }
 
 /// Broadcast 8-bit integer `a` to all elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi8)
 #[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_set1_epi8(a: i8) -> __m128i {
     _mm_set_epi8(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
 }
 
 /// Set packed 32-bit integers with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi32)
 #[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
     _mm_set_epi32(e0, e1, e2, e3)
 }
 
 /// Set packed 16-bit integers with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi16)
 #[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_setr_epi16(
     e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16
 ) -> __m128i {
@@ -858,9 +1101,12 @@ pub unsafe fn _mm_setr_epi16(
 }
 
 /// Set packed 8-bit integers with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi8)
 #[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_setr_epi8(
     e15: i8, e14: i8, e13: i8, e12: i8, e11: i8, e10: i8, e9: i8, e8: i8,
     e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8,
@@ -872,14 +1118,19 @@ pub unsafe fn _mm_setr_epi8(
 }
 
 /// Returns a vector with all elements set to zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_si128)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(xorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_setzero_si128() -> __m128i {
     _mm_set1_epi64x(0)
 }
 
 /// Load 64-bit integer from memory into first element of returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64)
 #[inline]
 #[target_feature(enable = "sse2")]
 // FIXME movsd on windows
@@ -887,6 +1138,7 @@ pub unsafe fn _mm_setzero_si128() -> __m128i {
                not(all(target_os = "linux", target_arch = "x86_64")),
                target_arch = "x86_64"),
            assert_instr(movq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_loadl_epi64(mem_addr: *const __m128i) -> __m128i {
     _mm_set_epi64x(0, simd_extract((*mem_addr).as_i64x2(), 0))
 }
@@ -894,9 +1146,12 @@ pub unsafe fn _mm_loadl_epi64(mem_addr: *const __m128i) -> __m128i {
 /// Load 128-bits of integer data from memory into a new vector.
 ///
 /// `mem_addr` must be aligned on a 16-byte boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_si128)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i {
     *mem_addr
 }
@@ -904,9 +1159,12 @@ pub unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i {
 /// Load 128-bits of integer data from memory into a new vector.
 ///
 /// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si128)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movups))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
     let mut dst: __m128i = _mm_undefined_si128();
     ptr::copy_nonoverlapping(
@@ -925,9 +1183,12 @@ pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
 ///
 /// `mem_addr` should correspond to a 128-bit memory location and does not need
 /// to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmoveu_si128)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(maskmovdqu))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_maskmoveu_si128(
     a: __m128i, mask: __m128i, mem_addr: *mut i8
 ) {
@@ -937,9 +1198,12 @@ pub unsafe fn _mm_maskmoveu_si128(
 /// Store 128-bits of integer data from `a` into memory.
 ///
 /// `mem_addr` must be aligned on a 16-byte boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_si128)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) {
     *mem_addr = a;
 }
@@ -947,9 +1211,12 @@ pub unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) {
 /// Store 128-bits of integer data from `a` into memory.
 ///
 /// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si128)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movups))] // FIXME movdqu expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
     storeudq(mem_addr as *mut i8, a);
 }
@@ -957,6 +1224,8 @@ pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
 /// Store the lower 64-bit integer `a` to a memory location.
 ///
 /// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_epi64)
 #[inline]
 #[target_feature(enable = "sse2")]
 // FIXME mov on windows, movlps on i686
@@ -964,6 +1233,7 @@ pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
                not(all(target_os = "linux", target_arch = "x86_64")),
                target_arch = "x86_64"),
            assert_instr(movq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
     ptr::copy_nonoverlapping(
         &a as *const _ as *const u8,
@@ -975,9 +1245,12 @@ pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
 /// Stores a 128-bit integer vector to a 128-bit aligned memory location.
 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
 /// used again soon).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si128)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movntps))] // FIXME movntdq
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
     ::intrinsics::nontemporal_store(mem_addr, a);
 }
@@ -985,20 +1258,26 @@ pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
 /// Stores a 32-bit integer value in the specified memory location.
 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
 /// used again soon).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si32)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movnti))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
     ::intrinsics::nontemporal_store(mem_addr, a);
 }
 
 /// Return a vector where the low element is extracted from `a` and its upper
 /// element is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64)
 #[inline]
 #[target_feature(enable = "sse2")]
 // FIXME movd on windows, movd on i686
 #[cfg_attr(all(test, not(windows), target_arch = "x86_64"),
            assert_instr(movq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_move_epi64(a: __m128i) -> __m128i {
     let zero = _mm_setzero_si128();
     let r: i64x2 = simd_shuffle2(a.as_i64x2(), zero.as_i64x2(), [0, 2]);
@@ -1007,45 +1286,60 @@ pub unsafe fn _mm_move_epi64(a: __m128i) -> __m128i {
 
 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using signed saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_packs_epi16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(packsswb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(packsswb(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
 /// using signed saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_packs_epi32)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(packssdw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(packssdw(a.as_i32x4(), b.as_i32x4()))
 }
 
 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using unsigned saturation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_packus_epi16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(packuswb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(packuswb(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Return the `imm8` element of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pextrw, imm8 = 9))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_extract_epi16(a: __m128i, imm8: i32) -> i32 {
     simd_extract::<_, i16>(a.as_i16x8(), (imm8 & 7) as u32) as i32
 }
 
 /// Return a new vector where the `imm8` element of `a` is replaced with `i`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pinsrw, imm8 = 9))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_insert_epi16(a: __m128i, i: i32, imm8: i32) -> __m128i {
     mem::transmute(simd_insert(
         a.as_i16x8(),
@@ -1055,18 +1349,24 @@ pub unsafe fn _mm_insert_epi16(a: __m128i, i: i32, imm8: i32) -> __m128i {
 }
 
 /// Return a mask of the most significant bit of each element in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_epi8)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pmovmskb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_movemask_epi8(a: __m128i) -> i32 {
     pmovmskb(a.as_i8x16())
 }
 
 /// Shuffle 32-bit integers in `a` using the control in `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi32)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pshufd, imm8 = 9))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_shuffle_epi32(a: __m128i, imm8: i32) -> __m128i {
     // simd_shuffleX requires that its selector parameter be made up of
     // constant values, but we can't enforce that here. In spirit, we need
@@ -1127,10 +1427,13 @@ pub unsafe fn _mm_shuffle_epi32(a: __m128i, imm8: i32) -> __m128i {
 ///
 /// Put the results in the high 64 bits of the returned vector, with the low 64
 /// bits being copied from from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shufflehi_epi16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pshufhw, imm8 = 9))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_shufflehi_epi16(a: __m128i, imm8: i32) -> __m128i {
     // See _mm_shuffle_epi32.
     let imm8 = (imm8 & 0xFF) as u8;
@@ -1197,10 +1500,13 @@ pub unsafe fn _mm_shufflehi_epi16(a: __m128i, imm8: i32) -> __m128i {
 ///
 /// Put the results in the low 64 bits of the returned vector, with the high 64
 /// bits being copied from from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shufflelo_epi16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pshuflw, imm8 = 9))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_shufflelo_epi16(a: __m128i, imm8: i32) -> __m128i {
     // See _mm_shuffle_epi32.
     let imm8 = (imm8 & 0xFF) as u8;
@@ -1251,9 +1557,12 @@ pub unsafe fn _mm_shufflelo_epi16(a: __m128i, imm8: i32) -> __m128i {
 }
 
 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_epi8)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(punpckhbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute::<i8x16, _>(simd_shuffle16(
         a.as_i8x16(),
@@ -1265,9 +1574,12 @@ pub unsafe fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_epi16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(punpckhwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
     let x = simd_shuffle8(
         a.as_i16x8(),
@@ -1278,9 +1590,12 @@ pub unsafe fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_epi32)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(unpckhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute::<i32x4, _>(simd_shuffle4(
         a.as_i32x4(),
@@ -1290,9 +1605,12 @@ pub unsafe fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_epi64)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(unpckhpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute::<i64x2, _>(simd_shuffle2(
         a.as_i64x2(),
@@ -1302,9 +1620,12 @@ pub unsafe fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_epi8)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(punpcklbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute::<i8x16, _>(simd_shuffle16(
         a.as_i8x16(),
@@ -1316,9 +1637,12 @@ pub unsafe fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_epi16)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(punpcklwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
     let x = simd_shuffle8(
         a.as_i16x8(),
@@ -1329,9 +1653,12 @@ pub unsafe fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_epi32)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(unpcklps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute::<i32x4, _>(simd_shuffle4(
         a.as_i32x4(),
@@ -1341,9 +1668,12 @@ pub unsafe fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_epi64)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movlhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute::<i64x2, _>(simd_shuffle2(
         a.as_i64x2(),
@@ -1354,134 +1684,179 @@ pub unsafe fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
 
 /// Return a new vector with the low element of `a` replaced by the sum of the
 /// low elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(addsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d {
     simd_insert(a, 0, _mm_cvtsd_f64(a) + _mm_cvtsd_f64(b))
 }
 
 /// Add packed double-precision (64-bit) floating-point elements in `a` and
 /// `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(addpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d {
     simd_add(a, b)
 }
 
 /// Return a new vector with the low element of `a` replaced by the result of
 /// diving the lower element of `a` by the lower element of `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(divsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d {
     simd_insert(a, 0, _mm_cvtsd_f64(a) / _mm_cvtsd_f64(b))
 }
 
 /// Divide packed double-precision (64-bit) floating-point elements in `a` by
 /// packed elements in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(divpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_div_pd(a: __m128d, b: __m128d) -> __m128d {
     simd_div(a, b)
 }
 
 /// Return a new vector with the low element of `a` replaced by the maximum
 /// of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(maxsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_max_sd(a: __m128d, b: __m128d) -> __m128d {
     maxsd(a, b)
 }
 
 /// Return a new vector with the maximum values from corresponding elements in
 /// `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(maxpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_max_pd(a: __m128d, b: __m128d) -> __m128d {
     maxpd(a, b)
 }
 
 /// Return a new vector with the low element of `a` replaced by the minimum
 /// of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(minsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_min_sd(a: __m128d, b: __m128d) -> __m128d {
     minsd(a, b)
 }
 
 /// Return a new vector with the minimum values from corresponding elements in
 /// `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(minpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d {
     minpd(a, b)
 }
 
 /// Return a new vector with the low element of `a` replaced by multiplying the
 /// low elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(mulsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d {
     simd_insert(a, 0, _mm_cvtsd_f64(a) * _mm_cvtsd_f64(b))
 }
 
 /// Multiply packed double-precision (64-bit) floating-point elements in `a`
 /// and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(mulpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d {
     simd_mul(a, b)
 }
 
 /// Return a new vector with the low element of `a` replaced by the square
 /// root of the lower element `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(sqrtsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d {
     simd_insert(a, 0, _mm_cvtsd_f64(sqrtsd(b)))
 }
 
 /// Return a new vector with the square root of each of the values in `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(sqrtpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sqrt_pd(a: __m128d) -> __m128d {
     sqrtpd(a)
 }
 
 /// Return a new vector with the low element of `a` replaced by subtracting the
 /// low element by `b` from the low element of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(subsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d {
     simd_insert(a, 0, _mm_cvtsd_f64(a) - _mm_cvtsd_f64(b))
 }
 
 /// Subtract packed double-precision (64-bit) floating-point elements in `b`
 /// from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(subpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sub_pd(a: __m128d, b: __m128d) -> __m128d {
     simd_sub(a, b)
 }
 
 /// Compute the bitwise AND of packed double-precision (64-bit) floating-point
 /// elements in `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(andps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_and_pd(a: __m128d, b: __m128d) -> __m128d {
     let a: __m128i = mem::transmute(a);
     let b: __m128i = mem::transmute(b);
@@ -1489,9 +1864,12 @@ pub unsafe fn _mm_and_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compute the bitwise NOT of `a` and then AND with `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(andnps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_andnot_pd(a: __m128d, b: __m128d) -> __m128d {
     let a: __m128i = mem::transmute(a);
     let b: __m128i = mem::transmute(b);
@@ -1499,9 +1877,12 @@ pub unsafe fn _mm_andnot_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compute the bitwise OR of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_or_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(orps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_or_pd(a: __m128d, b: __m128d) -> __m128d {
     let a: __m128i = mem::transmute(a);
     let b: __m128i = mem::transmute(b);
@@ -1509,9 +1890,12 @@ pub unsafe fn _mm_or_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compute the bitwise OR of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(xorps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
     let a: __m128i = mem::transmute(a);
     let b: __m128i = mem::transmute(b);
@@ -1520,36 +1904,48 @@ pub unsafe fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the equality
 /// comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpeqsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpeq_sd(a: __m128d, b: __m128d) -> __m128d {
     cmpsd(a, b, 0)
 }
 
 /// Return a new vector with the low element of `a` replaced by the less-than
 /// comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpltsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmplt_sd(a: __m128d, b: __m128d) -> __m128d {
     cmpsd(a, b, 1)
 }
 
 /// Return a new vector with the low element of `a` replaced by the
 /// less-than-or-equal comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmplesd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d {
     cmpsd(a, b, 2)
 }
 
 /// Return a new vector with the low element of `a` replaced by the
 /// greater-than comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpltsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
     simd_insert(
         _mm_cmplt_sd(b, a),
@@ -1560,9 +1956,12 @@ pub unsafe fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the
 /// greater-than-or-equal comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmplesd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d {
     simd_insert(
         _mm_cmple_sd(b, a),
@@ -1575,9 +1974,12 @@ pub unsafe fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d {
 /// of comparing both of the lower elements of `a` and `b` to `NaN`. If
 /// neither are equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0`
 /// otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpordsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d {
     cmpsd(a, b, 7)
 }
@@ -1585,45 +1987,60 @@ pub unsafe fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d {
 /// Return a new vector with the low element of `a` replaced by the result of
 /// comparing both of the lower elements of `a` and `b` to `NaN`. If either is
 /// equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpunordsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpunord_sd(a: __m128d, b: __m128d) -> __m128d {
     cmpsd(a, b, 3)
 }
 
 /// Return a new vector with the low element of `a` replaced by the not-equal
 /// comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpneqsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpneq_sd(a: __m128d, b: __m128d) -> __m128d {
     cmpsd(a, b, 4)
 }
 
 /// Return a new vector with the low element of `a` replaced by the
 /// not-less-than comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpnltsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpnlt_sd(a: __m128d, b: __m128d) -> __m128d {
     cmpsd(a, b, 5)
 }
 
 /// Return a new vector with the low element of `a` replaced by the
 /// not-less-than-or-equal comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpnlesd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d {
     cmpsd(a, b, 6)
 }
 
 /// Return a new vector with the low element of `a` replaced by the
 /// not-greater-than comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpnltsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
     simd_insert(
         _mm_cmpnlt_sd(b, a),
@@ -1634,9 +2051,12 @@ pub unsafe fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the
 /// not-greater-than-or-equal comparison of the lower elements of `a` and `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpnlesd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d {
     simd_insert(
         _mm_cmpnle_sd(b, a),
@@ -1646,203 +2066,278 @@ pub unsafe fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compare corresponding elements in `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpeqpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpeq_pd(a: __m128d, b: __m128d) -> __m128d {
     cmppd(a, b, 0)
 }
 
 /// Compare corresponding elements in `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpltpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmplt_pd(a: __m128d, b: __m128d) -> __m128d {
     cmppd(a, b, 1)
 }
 
 /// Compare corresponding elements in `a` and `b` for less-than-or-equal
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmplepd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmple_pd(a: __m128d, b: __m128d) -> __m128d {
     cmppd(a, b, 2)
 }
 
 /// Compare corresponding elements in `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpltpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpgt_pd(a: __m128d, b: __m128d) -> __m128d {
     _mm_cmplt_pd(b, a)
 }
 
 /// Compare corresponding elements in `a` and `b` for greater-than-or-equal.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmplepd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpge_pd(a: __m128d, b: __m128d) -> __m128d {
     _mm_cmple_pd(b, a)
 }
 
 /// Compare corresponding elements in `a` and `b` to see if neither is `NaN`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpord_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpordpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpord_pd(a: __m128d, b: __m128d) -> __m128d {
     cmppd(a, b, 7)
 }
 
 /// Compare corresponding elements in `a` and `b` to see if either is `NaN`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpunord_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpunordpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpunord_pd(a: __m128d, b: __m128d) -> __m128d {
     cmppd(a, b, 3)
 }
 
 /// Compare corresponding elements in `a` and `b` for not-equal.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpneqpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpneq_pd(a: __m128d, b: __m128d) -> __m128d {
     cmppd(a, b, 4)
 }
 
 /// Compare corresponding elements in `a` and `b` for not-less-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnlt_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpnltpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpnlt_pd(a: __m128d, b: __m128d) -> __m128d {
     cmppd(a, b, 5)
 }
 
 /// Compare corresponding elements in `a` and `b` for not-less-than-or-equal.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnle_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpnlepd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpnle_pd(a: __m128d, b: __m128d) -> __m128d {
     cmppd(a, b, 6)
 }
 
 /// Compare corresponding elements in `a` and `b` for not-greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpngt_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpnltpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpngt_pd(a: __m128d, b: __m128d) -> __m128d {
     _mm_cmpnlt_pd(b, a)
 }
 
 /// Compare corresponding elements in `a` and `b` for
 /// not-greater-than-or-equal.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpnlepd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpnge_pd(a: __m128d, b: __m128d) -> __m128d {
     _mm_cmpnle_pd(b, a)
 }
 
 /// Compare the lower element of `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(comisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_comieq_sd(a: __m128d, b: __m128d) -> i32 {
     comieqsd(a, b)
 }
 
 /// Compare the lower element of `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(comisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_comilt_sd(a: __m128d, b: __m128d) -> i32 {
     comiltsd(a, b)
 }
 
 /// Compare the lower element of `a` and `b` for less-than-or-equal.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(comisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_comile_sd(a: __m128d, b: __m128d) -> i32 {
     comilesd(a, b)
 }
 
 /// Compare the lower element of `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(comisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_comigt_sd(a: __m128d, b: __m128d) -> i32 {
     comigtsd(a, b)
 }
 
 /// Compare the lower element of `a` and `b` for greater-than-or-equal.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(comisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_comige_sd(a: __m128d, b: __m128d) -> i32 {
     comigesd(a, b)
 }
 
 /// Compare the lower element of `a` and `b` for not-equal.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(comisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_comineq_sd(a: __m128d, b: __m128d) -> i32 {
     comineqsd(a, b)
 }
 
 /// Compare the lower element of `a` and `b` for equality.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomieq_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(ucomisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_ucomieq_sd(a: __m128d, b: __m128d) -> i32 {
     ucomieqsd(a, b)
 }
 
 /// Compare the lower element of `a` and `b` for less-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomilt_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(ucomisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_ucomilt_sd(a: __m128d, b: __m128d) -> i32 {
     ucomiltsd(a, b)
 }
 
 /// Compare the lower element of `a` and `b` for less-than-or-equal.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomile_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(ucomisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_ucomile_sd(a: __m128d, b: __m128d) -> i32 {
     ucomilesd(a, b)
 }
 
 /// Compare the lower element of `a` and `b` for greater-than.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomigt_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(ucomisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_ucomigt_sd(a: __m128d, b: __m128d) -> i32 {
     ucomigtsd(a, b)
 }
 
 /// Compare the lower element of `a` and `b` for greater-than-or-equal.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomige_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(ucomisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_ucomige_sd(a: __m128d, b: __m128d) -> i32 {
     ucomigesd(a, b)
 }
 
 /// Compare the lower element of `a` and `b` for not-equal.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomineq_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(ucomisd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32 {
     ucomineqsd(a, b)
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in "a" to
 /// packed single-precision (32-bit) floating-point elements
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtpd2ps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
     cvtpd2ps(a)
 }
@@ -1850,27 +2345,36 @@ pub unsafe fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
 /// Convert packed single-precision (32-bit) floating-point elements in `a` to
 /// packed
 /// double-precision (64-bit) floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtps2pd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtps_pd(a: __m128) -> __m128d {
     cvtps2pd(a)
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in `a` to
 /// packed 32-bit integers.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_epi32)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtpd2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtpd_epi32(a: __m128d) -> __m128i {
     mem::transmute(cvtpd2dq(a))
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to
 /// a 32-bit integer.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si32)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtsd2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtsd_si32(a: __m128d) -> i32 {
     cvtsd2si(a)
 }
@@ -1879,16 +2383,22 @@ pub unsafe fn _mm_cvtsd_si32(a: __m128d) -> i32 {
 /// to a single-precision (32-bit) floating-point element, store the result in
 /// the lower element of the return value, and copy the upper element from `a`
 /// to the upper element the return value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_ss)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtsd2ss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128 {
     cvtsd2ss(a, b)
 }
 
 /// Return the lower double-precision (64-bit) floating-point element of "a".
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64)
 #[inline]
 #[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtsd_f64(a: __m128d) -> f64 {
     simd_extract(a, 0)
 }
@@ -1897,85 +2407,115 @@ pub unsafe fn _mm_cvtsd_f64(a: __m128d) -> f64 {
 /// to a double-precision (64-bit) floating-point element, store the result in
 /// the lower element of the return value, and copy the upper element from `a`
 /// to the upper element the return value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtss2sd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d {
     cvtss2sd(a, b)
 }
 
 /// Convert packed double-precision (64-bit) floating-point elements in `a` to
 /// packed 32-bit integers with truncation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_epi32)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvttpd2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvttpd_epi32(a: __m128d) -> __m128i {
     mem::transmute(cvttpd2dq(a))
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in `a`
 /// to a 32-bit integer with truncation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si32)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvttsd2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvttsd_si32(a: __m128d) -> i32 {
     cvttsd2si(a)
 }
 
 /// Convert packed single-precision (32-bit) floating-point elements in `a` to
 /// packed 32-bit integers with truncation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_epi32)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvttps2dq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvttps_epi32(a: __m128) -> __m128i {
     mem::transmute(cvttps2dq(a))
 }
 
 /// Copy double-precision (64-bit) floating-point element `a` to the lower
 /// element of the packed 64-bit return value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_set_sd(a: f64) -> __m128d {
     _mm_set_pd(0.0, a)
 }
 
 /// Broadcast double-precision (64-bit) floating-point value a to all elements
 /// of the return value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_set1_pd(a: f64) -> __m128d {
     _mm_set_pd(a, a)
 }
 
 /// Broadcast double-precision (64-bit) floating-point value a to all elements
 /// of the return value.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1)
 #[inline]
 #[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_set_pd1(a: f64) -> __m128d {
     _mm_set_pd(a, a)
 }
 
 /// Set packed double-precision (64-bit) floating-point elements in the return
 /// value with the supplied values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_set_pd(a: f64, b: f64) -> __m128d {
     __m128d(b, a)
 }
 
 /// Set packed double-precision (64-bit) floating-point elements in the return
 /// value with the supplied values in reverse order.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_setr_pd(a: f64, b: f64) -> __m128d {
     _mm_set_pd(b, a)
 }
 
 /// Returns packed double-precision (64-bit) floating-point elements with all
 /// zeros.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(xorps))] // FIXME xorpd expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_setzero_pd() -> __m128d {
     _mm_set_pd(0.0, 0.0)
 }
@@ -1984,9 +2524,12 @@ pub unsafe fn _mm_setzero_pd() -> __m128d {
 ///
 /// The mask is stored in the 2 least significant bits of the return value.
 /// All other bits are set to `0`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movmskpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_movemask_pd(a: __m128d) -> i32 {
     movmskpd(a)
 }
@@ -1995,18 +2538,24 @@ pub unsafe fn _mm_movemask_pd(a: __m128d) -> i32 {
 /// floating-point elements) from memory into the returned vector.
 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
 /// exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> __m128d {
     *(mem_addr as *const __m128d)
 }
 
 /// Loads a 64-bit double-precision value to the low element of a
 /// 128-bit integer vector and clears the upper element.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_load_sd(mem_addr: *const f64) -> __m128d {
     _mm_setr_pd(*mem_addr, 0.)
 }
@@ -2014,9 +2563,12 @@ pub unsafe fn _mm_load_sd(mem_addr: *const f64) -> __m128d {
 /// Loads a double-precision value into the high-order bits of a 128-bit
 /// vector of [2 x double]. The low-order bits are copied from the low-order
 /// bits of the first operand.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movhpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_loadh_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
     _mm_setr_pd(simd_extract(a, 0), *mem_addr)
 }
@@ -2024,9 +2576,12 @@ pub unsafe fn _mm_loadh_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
 /// Loads a double-precision value into the low-order bits of a 128-bit
 /// vector of [2 x double]. The high-order bits are copied from the
 /// high-order bits of the first operand.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movlpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
     _mm_setr_pd(*mem_addr, simd_extract(a, 1))
 }
@@ -2035,18 +2590,24 @@ pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
 /// aligned memory location.
 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
 /// used again soon).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movntps))] // FIXME movntpd
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
     intrinsics::nontemporal_store(mem::transmute(mem_addr), a);
 }
 
 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
 /// memory location.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movlps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_store_sd(mem_addr: *mut f64, a: __m128d) {
     *mem_addr = simd_extract(a, 0)
 }
@@ -2054,9 +2615,12 @@ pub unsafe fn _mm_store_sd(mem_addr: *mut f64, a: __m128d) {
 /// Store 128-bits (composed of 2 packed double-precision (64-bit)
 /// floating-point elements) from `a` into memory. `mem_addr` must be aligned
 /// on a 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movaps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d) {
     *(mem_addr as *mut __m128d) = a;
 }
@@ -2064,9 +2628,12 @@ pub unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d) {
 /// Store 128-bits (composed of 2 packed double-precision (64-bit)
 /// floating-point elements) from `a` into memory.
 /// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movups))] // FIXME movupd expected
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d) {
     storeupd(mem_addr as *mut i8, a);
 }
@@ -2074,8 +2641,11 @@ pub unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d) {
 /// Store the lower double-precision (64-bit) floating-point element from `a`
 /// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
 /// 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d) {
     let b: __m128d = simd_shuffle2(a, a, [0, 0]);
     *(mem_addr as *mut __m128d) = b;
@@ -2084,8 +2654,11 @@ pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d) {
 /// Store the lower double-precision (64-bit) floating-point element from `a`
 /// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
 /// 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1)
 #[inline]
 #[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d) {
     let b: __m128d = simd_shuffle2(a, a, [0, 0]);
     *(mem_addr as *mut __m128d) = b;
@@ -2095,8 +2668,11 @@ pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d) {
 /// memory in reverse order.
 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
 /// exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: __m128d) {
     let b: __m128d = simd_shuffle2(a, a, [1, 0]);
     *(mem_addr as *mut __m128d) = b;
@@ -2104,27 +2680,36 @@ pub unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: __m128d) {
 
 /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
 /// memory location.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movhpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_storeh_pd(mem_addr: *mut f64, a: __m128d) {
     *mem_addr = simd_extract(a, 1);
 }
 
 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
 /// memory location.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movlps))] // FIXME movlpd
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_storel_pd(mem_addr: *mut f64, a: __m128d) {
     *mem_addr = simd_extract(a, 0);
 }
 
 /// Load a double-precision (64-bit) floating-point element from memory
 /// into both elements of returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 // #[cfg_attr(test, assert_instr(movapd))] // FIXME LLVM uses different codegen
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_load1_pd(mem_addr: *const f64) -> __m128d {
     let d = *mem_addr;
     _mm_setr_pd(d, d)
@@ -2132,9 +2717,12 @@ pub unsafe fn _mm_load1_pd(mem_addr: *const f64) -> __m128d {
 
 /// Load a double-precision (64-bit) floating-point element from memory
 /// into both elements of returned vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1)
 #[inline]
 #[target_feature(enable = "sse2")]
 // #[cfg_attr(test, assert_instr(movapd))] // FIXME same as _mm_load1_pd
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d {
     _mm_load1_pd(mem_addr)
 }
@@ -2142,9 +2730,12 @@ pub unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d {
 /// Load 2 double-precision (64-bit) floating-point elements from memory into
 /// the returned vector in reverse order. `mem_addr` must be aligned on a
 /// 16-byte boundary or a general-protection exception may be generated.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movapd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d {
     let a = _mm_load_pd(mem_addr);
     simd_shuffle2(a, a, [1, 0])
@@ -2153,9 +2744,12 @@ pub unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d {
 /// Load 128-bits (composed of 2 packed double-precision (64-bit)
 /// floating-point elements) from memory into the returned vector.
 /// `mem_addr` does not need to be aligned on any particular boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movups))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d {
     let mut dst = _mm_undefined_pd();
     ptr::copy_nonoverlapping(
@@ -2169,10 +2763,13 @@ pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d {
 /// Constructs a 128-bit floating-point vector of [2 x double] from two
 /// 128-bit vector parameters of [2 x double], using the immediate-value
 /// parameter as a specifier.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(shufpd, imm8 = 1))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_shuffle_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
     match imm8 & 0b11 {
         0b00 => simd_shuffle2(a, b, [0, 2]),
@@ -2185,71 +2782,98 @@ pub unsafe fn _mm_shuffle_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
 /// 64 bits are set to the lower 64 bits of the second parameter. The upper
 /// 64 bits are set to the upper 64 bits of the first parameter.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d {
     _mm_setr_pd(simd_extract(b, 0), simd_extract(a, 1))
 }
 
 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
 /// floating-point vector of [4 x float].
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps)
 #[inline]
 #[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_castpd_ps(a: __m128d) -> __m128 {
     mem::transmute(a)
 }
 
 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
 /// integer vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128)
 #[inline]
 #[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_castpd_si128(a: __m128d) -> __m128i {
     mem::transmute::<i64x2, _>(simd_cast(a))
 }
 
 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
 /// floating-point vector of [2 x double].
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_castps_pd(a: __m128) -> __m128d {
     mem::transmute(a)
 }
 
 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
 /// integer vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_si128)
 #[inline]
 #[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_castps_si128(a: __m128) -> __m128i {
     mem::transmute(a)
 }
 
 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
 /// of [2 x double].
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_castsi128_pd(a: __m128i) -> __m128d {
     simd_cast(a.as_i64x2())
 }
 
 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
 /// of [4 x float].
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_ps)
 #[inline]
 #[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_castsi128_ps(a: __m128i) -> __m128 {
     mem::transmute(a)
 }
 
 /// Return vector of type __m128d with undefined elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_undefined_pd() -> __m128d {
     _mm_set1_pd(mem::uninitialized())
 }
 
 /// Return vector of type __m128i with undefined elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_si128)
 #[inline]
 #[target_feature(enable = "sse2")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_undefined_si128() -> __m128i {
     _mm_set1_epi8(mem::uninitialized())
 }
@@ -2259,9 +2883,12 @@ pub unsafe fn _mm_undefined_si128() -> __m128i {
 ///
 /// * The [127:64] bits are copied from the [127:64] bits of the second input
 /// * The [63:0] bits are copied from the [127:64] bits of the first input
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(unpckhpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
     simd_shuffle2(a, b, [1, 3])
 }
@@ -2271,9 +2898,12 @@ pub unsafe fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
 ///
 /// * The [127:64] bits are copied from the [63:0] bits of the second input
 /// * The [63:0] bits are copied from the [63:0] bits of the first input
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movlhps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d {
     simd_shuffle2(a, b, [0, 2])
 }
diff --git a/coresimd/x86/sse3.rs b/coresimd/x86/sse3.rs
index ea41164b24..edcbf1deac 100644
--- a/coresimd/x86/sse3.rs
+++ b/coresimd/x86/sse3.rs
@@ -9,54 +9,72 @@ use stdsimd_test::assert_instr;
 
 /// Alternatively add and subtract packed single-precision (32-bit)
 /// floating-point elements in `a` to/from packed elements in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_addsub_ps)
 #[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(addsubps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 {
     addsubps(a, b)
 }
 
 /// Alternatively add and subtract packed double-precision (64-bit)
 /// floating-point elements in `a` to/from packed elements in `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_addsub_pd)
 #[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(addsubpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_addsub_pd(a: __m128d, b: __m128d) -> __m128d {
     addsubpd(a, b)
 }
 
 /// Horizontally add adjacent pairs of double-precision (64-bit)
 /// floating-point elements in `a` and `b`, and pack the results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd)
 #[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(haddpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_hadd_pd(a: __m128d, b: __m128d) -> __m128d {
     haddpd(a, b)
 }
 
 /// Horizontally add adjacent pairs of single-precision (32-bit)
 /// floating-point elements in `a` and `b`, and pack the results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_ps)
 #[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(haddps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_hadd_ps(a: __m128, b: __m128) -> __m128 {
     haddps(a, b)
 }
 
 /// Horizontally subtract adjacent pairs of double-precision (64-bit)
 /// floating-point elements in `a` and `b`, and pack the results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pd)
 #[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(hsubpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_hsub_pd(a: __m128d, b: __m128d) -> __m128d {
     hsubpd(a, b)
 }
 
 /// Horizontally add adjacent pairs of single-precision (32-bit)
 /// floating-point elements in `a` and `b`, and pack the results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps)
 #[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(hsubps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_hsub_ps(a: __m128, b: __m128) -> __m128 {
     hsubps(a, b)
 }
@@ -64,45 +82,60 @@ pub unsafe fn _mm_hsub_ps(a: __m128, b: __m128) -> __m128 {
 /// Load 128-bits of integer data from unaligned memory.
 /// This intrinsic may perform better than `_mm_loadu_si128`
 /// when the data crosses a cache line boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128)
 #[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(lddqu))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_lddqu_si128(mem_addr: *const __m128i) -> __m128i {
     mem::transmute(lddqu(mem_addr as *const _))
 }
 
 /// Duplicate the low double-precision (64-bit) floating-point element
 /// from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd)
 #[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(movddup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_movedup_pd(a: __m128d) -> __m128d {
     simd_shuffle2(a, a, [0, 0])
 }
 
 /// Load a double-precision (64-bit) floating-point element from memory
 /// into both elements of return vector.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd)
 #[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(movddup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_loaddup_pd(mem_addr: *const f64) -> __m128d {
     _mm_load1_pd(mem_addr)
 }
 
 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements
 /// from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps)
 #[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(movshdup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_movehdup_ps(a: __m128) -> __m128 {
     simd_shuffle4(a, a, [1, 1, 3, 3])
 }
 
 /// Duplicate even-indexed single-precision (32-bit) floating-point elements
 /// from `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps)
 #[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(movsldup))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_moveldup_ps(a: __m128) -> __m128 {
     simd_shuffle4(a, a, [0, 0, 2, 2])
 }
diff --git a/coresimd/x86/sse41.rs b/coresimd/x86/sse41.rs
index 5e03bcc8b5..81819f7b34 100644
--- a/coresimd/x86/sse41.rs
+++ b/coresimd/x86/sse41.rs
@@ -10,34 +10,47 @@ use stdsimd_test::assert_instr;
 
 // SSE4 rounding constans
 /// round to nearest
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_FROUND_TO_NEAREST_INT: i32 = 0x00;
 /// round down
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_FROUND_TO_NEG_INF: i32 = 0x01;
 /// round up
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_FROUND_TO_POS_INF: i32 = 0x02;
 /// truncate
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_FROUND_TO_ZERO: i32 = 0x03;
 /// use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_FROUND_CUR_DIRECTION: i32 = 0x04;
 /// do not suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_FROUND_RAISE_EXC: i32 = 0x00;
 /// suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_FROUND_NO_EXC: i32 = 0x08;
 /// round to nearest and do not suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_FROUND_NINT: i32 = 0x00;
 /// round down and do not suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_FROUND_FLOOR: i32 =
     (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF);
 /// round up and do not suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_FROUND_CEIL: i32 =
     (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF);
 /// truncate and do not suppress exceptions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_FROUND_TRUNC: i32 = (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO);
 /// use MXCSR.RC and do not suppress exceptions; see
 /// `vendor::_MM_SET_ROUNDING_MODE`
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_FROUND_RINT: i32 =
     (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION);
 /// use MXCSR.RC and suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE`
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _MM_FROUND_NEARBYINT: i32 =
     (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION);
 
@@ -46,9 +59,12 @@ pub const _MM_FROUND_NEARBYINT: i32 =
 /// The high bit of each corresponding mask byte determines the selection.
 /// If the high bit is set the element of `a` is selected. The element
 /// of `b` is selected otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_epi8)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pblendvb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_blendv_epi8(
     a: __m128i, b: __m128i, mask: __m128i
 ) -> __m128i {
@@ -64,10 +80,13 @@ pub unsafe fn _mm_blendv_epi8(
 /// The mask bits determine the selection. A clear bit selects the
 /// corresponding element of `a`, and a set bit the corresponding
 /// element of `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_epi16)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pblendw, imm8 = 0xF0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
     let a = a.as_i16x8();
     let b = b.as_i16x8();
@@ -81,28 +100,37 @@ pub unsafe fn _mm_blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 
 /// Blend packed double-precision (64-bit) floating-point elements from `a`
 /// and `b` using `mask`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(blendvpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
     blendvpd(a, b, mask)
 }
 
 /// Blend packed single-precision (32-bit) floating-point elements from `a`
 /// and `b` using `mask`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(blendvps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
     blendvps(a, b, mask)
 }
 
 /// Blend packed double-precision (64-bit) floating-point elements from `a`
 /// and `b` using control mask `imm2`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(blendpd, imm2 = 0b10))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_blend_pd(a: __m128d, b: __m128d, imm2: i32) -> __m128d {
     macro_rules! call {
         ($imm2:expr) => {
@@ -114,10 +142,13 @@ pub unsafe fn _mm_blend_pd(a: __m128d, b: __m128d, imm2: i32) -> __m128d {
 
 /// Blend packed single-precision (32-bit) floating-point elements from `a`
 /// and `b` using mask `imm4`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(blendps, imm4 = 0b0101))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_blend_ps(a: __m128, b: __m128, imm4: i32) -> __m128 {
     macro_rules! call {
         ($imm4:expr) => {
@@ -129,11 +160,14 @@ pub unsafe fn _mm_blend_ps(a: __m128, b: __m128, imm4: i32) -> __m128 {
 
 /// Extract a single-precision (32-bit) floating-point element from `a`,
 /// selected with `imm8`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_ps)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 // TODO: Add test for Windows
 #[cfg_attr(test, assert_instr(extractps, imm8 = 0))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_extract_ps(a: __m128, imm8: i32) -> i32 {
     mem::transmute(simd_extract::<_, f32>(a, imm8 as u32 & 0b11))
 }
@@ -142,21 +176,27 @@ pub unsafe fn _mm_extract_ps(a: __m128, imm8: i32) -> i32 {
 /// integer containing the zero-extended integer data.
 ///
 /// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi8)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pextrb, imm8 = 0))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_extract_epi8(a: __m128i, imm8: i32) -> i32 {
     let imm8 = (imm8 & 15) as u32;
     simd_extract::<_, u8>(a.as_u8x16(), imm8) as i32
 }
 
 /// Extract an 32-bit integer from `a` selected with `imm8`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi32)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 // TODO: Add test for Windows
 #[cfg_attr(test, assert_instr(extractps, imm8 = 1))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_extract_epi32(a: __m128i, imm8: i32) -> i32 {
     let imm8 = (imm8 & 3) as u32;
     simd_extract::<_, i32>(a.as_i32x4(), imm8)
@@ -184,10 +224,13 @@ pub unsafe fn _mm_extract_epi32(a: __m128i, imm8: i32) -> i32 {
 ///
 /// * Bits `[3:0]`: If any of these bits are set, the corresponding result
 /// element is cleared.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_ps)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(insertps, imm8 = 0b1010))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_insert_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
     macro_rules! call {
         ($imm8:expr) => {
@@ -199,10 +242,13 @@ pub unsafe fn _mm_insert_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
 
 /// Return a copy of `a` with the 8-bit integer from `i` inserted at a
 /// location specified by `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi8)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pinsrb, imm8 = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_insert_epi8(a: __m128i, i: i32, imm8: i32) -> __m128i {
     mem::transmute(simd_insert(
         a.as_i8x16(),
@@ -213,10 +259,13 @@ pub unsafe fn _mm_insert_epi8(a: __m128i, i: i32, imm8: i32) -> __m128i {
 
 /// Return a copy of `a` with the 32-bit integer from `i` inserted at a
 /// location specified by `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi32)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pinsrd, imm8 = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_insert_epi32(a: __m128i, i: i32, imm8: i32) -> __m128i {
     mem::transmute(simd_insert(
         a.as_i32x4(),
@@ -227,97 +276,130 @@ pub unsafe fn _mm_insert_epi32(a: __m128i, i: i32, imm8: i32) -> __m128i {
 
 /// Compare packed 8-bit integers in `a` and `b` and return packed maximum
 /// values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmaxsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(pmaxsb(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed
 /// maximum.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmaxuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(pmaxuw(a.as_u16x8(), b.as_u16x8()))
 }
 
 /// Compare packed 32-bit integers in `a` and `b`, and return packed maximum
 /// values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi32)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmaxsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(pmaxsd(a.as_i32x4(), b.as_i32x4()))
 }
 
 /// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed
 /// maximum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmaxud))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(pmaxud(a.as_u32x4(), b.as_u32x4()))
 }
 
 /// Compare packed 8-bit integers in `a` and `b` and return packed minimum
 /// values in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pminsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(pminsb(a.as_i8x16(), b.as_i8x16()))
 }
 
 /// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed
 /// minimum.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pminuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(pminuw(a.as_u16x8(), b.as_u16x8()))
 }
 
 /// Compare packed 32-bit integers in `a` and `b`, and return packed minimum
 /// values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi32)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pminsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(pminsd(a.as_i32x4(), b.as_i32x4()))
 }
 
 /// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed
 /// minimum values.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu32)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pminud))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(pminud(a.as_u32x4(), b.as_u32x4()))
 }
 
 /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
 /// using unsigned saturation
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_packus_epi32)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(packusdw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(packusdw(a.as_i32x4(), b.as_i32x4()))
 }
 
 /// Compare packed 64-bit integers in `a` and `b` for equality
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi64)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pcmpeqq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(simd_eq::<_, i64x2>(a.as_i64x2(), b.as_i64x2()))
 }
 
 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi16)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovsxbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
     let a = a.as_i8x16();
     let a = simd_shuffle8::<_, i8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
@@ -325,9 +407,12 @@ pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
 }
 
 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi32)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovsxbd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
     let a = a.as_i8x16();
     let a = simd_shuffle4::<_, i8x4>(a, a, [0, 1, 2, 3]);
@@ -336,9 +421,12 @@ pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
 
 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed
 /// 64-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi8_epi64)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovsxbq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
     let a = a.as_i8x16();
     let a = simd_shuffle2::<_, i8x2>(a, a, [0, 1]);
@@ -346,9 +434,12 @@ pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
 }
 
 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_epi32)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovsxwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
     let a = a.as_i16x8();
     let a = simd_shuffle4::<_, i16x4>(a, a, [0, 1, 2, 3]);
@@ -356,9 +447,12 @@ pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
 }
 
 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_epi64)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovsxwq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
     let a = a.as_i16x8();
     let a = simd_shuffle2::<_, i16x2>(a, a, [0, 1]);
@@ -366,9 +460,12 @@ pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
 }
 
 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_epi64)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovsxdq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
     let a = a.as_i32x4();
     let a = simd_shuffle2::<_, i32x2>(a, a, [0, 1]);
@@ -376,9 +473,12 @@ pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
 }
 
 /// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi16)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovzxbw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
     let a = a.as_u8x16();
     let a = simd_shuffle8::<_, u8x8>(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
@@ -386,9 +486,12 @@ pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
 }
 
 /// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi32)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovzxbd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
     let a = a.as_u8x16();
     let a = simd_shuffle4::<_, u8x4>(a, a, [0, 1, 2, 3]);
@@ -396,9 +499,12 @@ pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
 }
 
 /// Zero extend packed unsigned 8-bit integers in `a` to packed 64-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu8_epi64)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovzxbq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
     let a = a.as_u8x16();
     let a = simd_shuffle2::<_, u8x2>(a, a, [0, 1]);
@@ -407,9 +513,12 @@ pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
 
 /// Zero extend packed unsigned 16-bit integers in `a`
 /// to packed 32-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_epi32)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovzxwd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
     let a = a.as_u16x8();
     let a = simd_shuffle4::<_, u16x4>(a, a, [0, 1, 2, 3]);
@@ -418,9 +527,12 @@ pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
 
 /// Zero extend packed unsigned 16-bit integers in `a`
 /// to packed 64-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_epi64)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovzxwq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
     let a = a.as_u16x8();
     let a = simd_shuffle2::<_, u16x2>(a, a, [0, 1]);
@@ -429,9 +541,12 @@ pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
 
 /// Zero extend packed unsigned 32-bit integers in `a`
 /// to packed 64-bit integers
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu32_epi64)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovzxdq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
     let a = a.as_u32x4();
     let a = simd_shuffle2::<_, u32x2>(a, a, [0, 1]);
@@ -445,10 +560,13 @@ pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
 /// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
 /// the dot product will be stored in the return value component. Otherwise if
 /// the broadcast mask bit is zero then the return component will be zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_pd)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(dppd, imm8 = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_dp_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
     macro_rules! call {
         ($imm8:expr) => {
@@ -465,10 +583,13 @@ pub unsafe fn _mm_dp_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
 /// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
 /// the dot product will be stored in the return value component. Otherwise if
 /// the broadcast mask bit is zero then the return component will be zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(dpps, imm8 = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_dp_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
     macro_rules! call {
         ($imm8:expr) => {
@@ -481,9 +602,12 @@ pub unsafe fn _mm_dp_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
 /// Round the packed double-precision (64-bit) floating-point elements in `a`
 /// down to an integer value, and store the results as packed double-precision
 /// floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_pd)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d {
     roundpd(a, _MM_FROUND_FLOOR)
 }
@@ -491,9 +615,12 @@ pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d {
 /// Round the packed single-precision (32-bit) floating-point elements in `a`
 /// down to an integer value, and store the results as packed single-precision
 /// floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_floor_ps(a: __m128) -> __m128 {
     roundps(a, _MM_FROUND_FLOOR)
 }
@@ -503,9 +630,12 @@ pub unsafe fn _mm_floor_ps(a: __m128) -> __m128 {
 /// floating-point element in the lower element of the intrinsic result,
 /// and copy the upper element from `a` to the upper element of the intrinsic
 /// result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_sd)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
     roundsd(a, b, _MM_FROUND_FLOOR)
 }
@@ -515,9 +645,12 @@ pub unsafe fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
 /// floating-point element in the lower element of the intrinsic result,
 /// and copy the upper 3 packed elements from `a` to the upper elements
 /// of the intrinsic result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
     roundss(a, b, _MM_FROUND_FLOOR)
 }
@@ -525,9 +658,12 @@ pub unsafe fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
 /// Round the packed double-precision (64-bit) floating-point elements in `a`
 /// up to an integer value, and store the results as packed double-precision
 /// floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_pd)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundpd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d {
     roundpd(a, _MM_FROUND_CEIL)
 }
@@ -535,9 +671,12 @@ pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d {
 /// Round the packed single-precision (32-bit) floating-point elements in `a`
 /// up to an integer value, and store the results as packed single-precision
 /// floating-point elements.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundps))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_ceil_ps(a: __m128) -> __m128 {
     roundps(a, _MM_FROUND_CEIL)
 }
@@ -547,9 +686,12 @@ pub unsafe fn _mm_ceil_ps(a: __m128) -> __m128 {
 /// floating-point element in the lower element of the intrisic result,
 /// and copy the upper element from `a` to the upper element
 /// of the intrinsic result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_sd)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
     roundsd(a, b, _MM_FROUND_CEIL)
 }
@@ -559,9 +701,12 @@ pub unsafe fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
 /// floating-point element in the lower element of the intrinsic result,
 /// and copy the upper 3 packed elements from `a` to the upper elements
 /// of the intrinsic result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
     roundss(a, b, _MM_FROUND_CEIL)
 }
@@ -602,10 +747,13 @@ pub unsafe fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
 /// _MM_FROUND_CUR_DIRECTION;
 /// # }
 /// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_pd)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundpd, rounding = 0))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_round_pd(a: __m128d, rounding: i32) -> __m128d {
     macro_rules! call {
         ($imm4:expr) => {
@@ -652,10 +800,13 @@ pub unsafe fn _mm_round_pd(a: __m128d, rounding: i32) -> __m128d {
 /// _MM_FROUND_CUR_DIRECTION;
 /// # }
 /// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundps, rounding = 0))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_round_ps(a: __m128, rounding: i32) -> __m128 {
     macro_rules! call {
         ($imm4:expr) => {
@@ -703,10 +854,13 @@ pub unsafe fn _mm_round_ps(a: __m128, rounding: i32) -> __m128 {
 /// _MM_FROUND_CUR_DIRECTION;
 /// # }
 /// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_sd)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundsd, rounding = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
     macro_rules! call {
         ($imm4:expr) => {
@@ -754,10 +908,13 @@ pub unsafe fn _mm_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
 /// _MM_FROUND_CUR_DIRECTION;
 /// # }
 /// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ss)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundss, rounding = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
     macro_rules! call {
         ($imm4:expr) => {
@@ -786,18 +943,24 @@ pub unsafe fn _mm_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
 /// * bits `[15:0]` - contain the minimum value found in parameter `a`,
 /// * bits `[18:16]` - contain the index of the minimum value
 /// * remaining bits are set to `0`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(phminposuw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i {
     mem::transmute(phminposuw(a.as_u16x8()))
 }
 
 /// Multiply the low 32-bit integers from each packed 64-bit
 /// element in `a` and `b`, and return the signed 64-bit result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_epi32)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmuldq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(pmuldq(a.as_i32x4(), b.as_i32x4()))
 }
@@ -808,9 +971,12 @@ pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
 /// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping
 /// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would
 /// return a negative number.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mullo_epi32)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmulld))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(simd_mul(a.as_i32x4(), b.as_i32x4()))
 }
@@ -846,10 +1012,13 @@ pub unsafe fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
 ///
 /// * A `__m128i` vector containing the sums of the sets of
 ///   absolute differences between both operands.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mpsadbw_epu8)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(mpsadbw, imm8 = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mpsadbw_epu8(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
     let a = a.as_u8x16();
     let b = b.as_u8x16();
@@ -874,9 +1043,12 @@ pub unsafe fn _mm_mpsadbw_epu8(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 ///
 /// * `1` - if the specified bits are all zeros,
 /// * `0` - otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
     ptestz(a.as_i64x2(), mask.as_i64x2())
 }
@@ -894,9 +1066,12 @@ pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
 ///
 /// * `1` - if the specified bits are all ones,
 /// * `0` - otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
     ptestc(a.as_i64x2(), mask.as_i64x2())
 }
@@ -914,9 +1089,12 @@ pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
 ///
 /// * `1` - if the specified bits are neither all zeros nor all ones,
 /// * `0` - otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testnzc_si128)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
     ptestnzc(a.as_i64x2(), mask.as_i64x2())
 }
@@ -934,9 +1112,12 @@ pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
 ///
 /// * `1` - if the specified bits are all zeros,
 /// * `0` - otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
     _mm_testz_si128(a, mask)
 }
@@ -952,10 +1133,13 @@ pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
 ///
 /// * `1` - if the bits specified in the operand are all set to 1,
 /// * `0` - otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pcmpeqd))]
 #[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 {
     _mm_testc_si128(a, _mm_cmpeq_epi32(a, a))
 }
@@ -973,9 +1157,12 @@ pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 {
 ///
 /// * `1` - if the specified bits are neither all zeros nor all ones,
 /// * `0` - otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_mix_ones_zeros)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(ptest))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
     _mm_testnzc_si128(a, mask)
 }
diff --git a/coresimd/x86/sse42.rs b/coresimd/x86/sse42.rs
index 21284f1cde..efbfe79d3e 100644
--- a/coresimd/x86/sse42.rs
+++ b/coresimd/x86/sse42.rs
@@ -10,49 +10,68 @@ use coresimd::simd_llvm::*;
 use coresimd::x86::*;
 
 /// String contains unsigned 8-bit characters *(Default)*
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_UBYTE_OPS: i32 = 0b0000_0000;
 /// String contains unsigned 16-bit characters
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_UWORD_OPS: i32 = 0b0000_0001;
 /// String contains signed 8-bit characters
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_SBYTE_OPS: i32 = 0b0000_0010;
 /// String contains unsigned 16-bit characters
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_SWORD_OPS: i32 = 0b0000_0011;
 
 /// For each character in `a`, find if it is in `b` *(Default)*
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_CMP_EQUAL_ANY: i32 = 0b0000_0000;
 /// For each character in `a`, determine if
 /// `b[0] <= c <= b[1] or b[1] <= c <= b[2]...`
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_CMP_RANGES: i32 = 0b0000_0100;
 /// The strings defined by `a` and `b` are equal
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_CMP_EQUAL_EACH: i32 = 0b0000_1000;
 /// Search for the defined substring in the target
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_CMP_EQUAL_ORDERED: i32 = 0b0000_1100;
 
 /// Do not negate results *(Default)*
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_POSITIVE_POLARITY: i32 = 0b0000_0000;
 /// Negate results
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_NEGATIVE_POLARITY: i32 = 0b0001_0000;
 /// Do not negate results before the end of the string
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_MASKED_POSITIVE_POLARITY: i32 = 0b0010_0000;
 /// Negate results only before the end of the string
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_MASKED_NEGATIVE_POLARITY: i32 = 0b0011_0000;
 
 /// **Index only**: return the least significant bit *(Default)*
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_LEAST_SIGNIFICANT: i32 = 0b0000_0000;
 /// **Index only**: return the most significant bit
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_MOST_SIGNIFICANT: i32 = 0b0100_0000;
 
 /// **Mask only**: return the bit mask
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_BIT_MASK: i32 = 0b0000_0000;
 /// **Mask only**: return the byte mask
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _SIDD_UNIT_MASK: i32 = 0b0100_0000;
 
 /// Compare packed strings with implicit lengths in `a` and `b` using the
 /// control in `imm8`, and return the generated mask.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpistrm)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpistrm, imm8 = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpistrm(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
     let a = a.as_i8x16();
     let b = b.as_i8x16();
@@ -296,10 +315,13 @@ pub unsafe fn _mm_cmpistrm(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 /// [`_SIDD_LEAST_SIGNIFICANT`]: constant._SIDD_LEAST_SIGNIFICANT.html
 /// [`_SIDD_MOST_SIGNIFICANT`]: constant._SIDD_MOST_SIGNIFICANT.html
 /// [`_mm_cmpestri`]: fn._mm_cmpestri.html
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpistri)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpistri(a: __m128i, b: __m128i, imm8: i32) -> i32 {
     let a = a.as_i8x16();
     let b = b.as_i8x16();
@@ -314,10 +336,13 @@ pub unsafe fn _mm_cmpistri(a: __m128i, b: __m128i, imm8: i32) -> i32 {
 /// Compare packed strings with implicit lengths in `a` and `b` using the
 /// control in `imm8`, and return `1` if any character in `b` was null.
 /// and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpistrz)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpistrz(a: __m128i, b: __m128i, imm8: i32) -> i32 {
     let a = a.as_i8x16();
     let b = b.as_i8x16();
@@ -332,10 +357,13 @@ pub unsafe fn _mm_cmpistrz(a: __m128i, b: __m128i, imm8: i32) -> i32 {
 /// Compare packed strings with implicit lengths in `a` and `b` using the
 /// control in `imm8`, and return `1` if the resulting mask was non-zero,
 /// and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpistrc)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpistrc(a: __m128i, b: __m128i, imm8: i32) -> i32 {
     let a = a.as_i8x16();
     let b = b.as_i8x16();
@@ -350,10 +378,13 @@ pub unsafe fn _mm_cmpistrc(a: __m128i, b: __m128i, imm8: i32) -> i32 {
 /// Compare packed strings with implicit lengths in `a` and `b` using the
 /// control in `imm8`, and returns `1` if any character in `a` was null,
 /// and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpistrs)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpistrs(a: __m128i, b: __m128i, imm8: i32) -> i32 {
     let a = a.as_i8x16();
     let b = b.as_i8x16();
@@ -367,10 +398,13 @@ pub unsafe fn _mm_cmpistrs(a: __m128i, b: __m128i, imm8: i32) -> i32 {
 
 /// Compare packed strings with implicit lengths in `a` and `b` using the
 /// control in `imm8`, and return bit `0` of the resulting bit mask.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpistro)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpistro(a: __m128i, b: __m128i, imm8: i32) -> i32 {
     let a = a.as_i8x16();
     let b = b.as_i8x16();
@@ -385,10 +419,13 @@ pub unsafe fn _mm_cmpistro(a: __m128i, b: __m128i, imm8: i32) -> i32 {
 /// Compare packed strings with implicit lengths in `a` and `b` using the
 /// control in `imm8`, and return `1` if `b` did not contain a null
 /// character and the resulting mask was zero, and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpistra)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpistra(a: __m128i, b: __m128i, imm8: i32) -> i32 {
     let a = a.as_i8x16();
     let b = b.as_i8x16();
@@ -402,10 +439,13 @@ pub unsafe fn _mm_cmpistra(a: __m128i, b: __m128i, imm8: i32) -> i32 {
 
 /// Compare packed strings in `a` and `b` with lengths `la` and `lb`
 /// using the control in `imm8`, and return the generated mask.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestrm)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpestrm, imm8 = 0))]
 #[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpestrm(
     a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
 ) -> __m128i {
@@ -506,10 +546,13 @@ pub unsafe fn _mm_cmpestrm(
 /// [`_SIDD_LEAST_SIGNIFICANT`]: constant._SIDD_LEAST_SIGNIFICANT.html
 /// [`_SIDD_MOST_SIGNIFICANT`]: constant._SIDD_MOST_SIGNIFICANT.html
 /// [`_mm_cmpistri`]: fn._mm_cmpistri.html
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestri)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
 #[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpestri(
     a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
 ) -> i32 {
@@ -526,10 +569,13 @@ pub unsafe fn _mm_cmpestri(
 /// Compare packed strings in `a` and `b` with lengths `la` and `lb`
 /// using the control in `imm8`, and return `1` if any character in
 /// `b` was null, and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestrz)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
 #[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpestrz(
     a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
 ) -> i32 {
@@ -546,10 +592,13 @@ pub unsafe fn _mm_cmpestrz(
 /// Compare packed strings in `a` and `b` with lengths `la` and `lb`
 /// using the control in `imm8`, and return `1` if the resulting mask
 /// was non-zero, and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestrc)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
 #[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpestrc(
     a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
 ) -> i32 {
@@ -566,10 +615,13 @@ pub unsafe fn _mm_cmpestrc(
 /// Compare packed strings in `a` and `b` with lengths `la` and `lb`
 /// using the control in `imm8`, and return `1` if any character in
 /// a was null, and `0` otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestrs)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
 #[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpestrs(
     a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
 ) -> i32 {
@@ -586,10 +638,13 @@ pub unsafe fn _mm_cmpestrs(
 /// Compare packed strings in `a` and `b` with lengths `la` and `lb`
 /// using the control in `imm8`, and return bit `0` of the resulting
 /// bit mask.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestro)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
 #[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpestro(
     a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
 ) -> i32 {
@@ -607,10 +662,13 @@ pub unsafe fn _mm_cmpestro(
 /// using the control in `imm8`, and return `1` if `b` did not
 /// contain a null character and the resulting mask was zero, and `0`
 /// otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpestra)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
 #[rustc_args_required_const(4)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpestra(
     a: __m128i, la: i32, b: __m128i, lb: i32, imm8: i32
 ) -> i32 {
@@ -626,36 +684,48 @@ pub unsafe fn _mm_cmpestra(
 
 /// Starting with the initial value in `crc`, return the accumulated
 /// CRC32 value for unsigned 8-bit integer `v`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_crc32_u8)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(crc32))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_crc32_u8(crc: u32, v: u8) -> u32 {
     crc32_32_8(crc, v)
 }
 
 /// Starting with the initial value in `crc`, return the accumulated
 /// CRC32 value for unsigned 16-bit integer `v`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_crc32_u16)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(crc32))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_crc32_u16(crc: u32, v: u16) -> u32 {
     crc32_32_16(crc, v)
 }
 
 /// Starting with the initial value in `crc`, return the accumulated
 /// CRC32 value for unsigned 32-bit integer `v`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_crc32_u32)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(crc32))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_crc32_u32(crc: u32, v: u32) -> u32 {
     crc32_32_32(crc, v)
 }
 
 /// Compare packed 64-bit integers in `a` and `b` for greater-than,
 /// return the results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_epi64)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpgtq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpgt_epi64(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(simd_gt::<_, i64x2>(a.as_i64x2(), b.as_i64x2()))
 }
diff --git a/coresimd/x86/sse4a.rs b/coresimd/x86/sse4a.rs
index 370034c38a..fff6043564 100644
--- a/coresimd/x86/sse4a.rs
+++ b/coresimd/x86/sse4a.rs
@@ -36,6 +36,7 @@ extern "C" {
 #[inline]
 #[target_feature(enable = "sse4a")]
 #[cfg_attr(test, assert_instr(extrq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_extract_si64(x: __m128i, y: __m128i) -> __m128i {
     mem::transmute(extrq(x.as_i64x2(), y.as_i8x16()))
 }
@@ -52,6 +53,7 @@ pub unsafe fn _mm_extract_si64(x: __m128i, y: __m128i) -> __m128i {
 #[inline]
 #[target_feature(enable = "sse4a")]
 #[cfg_attr(test, assert_instr(insertq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_insert_si64(x: __m128i, y: __m128i) -> __m128i {
     mem::transmute(insertq(x.as_i64x2(), y.as_i64x2()))
 }
@@ -60,6 +62,7 @@ pub unsafe fn _mm_insert_si64(x: __m128i, y: __m128i) -> __m128i {
 #[inline]
 #[target_feature(enable = "sse4a")]
 #[cfg_attr(test, assert_instr(movntsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) {
     movntsd(p, a);
 }
@@ -68,6 +71,7 @@ pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) {
 #[inline]
 #[target_feature(enable = "sse4a")]
 #[cfg_attr(test, assert_instr(movntss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_stream_ss(p: *mut f32, a: __m128) {
     movntss(p, a);
 }
diff --git a/coresimd/x86/ssse3.rs b/coresimd/x86/ssse3.rs
index 4bee30e776..ec0477a611 100644
--- a/coresimd/x86/ssse3.rs
+++ b/coresimd/x86/ssse3.rs
@@ -10,9 +10,12 @@ use stdsimd_test::assert_instr;
 
 /// Compute the absolute value of packed 8-bit signed integers in `a` and
 /// return the unsigned results.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(pabsb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_abs_epi8(a: __m128i) -> __m128i {
     mem::transmute(pabsb128(a.as_i8x16()))
 }
@@ -20,9 +23,12 @@ pub unsafe fn _mm_abs_epi8(a: __m128i) -> __m128i {
 /// Compute the absolute value of each of the packed 16-bit signed integers in
 /// `a` and
 /// return the 16-bit unsigned integer
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(pabsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_abs_epi16(a: __m128i) -> __m128i {
     mem::transmute(pabsw128(a.as_i16x8()))
 }
@@ -30,9 +36,12 @@ pub unsafe fn _mm_abs_epi16(a: __m128i) -> __m128i {
 /// Compute the absolute value of each of the packed 32-bit signed integers in
 /// `a` and
 /// return the 32-bit unsigned integer
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(pabsd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_abs_epi32(a: __m128i) -> __m128i {
     mem::transmute(pabsd128(a.as_i32x4()))
 }
@@ -61,19 +70,25 @@ pub unsafe fn _mm_abs_epi32(a: __m128i) -> __m128i {
 ///     r
 /// }
 /// ```
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(pshufb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_shuffle_epi8(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(pshufb128(a.as_u8x16(), b.as_u8x16()))
 }
 
 /// Concatenate 16-byte blocks in `a` and `b` into a 32-byte temporary result,
 /// shift the result right by `n` bytes, and return the low 16 bytes.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(palignr, n = 15))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_alignr_epi8(a: __m128i, b: __m128i, n: i32) -> __m128i {
     let n = n as u32;
     // If palignr is shifting the pair of vectors more than the size of two
@@ -141,9 +156,12 @@ pub unsafe fn _mm_alignr_epi8(a: __m128i, b: __m128i, n: i32) -> __m128i {
 
 /// Horizontally add the adjacent pairs of values contained in 2 packed
 /// 128-bit vectors of [8 x i16].
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_epi16)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(phaddw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(phaddw128(a.as_i16x8(), b.as_i16x8()))
 }
@@ -151,27 +169,36 @@ pub unsafe fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// Horizontally add the adjacent pairs of values contained in 2 packed
 /// 128-bit vectors of [8 x i16]. Positive sums greater than 7FFFh are
 /// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadds_epi16)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(phaddsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_hadds_epi16(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(phaddsw128(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Horizontally add the adjacent pairs of values contained in 2 packed
 /// 128-bit vectors of [4 x i32].
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_epi32)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(phaddd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(phaddd128(a.as_i32x4(), b.as_i32x4()))
 }
 
 /// Horizontally subtract the adjacent pairs of values contained in 2
 /// packed 128-bit vectors of [8 x i16].
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi16)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(phsubw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(phsubw128(a.as_i16x8(), b.as_i16x8()))
 }
@@ -180,18 +207,24 @@ pub unsafe fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// packed 128-bit vectors of [8 x i16]. Positive differences greater than
 /// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
 /// saturated to 8000h.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(phsubsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_hsubs_epi16(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(phsubsw128(a.as_i16x8(), b.as_i16x8()))
 }
 
 /// Horizontally subtract the adjacent pairs of values contained in 2
 /// packed 128-bit vectors of [4 x i32].
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_epi32)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(phsubd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(phsubd128(a.as_i32x4(), b.as_i32x4()))
 }
@@ -201,9 +234,12 @@ pub unsafe fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i {
 /// integer values contained in the second source operand, add pairs of
 /// contiguous products with signed saturation, and writes the 16-bit sums to
 /// the corresponding bits in the destination.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maddubs_epi16)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(pmaddubsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_maddubs_epi16(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(pmaddubsw128(a.as_u8x16(), b.as_i8x16()))
 }
@@ -211,9 +247,12 @@ pub unsafe fn _mm_maddubs_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// Multiply packed 16-bit signed integer values, truncate the 32-bit
 /// product to the 18 most significant bits by right-shifting, round the
 /// truncated value by adding 1, and write bits [16:1] to the destination.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhrs_epi16)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(pmulhrsw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mulhrs_epi16(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(pmulhrsw128(a.as_i16x8(), b.as_i16x8()))
 }
@@ -222,9 +261,12 @@ pub unsafe fn _mm_mulhrs_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// integer in `b` is negative, and return the result.
 /// Elements in result are zeroed out when the corresponding element in `b`
 /// is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_epi8)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(psignb))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sign_epi8(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(psignb128(a.as_i8x16(), b.as_i8x16()))
 }
@@ -233,9 +275,12 @@ pub unsafe fn _mm_sign_epi8(a: __m128i, b: __m128i) -> __m128i {
 /// integer in `b` is negative, and return the results.
 /// Elements in result are zeroed out when the corresponding element in `b`
 /// is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_epi16)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(psignw))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sign_epi16(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(psignw128(a.as_i16x8(), b.as_i16x8()))
 }
@@ -244,9 +289,12 @@ pub unsafe fn _mm_sign_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// integer in `b` is negative, and return the results.
 /// Element in result are zeroed out when the corresponding element in `b`
 /// is zero.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_epi32)
 #[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(psignd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sign_epi32(a: __m128i, b: __m128i) -> __m128i {
     mem::transmute(psignd128(a.as_i32x4(), b.as_i32x4()))
 }
diff --git a/coresimd/x86/tbm.rs b/coresimd/x86/tbm.rs
index fa5b08d197..9793edfede 100644
--- a/coresimd/x86/tbm.rs
+++ b/coresimd/x86/tbm.rs
@@ -70,6 +70,7 @@ pub fn _bextr2_u64(a: u64, control: u64) -> u64 {
 #[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcfill))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blcfill_u32(x: u32) -> u32 {
     x & (x.wrapping_add(1))
 }
@@ -81,6 +82,7 @@ pub unsafe fn _blcfill_u32(x: u32) -> u32 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcfill))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blcfill_u64(x: u64) -> u64 {
     x & (x.wrapping_add(1))
 }
@@ -91,6 +93,7 @@ pub unsafe fn _blcfill_u64(x: u64) -> u64 {
 #[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blci))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blci_u32(x: u32) -> u32 {
     x | !(x.wrapping_add(1))
 }
@@ -102,6 +105,7 @@ pub unsafe fn _blci_u32(x: u32) -> u32 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blci))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blci_u64(x: u64) -> u64 {
     x | !(x.wrapping_add(1))
 }
@@ -112,6 +116,7 @@ pub unsafe fn _blci_u64(x: u64) -> u64 {
 #[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcic))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blcic_u32(x: u32) -> u32 {
     !x & (x.wrapping_add(1))
 }
@@ -123,6 +128,7 @@ pub unsafe fn _blcic_u32(x: u32) -> u32 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcic))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blcic_u64(x: u64) -> u64 {
     !x & (x.wrapping_add(1))
 }
@@ -134,6 +140,7 @@ pub unsafe fn _blcic_u64(x: u64) -> u64 {
 #[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcmsk))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blcmsk_u32(x: u32) -> u32 {
     x ^ (x.wrapping_add(1))
 }
@@ -146,6 +153,7 @@ pub unsafe fn _blcmsk_u32(x: u32) -> u32 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcmsk))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blcmsk_u64(x: u64) -> u64 {
     x ^ (x.wrapping_add(1))
 }
@@ -156,6 +164,7 @@ pub unsafe fn _blcmsk_u64(x: u64) -> u64 {
 #[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcs))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blcs_u32(x: u32) -> u32 {
     x | (x.wrapping_add(1))
 }
@@ -167,6 +176,7 @@ pub unsafe fn _blcs_u32(x: u32) -> u32 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcs))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blcs_u64(x: u64) -> u64 {
     x | x.wrapping_add(1)
 }
@@ -177,6 +187,7 @@ pub unsafe fn _blcs_u64(x: u64) -> u64 {
 #[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blsfill))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blsfill_u32(x: u32) -> u32 {
     x | (x.wrapping_sub(1))
 }
@@ -188,6 +199,7 @@ pub unsafe fn _blsfill_u32(x: u32) -> u32 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blsfill))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blsfill_u64(x: u64) -> u64 {
     x | (x.wrapping_sub(1))
 }
@@ -198,6 +210,7 @@ pub unsafe fn _blsfill_u64(x: u64) -> u64 {
 #[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blsic))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blsic_u32(x: u32) -> u32 {
     !x | (x.wrapping_sub(1))
 }
@@ -209,6 +222,7 @@ pub unsafe fn _blsic_u32(x: u32) -> u32 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blsic))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blsic_u64(x: u64) -> u64 {
     !x | (x.wrapping_sub(1))
 }
@@ -220,6 +234,7 @@ pub unsafe fn _blsic_u64(x: u64) -> u64 {
 #[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(t1mskc))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _t1mskc_u32(x: u32) -> u32 {
     !x | (x.wrapping_add(1))
 }
@@ -232,6 +247,7 @@ pub unsafe fn _t1mskc_u32(x: u32) -> u32 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(t1mskc))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _t1mskc_u64(x: u64) -> u64 {
     !x | (x.wrapping_add(1))
 }
@@ -243,6 +259,7 @@ pub unsafe fn _t1mskc_u64(x: u64) -> u64 {
 #[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(tzmsk))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _tzmsk_u32(x: u32) -> u32 {
     !x & (x.wrapping_sub(1))
 }
@@ -255,6 +272,7 @@ pub unsafe fn _tzmsk_u32(x: u32) -> u32 {
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(tzmsk))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _tzmsk_u64(x: u64) -> u64 {
     !x & (x.wrapping_sub(1))
 }
diff --git a/coresimd/x86/xsave.rs b/coresimd/x86/xsave.rs
index 06f5e9a215..6f59c384ea 100644
--- a/coresimd/x86/xsave.rs
+++ b/coresimd/x86/xsave.rs
@@ -31,9 +31,12 @@ extern "C" {
 ///
 /// The format of the XSAVE area is detailed in Section 13.4, “XSAVE Area,” of
 /// Intel® 64 and IA-32 Architectures Software Developer’s Manual, Volume 1.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsave)
 #[inline]
 #[target_feature(enable = "xsave")]
 #[cfg_attr(test, assert_instr(xsave))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsave(mem_addr: *mut u8, save_mask: u64) {
     xsave(
         mem_addr,
@@ -48,9 +51,12 @@ pub unsafe fn _xsave(mem_addr: *mut u8, save_mask: u64) {
 /// State is restored based on bits [62:0] in `rs_mask`, `XCR0`, and
 /// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
 /// boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xrstor)
 #[inline]
 #[target_feature(enable = "xsave")]
 #[cfg_attr(test, assert_instr(xrstor))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xrstor(mem_addr: *const u8, rs_mask: u64) {
     xrstor(mem_addr, (rs_mask >> 32) as u32, rs_mask as u32);
 }
@@ -58,24 +64,31 @@ pub unsafe fn _xrstor(mem_addr: *const u8, rs_mask: u64) {
 /// `XFEATURE_ENABLED_MASK` for `XCR`
 ///
 /// This intrinsic maps to `XSETBV` instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub const _XCR_XFEATURE_ENABLED_MASK: u32 = 0;
 
 /// Copy 64-bits from `val` to the extended control register (`XCR`) specified
 /// by `a`.
 ///
 /// Currently only `XFEATURE_ENABLED_MASK` `XCR` is supported.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsetbv)
 #[inline]
 #[target_feature(enable = "xsave")]
 #[cfg_attr(test, assert_instr(xsetbv))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsetbv(a: u32, val: u64) {
     xsetbv(a, (val >> 32) as u32, val as u32);
 }
 
 /// Reads the contents of the extended control register `XCR`
 /// specified in `xcr_no`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xgetbv)
 #[inline]
 #[target_feature(enable = "xsave")]
 #[cfg_attr(test, assert_instr(xgetbv))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xgetbv(xcr_no: u32) -> u64 {
     let eax: u32;
     let edx: u32;
@@ -90,9 +103,12 @@ pub unsafe fn _xgetbv(xcr_no: u32) -> u64 {
 /// `mem_addr` must be aligned on a 64-byte boundary. The hardware may optimize
 /// the manner in which data is saved. The performance of this instruction will
 /// be equal to or better than using the `XSAVE` instruction.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsaveopt)
 #[inline]
 #[target_feature(enable = "xsave,xsaveopt")]
 #[cfg_attr(test, assert_instr(xsaveopt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsaveopt(mem_addr: *mut u8, save_mask: u64) {
     xsaveopt(
         mem_addr,
@@ -107,9 +123,12 @@ pub unsafe fn _xsaveopt(mem_addr: *mut u8, save_mask: u64) {
 /// `xsavec` differs from `xsave` in that it uses compaction and that it may
 /// use init optimization. State is saved based on bits [62:0] in `save_mask`
 /// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsavec)
 #[inline]
 #[target_feature(enable = "xsave,xsavec")]
 #[cfg_attr(test, assert_instr(xsavec))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsavec(mem_addr: *mut u8, save_mask: u64) {
     xsavec(
         mem_addr,
@@ -125,9 +144,12 @@ pub unsafe fn _xsavec(mem_addr: *mut u8, save_mask: u64) {
 /// corresponding to bits set in `IA32_XSS` `MSR` and that it may use the
 /// modified optimization. State is saved based on bits [62:0] in `save_mask`
 /// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsaves)
 #[inline]
 #[target_feature(enable = "xsave,xsaves")]
 #[cfg_attr(test, assert_instr(xsaves))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsaves(mem_addr: *mut u8, save_mask: u64) {
     xsaves(
         mem_addr,
@@ -145,9 +167,12 @@ pub unsafe fn _xsaves(mem_addr: *mut u8, save_mask: u64) {
 /// State is restored based on bits [62:0] in `rs_mask`, `XCR0`, and
 /// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
 /// boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xrstors)
 #[inline]
 #[target_feature(enable = "xsave,xsaves")]
 #[cfg_attr(test, assert_instr(xrstors))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xrstors(mem_addr: *const u8, rs_mask: u64) {
     xrstors(mem_addr, (rs_mask >> 32) as u32, rs_mask as u32);
 }
diff --git a/coresimd/x86_64/abm.rs b/coresimd/x86_64/abm.rs
index 43fbee28ef..8175c10f99 100644
--- a/coresimd/x86_64/abm.rs
+++ b/coresimd/x86_64/abm.rs
@@ -23,17 +23,23 @@ use stdsimd_test::assert_instr;
 /// Counts the leading most significant zero bits.
 ///
 /// When the operand is zero, it returns its size in bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_lzcnt_u64)
 #[inline]
 #[target_feature(enable = "lzcnt")]
 #[cfg_attr(test, assert_instr(lzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _lzcnt_u64(x: u64) -> u64 {
     x.leading_zeros() as u64
 }
 
 /// Counts the bits that are set.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_popcnt64)
 #[inline]
 #[target_feature(enable = "popcnt")]
 #[cfg_attr(test, assert_instr(popcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _popcnt64(x: i64) -> i32 {
     x.count_ones() as i32
 }
diff --git a/coresimd/x86_64/avx.rs b/coresimd/x86_64/avx.rs
index df534d0aa6..510949ad4a 100644
--- a/coresimd/x86_64/avx.rs
+++ b/coresimd/x86_64/avx.rs
@@ -19,10 +19,13 @@ use mem;
 
 /// Copy `a` to result, and insert the 64-bit integer `i` into result
 /// at the location specified by `index`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insert_epi64)
 #[inline]
 #[rustc_args_required_const(2)]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_insert_epi64(a: __m256i, i: i64, index: i32) -> __m256i {
     mem::transmute(simd_insert(a.as_i64x4(), (index as u32) & 3, i))
 }
diff --git a/coresimd/x86_64/avx2.rs b/coresimd/x86_64/avx2.rs
index 86d2863739..3e15e2595e 100644
--- a/coresimd/x86_64/avx2.rs
+++ b/coresimd/x86_64/avx2.rs
@@ -22,10 +22,13 @@ use coresimd::simd_llvm::*;
 use coresimd::x86::*;
 
 /// Extract a 64-bit integer from `a`, selected with `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extract_epi64)
 #[inline]
 #[target_feature(enable = "avx2")]
 #[rustc_args_required_const(1)]
 // This intrinsic has no corresponding instruction.
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_extract_epi64(a: __m256i, imm8: i32) -> i64 {
     let imm8 = (imm8 & 3) as u32;
     simd_extract(a.as_i64x4(), imm8)
diff --git a/coresimd/x86_64/bmi.rs b/coresimd/x86_64/bmi.rs
index 2f55b7109c..e9841c36ef 100644
--- a/coresimd/x86_64/bmi.rs
+++ b/coresimd/x86_64/bmi.rs
@@ -14,10 +14,13 @@ use stdsimd_test::assert_instr;
 
 /// Extracts bits in range [`start`, `start` + `length`) from `a` into
 /// the least significant bits of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bextr_u64)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(bextr))]
 #[cfg(not(target_arch = "x86"))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _bextr_u64(a: u64, start: u32, len: u32) -> u64 {
     _bextr2_u64(a, ((start & 0xff) | ((len & 0xff) << 8)) as u64)
 }
@@ -27,36 +30,48 @@ pub unsafe fn _bextr_u64(a: u64, start: u32, len: u32) -> u64 {
 ///
 /// Bits [7,0] of `control` specify the index to the first bit in the range to
 /// be extracted, and bits [15,8] specify the length of the range.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bextr2_u64)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(bextr))]
 #[cfg(not(target_arch = "x86"))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _bextr2_u64(a: u64, control: u64) -> u64 {
     x86_bmi_bextr_64(a, control)
 }
 
 /// Bitwise logical `AND` of inverted `a` with `b`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_andn_u64)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(andn))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _andn_u64(a: u64, b: u64) -> u64 {
     !a & b
 }
 
 /// Extract lowest set isolated bit.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_blsi_u64)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(blsi))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blsi_u64(x: u64) -> u64 {
     x & x.wrapping_neg()
 }
 
 /// Get mask up to lowest set bit.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_blsmsk_u64)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(blsmsk))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blsmsk_u64(x: u64) -> u64 {
     x ^ (x.wrapping_sub(1_u64))
 }
@@ -64,10 +79,13 @@ pub unsafe fn _blsmsk_u64(x: u64) -> u64 {
 /// Resets the lowest set bit of `x`.
 ///
 /// If `x` is sets CF.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_blsr_u64)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(blsr))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _blsr_u64(x: u64) -> u64 {
     x & (x.wrapping_sub(1))
 }
@@ -75,9 +93,12 @@ pub unsafe fn _blsr_u64(x: u64) -> u64 {
 /// Counts the number of trailing least significant zero bits.
 ///
 /// When the source operand is 0, it returns its size in bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_tzcnt_u64)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(tzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _tzcnt_u64(x: u64) -> u64 {
     x.trailing_zeros() as u64
 }
@@ -85,9 +106,12 @@ pub unsafe fn _tzcnt_u64(x: u64) -> u64 {
 /// Counts the number of trailing least significant zero bits.
 ///
 /// When the source operand is 0, it returns its size in bits.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_tzcnt_64)
 #[inline]
 #[target_feature(enable = "bmi1")]
 #[cfg_attr(test, assert_instr(tzcnt))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_tzcnt_64(x: u64) -> i64 {
     x.trailing_zeros() as i64
 }
diff --git a/coresimd/x86_64/bmi2.rs b/coresimd/x86_64/bmi2.rs
index d97d371b3e..29c5ee0c77 100644
--- a/coresimd/x86_64/bmi2.rs
+++ b/coresimd/x86_64/bmi2.rs
@@ -17,10 +17,13 @@ use stdsimd_test::assert_instr;
 ///
 /// Unsigned multiplication of `a` with `b` returning a pair `(lo, hi)` with
 /// the low half and the high half of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mulx_u64)
 #[inline]
 #[cfg_attr(test, assert_instr(mulx))]
 #[target_feature(enable = "bmi2")]
 #[cfg(not(target_arch = "x86"))] // calls an intrinsic
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mulx_u64(a: u64, b: u64, hi: &mut u64) -> u64 {
     let result: u128 = (a as u128) * (b as u128);
     *hi = (result >> 64) as u64;
@@ -28,30 +31,39 @@ pub unsafe fn _mulx_u64(a: u64, b: u64, hi: &mut u64) -> u64 {
 }
 
 /// Zero higher bits of `a` >= `index`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bzhi_u64)
 #[inline]
 #[target_feature(enable = "bmi2")]
 #[cfg_attr(test, assert_instr(bzhi))]
 #[cfg(not(target_arch = "x86"))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _bzhi_u64(a: u64, index: u32) -> u64 {
     x86_bmi2_bzhi_64(a, index as u64)
 }
 
 /// Scatter contiguous low order bits of `a` to the result at the positions
 /// specified by the `mask`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_pdep_u64)
 #[inline]
 #[target_feature(enable = "bmi2")]
 #[cfg_attr(test, assert_instr(pdep))]
 #[cfg(not(target_arch = "x86"))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _pdep_u64(a: u64, mask: u64) -> u64 {
     x86_bmi2_pdep_64(a, mask)
 }
 
 /// Gathers the bits of `x` specified by the `mask` into the contiguous low
 /// order bit positions of the result.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_pext_u64)
 #[inline]
 #[target_feature(enable = "bmi2")]
 #[cfg_attr(test, assert_instr(pext))]
 #[cfg(not(target_arch = "x86"))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _pext_u64(a: u64, mask: u64) -> u64 {
     x86_bmi2_pext_64(a, mask)
 }
diff --git a/coresimd/x86_64/bswap.rs b/coresimd/x86_64/bswap.rs
index 340780f7f5..bcd3c4bbc7 100644
--- a/coresimd/x86_64/bswap.rs
+++ b/coresimd/x86_64/bswap.rs
@@ -6,8 +6,11 @@
 use stdsimd_test::assert_instr;
 
 /// Return an integer with the reversed byte order of x
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_bswap64)
 #[inline]
 #[cfg_attr(test, assert_instr(bswap))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _bswap64(x: i64) -> i64 {
     bswap_i64(x)
 }
diff --git a/coresimd/x86_64/fxsr.rs b/coresimd/x86_64/fxsr.rs
index 42280b4b3a..1bc5441647 100644
--- a/coresimd/x86_64/fxsr.rs
+++ b/coresimd/x86_64/fxsr.rs
@@ -21,9 +21,12 @@ extern "C" {
 ///
 /// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
 /// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_fxsave64)
 #[inline]
 #[target_feature(enable = "fxsr")]
 #[cfg_attr(test, assert_instr(fxsave64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _fxsave64(mem_addr: *mut u8) {
     fxsave64(mem_addr)
 }
@@ -42,9 +45,12 @@ pub unsafe fn _fxsave64(mem_addr: *mut u8) {
 ///
 /// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
 /// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_fxrstor64)
 #[inline]
 #[target_feature(enable = "fxsr")]
 #[cfg_attr(test, assert_instr(fxrstor64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _fxrstor64(mem_addr: *const u8) {
     fxrstor64(mem_addr)
 }
diff --git a/coresimd/x86_64/rdrand.rs b/coresimd/x86_64/rdrand.rs
index 917e900fef..7a64697591 100644
--- a/coresimd/x86_64/rdrand.rs
+++ b/coresimd/x86_64/rdrand.rs
@@ -12,10 +12,13 @@ use stdsimd_test::assert_instr;
 
 /// Read a hardware generated 64-bit random value and store the result in val.
 /// Return 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdrand64_step)
 #[inline]
 #[target_feature(enable = "rdrand")]
 #[cfg_attr(test, assert_instr(rdrand))]
 #[cfg_attr(feature = "cargo-clippy", allow(stutter))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _rdrand64_step(val: &mut u64) -> i32 {
     let (v, flag) = x86_rdrand64_step();
     *val = v;
@@ -24,9 +27,12 @@ pub unsafe fn _rdrand64_step(val: &mut u64) -> i32 {
 
 /// Read a 64-bit NIST SP800-90B and SP800-90C compliant random value and store
 /// in val. Return 1 if a random value was generated, and 0 otherwise.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_rdseed64_step)
 #[inline]
 #[target_feature(enable = "rdseed")]
 #[cfg_attr(test, assert_instr(rdseed))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _rdseed64_step(val: &mut u64) -> i32 {
     let (v, flag) = x86_rdseed64_step();
     *val = v;
diff --git a/coresimd/x86_64/sse.rs b/coresimd/x86_64/sse.rs
index 808470c17f..a06a515bc4 100644
--- a/coresimd/x86_64/sse.rs
+++ b/coresimd/x86_64/sse.rs
@@ -24,9 +24,12 @@ extern "C" {
 /// [`_mm_setcsr`](fn._mm_setcsr.html)).
 ///
 /// This corresponds to the `CVTSS2SI` instruction (with 64 bit output).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cvtss2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtss_si64(a: __m128) -> i64 {
     cvtss2si64(a)
 }
@@ -40,9 +43,12 @@ pub unsafe fn _mm_cvtss_si64(a: __m128) -> i64 {
 /// point exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
 ///
 /// This corresponds to the `CVTTSS2SI` instruction (with 64 bit output).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cvttss2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvttss_si64(a: __m128) -> i64 {
     cvttss2si64(a)
 }
@@ -52,9 +58,12 @@ pub unsafe fn _mm_cvttss_si64(a: __m128) -> i64 {
 ///
 /// This intrinsic corresponds to the `CVTSI2SS` instruction (with 64 bit
 /// input).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss)
 #[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cvtsi2ss))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtsi64_ss(a: __m128, b: i64) -> __m128 {
     cvtsi642ss(a, b)
 }
diff --git a/coresimd/x86_64/sse2.rs b/coresimd/x86_64/sse2.rs
index e48708ee59..30089da6f8 100644
--- a/coresimd/x86_64/sse2.rs
+++ b/coresimd/x86_64/sse2.rs
@@ -17,34 +17,46 @@ extern "C" {
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to
 /// a 64-bit integer.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtsd2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtsd_si64(a: __m128d) -> i64 {
     cvtsd2si64(a)
 }
 
 /// Alias for `_mm_cvtsd_si64`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_si64x)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtsd2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtsd_si64x(a: __m128d) -> i64 {
     _mm_cvtsd_si64(a)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in `a`
 /// to a 64-bit integer with truncation.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvttsd2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvttsd_si64(a: __m128d) -> i64 {
     cvttsd2si64(a)
 }
 
 /// Alias for `_mm_cvttsd_si64`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvttsd2si))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvttsd_si64x(a: __m128d) -> i64 {
     _mm_cvttsd_si64(a)
 }
@@ -52,61 +64,82 @@ pub unsafe fn _mm_cvttsd_si64x(a: __m128d) -> i64 {
 /// Stores a 64-bit integer value in the specified memory location.
 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
 /// used again soon).
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si64)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movnti))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_stream_si64(mem_addr: *mut i64, a: i64) {
     intrinsics::nontemporal_store(mem_addr, a);
 }
 
 /// Return a vector whose lowest element is `a` and all higher elements are
 /// `0`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_si128)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(all(test, not(windows)), assert_instr(movq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtsi64_si128(a: i64) -> __m128i {
     _mm_set_epi64x(0, a)
 }
 
 /// Return a vector whose lowest element is `a` and all higher elements are
 /// `0`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_si128)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(all(test, not(windows)), assert_instr(movq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtsi64x_si128(a: i64) -> __m128i {
     _mm_cvtsi64_si128(a)
 }
 
 /// Return the lowest element of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(all(test, not(windows)), assert_instr(movq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtsi128_si64(a: __m128i) -> i64 {
     simd_extract(a.as_i64x2(), 0)
 }
 
 /// Return the lowest element of `a`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(all(test, not(windows)), assert_instr(movq))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtsi128_si64x(a: __m128i) -> i64 {
     _mm_cvtsi128_si64(a)
 }
 
 /// Return `a` with its lower element replaced by `b` after converting it to
 /// an `f64`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtsi2sd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtsi64_sd(a: __m128d, b: i64) -> __m128d {
     simd_insert(a, 0, b as f64)
 }
 
 /// Return `a` with its lower element replaced by `b` after converting it to
 /// an `f64`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64x_sd)
 #[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtsi2sd))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtsi64x_sd(a: __m128d, b: i64) -> __m128d {
     _mm_cvtsi64_sd(a, b)
 }
diff --git a/coresimd/x86_64/sse41.rs b/coresimd/x86_64/sse41.rs
index 8a0bcdb05f..63b4a6c4ca 100644
--- a/coresimd/x86_64/sse41.rs
+++ b/coresimd/x86_64/sse41.rs
@@ -8,11 +8,14 @@ use mem;
 use stdsimd_test::assert_instr;
 
 /// Extract an 64-bit integer from `a` selected with `imm8`
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_epi64)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 // TODO: Add test for Windows
 #[cfg_attr(test, assert_instr(pextrq, imm8 = 1))]
 #[rustc_args_required_const(1)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_extract_epi64(a: __m128i, imm8: i32) -> i64 {
     let imm8 = (imm8 & 1) as u32;
     simd_extract(a.as_i64x2(), imm8)
@@ -20,10 +23,13 @@ pub unsafe fn _mm_extract_epi64(a: __m128i, imm8: i32) -> i64 {
 
 /// Return a copy of `a` with the 64-bit integer from `i` inserted at a
 /// location specified by `imm8`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_epi64)
 #[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pinsrq, imm8 = 0))]
 #[rustc_args_required_const(2)]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_insert_epi64(a: __m128i, i: i64, imm8: i32) -> __m128i {
     mem::transmute(simd_insert(a.as_i64x2(), (imm8 & 1) as u32, i))
 }
diff --git a/coresimd/x86_64/sse42.rs b/coresimd/x86_64/sse42.rs
index 1cbd04d6a8..bf42fd2c59 100644
--- a/coresimd/x86_64/sse42.rs
+++ b/coresimd/x86_64/sse42.rs
@@ -11,9 +11,12 @@ extern "C" {
 
 /// Starting with the initial value in `crc`, return the accumulated
 /// CRC32 value for unsigned 64-bit integer `v`.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_crc32_u64)
 #[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(crc32))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_crc32_u64(crc: u64, v: u64) -> u64 {
     crc32_64_64(crc, v)
 }
diff --git a/coresimd/x86_64/xsave.rs b/coresimd/x86_64/xsave.rs
index 0faaef1864..8fc721ada8 100644
--- a/coresimd/x86_64/xsave.rs
+++ b/coresimd/x86_64/xsave.rs
@@ -29,9 +29,12 @@ extern "C" {
 ///
 /// The format of the XSAVE area is detailed in Section 13.4, “XSAVE Area,” of
 /// Intel® 64 and IA-32 Architectures Software Developer’s Manual, Volume 1.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsave64)
 #[inline]
 #[target_feature(enable = "xsave")]
 #[cfg_attr(test, assert_instr(xsave64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsave64(mem_addr: *mut u8, save_mask: u64) {
     xsave64(
         mem_addr,
@@ -46,9 +49,12 @@ pub unsafe fn _xsave64(mem_addr: *mut u8, save_mask: u64) {
 /// State is restored based on bits [62:0] in `rs_mask`, `XCR0`, and
 /// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
 /// boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xrstor64)
 #[inline]
 #[target_feature(enable = "xsave")]
 #[cfg_attr(test, assert_instr(xrstor64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xrstor64(mem_addr: *const u8, rs_mask: u64) {
     xrstor64(mem_addr, (rs_mask >> 32) as u32, rs_mask as u32);
 }
@@ -60,9 +66,12 @@ pub unsafe fn _xrstor64(mem_addr: *const u8, rs_mask: u64) {
 /// `mem_addr` must be aligned on a 64-byte boundary. The hardware may optimize
 /// the manner in which data is saved. The performance of this instruction will
 /// be equal to or better than using the `XSAVE64` instruction.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsaveopt64)
 #[inline]
 #[target_feature(enable = "xsave,xsaveopt")]
 #[cfg_attr(test, assert_instr(xsaveopt64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsaveopt64(mem_addr: *mut u8, save_mask: u64) {
     xsaveopt64(
         mem_addr,
@@ -77,9 +86,12 @@ pub unsafe fn _xsaveopt64(mem_addr: *mut u8, save_mask: u64) {
 /// `xsavec` differs from `xsave` in that it uses compaction and that it may
 /// use init optimization. State is saved based on bits [62:0] in `save_mask`
 /// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsavec64)
 #[inline]
 #[target_feature(enable = "xsave,xsavec")]
 #[cfg_attr(test, assert_instr(xsavec64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsavec64(mem_addr: *mut u8, save_mask: u64) {
     xsavec64(
         mem_addr,
@@ -95,9 +107,12 @@ pub unsafe fn _xsavec64(mem_addr: *mut u8, save_mask: u64) {
 /// corresponding to bits set in `IA32_XSS` `MSR` and that it may use the
 /// modified optimization. State is saved based on bits [62:0] in `save_mask`
 /// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xsaves64)
 #[inline]
 #[target_feature(enable = "xsave,xsaves")]
 #[cfg_attr(test, assert_instr(xsaves64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xsaves64(mem_addr: *mut u8, save_mask: u64) {
     xsaves64(
         mem_addr,
@@ -115,9 +130,12 @@ pub unsafe fn _xsaves64(mem_addr: *mut u8, save_mask: u64) {
 /// State is restored based on bits [62:0] in `rs_mask`, `XCR0`, and
 /// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
 /// boundary.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_xrstors64)
 #[inline]
 #[target_feature(enable = "xsave,xsaves")]
 #[cfg_attr(test, assert_instr(xrstors64))]
+#[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _xrstors64(mem_addr: *const u8, rs_mask: u64) {
     xrstors64(mem_addr, (rs_mask >> 32) as u32, rs_mask as u32);
 }
diff --git a/crates/stdsimd/src/lib.rs b/crates/stdsimd/src/lib.rs
index c77b7afd9f..e797f66d6f 100644
--- a/crates/stdsimd/src/lib.rs
+++ b/crates/stdsimd/src/lib.rs
@@ -8,7 +8,7 @@
 //! [stdsimd]: https://rust-lang-nursery.github.io/stdsimd/x86_64/stdsimd/
 
 #![feature(const_fn, integer_atomics, staged_api, stdsimd)]
-#![feature(cfg_target_feature, doc_cfg)]
+#![feature(cfg_target_feature, doc_cfg, allow_internal_unstable)]
 #![cfg_attr(feature = "cargo-clippy", allow(shadow_reuse))]
 #![cfg_attr(target_os = "linux", feature(linkage))]
 #![no_std]
diff --git a/stdsimd/arch/detect/arch/x86.rs b/stdsimd/arch/detect/arch/x86.rs
index eb43021375..ed08bea3c6 100644
--- a/stdsimd/arch/detect/arch/x86.rs
+++ b/stdsimd/arch/detect/arch/x86.rs
@@ -16,8 +16,70 @@
 //! in a global `AtomicUsize` variable. The query is performed by just checking
 //! whether the feature bit in this global variable is set or cleared.
 
+/// A macro to test at *runtime* whether a CPU feature is available on
+/// x86/x86-64 platforms.
+///
+/// This macro is provided in the standard library and will detect at runtime
+/// whether the specified CPU feature is detected. This does *not* resolve at
+/// compile time unless the specified feature is already enabled for the entire
+/// crate. Runtime detection currently relies mostly on the `cpuid` instruction.
+///
+/// This macro only takes one argument which is a string literal of the feature
+/// being tested for. The feature names supported are the lowercase versions of
+/// the ones defined by Intel in [their documentation][docs].
+///
+/// ## Supported arguments
+///
+/// This macro supports the same names that `#[target_feature]` supports. Unlike
+/// `#[target_feature]`, however, this macro does not support names separated
+/// with a comma. Instead testing for multiple features must be done through
+/// separate macro invocations for now.
+///
+/// Supported arguments are:
+///
+/// * `"aes"`
+/// * `"pclmulqdq"`
+/// * `"rdrand"`
+/// * `"rdseed"`
+/// * `"tsc"`
+/// * `"mmx"`
+/// * `"sse"`
+/// * `"sse2"`
+/// * `"sse3"`
+/// * `"ssse3"`
+/// * `"sse4.1"`
+/// * `"sse4.2"`
+/// * `"sse4a"`
+/// * `"sha"`
+/// * `"avx"`
+/// * `"avx2"`
+/// * `"avx512f"`
+/// * `"avx512cd"`
+/// * `"avx512er"`
+/// * `"avx512pf"`
+/// * `"avx512bw"`
+/// * `"avx512dq"`
+/// * `"avx512vl"`
+/// * `"avx512ifma"`
+/// * `"avx512vbmi"`
+/// * `"avx512vpopcntdq"`
+/// * `"fma"`
+/// * `"bmi1"`
+/// * `"bmi2"`
+/// * `"abm"`
+/// * `"lzcnt"`
+/// * `"tbm"`
+/// * `"popcnt"`
+/// * `"fxsr"`
+/// * `"xsave"`
+/// * `"xsaveopt"`
+/// * `"xsaves"`
+/// * `"xsavec"`
+///
+/// [docs]: https://software.intel.com/sites/landingpage/IntrinsicsGuide
 #[macro_export]
-#[unstable(feature = "stdsimd", issue = "0")]
+#[stable(feature = "simd_x86", since = "1.27.0")]
+#[allow_internal_unstable]
 macro_rules! is_x86_feature_detected {
     ("aes") => {
         cfg!(target_feature = "aes") || $crate::arch::detect::check_for(
diff --git a/stdsimd/mod.rs b/stdsimd/mod.rs
index bec4884054..30b5beced1 100644
--- a/stdsimd/mod.rs
+++ b/stdsimd/mod.rs
@@ -343,30 +343,38 @@
 ///     }
 /// }
 /// ```
-#[unstable(feature = "stdsimd", issue = "0")]
+#[stable(feature = "simd_arch", since = "1.27.0")]
 pub mod arch {
     #[cfg(all(not(dox), target_arch = "x86"))]
+    #[stable(feature = "simd_x86", since = "1.27.0")]
     pub use coresimd::arch::x86;
 
     #[cfg(all(not(dox), target_arch = "x86_64"))]
+    #[stable(feature = "simd_x86", since = "1.27.0")]
     pub use coresimd::arch::x86_64;
 
     #[cfg(all(not(dox), target_arch = "arm"))]
+    #[unstable(feature = "stdsimd", issue = "0")]
     pub use coresimd::arch::arm;
 
     #[cfg(all(not(dox), target_arch = "aarch64"))]
+    #[unstable(feature = "stdsimd", issue = "0")]
     pub use coresimd::arch::aarch64;
 
     #[cfg(target_arch = "wasm32")]
+    #[unstable(feature = "stdsimd", issue = "0")]
     pub use coresimd::arch::wasm32;
 
     #[cfg(all(not(dox), target_arch = "mips"))]
+    #[unstable(feature = "stdsimd", issue = "0")]
     pub use coresimd::arch::mips;
 
     #[cfg(all(not(dox), target_arch = "mips64"))]
+    #[unstable(feature = "stdsimd", issue = "0")]
     pub use coresimd::arch::mips64;
 
     #[doc(hidden)] // unstable implementation detail
+    #[unstable(feature = "stdsimd", issue = "0")]
     pub mod detect;
 
     /// Platform-specific intrinsics for the `x86` platform.
@@ -378,6 +386,7 @@ pub mod arch {
     /// [libcore]: ../../../core/arch/x86/index.html
     #[cfg(dox)]
     #[doc(cfg(target_arch = "x86"))]
+    #[stable(feature = "simd_x86", since = "1.27.0")]
     pub mod x86 {}
 
     /// Platform-specific intrinsics for the `x86_64` platform.
@@ -389,6 +398,7 @@ pub mod arch {
     /// [libcore]: ../../../core/arch/x86_64/index.html
     #[cfg(dox)]
     #[doc(cfg(target_arch = "x86_64"))]
+    #[stable(feature = "simd_x86", since = "1.27.0")]
     pub mod x86_64 {}
 
     /// Platform-specific intrinsics for the `arm` platform.
@@ -400,6 +410,7 @@ pub mod arch {
     /// [libcore]: ../../../core/arch/arm/index.html
     #[cfg(dox)]
     #[doc(cfg(target_arch = "arm"))]
+    #[unstable(feature = "stdsimd", issue = "0")]
     pub mod arm {}
 
     /// Platform-specific intrinsics for the `aarch64` platform.
@@ -411,6 +422,7 @@ pub mod arch {
     /// [libcore]: ../../../core/arch/aarch64/index.html
     #[cfg(dox)]
     #[doc(cfg(target_arch = "aarch64"))]
+    #[unstable(feature = "stdsimd", issue = "0")]
     pub mod aarch64 {}
 
     /// Platform-specific intrinsics for the `mips` platform.
@@ -422,6 +434,7 @@ pub mod arch {
     /// [libcore]: ../../../core/arch/mips/index.html
     #[cfg(dox)]
     #[doc(cfg(target_arch = "mips"))]
+    #[unstable(feature = "stdsimd", issue = "0")]
     pub mod mips {}
 
     /// Platform-specific intrinsics for the `mips64` platform.
@@ -433,6 +446,7 @@ pub mod arch {
     /// [libcore]: ../../../core/arch/mips64/index.html
     #[cfg(dox)]
     #[doc(cfg(target_arch = "mips64"))]
+    #[unstable(feature = "stdsimd", issue = "0")]
     pub mod mips64 {}
 }