From 79c86004c755cc3b87b8f915326ccb44145b205e Mon Sep 17 00:00:00 2001 From: Taiki Endo Date: Tue, 19 Nov 2024 00:02:44 +0900 Subject: [PATCH] More cleanups for AVR/M68k/MSP430/PowerPC/RISC-V/s390x/SPARC --- .github/.cspell/project-dictionary.txt | 11 +- .github/workflows/ci.yml | 6 + README.md | 2 +- build.rs | 61 +-- src/arch/avr.rs | 175 +++++--- src/arch/m68k.rs | 16 +- src/arch/msp430.rs | 37 +- src/arch/powerpc.rs | 534 ++++++++++++++++--------- src/arch/riscv.rs | 22 +- src/arch/s390x.rs | 89 +++-- src/arch/sparc.rs | 152 +++++-- src/lib.rs | 2 +- src/utils.rs | 47 ++- 13 files changed, 770 insertions(+), 384 deletions(-) diff --git a/.github/.cspell/project-dictionary.txt b/.github/.cspell/project-dictionary.txt index 04fcdcc7..ee485a5c 100644 --- a/.github/.cspell/project-dictionary.txt +++ b/.github/.cspell/project-dictionary.txt @@ -5,6 +5,7 @@ andc andn aqrl armasm +balign beqz Bicc bnez @@ -12,9 +13,9 @@ casp cbnz ccmp cdsg +CDSY cinc clrex -cmpd cmpw cmpxchg cset @@ -26,14 +27,18 @@ fistp gaisler getex GRLIB +hwsync IMAFD inequal +instrs ishld isync kuser +LAAL lclang ldapr ldar +ldarx ldaxp ldclrp ldiapp @@ -41,6 +46,7 @@ ldrd ldrex ldrexd ldsetp +ldstub ldxp leoncasa lgcc @@ -75,6 +81,7 @@ opensbi orrs partword pshufd +pstq putchar qbsp quadword @@ -94,6 +101,7 @@ sltui sreg srlv stbar +stdcx stilp stlxp stpq @@ -116,6 +124,7 @@ uxth virt wokwi xchg +xmegau xmmword xorps zaamo diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3d154226..603285b3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -354,6 +354,12 @@ jobs: RUSTDOCFLAGS: ${{ env.RUSTDOCFLAGS }} -C target-cpu=pwr8 RUSTFLAGS: ${{ env.RUSTFLAGS }} -C target-cpu=pwr8 if: startsWith(matrix.target, 'powerpc64-') + # powerpc64 pwr10 + - run: tools/test.sh -vv --tests ${TARGET:-} ${BUILD_STD:-} ${RELEASE:-} + env: + RUSTDOCFLAGS: ${{ env.RUSTDOCFLAGS }} -C target-cpu=pwr10 + RUSTFLAGS: ${{ env.RUSTFLAGS }} -C target-cpu=pwr10 + if: startsWith(matrix.target, 'powerpc64') # riscv +zabha - run: tools/test.sh -vv --tests ${TARGET:-} ${BUILD_STD:-} ${RELEASE:-} env: diff --git a/README.md b/README.md index 02918ab9..9fd25a2b 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Currently, x86, x86_64, Arm, AArch64, RISC-V, LoongArch64, Arm64EC, s390x, MIPS, | mips64 / mips64r6 \[8] | isize,usize,i8,u8,i16,u16,i32,u32,i64,u64 | ✓ | ✓ | | powerpc \[8] | isize,usize,i8,u8,i16,u16,i32,u32 | ✓ | ✓ | | powerpc64 \[8] | isize,usize,i8,u8,i16,u16,i32,u32,i64,u64 | ✓ | ✓ | -| powerpc64 (pwr8+) \[4] \[8] | i128,u128 | ✓ | ✓ | +| powerpc64 (+quadword-atomics) \[4] \[8]| i128,u128 | ✓ | ✓ | | msp430 \[8] (experimental) | isize,usize,i8,u8,i16,u16 | ✓ | ✓ | | avr \[8] (experimental) | isize,usize,i8,u8,i16,u16 | ✓ | ✓ | | sparc \[5] \[8] (experimental) | isize,usize,i8,u8,i16,u16,i32,u32 | ✓ | ✓ | diff --git a/build.rs b/build.rs index c89716e8..35f256d5 100644 --- a/build.rs +++ b/build.rs @@ -30,7 +30,7 @@ fn main() { if version.minor >= 80 { println!( - r#"cargo:rustc-check-cfg=cfg(target_feature,values("x87","v8m","fast-serialization","isa-68020"))"# + r#"cargo:rustc-check-cfg=cfg(target_feature,values("x87","v8m","prefix-instrs","fast-serialization","isa-68020"))"# ); // Custom cfgs set by build script. Not public API. @@ -41,7 +41,7 @@ fn main() { // TODO: handle multi-line target_feature_fallback // grep -F 'target_feature_fallback("' build.rs | grep -Ev '^ *//' | sed -E 's/^.*target_feature_fallback\(//; s/",.*$/"/' | LC_ALL=C sort -u | tr '\n' ',' | sed -E 's/,$/\n/' println!( - r#"cargo:rustc-check-cfg=cfg(atomic_maybe_uninit_target_feature,values("a","cmpxchg16b","fast-serialization","isa-68020","leoncasa","lse","lse128","lse2","mclass","partword-atomics","quadword-atomics","rcpc","rcpc3","v5te","v6","v7","v8","v8m","v9","x87","zaamo","zabha"))"# + r#"cargo:rustc-check-cfg=cfg(atomic_maybe_uninit_target_feature,values("a","cmpxchg16b","fast-serialization","isa-68020","leoncasa","lse","lse128","lse2","mclass","partword-atomics","prefix-instrs","quadword-atomics","rcpc","rcpc3","v5te","v6","v7","v8","v8m","v9","x87","zaamo","zabha"))"# ); } @@ -317,35 +317,43 @@ fn main() { target_feature_fallback("a", a); } } - "powerpc64" => { - // target_feature "quadword-atomics" is unstable and available on rustc side since nightly-2024-09-28: /~https://github.com/rust-lang/rust/pull/130873 - if !version.probe(83, 2024, 9, 27) || needs_target_feature_fallback(&version, None) { - let target_endian = - env::var("CARGO_CFG_TARGET_ENDIAN").expect("CARGO_CFG_TARGET_ENDIAN not set"); - // powerpc64le is pwr8+ by default /~https://github.com/llvm/llvm-project/blob/llvmorg-19.1.0/llvm/lib/Target/PowerPC/PPC.td#L702 - // See also /~https://github.com/rust-lang/rust/issues/59932 - let mut has_pwr8_features = target_endian == "little"; - // /~https://github.com/llvm/llvm-project/commit/549e118e93c666914a1045fde38a2cac33e1e445 - if let Some(cpu) = target_cpu().as_deref() { - if let Some(mut cpu_version) = cpu.strip_prefix("pwr") { - cpu_version = cpu_version.strip_suffix('x').unwrap_or(cpu_version); // for pwr5x and pwr6x - if let Ok(cpu_version) = cpu_version.parse::() { - has_pwr8_features = cpu_version >= 8; - } - } else { - // /~https://github.com/llvm/llvm-project/blob/llvmorg-19.1.0/llvm/lib/Target/PowerPC/PPC.td#L702 - // /~https://github.com/llvm/llvm-project/blob/llvmorg-19.1.0/llvm/lib/Target/PowerPC/PPC.td#L483 - // On the minimum external LLVM version of the oldest rustc version which we can use asm_experimental_arch - // on this target (see CI config for more), "future" is based on pwr10 features. - // /~https://github.com/llvm/llvm-project/blob/llvmorg-12.0.0/llvm/lib/Target/PowerPC/PPC.td#L370 - has_pwr8_features = cpu == "ppc64le" || cpu == "future"; + "powerpc" | "powerpc64" => { + // powerpc64le is pwr8 by default /~https://github.com/llvm/llvm-project/blob/llvmorg-19.1.0/llvm/lib/Target/PowerPC/PPC.td#L702 + // See also /~https://github.com/rust-lang/rust/issues/59932 + let mut pwr8_features = target_arch == "powerpc64" + && env::var("CARGO_CFG_TARGET_ENDIAN").expect("CARGO_CFG_TARGET_ENDIAN not set") + == "little"; + let mut pwr10_features = false; + if let Some(cpu) = &target_cpu() { + if let Some(mut cpu_version) = cpu.strip_prefix("pwr") { + cpu_version = cpu_version.strip_suffix('x').unwrap_or(cpu_version); // for pwr5x and pwr6x + if let Ok(cpu_version) = cpu_version.parse::() { + pwr8_features = cpu_version >= 8; + pwr10_features = cpu_version >= 10; } + } else { + // /~https://github.com/llvm/llvm-project/blob/llvmorg-19.1.0/llvm/lib/Target/PowerPC/PPC.td#L702 + // /~https://github.com/llvm/llvm-project/blob/llvmorg-19.1.0/llvm/lib/Target/PowerPC/PPC.td#L483 + // On the minimum external LLVM version of the oldest rustc version which we can use asm_experimental_arch + // on this target (see CI config for more), "future" is based on pwr10 features. + // /~https://github.com/llvm/llvm-project/blob/llvmorg-12.0.0/llvm/lib/Target/PowerPC/PPC.td#L370 + let future = cpu == "future"; + pwr8_features = future || cpu == "ppc64le"; + pwr10_features = future; } + } + // target_feature "quadword-atomics" is unstable and available on rustc side since nightly-2024-09-28: /~https://github.com/rust-lang/rust/pull/130873 + if !version.probe(83, 2024, 9, 27) || needs_target_feature_fallback(&version, None) { + // power8 features: /~https://github.com/llvm/llvm-project/blob/llvmorg-19.1.0/llvm/lib/Target/PowerPC/PPC.td#L409 // l[bh]arx and st[bh]cx. - target_feature_fallback("partword-atomics", has_pwr8_features); + target_feature_fallback("partword-atomics", pwr8_features); // lqarx and stqcx. - target_feature_fallback("quadword-atomics", has_pwr8_features); + target_feature_fallback("quadword-atomics", pwr8_features); } + // As of rustc 1.80, target_feature "prefix-instrs" is not available on rustc side: + // /~https://github.com/rust-lang/rust/blob/1.80.0/compiler/rustc_target/src/target_features.rs + // power10 features: /~https://github.com/llvm/llvm-project/blob/llvmorg-19.1.0/llvm/lib/Target/PowerPC/PPC.td#L460 + target_feature_fallback("prefix-instrs", pwr10_features); } "s390x" => { // /~https://github.com/llvm/llvm-project/blob/llvmorg-19.1.0/llvm/lib/Target/SystemZ/SystemZFeatures.td @@ -362,6 +370,7 @@ fn main() { } // As of rustc 1.80, target_feature "fast-serialization" is not available on rustc side: // /~https://github.com/rust-lang/rust/blob/1.80.0/compiler/rustc_target/src/target_features.rs + // arch9 features: /~https://github.com/llvm/llvm-project/blob/llvmorg-19.1.0/llvm/lib/Target/SystemZ/SystemZFeatures.td#L103 // bcr 14,0 target_feature_fallback("fast-serialization", arch9_features); } diff --git a/src/arch/avr.rs b/src/arch/avr.rs index 1df51634..e8a348d3 100644 --- a/src/arch/avr.rs +++ b/src/arch/avr.rs @@ -3,12 +3,28 @@ /* AVR +This architecture is always single-core and the following operations are atomic: + +- Operation that is complete within a single instruction. + This is because the currently executing instruction must be completed before entering the + interrupt service routine. + (Refs: https://developerhelp.microchip.com/xwiki/bin/view/products/mcu-mpu/8-bit-avr/structure/interrupts/) + The following two kinds of instructions are related to memory access: + - 8-bit load/store + - XCH, LAC, LAS, LAT: 8-bit swap,fetch-and-{clear,or,xor} (xmegau family) +- Operations performed in a situation where all interrupts are disabled. + However, pure operations that are not affected by compiler fences (note: the correct interrupt + disabling and restoring implementation must implies compiler fences, e.g., asm without nomem/readonly) + may be moved out of the critical section by compiler optimizations. + Refs: -- AVR Instruction Set Manual https://ww1.microchip.com/downloads/en/DeviceDoc/AVR-InstructionSet-Manual-DS40002198.pdf -- portable-atomic /~https://github.com/taiki-e/portable-atomic +- AVR® Instruction Set Manual, Rev. DS40002198B + https://ww1.microchip.com/downloads/en/DeviceDoc/AVR-InstructionSet-Manual-DS40002198.pdf +- portable-atomic + /~https://github.com/taiki-e/portable-atomic Generated asm: -- avr https://godbolt.org/z/Yn6s3hcGf +- avr https://godbolt.org/z/5TYW8x6T9 */ #[path = "cfgs/avr.rs"] @@ -27,23 +43,27 @@ fn disable() -> u8 { // Do not use `nomem` and `readonly` because prevent subsequent memory accesses from being reordered before interrupts are disabled. // Do not use `preserves_flags` because CLI modifies the I bit of the status register (SREG). asm!( - "in {0}, 0x3F", - "cli", - out(reg) sreg, + "in {sreg}, 0x3F", // sreg = SREG + "cli", // SREG.I = 0 + sreg = out(reg) sreg, options(nostack), ); } sreg } #[inline(always)] -unsafe fn restore(sreg: u8) { +unsafe fn restore(prev_sreg: u8) { // SAFETY: the caller must guarantee that the state was retrieved by the previous `disable`, unsafe { // This clobbers the entire status register. See msp430.rs to safety on this. // // Do not use `nomem` and `readonly` because prevent preceding memory accesses from being reordered after interrupts are enabled. // Do not use `preserves_flags` because OUT modifies the status register (SREG). - asm!("out 0x3F, {0}", in(reg) sreg, options(nostack)); + asm!( + "out 0x3F, {prev_sreg}", // SREG = prev_sreg + prev_sreg = in(reg) prev_sreg, + options(nostack), + ); } } @@ -53,7 +73,12 @@ fn xor8(a: MaybeUninit, b: MaybeUninit) -> u8 { // SAFETY: calling eor is safe. unsafe { // Do not use `preserves_flags` because EOR modifies Z, N, V, and S bits in the status register (SREG). - asm!("eor {a}, {b}", a = inout(reg) a => out, b = in(reg) b, options(pure, nomem, nostack)); + asm!( + "eor {a}, {b}", // a ^= b + a = inout(reg) a => out, + b = in(reg) b, + options(pure, nomem, nostack), + ); } out } @@ -71,38 +96,8 @@ fn cmp16(a: MaybeUninit, b: MaybeUninit) -> bool { xor8(a1, b1) | xor8(a2, b2) == 0 } -macro_rules! atomic { - ($ty:ident, $cmp:ident, $cmp_ty:ident) => { - impl AtomicLoad for $ty { - #[inline] - unsafe fn atomic_load( - src: *const MaybeUninit, - _order: Ordering, - ) -> MaybeUninit { - let s = disable(); - // SAFETY: the caller must guarantee that pointer is valid and properly aligned. - // On single-core systems, disabling interrupts is enough to prevent data race. - let out = unsafe { src.read() }; - // SAFETY: the state was retrieved by the previous `disable`. - unsafe { restore(s) } - out - } - } - impl AtomicStore for $ty { - #[inline] - unsafe fn atomic_store( - dst: *mut MaybeUninit, - val: MaybeUninit, - _order: Ordering, - ) { - let s = disable(); - // SAFETY: the caller must guarantee that pointer is valid and properly aligned. - // On single-core systems, disabling interrupts is enough to prevent data race. - unsafe { dst.write(val) } - // SAFETY: the state was retrieved by the previous `disable`. - unsafe { restore(s) } - } - } +macro_rules! atomic_swap { + ($ty:ident) => { impl AtomicSwap for $ty { #[inline] unsafe fn atomic_swap( @@ -121,6 +116,10 @@ macro_rules! atomic { out } } + }; +} +macro_rules! atomic_cas { + ($ty:ident, $cmp:ident, $cmp_ty:ident) => { impl AtomicCompareExchange for $ty { #[inline] unsafe fn atomic_compare_exchange( @@ -155,9 +154,91 @@ macro_rules! atomic { }; } -atomic!(i8, cmp8, u8); -atomic!(u8, cmp8, u8); -atomic!(i16, cmp16, u16); -atomic!(u16, cmp16, u16); -atomic!(isize, cmp16, u16); -atomic!(usize, cmp16, u16); +macro_rules! atomic8 { + ($ty:ident) => { + impl AtomicLoad for $ty { + #[inline] + unsafe fn atomic_load( + src: *const MaybeUninit, + _order: Ordering, + ) -> MaybeUninit { + let out: MaybeUninit; + + // SAFETY: the caller must uphold the safety contract. + unsafe { + asm!( + "ld {out}, Z", // atomic { out = *src } + in("Z") src, + out = out(reg) out, + options(nostack, preserves_flags), + ); + } + out + } + } + impl AtomicStore for $ty { + #[inline] + unsafe fn atomic_store( + dst: *mut MaybeUninit, + val: MaybeUninit, + _order: Ordering, + ) { + // SAFETY: the caller must uphold the safety contract. + unsafe { + asm!( + "st Z, {val}", // atomic { *dst = val } + in("Z") dst, + val = in(reg) val, + options(nostack, preserves_flags), + ); + } + } + } + atomic_swap!($ty); + atomic_cas!($ty, cmp8, u8); + }; +} + +macro_rules! atomic16 { + ($ty:ident) => { + impl AtomicLoad for $ty { + #[inline] + unsafe fn atomic_load( + src: *const MaybeUninit, + _order: Ordering, + ) -> MaybeUninit { + let s = disable(); + // SAFETY: the caller must guarantee that pointer is valid and properly aligned. + // On single-core systems, disabling interrupts is enough to prevent data race. + let out = unsafe { src.read() }; + // SAFETY: the state was retrieved by the previous `disable`. + unsafe { restore(s) } + out + } + } + impl AtomicStore for $ty { + #[inline] + unsafe fn atomic_store( + dst: *mut MaybeUninit, + val: MaybeUninit, + _order: Ordering, + ) { + let s = disable(); + // SAFETY: the caller must guarantee that pointer is valid and properly aligned. + // On single-core systems, disabling interrupts is enough to prevent data race. + unsafe { dst.write(val) } + // SAFETY: the state was retrieved by the previous `disable`. + unsafe { restore(s) } + } + } + atomic_swap!($ty); + atomic_cas!($ty, cmp16, u16); + }; +} + +atomic8!(i8); +atomic8!(u8); +atomic16!(i16); +atomic16!(u16); +atomic16!(isize); +atomic16!(usize); diff --git a/src/arch/m68k.rs b/src/arch/m68k.rs index ea77cf1d..7c32c493 100644 --- a/src/arch/m68k.rs +++ b/src/arch/m68k.rs @@ -3,16 +3,26 @@ /* M68k +This architecture provides the following atomic instructions: + +- Load/Store Instructions + - {8,16,32}-bit +- Multiprocessor Instructions + - TAS: 8-bit TAS (M68000 or later) + - CAS: {8,16,32}-bit CAS (M68020 or later) + - CAS2: {16,32}-bit double CAS (M68020 or later) + (Refs: Section 3.1.11 "Multiprocessor Instructions" of M68000 FAMILY Programmer's Reference Manual) + +Note that CAS2 is not yet supported in LLVM. + Refs: - M68000 FAMILY Programmer's Reference Manual https://www.nxp.com/docs/en/reference-manual/M68000PRM.pdf - M68060 User’s Manual https://www.nxp.com/docs/en/data-sheet/MC68060UM.pdf -Note that cas2 (double CAS) is not yet supported in LLVM. - Generated asm: -- m68k M68020 https://godbolt.org/z/7rhzK9d8n +- m68k (M68020) https://godbolt.org/z/87Wxq1Wdj */ #[path = "cfgs/m68k.rs"] diff --git a/src/arch/msp430.rs b/src/arch/msp430.rs index ade2377c..3a7752e8 100644 --- a/src/arch/msp430.rs +++ b/src/arch/msp430.rs @@ -3,12 +3,25 @@ /* MSP430 +This architecture is always single-core and the following operations are atomic: + +- Operation that is complete within a single instruction. + This is because the currently executing instruction must be completed before entering the + interrupt service routine. + (Refs: Section 1.3.4.1 "Interrupt Acceptance" of MSP430x5xx and MSP430x6xx Family User's Guide, Rev. Q: https://www.ti.com/lit/ug/slau208q/slau208q.pdf#page=59) +- Operations performed in a situation where all interrupts are disabled. + However, pure operations that are not affected by compiler fences (note: the correct interrupt + disabling and restoring implementation must implies compiler fences, e.g., asm without nomem/readonly) + may be moved out of the critical section by compiler optimizations. + Refs: -- MSP430x5xx and MSP430x6xx Family User's Guide https://www.ti.com/lit/ug/slau208q/slau208q.pdf -- portable-atomic /~https://github.com/taiki-e/portable-atomic +- MSP430x5xx and MSP430x6xx Family User's Guide, Rev. Q + https://www.ti.com/lit/ug/slau208q/slau208q.pdf +- portable-atomic + /~https://github.com/taiki-e/portable-atomic Generated asm: -- msp430 https://godbolt.org/z/zzncaW6Y5 +- msp430 https://godbolt.org/z/W8PYT7xx4 */ #[path = "cfgs/msp430.rs"] @@ -26,17 +39,18 @@ fn disable() -> u16 { unsafe { // Do not use `nomem` and `readonly` because prevent subsequent memory accesses from being reordered before interrupts are disabled. // Do not use `preserves_flags` because DINT modifies the GIE (global interrupt enable) bit of the status register. + // See "NOTE: Enable and Disable Interrupt" of User's Guide for NOP: https://www.ti.com/lit/ug/slau208q/slau208q.pdf#page=60 asm!( - "mov r2, {0}", - "dint {{ nop", - out(reg) sr, + "mov r2, {sr}", // sr = SR + "dint {{ nop", // SR.GIE = 0 + sr = out(reg) sr, options(nostack), ); } sr } #[inline(always)] -unsafe fn restore(sr: u16) { +unsafe fn restore(prev_sr: u16) { // SAFETY: the caller must guarantee that the state was retrieved by the previous `disable`, unsafe { // This clobbers the entire status register, but we never explicitly modify @@ -48,7 +62,12 @@ unsafe fn restore(sr: u16) { // // Do not use `nomem` and `readonly` because prevent preceding memory accesses from being reordered after interrupts are enabled. // Do not use `preserves_flags` because MOV modifies the status register. - asm!("nop {{ mov {0}, r2 {{ nop", in(reg) sr, options(nostack)); + // See "NOTE: Enable and Disable Interrupt" of User's Guide for NOP: https://www.ti.com/lit/ug/slau208q/slau208q.pdf#page=60 + asm!( + "nop {{ mov {prev_sr}, r2 {{ nop", // SR = prev_sr + prev_sr = in(reg) prev_sr, + options(nostack), + ); } } @@ -127,7 +146,7 @@ macro_rules! atomic { let r = unsafe { let r: $ty; asm!( - concat!("xor", $suffix, " {b}, {a}"), + concat!("xor", $suffix, " {b}, {a}"), // a ^= b a = inout(reg) old => r, b = in(reg) out, // Do not use `preserves_flags` because XOR modifies the V, N, Z, and C bits of the status register. diff --git a/src/arch/powerpc.rs b/src/arch/powerpc.rs index f9cf21e0..1e537e3c 100644 --- a/src/arch/powerpc.rs +++ b/src/arch/powerpc.rs @@ -5,32 +5,55 @@ PowerPC and PowerPC64 This architecture provides the following atomic instructions: -- Load/Store Instructions (relaxed load/store) - - All aligned {8,16,32}-bit and 64-bit (for PowerPC64) single load/store instructions - other than Move Assist instruction are atomic. - - ISA 2.07 or later: 128-bit for PowerPC64 (lq, stq, lqarx, stqcx.) - (Section 1.4 "Single-Copy Atomicity" of Power ISA 3.1C Book II) +- Load/Store Instructions + - All aligned {8,16,32}-bit and 64-bit (PPC64-only) single load/store instructions other than Move Assist instruction + - lq/stq: 128-bit load/store (PPC64-only, ISA 2.07 or later, included in the Linux Compliancy subset and AIX Compliancy subset) + (lq and stq are available since ISA 2.03, but were privileged instructions and big-endian mode only and not guaranteed to be atomic, in pre-2.07 ISA) + - plq/pstq: 128-bit load/store (PPC64-only, ISA 3.1 or later, included in the Linux Compliancy subset and AIX Compliancy subset) + (Refs: Section 1.4 "Single-Copy Atomicity" of Power ISA 3.1C Book II) - Load And Reserve and Store Conditional Instructions (aka LL/SC) - - PowerPC Architecture prior to v2.00, or later: 32-bit, 64-bit (for PowerPC64) - - ISA 2.06 or later: {8,16}-bit for PowerPC64 - - ISA 2.07 or later: 128-bit for PowerPC64 - (Section 4.6.2 "Load And Reserve and Store Conditional Instructions" of Power ISA 3.1C Book II) -- Atomic Memory Operation (AMO) Instructions (RMW) - - ISA 3.0 or later: {32,64}-bit swap,fetch_{add,and,or,xor,max,min},add,max,min for PowerPC64 - (Section 4.5 "Atomic Memory Operations" of Power ISA 3.1C Book II) + - l{b,h}arx/st{b,h}cx.: {8,16}-bit LL/SC (ISA 2.06 or later, included in all compliancy subsets) + - lwarx/stwcx.: 32-bit LL/SC (PPC or later, included in all compliancy subsets) + - ldarx/stdcx.: 64-bit LL/SC (PPC64-only, PPC or later, included in all compliancy subsets) + - lqarx/stqcx.: 128-bit LL/SC (PPC64-only, ISA 2.07 or later, included in the Linux Compliancy subset and AIX Compliancy subset) + (Refs: Section 4.6.2 "Load And Reserve and Store Conditional Instructions" of Power ISA 3.1C Book II) +- Atomic Memory Operation (AMO) Instructions + - l{w,d}at: {32,64}-bit swap,fetch-and-{add,and,or,xor,max,min} (PPC64-only, ISA 3.0 or later, included in the AIX Compliancy subset) + (Others: Compare and Swap Not Equal, Fetch and Increment Bounded, Fetch and Increment Equal, Fetch and Decrement Bounded) + - st{w,d}at: {32,64}-bit add,and,or,xor,max,min (PPC64-only, ISA 3.0 or later, included in the AIX Compliancy subset) + (Others: Store Twin) + (Refs: Section 4.5 "Atomic Memory Operations" of Power ISA 3.1C Book II) + +None of the above instructions imply a memory barrier. +- A sync (sync 0, sync 0,0, hwsync) instruction can be used as both an “import barrier” and an “export barrier”. +- A lwsync (sync 1, sync 1,0) instruction can be used as both an “import barrier” and an “export barrier”, + if the specified storage location is in storage that is neither Write Through Required nor Caching Inhibited. +- An “import barrier” can be constructed by a branch that depends on the loaded value (even a branch + that depends on a comparison of the same register is okay), followed by an isync instruction. +(sync, lwsync, and isync are available since POWER1, included in all compliancy subsets) +(Refs: Section 1.7.1 "Storage Access Ordering" and Section B.2 "Lock Acquisition and Release, and Related Techniques" of Power ISA 3.1C Book II) + +sync corresponds to SeqCst semantics, lwsync corresponds to AcqRel semantics, and +isync with appropriate sequence corresponds to Acquire semantics. Refs: -- Power ISA https://openpowerfoundation.org/specifications/isa -- AIX Assembler language reference https://www.ibm.com/docs/en/aix/7.3?topic=aix-assembler-language-reference -- Example POWER Implementation for C/C++ Memory Model http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2010.02.19a.html -- portable-atomic /~https://github.com/taiki-e/portable-atomic +- Power ISA + https://openpowerfoundation.org/specifications/isa +- AIX Assembler language reference + https://www.ibm.com/docs/en/aix/7.3?topic=aix-assembler-language-reference +- Example POWER Implementation for C/C++ Memory Model + http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2010.02.19a.html +- portable-atomic + /~https://github.com/taiki-e/portable-atomic Generated asm: -- powerpc https://godbolt.org/z/1MTdM6qKj -- powerpc64 https://godbolt.org/z/E7453fxnn -- powerpc64 (pwr8) https://godbolt.org/z/ccW95s1Ex -- powerpc64le https://godbolt.org/z/dv1E9qac6 -- powerpc64le (pwr7) https://godbolt.org/z/d6cnYrMq5 +- powerpc https://godbolt.org/z/6PMzWfhEM +- powerpc (pwr8) https://godbolt.org/z/5KvMh4Phn +- powerpc64 https://godbolt.org/z/exbfnjTW4 +- powerpc64 (pwr8) https://godbolt.org/z/4r3xGo8ef +- powerpc64le (pwr7) https://godbolt.org/z/9zzaKcWbe +- powerpc64le https://godbolt.org/z/3cs6ennKG +- powerpc64le (pwr10) https://godbolt.org/z/GzW367svM */ #[path = "cfgs/powerpc.rs"] @@ -50,41 +73,51 @@ use crate::raw::{AtomicCompareExchange, AtomicLoad, AtomicStore, AtomicSwap}; ))] use crate::utils::{MaybeUninit128, Pair}; -#[cfg(target_arch = "powerpc")] -macro_rules! cmp { - () => { - "cmpw" - }; -} -#[cfg(target_arch = "powerpc64")] -macro_rules! cmp { - () => { - "cmpd" - }; -} - macro_rules! atomic_rmw { ($op:ident, $order:ident) => { match $order { Ordering::Relaxed => $op!("", ""), - Ordering::Acquire => $op!("lwsync", ""), + Ordering::Acquire => $op!("isync", ""), Ordering::Release => $op!("", "lwsync"), - Ordering::AcqRel => $op!("lwsync", "lwsync"), - Ordering::SeqCst => $op!("lwsync", "sync"), + Ordering::AcqRel => $op!("isync", "lwsync"), + Ordering::SeqCst => $op!("isync", "sync"), _ => unreachable!(), } }; } +macro_rules! atomic_cas { + ($op:ident, $success:ident, $failure:ident) => { + if $failure == Ordering::Relaxed { + match $success { + Ordering::Relaxed => $op!("", "", ""), + Ordering::Acquire => $op!("", "isync", ""), + Ordering::Release => $op!("", "", "lwsync"), + Ordering::AcqRel => $op!("", "isync", "lwsync"), + Ordering::SeqCst => $op!("", "isync", "sync"), + _ => unreachable!(), + } + } else { + let order = crate::utils::upgrade_success_ordering($success, $failure); + match order { + // Relaxed and Release are covered in $failure == Relaxed branch. + Ordering::Acquire => $op!("isync", "", ""), + Ordering::AcqRel => $op!("isync", "", "lwsync"), + Ordering::SeqCst => $op!("isync", "", "sync"), + _ => unreachable!(), + } + } + }; +} // Extracts and checks the EQ bit of cr0. -#[inline(always)] -fn extract_cr0(r: crate::utils::RegSize) -> bool { - r & 0x20000000 != 0 +#[inline] +fn test_cr0_eq(cr: crate::utils::RegSize) -> bool { + cr & 0x20000000 != 0 } #[rustfmt::skip] macro_rules! atomic_load_store { - ($ty:ident, $l_suffix:tt, $suffix:tt) => { + ($ty:ident, $size:tt, $load_ext:tt) => { impl AtomicLoad for $ty { #[inline] unsafe fn atomic_load( @@ -100,16 +133,14 @@ macro_rules! atomic_load_store { ($release:tt) => { asm!( $release, - concat!("l", $l_suffix, " {out}, 0({src})"), // atomic { out = *src } - // Lightweight acquire sync - // Refs: /~https://github.com/boostorg/atomic/blob/boost-1.79.0/include/boost/atomic/detail/core_arch_ops_gcc_ppc.hpp#L47-L62 - concat!(cmp!(), " %cr7, {out}, {out}"), - "bne- %cr7, 2f", - "2:", - "isync", + concat!("l", $size, $load_ext, " {out}, 0({src})"), // atomic { out = *src } + "cmpw {out}, {out}", // if out == out { cr0.EQ = 1 } else { cr0.EQ = 0 } + "bne- %cr0, 2f", // if unlikely(cr0.EQ == 0) { jump 'never } + "2:", // 'never: + "isync", // fence (works in combination with a branch that depends on the loaded value) src = in(reg_nonzero) ptr_reg!(src), out = lateout(reg) out, - out("cr7") _, + out("cr0") _, options(nostack, preserves_flags), ) }; @@ -117,7 +148,7 @@ macro_rules! atomic_load_store { match order { Ordering::Relaxed => { asm!( - concat!("l", $l_suffix, " {out}, 0({src})"), // atomic { out = *src } + concat!("l", $size, $load_ext, " {out}, 0({src})"), // atomic { out = *src } src = in(reg_nonzero) ptr_reg!(src), out = lateout(reg) out, options(nostack, preserves_flags), @@ -145,8 +176,8 @@ macro_rules! atomic_load_store { macro_rules! atomic_store { ($release:tt) => { asm!( - $release, // fence - concat!("st", $suffix, " {val}, 0({dst})"), // atomic { *dst = val } + $release, // fence + concat!("st", $size, " {val}, 0({dst})"), // atomic { *dst = val } dst = in(reg_nonzero) ptr_reg!(dst), val = in(reg) val, options(nostack, preserves_flags), @@ -167,8 +198,8 @@ macro_rules! atomic_load_store { #[rustfmt::skip] macro_rules! atomic { - ($ty:ident, $l_suffix:tt, $suffix:tt, $cmp_suffix:tt) => { - atomic_load_store!($ty, $l_suffix, $suffix); + ($ty:ident, $size:tt, $load_ext:tt, $cmp_size:tt) => { + atomic_load_store!($ty, $size, $load_ext); impl AtomicSwap for $ty { #[inline] unsafe fn atomic_swap( @@ -184,12 +215,12 @@ macro_rules! atomic { macro_rules! swap { ($acquire:tt, $release:tt) => { asm!( - $release, // fence + $release, // fence "2:", // 'retry: - concat!("l", $suffix, "arx {out}, 0, {dst}"), // atomic { RESERVE = (dst, size_of($ty)); out = *dst } - concat!("st", $suffix, "cx. {val}, 0, {dst}"), // atomic { if RESERVE == (dst, size_of($ty)) { *dst = val; cr0.EQ = 1 } else { cr0.EQ = 0 }; RESERVE = None } - "bne %cr0, 2b", // if cr0.EQ == 0 { jump 'retry } - $acquire, // fence + concat!("l", $size, "arx {out}, 0, {dst}"), // atomic { RESERVE = (dst, size_of($ty)); out = *dst } + concat!("st", $size, "cx. {val}, 0, {dst}"), // atomic { if RESERVE == (dst, size_of($ty)) { *dst = val; cr0.EQ = 1 } else { cr0.EQ = 0 }; RESERVE = None } + "bne %cr0, 2b", // if cr0.EQ == 0 { jump 'retry } + $acquire, // fence dst = in(reg_nonzero) ptr_reg!(dst), val = in(reg) val, out = out(reg) out, @@ -213,25 +244,25 @@ macro_rules! atomic { failure: Ordering, ) -> (MaybeUninit, bool) { debug_assert!(dst as usize % mem::size_of::<$ty>() == 0); - let order = crate::utils::upgrade_success_ordering(success, failure); let mut out: MaybeUninit; + let mut r; // SAFETY: the caller must uphold the safety contract. unsafe { - let mut r; macro_rules! cmpxchg { - ($acquire:tt, $release:tt) => { + ($acquire_always:tt, $acquire_success:tt, $release:tt) => { asm!( - $release, // fence + $release, // fence "2:", // 'retry: - concat!("l", $suffix, "arx {out}, 0, {dst}"), // atomic { RESERVE = (dst, size_of($ty)); out = *dst } - concat!("cmp", $cmp_suffix, " {old}, {out}"), // if old == out { cr0.EQ = 1 } else { cr0.EQ = 0 } - "bne %cr0, 3f", // if cr0.EQ == 0 { jump 'cmp-fail } - concat!("st", $suffix, "cx. {new}, 0, {dst}"), // atomic { if RESERVE == (dst, size_of($ty)) { *dst = new; cr0.EQ = 1 } else { cr0.EQ = 0 }; RESERVE = None } - "bne %cr0, 2b", // if cr0.EQ == 0 { jump 'retry } + concat!("l", $size, "arx {out}, 0, {dst}"), // atomic { RESERVE = (dst, size_of($ty)); out = *dst } + concat!("cmp", $cmp_size, " {old}, {out}"), // if old == out { cr0.EQ = 1 } else { cr0.EQ = 0 } + "bne %cr0, 3f", // if cr0.EQ == 0 { jump 'cmp-fail } + concat!("st", $size, "cx. {new}, 0, {dst}"), // atomic { if RESERVE == (dst, size_of($ty)) { *dst = new; cr0.EQ = 1 } else { cr0.EQ = 0 }; RESERVE = None } + "bne %cr0, 2b", // if cr0.EQ == 0 { jump 'retry } + $acquire_success, // fence "3:", // 'cmp-fail: - "mfcr {r}", // r = zero_extend(cr) - $acquire, // fence + $acquire_always, // fence + "mfcr {r}", // r = zero_extend(cr) dst = in(reg_nonzero) ptr_reg!(dst), old = in(reg) crate::utils::ZeroExtend::zero_extend(old), new = in(reg) new, @@ -242,9 +273,9 @@ macro_rules! atomic { ) }; } - atomic_rmw!(cmpxchg, order); + atomic_cas!(cmpxchg, success, failure); // if compare failed EQ bit is cleared, if store succeeds EQ bit is set. - (out, extract_cr0(r)) + (out, test_cr0_eq(r)) } } #[inline] @@ -256,23 +287,23 @@ macro_rules! atomic { failure: Ordering, ) -> (MaybeUninit, bool) { debug_assert!(dst as usize % mem::size_of::<$ty>() == 0); - let order = crate::utils::upgrade_success_ordering(success, failure); let mut out: MaybeUninit; + let mut r; // SAFETY: the caller must uphold the safety contract. unsafe { - let mut r; macro_rules! cmpxchg_weak { - ($acquire:tt, $release:tt) => { + ($acquire_always:tt, $acquire_success:tt, $release:tt) => { asm!( - $release, // fence - concat!("l", $suffix, "arx {out}, 0, {dst}"), // atomic { RESERVE = (dst, size_of($ty)); out = *dst } - concat!("cmp", $cmp_suffix, " {old}, {out}"), // if old == out { cr0.EQ = 1 } else { cr0.EQ = 0 } - "bne %cr0, 3f", // if cr0.EQ == 0 { jump 'cmp-fail } - concat!("st", $suffix, "cx. {new}, 0, {dst}"), // atomic { if RESERVE == (dst, size_of($ty)) { *dst = new; cr0.EQ = 1 } else { cr0.EQ = 0 }; RESERVE = None } + $release, // fence + concat!("l", $size, "arx {out}, 0, {dst}"), // atomic { RESERVE = (dst, size_of($ty)); out = *dst } + concat!("cmp", $cmp_size, " {old}, {out}"), // if old == out { cr0.EQ = 1 } else { cr0.EQ = 0 } + "bne %cr0, 3f", // if cr0.EQ == 0 { jump 'cmp-fail } + concat!("st", $size, "cx. {new}, 0, {dst}"), // atomic { if RESERVE == (dst, size_of($ty)) { *dst = new; cr0.EQ = 1 } else { cr0.EQ = 0 }; RESERVE = None } + $acquire_success, // fence "3:", // 'cmp-fail: - "mfcr {r}", // r = zero_extend(cr) - $acquire, // fence + $acquire_always, // fence + "mfcr {r}", // r = zero_extend(cr) dst = in(reg_nonzero) ptr_reg!(dst), old = in(reg) crate::utils::ZeroExtend::zero_extend(old), new = in(reg) new, @@ -283,26 +314,32 @@ macro_rules! atomic { ) }; } - atomic_rmw!(cmpxchg_weak, order); + atomic_cas!(cmpxchg_weak, success, failure); // if compare or store failed EQ bit is cleared, if store succeeds EQ bit is set. - (out, extract_cr0(r)) + (out, test_cr0_eq(r)) } } } }; } -#[cfg(not(all( - target_arch = "powerpc64", - any( - target_feature = "partword-atomics", - atomic_maybe_uninit_target_feature = "partword-atomics", - ), -)))] #[rustfmt::skip] macro_rules! atomic_sub_word { - ($ty:ident, $l_suffix:tt, $suffix:tt) => { - atomic_load_store!($ty, $l_suffix, $suffix); + ($ty:ident, $size:tt) => { + #[cfg(any( + target_feature = "partword-atomics", + atomic_maybe_uninit_target_feature = "partword-atomics", + ))] + atomic!($ty, $size, "z", "w"); + #[cfg(not(any( + target_feature = "partword-atomics", + atomic_maybe_uninit_target_feature = "partword-atomics", + )))] + atomic_load_store!($ty, $size, "z"); + #[cfg(not(any( + target_feature = "partword-atomics", + atomic_maybe_uninit_target_feature = "partword-atomics", + )))] impl AtomicSwap for $ty { #[inline] unsafe fn atomic_swap( @@ -330,8 +367,8 @@ macro_rules! atomic_sub_word { "or {tmp}, {val}, {tmp}", // tmp |= val "stwcx. {tmp}, 0, {dst}", // atomic { if RESERVE == (dst, 4) { *dst = tmp; cr0.EQ = 1 } else { cr0.EQ = 0 }; RESERVE = None } "bne %cr0, 2b", // if cr0.EQ == 0 { jump 'retry } - "srw {out}, {out}, {shift}", // out >>= shift $acquire, // fence + "srw {out}, {out}, {shift}", // out >>= shift dst = in(reg_nonzero) ptr_reg!(dst), val = inout(reg) crate::utils::ZeroExtend::zero_extend(val) => _, out = out(reg) out, @@ -348,6 +385,10 @@ macro_rules! atomic_sub_word { out } } + #[cfg(not(any( + target_feature = "partword-atomics", + atomic_maybe_uninit_target_feature = "partword-atomics", + )))] impl AtomicCompareExchange for $ty { #[inline] unsafe fn atomic_compare_exchange( @@ -358,18 +399,17 @@ macro_rules! atomic_sub_word { failure: Ordering, ) -> (MaybeUninit, bool) { debug_assert!(dst as usize % mem::size_of::<$ty>() == 0); - let order = crate::utils::upgrade_success_ordering(success, failure); let (dst, shift, mask) = crate::utils::create_sub_word_mask_values(dst); let mut out: MaybeUninit; + let mut r; // SAFETY: the caller must uphold the safety contract. unsafe { - let mut r; // Implement sub-word atomic operations using word-sized LL/SC loop. // Based on assemblies generated by rustc/LLVM. // See also create_sub_word_mask_values. macro_rules! cmpxchg { - ($acquire:tt, $release:tt) => { + ($acquire_always:tt, $acquire_success:tt, $release:tt) => { asm!( "slw {old}, {old}, {shift}", // old <<= shift "slw {new}, {new}, {shift}", // new <<= shift @@ -383,10 +423,11 @@ macro_rules! atomic_sub_word { "or {tmp}, {tmp}, {new}", // tmp |= new "stwcx. {tmp}, 0, {dst}", // atomic { if RESERVE == (dst, size_of($ty)) { *dst = tmp; cr0.EQ = 1 } else { cr0.EQ = 0 }; RESERVE = None } "bne %cr0, 2b", // if cr0.EQ == 0 { jump 'retry } + $acquire_success, // fence "3:", // 'cmp-fail: + $acquire_always, // fence "srw {out}, {out}, {shift}", // out >>= shift "mfcr {tmp}", // r = zero_extend(cr) - $acquire, // fence dst = in(reg_nonzero) ptr_reg!(dst), old = inout(reg) crate::utils::ZeroExtend::zero_extend(old) => _, new = inout(reg) crate::utils::ZeroExtend::zero_extend(new) => _, @@ -399,88 +440,87 @@ macro_rules! atomic_sub_word { ) }; } - atomic_rmw!(cmpxchg, order); + atomic_cas!(cmpxchg, success, failure); // if compare failed EQ bit is cleared, if stqcx succeeds EQ bit is set. - (out, extract_cr0(r)) + (out, test_cr0_eq(r)) + } + } + #[inline] + unsafe fn atomic_compare_exchange_weak( + dst: *mut MaybeUninit, + old: MaybeUninit, + new: MaybeUninit, + success: Ordering, + failure: Ordering, + ) -> (MaybeUninit, bool) { + debug_assert!(dst as usize % mem::size_of::<$ty>() == 0); + let (dst, shift, mask) = crate::utils::create_sub_word_mask_values(dst); + let mut out: MaybeUninit; + let mut r; + + // SAFETY: the caller must uphold the safety contract. + unsafe { + // Implement sub-word atomic operations using word-sized LL/SC loop. + // Based on assemblies generated by rustc/LLVM. + // See also create_sub_word_mask_values. + macro_rules! cmpxchg_weak { + ($acquire_always:tt, $acquire_success:tt, $release:tt) => { + asm!( + "slw {old}, {old}, {shift}", // old <<= shift + "slw {new}, {new}, {shift}", // new <<= shift + $release, // fence + "lwarx {out}, 0, {dst}", // atomic { RESERVE = (dst, 4); out = *dst } + "and {tmp}, {out}, {mask}", // tmp = out & mask + "cmpw {tmp}, {old}", // if tmp == old { cr0.EQ = 1 } else { cr0.EQ = 0 } + "bne %cr0, 3f", // if cr0.EQ == 0 { jump 'cmp-fail } + "andc {tmp}, {out}, {mask}", // tmp = out & !mask + "or {tmp}, {tmp}, {new}", // tmp |= new + "stwcx. {tmp}, 0, {dst}", // atomic { if RESERVE == (dst, size_of($ty)) { *dst = tmp; cr0.EQ = 1 } else { cr0.EQ = 0 }; RESERVE = None } + $acquire_success, // fence + "3:", // 'cmp-fail: + $acquire_always, // fence + "srw {out}, {out}, {shift}", // out >>= shift + "mfcr {tmp}", // r = zero_extend(cr) + dst = in(reg_nonzero) ptr_reg!(dst), + old = inout(reg) crate::utils::ZeroExtend::zero_extend(old) => _, + new = inout(reg) crate::utils::ZeroExtend::zero_extend(new) => _, + out = out(reg) out, + shift = in(reg) shift, + mask = in(reg) mask, + tmp = out(reg) r, + out("cr0") _, + options(nostack, preserves_flags), + ) + }; + } + atomic_cas!(cmpxchg_weak, success, failure); + // if compare or store failed EQ bit is cleared, if store succeeds EQ bit is set. + (out, test_cr0_eq(r)) } } } }; } +atomic_sub_word!(i8, "b"); +atomic_sub_word!(u8, "b"); +atomic_sub_word!(i16, "h"); +atomic_sub_word!(u16, "h"); +atomic!(i32, "w", "z", "w"); +atomic!(u32, "w", "z", "w"); #[cfg(target_arch = "powerpc64")] -#[cfg(any( - target_feature = "partword-atomics", - atomic_maybe_uninit_target_feature = "partword-atomics", -))] -atomic!(i8, "bz", "b", "w"); -#[cfg(target_arch = "powerpc64")] -#[cfg(any( - target_feature = "partword-atomics", - atomic_maybe_uninit_target_feature = "partword-atomics", -))] -atomic!(u8, "bz", "b", "w"); -#[cfg(target_arch = "powerpc64")] -#[cfg(any( - target_feature = "partword-atomics", - atomic_maybe_uninit_target_feature = "partword-atomics", -))] -atomic!(i16, "hz", "h", "w"); +atomic!(i64, "d", "", "d"); #[cfg(target_arch = "powerpc64")] -#[cfg(any( - target_feature = "partword-atomics", - atomic_maybe_uninit_target_feature = "partword-atomics", -))] -atomic!(u16, "hz", "h", "w"); -#[cfg(not(all( - target_arch = "powerpc64", - any( - target_feature = "partword-atomics", - atomic_maybe_uninit_target_feature = "partword-atomics", - ), -)))] -atomic_sub_word!(i8, "bz", "b"); -#[cfg(not(all( - target_arch = "powerpc64", - any( - target_feature = "partword-atomics", - atomic_maybe_uninit_target_feature = "partword-atomics", - ), -)))] -atomic_sub_word!(u8, "bz", "b"); -#[cfg(not(all( - target_arch = "powerpc64", - any( - target_feature = "partword-atomics", - atomic_maybe_uninit_target_feature = "partword-atomics", - ), -)))] -atomic_sub_word!(i16, "hz", "h"); -#[cfg(not(all( - target_arch = "powerpc64", - any( - target_feature = "partword-atomics", - atomic_maybe_uninit_target_feature = "partword-atomics", - ), -)))] -atomic_sub_word!(u16, "hz", "h"); -atomic!(i32, "wz", "w", "w"); -atomic!(u32, "wz", "w", "w"); -#[cfg(target_arch = "powerpc64")] -atomic!(i64, "d", "d", "d"); -#[cfg(target_arch = "powerpc64")] -atomic!(u64, "d", "d", "d"); +atomic!(u64, "d", "", "d"); #[cfg(target_pointer_width = "32")] -atomic!(isize, "wz", "w", "w"); +atomic!(isize, "w", "z", "w"); #[cfg(target_pointer_width = "32")] -atomic!(usize, "wz", "w", "w"); +atomic!(usize, "w", "z", "w"); #[cfg(target_pointer_width = "64")] -atomic!(isize, "d", "d", "d"); +atomic!(isize, "d", "", "d"); #[cfg(target_pointer_width = "64")] -atomic!(usize, "d", "d", "d"); +atomic!(usize, "d", "", "d"); -// powerpc64 on pwr8+ support 128-bit atomics (load/store/LL/SC): -// See /~https://github.com/taiki-e/portable-atomic/blob/HEAD/src/imp/atomic128/README.md for details. #[cfg(target_arch = "powerpc64")] #[cfg(any( target_feature = "quadword-atomics", @@ -494,41 +534,112 @@ macro_rules! atomic128 { src: *const MaybeUninit, order: Ordering, ) -> MaybeUninit { + #[cfg(all( + any( + target_feature = "prefix-instrs", + atomic_maybe_uninit_target_feature = "prefix-instrs", + ), + target_endian = "little", + ))] + use crate::utils::{MaybeUninit128Be as MaybeUninit128, PairBe as Pair}; + debug_assert!(src as usize % mem::size_of::<$ty>() == 0); - let (prev_hi, prev_lo); + let (out_hi, out_lo); // SAFETY: the caller must uphold the safety contract. unsafe { + #[cfg(all( + any( + target_feature = "prefix-instrs", + atomic_maybe_uninit_target_feature = "prefix-instrs", + ), + target_endian = "little", + ))] + macro_rules! atomic_load_acquire { + ($release:tt) => { + asm!( + $release, + // plq is unsupported in LLVM 19. + // plq %r4, 0(%r3) // atomic { r4:r5 = *src } + ".balign 64,, 4", + ".long 0x04000000", // p + ".long 0xe0830000", // lq %r4, 0(%r3) + "cmpw %r4, %r4", // if r4 == r4 { cr0.EQ = 1 } else { cr0.EQ = 0 } + "bne- %cr0, 2f", // if unlikely(cr0.EQ == 0) { jump 'never } + "2:", // 'never: + "isync", // fence (works in combination with a branch that depends on the loaded value) + in("r3") ptr_reg!(src), + // Quadword atomic instructions work with even/odd pair of specified register and subsequent register. + // We cannot use r1 (sp) and r2 (system reserved), so start with r4 or grater. + out("r4") out_hi, + out("r5") out_lo, + out("cr0") _, + options(nostack, preserves_flags), + ) + }; + } + #[cfg(not(all( + any( + target_feature = "prefix-instrs", + atomic_maybe_uninit_target_feature = "prefix-instrs", + ), + target_endian = "little", + )))] macro_rules! atomic_load_acquire { ($release:tt) => { asm!( $release, "lq %r4, 0({src})", // atomic { r4:r5 = *src } - // Lightweight acquire sync - // Refs: /~https://github.com/boostorg/atomic/blob/boost-1.79.0/include/boost/atomic/detail/core_arch_ops_gcc_ppc.hpp#L47-L62 - "cmpd %cr7, %r4, %r4", - "bne- %cr7, 2f", - "2:", - "isync", + "cmpw %r4, %r4", // if r4 == r4 { cr0.EQ = 1 } else { cr0.EQ = 0 } + "bne- %cr0, 2f", // if unlikely(cr0.EQ == 0) { jump 'never } + "2:", // 'never: + "isync", // fence (works in combination with a branch that depends on the loaded value) src = in(reg_nonzero) ptr_reg!(src), // Quadword atomic instructions work with even/odd pair of specified register and subsequent register. // We cannot use r1 (sp) and r2 (system reserved), so start with r4 or grater. - out("r4") prev_hi, - out("r5") prev_lo, - out("cr7") _, + out("r4") out_hi, + out("r5") out_lo, + out("cr0") _, options(nostack, preserves_flags), ) }; } match order { Ordering::Relaxed => { + #[cfg(all( + any( + target_feature = "prefix-instrs", + atomic_maybe_uninit_target_feature = "prefix-instrs", + ), + target_endian = "little", + ))] + asm!( + // plq is unsupported in LLVM 19. + // plq %r4, 0(%r3) // atomic { r4:r5 = *src } + ".balign 64,, 4", + ".long 0x04000000", // p + ".long 0xe0830000", // lq %r4, 0(%r3) + in("r3") ptr_reg!(src), + // Quadword atomic instructions work with even/odd pair of specified register and subsequent register. + // We cannot use r1 (sp) and r2 (system reserved), so start with r4 or grater. + out("r4") out_hi, + out("r5") out_lo, + options(nostack, preserves_flags), + ); + #[cfg(not(all( + any( + target_feature = "prefix-instrs", + atomic_maybe_uninit_target_feature = "prefix-instrs", + ), + target_endian = "little", + )))] asm!( "lq %r4, 0({src})", // atomic { r4:r5 = *src } src = in(reg_nonzero) ptr_reg!(src), // Quadword atomic instructions work with even/odd pair of specified register and subsequent register. // We cannot use r1 (sp) and r2 (system reserved), so start with r4 or grater. - out("r4") prev_hi, - out("r5") prev_lo, + out("r4") out_hi, + out("r5") out_lo, options(nostack, preserves_flags), ); } @@ -536,7 +647,7 @@ macro_rules! atomic128 { Ordering::SeqCst => atomic_load_acquire!("sync"), _ => unreachable!(), } - MaybeUninit128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.$ty + MaybeUninit128 { pair: Pair { lo: out_lo, hi: out_hi } }.$ty } } } @@ -547,11 +658,52 @@ macro_rules! atomic128 { val: MaybeUninit, order: Ordering, ) { + #[cfg(all( + any( + target_feature = "prefix-instrs", + atomic_maybe_uninit_target_feature = "prefix-instrs", + ), + target_endian = "little", + ))] + use crate::utils::MaybeUninit128Be as MaybeUninit128; + debug_assert!(dst as usize % mem::size_of::<$ty>() == 0); let val = MaybeUninit128 { $ty: val }; // SAFETY: the caller must uphold the safety contract. unsafe { + #[cfg(all( + any( + target_feature = "prefix-instrs", + atomic_maybe_uninit_target_feature = "prefix-instrs", + ), + target_endian = "little", + ))] + macro_rules! atomic_store { + ($release:tt) => { + asm!( + $release, // fence + // pstq is unsupported in LLVM 19. + // "pstq %r4, 0(%r3)", // atomic { *dst = r4:r5 } + ".balign 64,, 4", + ".long 0x04000000", // p + ".long 0xf0830000", // stq %r4, 0(%r3) + in("r3") ptr_reg!(dst), + // Quadword atomic instructions work with even/odd pair of specified register and subsequent register. + // We cannot use r1 (sp) and r2 (system reserved), so start with r4 or grater. + in("r4") val.pair.hi, + in("r5") val.pair.lo, + options(nostack, preserves_flags), + ) + }; + } + #[cfg(not(all( + any( + target_feature = "prefix-instrs", + atomic_maybe_uninit_target_feature = "prefix-instrs", + ), + target_endian = "little", + )))] macro_rules! atomic_store { ($release:tt) => { asm!( @@ -624,16 +776,15 @@ macro_rules! atomic128 { failure: Ordering, ) -> (MaybeUninit, bool) { debug_assert!(dst as usize % mem::size_of::<$ty>() == 0); - let order = crate::utils::upgrade_success_ordering(success, failure); let old = MaybeUninit128 { $ty: old }; let new = MaybeUninit128 { $ty: new }; let (mut prev_hi, mut prev_lo); + let mut r; // SAFETY: the caller must uphold the safety contract. unsafe { - let mut r; macro_rules! cmpxchg { - ($acquire:tt, $release:tt) => { + ($acquire_always:tt, $acquire_success:tt, $release:tt) => { asm!( $release, // fence "2:", // 'retry: @@ -644,9 +795,10 @@ macro_rules! atomic128 { "bne %cr0, 3f", // if cr0.EQ == 0 { jump 'cmp-fail } "stqcx. %r6, 0, {dst}", // atomic { if RESERVE == (dst, 16) { *dst = r6:r7; cr0.EQ = 1 } else { cr0.EQ = 0 }; RESERVE = None } "bne %cr0, 2b", // if cr0.EQ == 0 { jump 'retry } + $acquire_success, // fence "3:", // 'cmp-fail: + $acquire_always, // fence "mfcr {tmp_lo}", // tmp_lo = zero_extend(cr) - $acquire, // fence dst = in(reg_nonzero) ptr_reg!(dst), old_hi = in(reg) old.pair.hi, old_lo = in(reg) old.pair.lo, @@ -663,11 +815,11 @@ macro_rules! atomic128 { ) }; } - atomic_rmw!(cmpxchg, order); + atomic_cas!(cmpxchg, success, failure); // if compare failed EQ bit is cleared, if store succeeds EQ bit is set. ( MaybeUninit128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.$ty, - extract_cr0(r) + test_cr0_eq(r) ) } } @@ -680,16 +832,15 @@ macro_rules! atomic128 { failure: Ordering, ) -> (MaybeUninit, bool) { debug_assert!(dst as usize % mem::size_of::<$ty>() == 0); - let order = crate::utils::upgrade_success_ordering(success, failure); let old = MaybeUninit128 { $ty: old }; let new = MaybeUninit128 { $ty: new }; let (mut prev_hi, mut prev_lo); + let mut r; // SAFETY: the caller must uphold the safety contract. unsafe { - let mut r; macro_rules! cmpxchg_weak { - ($acquire:tt, $release:tt) => { + ($acquire_always:tt, $acquire_success:tt, $release:tt) => { asm!( $release, // fence "lqarx %r8, 0, {dst}", // atomic { RESERVE = (dst, 16); r8:r9 = *dst } @@ -698,9 +849,10 @@ macro_rules! atomic128 { "or. {tmp_lo}, {tmp_lo}, {tmp_hi}", // tmp_lo |= tmp_hi; if tmp_lo == 0 { cr0.EQ = 1 } else { cr0.EQ = 0 } "bne %cr0, 3f", // if cr0.EQ == 0 { jump 'cmp-fail } "stqcx. %r6, 0, {dst}", // atomic { if RESERVE == (dst, 16) { *dst = r6:r7; cr0.EQ = 1 } else { cr0.EQ = 0 }; RESERVE = None } + $acquire_success, // fence "3:", // 'cmp-fail: + $acquire_always, // fence "mfcr {tmp_lo}", // tmp_lo = zero_extend(cr) - $acquire, // fence dst = in(reg_nonzero) ptr_reg!(dst), old_hi = in(reg) old.pair.hi, old_lo = in(reg) old.pair.lo, @@ -717,11 +869,11 @@ macro_rules! atomic128 { ) }; } - atomic_rmw!(cmpxchg_weak, order); + atomic_cas!(cmpxchg_weak, success, failure); // if compare or store failed EQ bit is cleared, if store succeeds EQ bit is set. ( MaybeUninit128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.$ty, - extract_cr0(r) + test_cr0_eq(r) ) } } diff --git a/src/arch/riscv.rs b/src/arch/riscv.rs index 72c99f52..1fdf2e6c 100644 --- a/src/arch/riscv.rs +++ b/src/arch/riscv.rs @@ -6,7 +6,7 @@ RISC-V This architecture provides the following atomic instructions: - Load/Store Instructions (relaxed load/store) - - All aligned {8,16,32}-bit (for RV32 & RV64) and 64-bit (for RV64) load/store instructions are atomic. + - All aligned {8,16,32}-bit (for RV32 & RV64) and 64-bit (for RV64) load/store instructions Currently, there is no guaranteed 128-bit atomic load/store even on RV128. /~https://github.com/riscv/riscv-isa-manual/blob/riscv-isa-release-8b9dc50-2024-08-30/src/rvwmo.adoc#memory-model-primitives - Load-Acquire and Store-Release Instructions (acquire/seqcst load and release/seqcst store) @@ -15,7 +15,7 @@ This architecture provides the following atomic instructions: - Load-Reserved/Store-Conditional (LR/SC) Instructions (aka LL/SC) - Zalrsc extension: 32-bit (for RV32 & RV64) and 64-bit (for RV64) /~https://github.com/riscv/riscv-isa-manual/blob/riscv-isa-release-8b9dc50-2024-08-30/src/a-st-ext.adoc#zalrsc-extension-for-load-reservedstore-conditional-instructions -- Atomic Memory Operation (AMO) Instructions (RMW) +- Atomic Memory Operation (AMO) Instructions - Zaamo extension: 32-bit (for RV32 & RV64) and 64-bit (for RV64) swap,fetch_{add,and,or,xor,max.min} /~https://github.com/riscv/riscv-isa-manual/blob/riscv-isa-release-8b9dc50-2024-08-30/src/a-st-ext.adoc#zaamo-extension-for-atomic-memory-operations - Zabha extension: {8,16}-bit swap,fetch_{add,and,or,xor,max.min} @@ -26,21 +26,25 @@ This architecture provides the following atomic instructions: - Zacas and Zabha extensions: {8,16}-bit /~https://github.com/riscv/riscv-isa-manual/blob/riscv-isa-release-8b9dc50-2024-08-30/src/zabha.adoc -Note: "A" extension comprises instructions provided by "Zalrsc" and "Zaamo" extensions, -"Zabha" and "Zacas" depends upon "Zaamo" extension. +Of the above instructions, instructions other than relaxed load/store, can specify the memory ordering. +The mappings from the C/C++ atomic operations are described in the RISC-V Atomics ABI Specification. + +Note: "A" extension comprises instructions provided by Zalrsc and Zaamo extensions, +Zabha and Zacas extensions depends upon Zaamo extension. Refs: - RISC-V Instruction Set Manual /~https://github.com/riscv/riscv-isa-manual - RISC-V Atomics ABI Specification /~https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/draft-20240829-13bfa9f54634cb60d86b9b333e109f077805b4b3/riscv-atomic.adoc -- portable-atomic /~https://github.com/taiki-e/portable-atomic +- portable-atomic + /~https://github.com/taiki-e/portable-atomic Generated asm: -- riscv64gc https://godbolt.org/z/4bzozeK8d -- riscv64gc (+zabha) https://godbolt.org/z/KEdoMn6re -- riscv32imac https://godbolt.org/z/9nT3qh33v -- riscv32imac (+zabha) https://godbolt.org/z/d1Tr7W3E3 +- riscv64gc https://godbolt.org/z/nfjcYcn1a +- riscv64gc (+zabha) https://godbolt.org/z/9En1f3G8v +- riscv32imac https://godbolt.org/z/3v8Yq7Ejh +- riscv32imac (+zabha) https://godbolt.org/z/9xWrW4Ynn */ // TODO: Zacas extension, and Zalrsc extension without A extension diff --git a/src/arch/s390x.rs b/src/arch/s390x.rs index 0f480e39..f3b9a0c4 100644 --- a/src/arch/s390x.rs +++ b/src/arch/s390x.rs @@ -1,30 +1,49 @@ // SPDX-License-Identifier: Apache-2.0 OR MIT /* -s390x +s390x (SystemZ) This architecture provides the following atomic instructions: - Load/Store Instructions - - Baseline: {8,16,32,64}-bit - (Section "Storage-Operand Fetch References" and "Storage-Operand Store References" of z/Architecture Principles of Operation) - - Baseline: 128-bit (lpq, stpq) - (Section "LOAD PAIR FROM QUADWORD" and "STORE PAIR TO QUADWORD" of z/Architecture Principles of Operation) -- Compare-and-Swap Instructions (CAS) - - Baseline: {32,64,128}-bit - (Section "Storage-Operand Update References" of z/Architecture Principles of Operation) -- Interlocked-Access Facilities (RMW) - - Interlocked-Access Facility 1: {32,64}-bit fetch_{add,and,or,xor}, add with immediate value - - Interlocked-Access Facility 2: {and,or,xor} with immediate value - (Section "Storage-Operand Update References" of z/Architecture Principles of Operation) + - All {8,16,32,64}-bit load/store instructions that having Single-Access References + (Refs: Section "Storage-Operand Fetch References", "Storage-Operand Store References", and "Storage-Operand Consistency" of z/Architecture Principles of Operation, Fourteenth Edition) + - LPQ/STPQ: 128-bit load/store (arch1 or later) + (Refs: Section "LOAD PAIR FROM QUADWORD" and "STORE PAIR TO QUADWORD" of z/Architecture Principles of Operation, Fourteenth Edition) +- Instructions that having Interlocked-Update References + - TS: 8-bit TAS (360 or later) + (TEST AND SET) + - CS{,Y,G}, CDS{,Y,G}: {32,64,128}-bit CAS (CS,CDS: 370 or later, CSG,CDSG: arch1 or later, CSY,CDSY: long-displacement facility added in arch3) + (COMPARE AND SWAP, COMPARE DOUBLE AND SWAP) + - LAA{,G}, LAAL{,G}, LAN{,G}, LAO{,G}, LAX{,G}: {32,64}-bit fetch-and-{add,and,or,xor} (interlocked-access facility 1 added in arch9) + (LOAD AND ADD, LOAD AND ADD LOGICAL, LOAD AND AND, LOAD AND OR, LOAD AND EXCLUSIVE OR) + - Aligned A{,G}SI, AL{,G}SI: {32,64}-bit add with immediate (interlocked-access facility 1 added in arch9) + (Storage-and-immediate formats of ADD IMMEDIATE and ADD LOGICAL WITH SIGNED IMMEDIATE) + - NI{,Y}, OI{,Y}, XI{,Y}: 8-bit {and,or,xor} with immediate (interlocked-access facility 2 added in arch10) + (Storage-and-immediate formats of AND, OR, and EXCLUSIVE OR) + - (Others: COMPARE AND REPLACE DAT TABLE ENTRY, COMPARE AND SWAP AND PURGE, COMPARE AND SWAP AND STORE, STORE CHARACTERS UNDER MASK (conditional)) + (Refs: Section "Storage-Operand Update References" of z/Architecture Principles of Operation, Fourteenth Edition) + +Of the above instructions, instructions that having Interlocked-Update References +other than STORE CHARACTERS UNDER MASK perform serialization. +(Refs: Section "CPU Serialization" of z/Architecture Principles of Operation, Fourteenth Edition) + +The following instructions are usually used as standalone memory barrier: +- BCR 15,0 (360 or later) +- BCR 14,0 (fast-BCR-serialization facility added in arch9) +(Refs: Section "BRANCH ON CONDITION" of z/Architecture Principles of Operation, Fourteenth Edition) + +Serialization corresponds to SeqCst semantics, all memory access has Acquire/Release semantics. Refs: -- z/Architecture Principles of Operation https://publibfp.dhe.ibm.com/epubs/pdf/a227832d.pdf -- portable-atomic /~https://github.com/taiki-e/portable-atomic +- z/Architecture Principles of Operation, Fourteenth Edition (SA22-7832-13) + https://publibfp.dhe.ibm.com/epubs/pdf/a227832d.pdf +- portable-atomic + /~https://github.com/taiki-e/portable-atomic Generated asm: -- s390x https://godbolt.org/z/PK1xK95xb -- s390x (z196) https://godbolt.org/z/rW8cx3h5W +- s390x https://godbolt.org/z/Y4YvPsTWz +- s390x (z196) https://godbolt.org/z/v9Wbro8oj */ #[path = "cfgs/s390x.rs"] @@ -41,7 +60,7 @@ use crate::{ utils::{MaybeUninit128, Pair}, }; -// bcr 14,0 (fast-BCR-serialization) requires z196 or later. +// bcr 14,0 requires fast-BCR-serialization facility added in arch9 (z196). #[cfg(any( target_feature = "fast-serialization", atomic_maybe_uninit_target_feature = "fast-serialization", @@ -62,16 +81,11 @@ macro_rules! serialization { } // Extracts and checks condition code. -#[inline(always)] +#[inline] fn extract_cc(r: i64) -> bool { r.wrapping_add(-268435456) & (1 << 31) != 0 } -#[inline(always)] -fn complement(v: u32) -> u32 { - (v ^ !0).wrapping_add(1) -} - macro_rules! atomic_load_store { ($ty:ident, $l_suffix:tt, $suffix:tt) => { impl AtomicLoad for $ty { @@ -108,10 +122,10 @@ macro_rules! atomic_load_store { // SAFETY: the caller must uphold the safety contract. unsafe { macro_rules! atomic_store { - ($fence:expr) => { + ($acquire:expr) => { asm!( concat!("st", $suffix, " {val}, 0({dst})"), // atomic { *dst = val } - $fence, // fence + $acquire, // fence dst = in(reg_addr) ptr_reg!(dst), val = in(reg) val, options(nostack, preserves_flags), @@ -172,10 +186,10 @@ macro_rules! atomic { ) -> (MaybeUninit, bool) { debug_assert!(dst as usize % mem::size_of::<$ty>() == 0); let out: MaybeUninit; + let r; // SAFETY: the caller must uphold the safety contract. unsafe { - let r; // compare_exchange is always SeqCst. asm!( concat!("cs", $suffix, " {old}, {new}, 0({dst})"), // atomic { if *dst == old { cc = 0; *dst = new } else { cc = 1; old = *dst } } @@ -226,7 +240,7 @@ macro_rules! atomic_sub_word { val = in(reg) val, out = out(reg) out, shift = in(reg_addr) shift, - shift_c = in(reg_addr) complement(shift), + shift_c = in(reg_addr) shift.wrapping_neg(), prev = out(reg) _, // Do not use `preserves_flags` because CS and RISBG modify the condition code. options(nostack), @@ -247,10 +261,10 @@ macro_rules! atomic_sub_word { debug_assert!(dst as usize % mem::size_of::<$ty>() == 0); let (dst, shift, _mask) = crate::utils::create_sub_word_mask_values(dst); let mut out: MaybeUninit; + let r; // SAFETY: the caller must uphold the safety contract. unsafe { - let r; // Implement sub-word atomic operations using word-sized CAS loop. // Based on assemblies generated by rustc/LLVM. // See also create_sub_word_mask_values. @@ -261,6 +275,7 @@ macro_rules! atomic_sub_word { concat!("risbg {new}, {out}, 32, ", $risbg_cas, ", 0"), concat!("ll", $suffix, "r {out}, {out}"), "cr {out}, {old}", + // TODO?? "jlh 3f", concat!("rll {tmp}, {new}, -", $bits ,"({shift_c})"), "cs {prev}, {tmp}, 0({dst})", @@ -272,7 +287,7 @@ macro_rules! atomic_sub_word { old = in(reg) crate::utils::ZeroExtend::zero_extend(old), new = inout(reg) new => _, shift = in(reg_addr) shift, - shift_c = in(reg_addr) complement(shift), + shift_c = in(reg_addr) shift.wrapping_neg(), tmp = out(reg) _, r = lateout(reg) r, out = out(reg) out, @@ -297,8 +312,6 @@ atomic!(u64, "g"); atomic!(isize, "g"); atomic!(usize, "g"); -// s390x has 128-bit atomic load/store/CAS instructions and other operations are emulated by CAS loop. -// See /~https://github.com/taiki-e/portable-atomic/blob/HEAD/src/imp/atomic128/README.md for details. macro_rules! atomic128 { ($ty:ident) => { impl AtomicLoad for $ty { @@ -308,7 +321,7 @@ macro_rules! atomic128 { _order: Ordering, ) -> MaybeUninit { debug_assert!(src as usize % mem::size_of::<$ty>() == 0); - let (prev_hi, prev_lo); + let (out_hi, out_lo); // SAFETY: the caller must uphold the safety contract. unsafe { @@ -317,11 +330,11 @@ macro_rules! atomic128 { "lpq %r0, 0({src})", // atomic { r0:r1 = *src } src = in(reg_addr) ptr_reg!(src), // Quadword atomic instructions work with even/odd pair of specified register and subsequent register. - out("r0") prev_hi, - out("r1") prev_lo, + out("r0") out_hi, + out("r1") out_lo, options(nostack, preserves_flags), ); - MaybeUninit128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.$ty + MaybeUninit128 { pair: Pair { lo: out_lo, hi: out_hi } }.$ty } } } @@ -338,10 +351,10 @@ macro_rules! atomic128 { // SAFETY: the caller must uphold the safety contract. unsafe { macro_rules! atomic_store { - ($fence:expr) => { + ($acquire:expr) => { asm!( "stpq %r0, 0({dst})", // atomic { *dst = r0:r1 } - $fence, // fence + $acquire, // acquire dst = in(reg_addr) ptr_reg!(dst), // Quadword atomic instructions work with even/odd pair of specified register and subsequent register. in("r0") val.pair.hi, @@ -405,10 +418,10 @@ macro_rules! atomic128 { let old = MaybeUninit128 { $ty: old }; let new = MaybeUninit128 { $ty: new }; let (prev_hi, prev_lo); + let r; // SAFETY: the caller must uphold the safety contract. unsafe { - let r; // compare_exchange is always SeqCst. asm!( "cdsg %r0, %r12, 0({dst})", // atomic { if *dst == r0:r1 { cc = 0; *dst = r12:13 } else { cc = 1; r0:r1 = *dst } } diff --git a/src/arch/sparc.rs b/src/arch/sparc.rs index 2c3cdc9e..2e618928 100644 --- a/src/arch/sparc.rs +++ b/src/arch/sparc.rs @@ -6,19 +6,29 @@ SPARC This architecture provides the following atomic instructions: - Load/Store Instructions - - V7-V9: {8,16,32}-bit - - V8+,V9: 64-bit (LDX, STX) - (Section 8.4.4 "Memory Models" of the SPARC Architecture Manual, Version 9) -- Compare-and-Swap Instructions (CAS) - - V8+,V9: {32,64}-bit - - V8 with CAS (e.g., LEON4): 32-bit - (Section 8.4.6 "Hardware Primitives for Mutual Exclusion" of the SPARC Architecture Manual, Version 9) -- SWAP Instructions (RMW) - - V8-V9: 32-bit (deprecated in V9) - (Section 8.4.6 "Hardware Primitives for Mutual Exclusion" and A.57 "Swap Register with Memory" of the SPARC Architecture Manual, Version 9) -- Load Store Unsigned Byte Instructions (RMW) - - V7-V9: 8-bit - (Section 8.4.6 "Hardware Primitives for Mutual Exclusion" of the SPARC Architecture Manual, Version 9) + - V7 or later: {8,16,32}-bit + - V8+,V9: 64-bit + (Refs: Section D.4.1 "Value Atomicity" of the SPARC Architecture Manual, Version 9) +- Compare-and-Swap Instructions + - V8+,V9: {32,64}-bit CAS + - V8 with LEONCASA: 32-bit CAS + (Refs: Section 8.4.6 "Hardware Primitives for Mutual Exclusion" of the SPARC Architecture Manual, Version 9) +- SWAP Instructions + - V7 or later: 32-bit swap (deprecated in V9) + (Refs: Section 8.4.6 "Hardware Primitives for Mutual Exclusion" and A.57 "Swap Register with Memory" of the SPARC Architecture Manual, Version 9) +- Load Store Unsigned Byte Instructions + - V7 or later: 8-bit TAS + (Refs: Section 8.4.6 "Hardware Primitives for Mutual Exclusion" of the SPARC Architecture Manual, Version 9) + +Which memory barrier the above instructions imply depends on the memory model used. +V8+ and V9 have three memory models: Total Store Order (TSO), Partial Store Order (PSO), and Relaxed +Memory Order (RMO). V8 has TSO and PSO. Implementation of TSO (or a more strongly ordered model +which implies TSO) is mandatory, and PSO and RMO are optional. +(Refs: Section 8.4.4 "Memory Models" of the SPARC Architecture Manual, Version 9) + +Memory access instructions require proper alignment, but some instructions are implementation-dependent +and may work with insufficient alignment. +(Refs: Section 6.3.1.1 Memory Alignment Restrictions" of the SPARC Architecture Manual, Version 9) Refs: - The SPARC Architecture Manual, Version 9 @@ -28,9 +38,9 @@ Refs: https://temlib.org/pub/SparcStation/Standards/V8plus.pdf Generated asm: -- sparcv8+leoncasa https://godbolt.org/z/n96j1W87s -- sparcv8plus https://godbolt.org/z/qse81K1M5 -- sparc64 https://godbolt.org/z/PbxqToxj4 +- sparcv8+leoncasa https://godbolt.org/z/4TPGbfPo4 +- sparcv8plus https://godbolt.org/z/rvnPono8j +- sparc64 https://godbolt.org/z/ejM3ooeec */ #[path = "cfgs/sparc.rs"] @@ -53,8 +63,7 @@ macro_rules! cas { #[cfg(any(target_feature = "leoncasa", atomic_maybe_uninit_target_feature = "leoncasa"))] macro_rules! cas { ("", $rs1:tt, $rs2:tt, $rd:tt) => { - // .p2align 4 is workaround for errata (GRLIB-TN-0011). - concat!(".p2align 4", "\n", "casa ", $rs1, " 10, ", $rs2, ", ", $rd) + concat!(leon_align!(), "casa ", $rs1, " 10, ", $rs2, ", ", $rd) }; } @@ -113,7 +122,26 @@ macro_rules! leon_nop { #[cfg(any(target_feature = "leoncasa", atomic_maybe_uninit_target_feature = "leoncasa"))] macro_rules! leon_nop { () => { - "nop" + "nop\n" + }; +} +// Workaround for errata (GRLIB-TN-0011). +// https://www.gaisler.com/index.php/information/app-tech-notes +#[cfg(not(any( + target_arch = "sparc64", + target_feature = "v9", + atomic_maybe_uninit_target_feature = "v9", +)))] +#[cfg(not(any(target_feature = "leoncasa", atomic_maybe_uninit_target_feature = "leoncasa")))] +macro_rules! leon_align { + () => { + "" + }; +} +#[cfg(any(target_feature = "leoncasa", atomic_maybe_uninit_target_feature = "leoncasa"))] +macro_rules! leon_align { + () => { + ".p2align 4\n" }; } @@ -126,16 +154,10 @@ macro_rules! atomic_rmw { ($op:ident, $order:ident) => { match $order { Ordering::Relaxed => $op!("", ""), - Ordering::Acquire => { - $op!("membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore", "") - } - Ordering::Release => { - $op!("", "membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore") - } - Ordering::AcqRel | Ordering::SeqCst => $op!( - "membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore", - "membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore" - ), + // 15 == #LoadLoad | #StoreLoad | #LoadStore | #StoreStore + Ordering::Acquire => $op!("membar 15", ""), + Ordering::Release => $op!("", "membar 15"), + Ordering::AcqRel | Ordering::SeqCst => $op!("membar 15", "membar 15"), _ => unreachable!(), } }; @@ -147,11 +169,19 @@ macro_rules! atomic_rmw { )))] macro_rules! atomic_rmw { ($op:ident, $order:ident) => { + // GCC and LLVM use different types of memory barriers in SPARC-V8, and probably have + // different semantics to obtain as a result. My experience with this platform is that LLVM + // is often incomplete and GCC's is more likely to be correct, but I use code with both + // semantics just to be safe. match $order { Ordering::Relaxed => $op!("", ""), Ordering::Acquire => $op!("stbar", ""), - Ordering::Release => $op!("", "stbar"), - Ordering::AcqRel | Ordering::SeqCst => $op!("stbar", "stbar"), + Ordering::Release => { + $op!("", concat!("stbar\n", leon_align!(), "ldstub [%sp-1], %g0")) + } + Ordering::AcqRel | Ordering::SeqCst => { + $op!("stbar", concat!("stbar\n", leon_align!(), "ldstub [%sp-1], %g0")) + } _ => unreachable!(), } }; @@ -171,8 +201,9 @@ macro_rules! atomic_load_store { // SAFETY: the caller must uphold the safety contract. unsafe { macro_rules! atomic_load { - ($acquire:tt) => { + ($acquire:expr, $release:expr) => { asm!( + $release, concat!("ld", $load_sign, $size, " [{src}], {out}"), // atomic { out = *src } $acquire, // fence src = in(reg) ptr_reg!(src), @@ -187,8 +218,11 @@ macro_rules! atomic_load_store { atomic_maybe_uninit_target_feature = "v9", ))] match order { - Ordering::Relaxed => atomic_load!(""), - Ordering::Acquire | Ordering::SeqCst => atomic_load!("membar #LoadLoad | #StoreLoad | #LoadStore | #StoreStore"), + Ordering::Relaxed => atomic_load!("", ""), + // 3 == #LoadLoad | #StoreLoad + // 5 == #LoadLoad | #LoadStore + Ordering::Acquire => atomic_load!("membar 5", ""), + Ordering::SeqCst => atomic_load!("membar 5", "membar 3"), _ => unreachable!(), } #[cfg(not(any( @@ -196,9 +230,14 @@ macro_rules! atomic_load_store { target_feature = "v9", atomic_maybe_uninit_target_feature = "v9", )))] + // GCC and LLVM use different types of memory barriers in SPARC-V8, and probably have + // different semantics to obtain as a result. My experience with this platform is that LLVM + // is often incomplete and GCC's is more likely to be correct, but I use code with both + // semantics just to be safe. match order { - Ordering::Relaxed => atomic_load!(""), - Ordering::Acquire | Ordering::SeqCst => atomic_load!("stbar"), + Ordering::Relaxed => atomic_load!("", ""), + Ordering::Acquire => atomic_load!("stbar", ""), + Ordering::SeqCst => atomic_load!("stbar", concat!(leon_nop!(), leon_align!(), "ldstub [%sp-1], %g0")), _ => unreachable!(), } } @@ -214,8 +253,8 @@ macro_rules! atomic_load_store { ) { // SAFETY: the caller must uphold the safety contract. unsafe { - macro_rules! store { - ($acquire:tt, $release:tt) => { + macro_rules! atomic_store { + ($acquire:expr, $release:expr) => { asm!( leon_nop!(), // Workaround for for errata (GRLIB-TN-0009). $release, // fence @@ -228,7 +267,34 @@ macro_rules! atomic_load_store { ) }; } - atomic_rmw!(store, order); + #[cfg(any( + target_arch = "sparc64", + target_feature = "v9", + atomic_maybe_uninit_target_feature = "v9", + ))] + match order { + Ordering::Relaxed => atomic_store!("", ""), + // 10 == #StoreLoad | #StoreStore + // 12 == #LoadStore | #StoreStore + Ordering::Release => atomic_store!("", "membar 12"), + Ordering::SeqCst => atomic_store!("membar 10", "membar 12"), + _ => unreachable!(), + } + #[cfg(not(any( + target_arch = "sparc64", + target_feature = "v9", + atomic_maybe_uninit_target_feature = "v9", + )))] + // GCC and LLVM use different types of memory barriers in SPARC-V8, and probably have + // different semantics to obtain as a result. My experience with this platform is that LLVM + // is often incomplete and GCC's is more likely to be correct, but I use code with both + // semantics just to be safe. + match order { + Ordering::Relaxed => atomic_store!("", ""), + Ordering::Release => atomic_store!("", "stbar"), + Ordering::SeqCst => atomic_store!(concat!("stbar\n", leon_align!(), "ldstub [%sp-1], %g0"), "stbar"), + _ => unreachable!(), + } } } } @@ -251,7 +317,7 @@ macro_rules! atomic { // SAFETY: the caller must uphold the safety contract. unsafe { macro_rules! swap { - ($acquire:tt, $release:tt) => { + ($acquire:expr, $release:expr) => { asm!( $release, // fence concat!("ld", $size, " [{dst}], {out}"), // atomic { out = *dst } @@ -292,7 +358,7 @@ macro_rules! atomic { unsafe { let mut r: crate::utils::RegSize; macro_rules! cmpxchg { - ($acquire:tt, $release:tt) => { + ($acquire:expr, $release:expr) => { asm!( leon_nop!(), // Workaround for errata (GRLIB-TN-0010). $release, // fence @@ -336,7 +402,7 @@ macro_rules! atomic_sub_word { // SAFETY: the caller must uphold the safety contract. unsafe { macro_rules! swap { - ($acquire:tt, $release:tt) => { + ($acquire:expr, $release:expr) => { // Implement sub-word atomic operations using word-sized CAS loop. // See also create_sub_word_mask_values. asm!( @@ -386,7 +452,7 @@ macro_rules! atomic_sub_word { unsafe { let mut r: crate::utils::RegSize; macro_rules! cmpxchg { - ($acquire:tt, $release:tt) => { + ($acquire:expr, $release:expr) => { // Implement sub-word atomic operations using word-sized CAS loop. // See also create_sub_word_mask_values. asm!( diff --git a/src/lib.rs b/src/lib.rs index 2393d67f..2af8b910 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -31,7 +31,7 @@ Currently, x86, x86_64, Arm, AArch64, RISC-V, LoongArch64, Arm64EC, s390x, MIPS, | mips64 / mips64r6 \[8] | isize,usize,i8,u8,i16,u16,i32,u32,i64,u64 | ✓ | ✓ | | powerpc \[8] | isize,usize,i8,u8,i16,u16,i32,u32 | ✓ | ✓ | | powerpc64 \[8] | isize,usize,i8,u8,i16,u16,i32,u32,i64,u64 | ✓ | ✓ | -| powerpc64 (pwr8+) \[4] \[8] | i128,u128 | ✓ | ✓ | +| powerpc64 (+quadword-atomics) \[4] \[8]| i128,u128 | ✓ | ✓ | | msp430 \[8] (experimental) | isize,usize,i8,u8,i16,u16 | ✓ | ✓ | | avr \[8] (experimental) | isize,usize,i8,u8,i16,u16 | ✓ | ✓ | | sparc \[5] \[8] (experimental) | isize,usize,i8,u8,i16,u16,i32,u32 | ✓ | ✓ | diff --git a/src/utils.rs b/src/utils.rs index 3e75ace3..4d9ce381 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -206,6 +206,15 @@ pub(crate) union MaybeUninit128 { pub(crate) i128: MaybeUninit, pub(crate) pair: Pair, } +#[cfg(target_arch = "powerpc64")] +#[allow(dead_code)] +#[derive(Clone, Copy)] +#[repr(C)] +pub(crate) union MaybeUninit128Be { + pub(crate) u128: MaybeUninit, + pub(crate) i128: MaybeUninit, + pub(crate) pair: PairBe, +} /// A 64-bit value represented as a pair of 32-bit values. /// /// This type is `#[repr(C)]`, both fields have the same in-memory representation @@ -218,26 +227,34 @@ pub(crate) union MaybeUninit64 { pub(crate) i64: MaybeUninit, pub(crate) pair: Pair, } +#[cfg(not(any( + target_endian = "little", + target_arch = "aarch64", + target_arch = "arm", + target_arch = "arm64ec", +)))] +pub(crate) use self::PairBe as Pair; +#[cfg(any( + target_endian = "little", + target_arch = "aarch64", + target_arch = "arm", + target_arch = "arm64ec", +))] +pub(crate) use self::PairLe as Pair; +/// Little-endian order pair. #[allow(dead_code)] #[derive(Clone, Copy)] #[repr(C)] -pub(crate) struct Pair { - // little endian order - #[cfg(any( - target_endian = "little", - target_arch = "aarch64", - target_arch = "arm", - target_arch = "arm64ec", - ))] +pub(crate) struct PairLe { pub(crate) lo: MaybeUninit, pub(crate) hi: MaybeUninit, - // big endian order - #[cfg(not(any( - target_endian = "little", - target_arch = "aarch64", - target_arch = "arm", - target_arch = "arm64ec", - )))] +} +/// Big-endian order pair. +#[allow(dead_code)] +#[derive(Clone, Copy)] +#[repr(C)] +pub(crate) struct PairBe { + pub(crate) hi: MaybeUninit, pub(crate) lo: MaybeUninit, }