-
Notifications
You must be signed in to change notification settings - Fork 13k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Small POD struct equality ineffective #83585
Comments
See also: #83623 |
I am quite surprised but fix of #83623 doesn't fix this issue. ASMplayground::foo1a: # @playground::foo1a
# %bb.0:
cmpb %sil, %dil
jne .LBB0_6
# %bb.1:
movq %rsi, %rcx
shrq $8, %rcx
movq %rdi, %rdx
shrq $8, %rdx
xorl %eax, %eax
cmpb %cl, %dl
jne .LBB0_7
# %bb.2:
movq %rsi, %rcx
shrq $16, %rcx
movq %rdi, %rdx
shrq $16, %rdx
cmpb %cl, %dl
jne .LBB0_7
# %bb.3:
movq %rdi, %rcx
shrq $24, %rcx
movq %rsi, %rdx
shrq $24, %rdx
xorl %eax, %eax
cmpb %dl, %cl
jne .LBB0_7
# %bb.4:
movq %rdi, %rcx
shrq $32, %rcx
movq %rsi, %rdx
shrq $32, %rdx
cmpb %dl, %cl
jne .LBB0_7
# %bb.5:
movq %rdi, %r8
shrq $40, %r8
movq %rdi, %rcx
shrq $48, %rcx
shrq $56, %rdi
movq %rsi, %rdx
shrq $40, %rdx
movq %rsi, %rax
shrq $48, %rax
shrq $56, %rsi
xorb %r8b, %dl
xorb %cl, %al
orb %dl, %al
xorb %dil, %sil
orb %al, %sil
sete %al
# kill: def $al killed $al killed $eax
retq
.LBB0_6:
xorl %eax, %eax
.LBB0_7:
# kill: def $al killed $al killed $eax
retq |
With the last nightly I'm seeing a probable improvement in foo1b (the second): foo1a:
cmp dil, sil
jne .LBB0_6
mov rcx, rsi
shr rcx, 8
mov rdx, rdi
shr rdx, 8
xor eax, eax
cmp dl, cl
jne .LBB0_7
mov rcx, rsi
shr rcx, 16
mov rdx, rdi
shr rdx, 16
cmp dl, cl
jne .LBB0_7
mov rcx, rdi
shr rcx, 24
mov rdx, rsi
shr rdx, 24
xor eax, eax
cmp cl, dl
jne .LBB0_7
mov rcx, rdi
shr rcx, 32
mov rdx, rsi
shr rdx, 32
cmp cl, dl
jne .LBB0_7
mov r8, rdi
shr r8, 40
mov rcx, rdi
shr rcx, 48
shr rdi, 56
mov rdx, rsi
shr rdx, 40
mov rax, rsi
shr rax, 48
shr rsi, 56
xor dl, r8b
xor al, cl
or al, dl
xor sil, dil
or sil, al
sete al
ret
.LBB0_6:
xor eax, eax
.LBB0_7:
ret
foo1b:
mov al, byte ptr [rdi]
cmp al, byte ptr [rsi]
jne .LBB1_1
mov eax, dword ptr [rdi + 1]
mov ecx, dword ptr [rdi + 4]
xor eax, dword ptr [rsi + 1]
xor ecx, dword ptr [rsi + 4]
or ecx, eax
sete al
ret
.LBB1_1:
xor eax, eax
ret
foo2a:
cmp rdi, rsi
sete al
ret
foo2b:
mov rax, qword ptr [rdi]
cmp rax, qword ptr [rsi]
sete al
ret |
It looks like that one of SROA passes transforms getting of tuple fields into nearly this: ((a >> (0*8)) as u8 == (b >> (0*8)) as u8) &&
((a >> (7*8)) as u8 == (b >> (7*8)) as u8) &&
((a >> (6*8)) as u8 == (b >> (6*8)) as u8) &&
((a >> (5*8)) as u8 == (b >> (5*8)) as u8) &&
((a >> (4*8)) as u8 == (b >> (4*8)) as u8) &&
((a >> (3*8)) as u8 == (b >> (3*8)) as u8) &&
((a >> (2*8)) as u8 == (b >> (2*8)) as u8) &&
((a >> (1*8)) as u8 == (b >> (1*8)) as u8) Edit: Removing SROA doesn't prevent this transformation, it just happens in other steps. |
This leads to big overhead of tuples and structs (because tuples is structs). Benchmark use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
type TElem = u8;
type Tpl = (TElem, TElem, TElem, TElem, TElem, TElem, TElem, TElem, );
struct Data {
original: Vec<Tpl>,
diff_random_field: Vec<Tpl>,
diff_first_field: Vec<Tpl>,
diff_last_field: Vec<Tpl>,
diff_every_second: Vec<Tpl>,
}
fn generate_data(n: usize) -> Data {
use rand::prelude::{Rng, SeedableRng};
use rand_chacha::ChaCha8Rng;
let mut rng = ChaCha8Rng::seed_from_u64(357u64);
let mut data: Vec<TElem> = vec![0; n * 8];
rng.fill(data.as_mut_slice());
let original: Vec<Tpl> = data.windows(8).map(|x|{
(x[0],x[1],x[2],x[3],x[4],x[5],x[6],x[7],)
}).collect();
let diff_random_field: Vec<Tpl> = original
.iter()
.map(|&x| unsafe {
let mut arr: [TElem; 8] = std::mem::transmute(x);
let index = rng.gen_range(0..arr.len());
arr[index] = arr[index].wrapping_add(5);
std::mem::transmute(arr)
})
.collect();
let diff_first_field: Vec<Tpl> = original
.iter()
.copied()
.map(|mut x| {
x.0 = x.0.wrapping_add(5);
x
})
.collect();
let diff_last_field: Vec<Tpl> = original
.iter()
.copied()
.map(|mut x| {
x.7 = x.7.wrapping_add(5);
x
})
.collect();
let mut diff_every_second: Vec<Tpl> = original.clone();
for (a, b) in diff_every_second
.iter_mut()
.zip(diff_random_field.iter())
.step_by(2)
{
*a = b.clone();
}
Data {
original,
diff_random_field,
diff_first_field,
diff_last_field,
diff_every_second,
}
}
fn arr_cmp(a: Tpl, b: Tpl)->bool{
unsafe{
let a: [TElem; 8] = std::mem::transmute(a);
let b: [TElem; 8] = std::mem::transmute(b);
a==b
}
}
pub fn criterion_benchmark(c: &mut Criterion) {
let Data {
original,
diff_random_field,
diff_first_field,
diff_last_field,
diff_every_second,
} = generate_data(1000);
let original_copy = original.clone();
let mut group = c.benchmark_group("u8");
let pairs = [
("Self", original_copy),
("Random field", diff_random_field),
("First field", diff_first_field),
("Last field", diff_last_field),
("Every Second", diff_every_second),
];
for (name, to_compare) in pairs.iter().cloned() {
group.bench_with_input(
BenchmarkId::new("u8 tuple", name),
&to_compare,
|b, to_compare| {
b.iter_batched(
|| -> (Vec<bool>, Vec<Tpl>) {
let buffer = Vec::with_capacity(to_compare.len());
let to_compare_copy = to_compare.clone();
(buffer, to_compare_copy)
},
|(mut out_buff, other)| {
let comparison = original.iter().zip(other.iter()).map(|(a, b)| a == b);
out_buff.extend(comparison);
(out_buff, other)
},
criterion::BatchSize::LargeInput,
);
},
);
}
for (name, to_compare) in pairs.iter().cloned() {
group.bench_with_input(
BenchmarkId::new("u8 arr", name),
&to_compare,
|b, to_compare| {
b.iter_batched(
|| -> (Vec<bool>, Vec<Tpl>) {
let buffer = Vec::with_capacity(to_compare.len());
let to_compare_copy = to_compare.clone();
(buffer, to_compare_copy)
},
|(mut out_buff, other)| {
let comparison = original.iter().zip(other.iter()).map(|(a, b)| arr_cmp(*a, *b));
out_buff.extend(comparison);
(out_buff, other)
},
criterion::BatchSize::LargeInput,
);
},
);
}
group.finish();
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches); Results:
So, PartialEq slower from 4 to 40 times. |
Hack We could change std using this weird trick but possibly shouldn't. Hacky codetype T = u8;
pub struct T1<A, B, C, D, E, F, G, H>(A, B, C, D, E, F, G, H);
impl<A, B, C, D, E, F, G, H> PartialEq for T1<A, B, C, D, E, F, G, H>
where
A: PartialEq, B: PartialEq, C: PartialEq, D: PartialEq, E: PartialEq, F: PartialEq, G: PartialEq, H: PartialEq
{
default fn eq(&self, b:&Self)->bool{
let a = self;
a.0 == b.0 &&
a.1 == b.1 &&
a.2 == b.2 &&
a.3 == b.3 &&
a.4 == b.4 &&
a.5 == b.5 &&
a.6 == b.6 &&
a.7 == b.7
}
}
impl<A: PartialEq> PartialEq for T1<A,A,A,A,A,A,A,A>
{
fn eq(&self, b:&Self)->bool{
assert_eq!(
std::mem::size_of::<Self>(),
std::mem::size_of::<[A;8]>()
);
unsafe{
let a: &[A;8] = std::mem::transmute(self);
let b: &[A;8] = std::mem::transmute(b);
a == b
}
}
}
#[no_mangle]
pub fn is_eq0(a: T1<T, T, T, T, T, T, T, T>,
b: T1<T, T, T, T, T, T, T, T>)->bool{
a==b
} The problem is that we still:
Unfortunately, argpromotion and inlining would convert this function to I suggest you to rename the issue to something like "Small POD struct equality ineffective". |
It looks like Clang incapable for making such optimizations as well. |
On the LLVM side, we're basically missing this fold: https://alive2.llvm.org/ce/z/Ljo0nX |
Upstream review: https://reviews.llvm.org/D101232 |
Upstream fix: llvm/llvm-project@463ea28 |
I ran into an issue where using a struct of two arrays was significant slower than passing those arrays directly. |
Unfortunately this has not been fixed by the LLVM 13 upgrade. Might be an unfortunate interaction with the migration from bitwise to logical and. |
Upstream fix for the logical and/or issue: llvm/llvm-project@be4b836, llvm/llvm-project@fafe5a6 Might make sense to cherry-pick this to our fork to avoid waiting another release cycle. |
For the case foo1b the MergeICmps pass would usually handle this in the backend, but it's getting blocked by additional |
We're now generating good code for everything but foo1b: https://rust.godbolt.org/z/6rs8EbhEb foo1b still unnecessarily splits out one comparison. Either https://reviews.llvm.org/D108517 or https://reviews.llvm.org/D108782 should address that. |
Remaining case is fixed by the LLVM upgrade in #93577. Adding a codegen test for this may be worthwhile. |
Thats true for |
Add codegen tests for E-needs-test close rust-lang#36010 close rust-lang#68667 close rust-lang#74938 close rust-lang#83585 close rust-lang#93036 close rust-lang#109328 close rust-lang#110797 close rust-lang#111508 close rust-lang#112509 close rust-lang#113757 close rust-lang#120440 close rust-lang#118392 close rust-lang#71096 r? nikic
Add codegen tests for E-needs-test close rust-lang#36010 close rust-lang#68667 close rust-lang#74938 close rust-lang#83585 close rust-lang#93036 close rust-lang#109328 close rust-lang#110797 close rust-lang#111508 close rust-lang#112509 close rust-lang#113757 close rust-lang#120440 close rust-lang#118392 close rust-lang#71096 r? nikic
Add codegen tests for E-needs-test close rust-lang#36010 close rust-lang#68667 close rust-lang#74938 close rust-lang#83585 close rust-lang#93036 close rust-lang#109328 close rust-lang#110797 close rust-lang#111508 close rust-lang#112509 close rust-lang#113757 close rust-lang#120440 close rust-lang#118392 close rust-lang#71096 r? nikic
Add codegen tests for E-needs-test close rust-lang#36010 close rust-lang#68667 close rust-lang#74938 close rust-lang#83585 close rust-lang#93036 close rust-lang#109328 close rust-lang#110797 close rust-lang#111508 close rust-lang#112509 close rust-lang#113757 close rust-lang#120440 close rust-lang#118392 close rust-lang#71096 r? nikic
From a recent Reddit discussion (https://old.reddit.com/r/rust/comments/medh15/why_are_derived_partialeqimplementations_not_more/ ):
Gives (rustc 1.53.0-nightly 5e65467 2021-03-26):
That Reddit discussion shows other cases.
The text was updated successfully, but these errors were encountered: