Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Skip uninteresting commits for blame #1743

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
3 changes: 3 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 10 additions & 6 deletions gitoxide-core/src/repository/blame.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,17 @@ pub fn blame_file(
.next()
.expect("exactly one pattern");

let suspect = repo.head()?.peel_to_commit_in_place()?;
let traverse =
gix::traverse::commit::topo::Builder::from_iters(&repo.objects, [suspect.id], None::<Vec<gix::ObjectId>>)
.with_commit_graph(repo.commit_graph_if_enabled()?)
.build()?;
let suspect: gix::ObjectId = repo.head()?.into_peeled_id()?.into();
let cache: Option<gix::commitgraph::Graph> = repo.commit_graph_if_enabled()?;
let mut resource_cache = repo.diff_resource_cache_for_tree_diff()?;
let outcome = gix::blame::file(&repo.objects, traverse, &mut resource_cache, file.as_bstr(), range)?;
let outcome = gix::blame::file(
&repo.objects,
suspect,
cache,
&mut resource_cache,
file.as_bstr(),
range,
)?;
let statistics = outcome.statistics;
write_blame_entries(out, outcome)?;

Expand Down
3 changes: 3 additions & 0 deletions gix-blame/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,16 @@ rust-version = "1.70"
doctest = false

[dependencies]
gix-commitgraph = { version = "^0.26.0", path = "../gix-commitgraph" }
gix-revwalk = { version = "^0.18.0", path = "../gix-revwalk" }
gix-trace = { version = "^0.1.12", path = "../gix-trace" }
gix-diff = { version = "^0.50.0", path = "../gix-diff", default-features = false, features = ["blob"] }
gix-object = { version = "^0.47.0", path = "../gix-object" }
gix-hash = { version = "^0.16.0", path = "../gix-hash" }
gix-worktree = { version = "^0.39.0", path = "../gix-worktree", default-features = false, features = ["attributes"] }
gix-traverse = { version = "^0.44.0", path = "../gix-traverse" }

smallvec = "1.10.0"
thiserror = "2.0.0"

[dev-dependencies]
Expand Down
185 changes: 154 additions & 31 deletions gix-blame/src/file/function.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,23 @@ use gix_object::{
bstr::{BStr, BString},
FindExt,
};
use gix_traverse::commit::find;
use smallvec::SmallVec;
use std::num::NonZeroU32;
use std::ops::Range;

/// Produce a list of consecutive [`BlameEntry`] instances to indicate in which commits the ranges of the file
/// at `traverse[0]:<file_path>` originated in.
/// at `suspect:<file_path>` originated in.
///
/// ## Paramters
///
/// * `odb`
/// - Access to database objects, also for used for diffing.
/// - Should have an object cache for good diff performance.
/// * `traverse`
/// - The list of commits from the most recent to prior ones, following all parents sorted
/// by time.
/// - It's paramount that older commits are returned after newer ones.
/// - The first commit returned here is the first eligible commit to be responsible for parts of `file_path`.
/// * `suspect`
/// - The first commit to be responsible for parts of `file_path`.
/// * `cache`
/// - Optionally, the commitgraph cache.
/// * `file_path`
/// - A *slash-separated* worktree-relative path to the file to blame.
/// * `range`
Expand Down Expand Up @@ -60,20 +61,14 @@ use std::ops::Range;
// <---><----------><-------><-----><------->
// <---><---><-----><-------><-----><------->
// <---><---><-----><-------><-----><-><-><->
pub fn file<E>(
pub fn file(
odb: impl gix_object::Find + gix_object::FindHeader,
traverse: impl IntoIterator<Item = Result<gix_traverse::commit::Info, E>>,
suspect: ObjectId,
cache: Option<gix_commitgraph::Graph>,
resource_cache: &mut gix_diff::blob::Platform,
file_path: &BStr,
range: Option<Range<u32>>,
) -> Result<Outcome, Error>
where
E: Into<Box<dyn std::error::Error + Send + Sync + 'static>>,
{
let mut traverse = traverse.into_iter().peekable();
let Some(Ok(suspect)) = traverse.peek().map(|res| res.as_ref().map(|item| item.id)) else {
return Err(Error::EmptyTraversal);
};
) -> Result<Outcome, Error> {
let _span = gix_trace::coarse!("gix_blame::file()", ?file_path, ?suspect);

let mut stats = Statistics::default();
Expand Down Expand Up @@ -103,25 +98,43 @@ where
suspects: [(suspect, range_in_blamed_file)].into(),
}];

let mut buf = Vec::new();
let commit = find(cache.as_ref(), &odb, &suspect, &mut buf)?;

let mut queue: gix_revwalk::PriorityQueue<CommitTime, ObjectId> = gix_revwalk::PriorityQueue::new();

let commit_time = commit_time(commit);
queue.insert(commit_time, suspect);

let mut out = Vec::new();
let mut diff_state = gix_diff::tree::State::default();
let mut previous_entry: Option<(ObjectId, ObjectId)> = None;
'outer: while let Some(item) = traverse.next() {
'outer: while let Some(suspect) = queue.pop_value() {
if hunks_to_blame.is_empty() {
break;
}
let commit = item.map_err(|err| Error::Traverse(err.into()))?;
let suspect = commit.id;

let is_still_suspect = hunks_to_blame.iter().any(|hunk| hunk.suspects.contains_key(&suspect));

if !is_still_suspect {
// There are no `UnblamedHunk`s associated with this `suspect`, so we can continue with
// the next one.
continue 'outer;
}

stats.commits_traversed += 1;

let parent_ids = commit.parent_ids;
let commit = find(cache.as_ref(), &odb, &suspect, &mut buf)?;

let parent_ids: ParentIds = collect_parents(commit, &odb, cache.as_ref());

if parent_ids.is_empty() {
if traverse.peek().is_none() {
// I’m not entirely sure if this is correct yet. `suspect`, at this point, is the `id` of
// the last `item` that was yielded by `traverse`, so it makes sense to assign the
// remaining lines to it, even though we don’t explicitly check whether that is true
// here. We could perhaps use diff-tree-to-tree to compare `suspect`
// against an empty tree to validate this assumption.
if queue.is_empty() {
// I’m not entirely sure if this is correct yet. `suspect`, at this point, is the
// `id` of the last `item` that was yielded by `queue`, so it makes sense to assign
// the remaining lines to it, even though we don’t explicitly check whether that is
// true here. We could perhaps use diff-tree-to-tree to compare `suspect` against
// an empty tree to validate this assumption.
if unblamed_to_out_is_done(&mut hunks_to_blame, &mut out, suspect) {
break 'outer;
}
Expand All @@ -143,7 +156,41 @@ where
continue;
};

for (pid, parent_id) in parent_ids.iter().enumerate() {
// This block asserts that, for every `UnblamedHunk`, all lines in the *Blamed File* are
// identical to the corresponding lines in the *Source File*.
#[cfg(debug_assertions)]
{
let source_blob = odb.find_blob(&entry_id, &mut buf)?.data.to_vec();
let mut source_interner = gix_diff::blob::intern::Interner::new(source_blob.len() / 100);
let source_lines_as_tokens: Vec<_> = tokens_for_diffing(&source_blob)
.tokenize()
.map(|token| source_interner.intern(token))
.collect();

let mut blamed_interner = gix_diff::blob::intern::Interner::new(blamed_file_blob.len() / 100);
let blamed_lines_as_tokens: Vec<_> = tokens_for_diffing(&blamed_file_blob)
.tokenize()
.map(|token| blamed_interner.intern(token))
.collect();

for hunk in hunks_to_blame.iter() {
if let Some(range_in_suspect) = hunk.suspects.get(&suspect) {
let range_in_blamed_file = hunk.range_in_blamed_file.clone();

for (blamed_line_number, source_line_number) in range_in_blamed_file.zip(range_in_suspect.clone()) {
let source_token = source_lines_as_tokens[source_line_number as usize];
let blame_token = blamed_lines_as_tokens[blamed_line_number as usize];

let source_line = BString::new(source_interner[source_token].into());
let blamed_line = BString::new(blamed_interner[blame_token].into());

assert_eq!(source_line, blamed_line);
}
}
}
}

for (pid, (parent_id, parent_commit_time)) in parent_ids.iter().enumerate() {
if let Some(parent_entry_id) =
find_path_entry_in_commit(&odb, parent_id, file_path, &mut buf, &mut buf2, &mut stats)?
{
Expand All @@ -153,17 +200,19 @@ where
}
if no_change_in_entry {
pass_blame_from_to(suspect, *parent_id, &mut hunks_to_blame);
queue.insert(*parent_commit_time, *parent_id);
continue 'outer;
}
}
}

let more_than_one_parent = parent_ids.len() > 1;
for parent_id in parent_ids {
for (parent_id, parent_commit_time) in parent_ids {
queue.insert(parent_commit_time, parent_id);
let changes_for_file_path = tree_diff_at_file_path(
&odb,
file_path,
commit.id,
suspect,
parent_id,
&mut stats,
&mut diff_state,
Expand Down Expand Up @@ -588,8 +637,82 @@ fn find_path_entry_in_commit(
Ok(res.map(|e| e.oid))
}

/// Return an iterator over tokens for use in diffing. These usually lines, but iit's important to unify them
/// so the later access shows the right thing.
type CommitTime = i64;

fn commit_time(commit: gix_traverse::commit::Either<'_, '_>) -> CommitTime {
match commit {
gix_traverse::commit::Either::CommitRefIter(commit_ref_iter) => {
let mut commit_time = 0;
for token in commit_ref_iter {
use gix_object::commit::ref_iter::Token as T;
match token {
Ok(T::Tree { .. }) => continue,
Ok(T::Parent { .. }) => continue,
Ok(T::Author { .. }) => continue,
Ok(T::Committer { signature }) => {
commit_time = signature.time.seconds;
break;
}
Ok(_unused_token) => break,
Err(_err) => todo!(),
}
}
commit_time
}
gix_traverse::commit::Either::CachedCommit(commit) => commit.committer_timestamp() as i64,
}
}

type ParentIds = SmallVec<[(gix_hash::ObjectId, i64); 2]>;

fn collect_parents(
commit: gix_traverse::commit::Either<'_, '_>,
odb: &impl gix_object::Find,
cache: Option<&gix_commitgraph::Graph>,
) -> ParentIds {
let mut parent_ids: ParentIds = Default::default();

match commit {
gix_traverse::commit::Either::CachedCommit(commit) => {
let cache = cache
.as_ref()
.expect("find returned a cached commit, so we expect cache to be present");
for parent_id in commit.iter_parents() {
match parent_id {
Ok(pos) => {
let parent = cache.commit_at(pos);

parent_ids.push((parent.id().to_owned(), parent.committer_timestamp() as i64));
}
Err(_) => todo!(),
}
}
}
gix_traverse::commit::Either::CommitRefIter(commit_ref_iter) => {
for token in commit_ref_iter {
match token {
Ok(gix_object::commit::ref_iter::Token::Tree { .. }) => continue,
Ok(gix_object::commit::ref_iter::Token::Parent { id }) => {
let mut buf = Vec::new();
let parent = odb.find_commit_iter(id.as_ref(), &mut buf).ok();
let parent_commit_time = parent
.and_then(|parent| parent.committer().ok().map(|committer| committer.time.seconds))
.unwrap_or_default();

parent_ids.push((id, parent_commit_time));
}
Ok(_unused_token) => break,
Err(_err) => todo!(),
}
}
}
};

parent_ids
}

/// Return an iterator over tokens for use in diffing. These are usually lines, but it's important
/// to unify them so the later access shows the right thing.
pub(crate) fn tokens_for_diffing(data: &[u8]) -> impl TokenSource<Token = &[u8]> {
gix_diff::blob::sources::byte_lines_with_terminator(data)
}
Loading
Loading