Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: document retrieval metrics for non-document_id document_relevance_criteria #3885

Merged
merged 15 commits into from
Feb 2, 2023
35 changes: 30 additions & 5 deletions haystack/pipelines/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1413,7 +1413,7 @@ def _reorder_columns_in_eval_result(self, eval_result: EvaluationResult) -> Eval
"multilabel_id", # generic
"query", # generic
"filters", # generic
"gold_answers", # answer-specific
"gold_answers", # generic
"answer", # answer-specific
"context", # generic
"exact_match", # answer-specific
Expand Down Expand Up @@ -1690,6 +1690,7 @@ def _build_eval_dataframe(
df_docs.map_rows = partial(df_docs.apply, axis=1)
df_docs.rename(columns={"id": "document_id", "content": "context"}, inplace=True)
df_docs["gold_document_ids"] = [gold_document_ids] * len(df_docs)
df_docs["gold_answers"] = [gold_answers] * len(df_docs)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added for easier analysis, e.g. if document_relevance_criterion="answer"

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to clarify this won't be a problem if there are no gold_answers in the eval set? For example in the case we use context for the relevance criteria gold_answers may not be available.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Exactly. I've added tests for this case.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @tstadel this looks good to me! I don't fully understand how the code is structured, but I appreciate that the tests for the metrics look correct now.

I agree, the current code can be better structured. As there are no other using parts besides calculate metrics, refactoring it, wouldn't bring too much value. Let's refactor in another PR, probably connecting it to a feature to support custom metrics.

df_docs["gold_contexts"] = [gold_contexts] * len(df_docs)
df_docs["gold_contexts_similarity"] = df_docs.map_rows(
lambda row: [
Expand Down Expand Up @@ -1740,7 +1741,12 @@ def _build_eval_dataframe(

# document_relevance_criterion: "document_id_and_answer",
df_docs["gold_id_and_answer_match"] = df_docs.map_rows(
lambda row: min(row["gold_id_match"], row["answer_match"])
lambda row: max(
min(id_match, answer_match)
for id_match, answer_match in zip(
row["gold_documents_id_match"] + [0.0], row["gold_answers_match"] + [0.0]
)
)
)

# document_relevance_criterion: "context",
Expand All @@ -1757,17 +1763,36 @@ def _build_eval_dataframe(

# document_relevance_criterion: "document_id_and_context",
df_docs["gold_id_and_context_match"] = df_docs.map_rows(
lambda row: min(row["gold_id_match"], row["context_match"])
lambda row: max(
min(id_match, 1.0 if context_similarity > context_matching_threshold else 0.0)
for id_match, context_similarity in zip(
row["gold_documents_id_match"] + [0.0], row["gold_contexts_similarity"] + [0.0]
)
)
)

# document_relevance_criterion: "document_id_and_context_and_answer",
df_docs["gold_id_and_context_and_answer_match"] = df_docs.map_rows(
lambda row: min(row["gold_id_match"], row["context_match"], row["answer_match"])
lambda row: max(
min(id_match, 1.0 if context_similarity > context_matching_threshold else 0.0, answer_match)
for id_match, context_similarity, answer_match in zip(
row["gold_documents_id_match"] + [0.0],
row["gold_contexts_similarity"] + [0.0],
row["gold_answers_match"] + [0.0],
)
)
)

# document_relevance_criterion: "context_and_answer",
df_docs["context_and_answer_match"] = df_docs.map_rows(
lambda row: min(row["context_match"], row["answer_match"])
lambda row: max(
min(1.0 if context_similarity > context_matching_threshold else 0.0, answer_match)
for context_similarity, answer_match in zip(
row["gold_contexts_similarity"], row["gold_answers_match"]
)
)
if any(row["gold_answers_match"]) and any(row["gold_contexts_similarity"])
else 0.0
)

df_docs["rank"] = np.arange(1, len(df_docs) + 1)
Expand Down
113 changes: 90 additions & 23 deletions haystack/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -1436,36 +1436,103 @@ def _build_document_metrics_df(
if simulated_top_k_retriever != -1:
documents = documents[documents["rank"] <= simulated_top_k_retriever]

# find out which label matched
def find_matched_label_idxs(row) -> List[int]: # pylint: disable=too-many-return-statements
id_matches = [idx for idx, val in enumerate(row["gold_documents_id_match"]) if val == 1.0]
context_matches = [
idx for idx, val in enumerate(row["gold_contexts_similarity"]) if val > 65.0
] # TODO: hardcoded threshold for now, will be param of calculate_metrics
answer_matches = [idx for idx, val in enumerate(row["gold_answers_match"]) if val == 1.0]
if document_relevance_criterion == "document_id":
return id_matches
elif document_relevance_criterion == "context":
return context_matches
elif document_relevance_criterion == "answer":
return answer_matches
elif document_relevance_criterion == "document_id_and_context":
return list(set(id_matches) & set(context_matches))
elif document_relevance_criterion == "document_id_or_context":
return list(set(id_matches) | set(context_matches))
elif document_relevance_criterion == "document_id_and_answer":
return list(set(id_matches) & set(answer_matches))
elif document_relevance_criterion == "document_id_or_answer":
return list(set(id_matches) | set(answer_matches))
elif document_relevance_criterion == "context_and_answer":
return list(set(context_matches) & set(answer_matches))
elif document_relevance_criterion == "document_id_and_context_and_answer":
return list(set(id_matches) & set(context_matches) & set(answer_matches))
else:
raise ValueError(f"document_relevance_criterion '{document_relevance_criterion}' not supported.")

documents["matched_label_idxs"] = documents.apply(find_matched_label_idxs, axis=1)

metrics = []

for multilabel_id in documents["multilabel_id"].unique():
query_df = documents[documents["multilabel_id"] == multilabel_id]
gold_ids = list(query_df["gold_document_ids"].iloc[0])
retrieved = len(query_df)

# Note: Metrics are always calculated on document_ids.
# For some document relevance criteria (e.g. context), the gold_document_ids are not enough or not useful at all.
# So, we have to adjust the relevant ids according to the document_relevance_criterion.
relevance_criterion_col = f"{document_relevance_criterion.replace('document_id', 'gold_id')}_match"
relevance_criterion_ids = list(query_df[query_df[relevance_criterion_col] == 1]["document_id"].values)
num_relevants = len(set(gold_ids + relevance_criterion_ids))
num_retrieved_relevants = query_df[relevance_criterion_col].values.sum()
rank_retrieved_relevants = query_df[query_df[relevance_criterion_col] == 1]["rank"].values
avp_retrieved_relevants = [
query_df[relevance_criterion_col].values[: int(rank)].sum() / rank for rank in rank_retrieved_relevants
]

avg_precision = np.sum(avp_retrieved_relevants) / num_relevants if num_relevants > 0 else 0.0
recall_multi_hit = num_retrieved_relevants / num_relevants if num_relevants > 0 else 1.0
recall_single_hit = min(num_retrieved_relevants, 1) if num_relevants > 0 else 1.0
precision = num_retrieved_relevants / retrieved if retrieved > 0 else 0.0
rr = 1.0 / rank_retrieved_relevants.min() if len(rank_retrieved_relevants) > 0 else 0.0
dcg = (
np.sum([1.0 / np.log2(rank + 1) for rank in rank_retrieved_relevants])
if len(rank_retrieved_relevants) > 0
else 0.0
relevant_rows = query_df[query_df[relevance_criterion_col] == 1]

# all labels without no_answers
# we need to match all (except for single hit recall)
gold_document_ids = (
list(query_df["gold_custom_document_ids"].iloc[0])
if "gold_custom_document_ids" in query_df
else list(query_df["gold_document_ids"].iloc[0])
)
idcg = (
np.sum([1.0 / np.log2(rank + 1) for rank in range(1, num_relevants + 1)]) if num_relevants > 0 else 1.0
)
ndcg = dcg / idcg
# remove no_answer label
gold_document_ids = [id for id in gold_document_ids if id != "00"]

num_labels = len(gold_document_ids)
num_matched_labels = len(set(idx for idxs in relevant_rows["matched_label_idxs"] for idx in idxs))
num_missing_labels = num_labels - num_matched_labels

relevance_criterion_ids = list(relevant_rows["document_id"].values)
num_relevants = len(set(relevance_criterion_ids)) + num_missing_labels

num_retrieved = len(query_df["document_id"])
num_retrieved_relevants = len(relevant_rows)
rank_retrieved_relevants = relevant_rows["rank"].values

if num_labels == 0:
# For no_answer queries, we set all metrics to 1.0, to indicate that the retriever cannot improve the pipeline.
# This behavior is different from pytrec_eval, which sets the metrics to 0.0 if there is no relevant document in the evalset.
rr = 1.0
avg_precision = 1.0
recall_multi_hit = 1.0
recall_single_hit = 1.0
precision = 1.0
ndcg = 1.0
elif num_retrieved_relevants == 0:
# Set all metrics to 0.0 if no relevant document has been retrieved to avoid undefined metrics.
rr = 0.0
avg_precision = 0.0
recall_multi_hit = 0.0
recall_single_hit = 0.0
precision = 0.0
ndcg = 0.0
else:
# The previous checks ensure:
# - `num_labels` > 0
# - `num_retrieved_relevants` > 0
# - `num_relevants` > 0 (`num_relevants` is always >= `num_labels`)
# - `num_retrieved` > 0 (`num_retrieved` is always >= `num_retrieved_relevants`)
# - `len(rank_retrieved_relevants)` > 0 (`len(rank_retrieved_relevants)` is always == `num_retrieved_relevants`)
avp_retrieved_relevants = [
len(relevant_rows[relevant_rows["rank"] <= rank]) / rank for rank in rank_retrieved_relevants
]
avg_precision = np.sum(avp_retrieved_relevants) / num_relevants
recall_multi_hit = num_matched_labels / num_labels
recall_single_hit = 1.0
precision = num_retrieved_relevants / num_retrieved
rr = 1.0 / rank_retrieved_relevants.min()
dcg = np.sum([1.0 / np.log2(rank + 1) for rank in rank_retrieved_relevants])
idcg = np.sum([1.0 / np.log2(rank + 1) for rank in range(1, num_relevants + 1)])
ndcg = dcg / idcg

metrics.append(
{
Expand Down
Loading