Skip to content

Commit

Permalink
Remove wrong retriever top_1 metrics from print_eval_report (#2510)
Browse files Browse the repository at this point in the history
* remove wrong retriever top_1 metrics

* Update Documentation & Code Style

* don't show wrong examples frame when n_wrong_examples is 0

* Update Documentation & Code Style

* Update Documentation & Code Style

* only use farm reader during eval tests

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
  • Loading branch information
tstadel and github-actions[bot] authored May 12, 2022
1 parent 738e008 commit 771ed0b
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 4 deletions.
28 changes: 28 additions & 0 deletions haystack/json-schemas/haystack-pipeline.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,34 @@
}
]
},
{
"allOf": [
{
"properties": {
"version": {
"const": "1.4.0"
}
}
},
{
"$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/haystack/json-schemas/haystack-pipeline-1.4.0.schema.json"
}
]
},
{
"allOf": [
{
"properties": {
"version": {
"const": "1.4.0"
}
}
},
{
"$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/haystack/json-schemas/haystack-pipeline-1.4.0.schema.json"
}
]
},
{
"allOf": [
{
Expand Down
10 changes: 8 additions & 2 deletions haystack/pipelines/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,9 +179,13 @@ def print_eval_report(
logger.warning("Pipelines with junctions are currently not supported.")
return

answer_nodes = {node for node, df in eval_result.node_results.items() if len(df[df["type"] == "answer"]) > 0}
all_top_1_metrics = eval_result.calculate_metrics(doc_relevance_col=doc_relevance_col, simulated_top_k_reader=1)
answer_top_1_metrics = {node: metrics for node, metrics in all_top_1_metrics.items() if node in answer_nodes}

calculated_metrics = {
"": eval_result.calculate_metrics(doc_relevance_col=doc_relevance_col),
"_top_1": eval_result.calculate_metrics(doc_relevance_col=doc_relevance_col, simulated_top_k_reader=1),
"_top_1": answer_top_1_metrics,
" upper bound": eval_result.calculate_metrics(doc_relevance_col=doc_relevance_col, eval_mode="isolated"),
}

Expand Down Expand Up @@ -242,7 +246,9 @@ def _format_wrong_examples_report(eval_result: EvaluationResult, n_wrong_example
node: eval_result.wrong_examples(node, doc_relevance_col="gold_id_or_answer_match", n=n_wrong_examples)
for node in eval_result.node_results.keys()
}
examples_formatted = {node: "\n".join(map(_format_wrong_example, examples)) for node, examples in examples.items()}
examples_formatted = {
node: "\n".join(map(_format_wrong_example, examples)) for node, examples in examples.items() if any(examples)
}

return "\n".join(map(_format_wrong_examples_node, examples_formatted.keys(), examples_formatted.values()))

Expand Down
16 changes: 14 additions & 2 deletions test/test_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,7 @@ def test_eval_data_split_passage(document_store):

@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
def test_extractive_qa_eval(reader, retriever_with_docs, tmp_path):
labels = EVAL_LABELS[:1]

Expand Down Expand Up @@ -357,6 +358,7 @@ def test_extractive_qa_eval(reader, retriever_with_docs, tmp_path):

@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
def test_extractive_qa_eval_multiple_queries(reader, retriever_with_docs, tmp_path):
pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
eval_result: EvaluationResult = pipeline.eval(labels=EVAL_LABELS, params={"Retriever": {"top_k": 5}})
Expand Down Expand Up @@ -429,6 +431,7 @@ def test_extractive_qa_eval_multiple_queries(reader, retriever_with_docs, tmp_pa

@pytest.mark.parametrize("retriever_with_docs", ["elasticsearch"], indirect=True)
@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
def test_extractive_qa_labels_with_filters(reader, retriever_with_docs, tmp_path):
labels = [
# MultiLabel with filter that selects only the document about Carla
Expand Down Expand Up @@ -498,6 +501,7 @@ def test_extractive_qa_labels_with_filters(reader, retriever_with_docs, tmp_path

@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
def test_extractive_qa_eval_sas(reader, retriever_with_docs):
pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
eval_result: EvaluationResult = pipeline.eval(
Expand All @@ -520,6 +524,7 @@ def test_extractive_qa_eval_sas(reader, retriever_with_docs):
assert metrics["Reader"]["sas"] == pytest.approx(1.0)


@pytest.mark.parametrize("reader", ["farm"], indirect=True)
def test_reader_eval_in_pipeline(reader):
pipeline = Pipeline()
pipeline.add_node(component=reader, name="Reader", inputs=["Query"])
Expand All @@ -537,6 +542,7 @@ def test_reader_eval_in_pipeline(reader):

@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
def test_extractive_qa_eval_doc_relevance_col(reader, retriever_with_docs):
pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
eval_result: EvaluationResult = pipeline.eval(labels=EVAL_LABELS, params={"Retriever": {"top_k": 5}})
Expand All @@ -553,6 +559,7 @@ def test_extractive_qa_eval_doc_relevance_col(reader, retriever_with_docs):

@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
def test_extractive_qa_eval_simulated_top_k_reader(reader, retriever_with_docs):
pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
eval_result: EvaluationResult = pipeline.eval(
Expand Down Expand Up @@ -600,6 +607,7 @@ def test_extractive_qa_eval_simulated_top_k_reader(reader, retriever_with_docs):

@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
def test_extractive_qa_eval_simulated_top_k_retriever(reader, retriever_with_docs):
pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
eval_result: EvaluationResult = pipeline.eval(labels=EVAL_LABELS, params={"Retriever": {"top_k": 5}})
Expand Down Expand Up @@ -651,6 +659,7 @@ def test_extractive_qa_eval_simulated_top_k_retriever(reader, retriever_with_doc

@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
def test_extractive_qa_eval_simulated_top_k_reader_and_retriever(reader, retriever_with_docs):
pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
eval_result: EvaluationResult = pipeline.eval(labels=EVAL_LABELS, params={"Retriever": {"top_k": 10}})
Expand Down Expand Up @@ -709,6 +718,7 @@ def test_extractive_qa_eval_simulated_top_k_reader_and_retriever(reader, retriev

@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
def test_extractive_qa_eval_isolated(reader, retriever_with_docs):
pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
eval_result: EvaluationResult = pipeline.eval(
Expand Down Expand Up @@ -738,6 +748,7 @@ def test_extractive_qa_eval_isolated(reader, retriever_with_docs):

@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
def test_extractive_qa_eval_wrong_examples(reader, retriever_with_docs):

labels = [
Expand Down Expand Up @@ -785,6 +796,7 @@ def test_extractive_qa_eval_wrong_examples(reader, retriever_with_docs):

@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
def test_extractive_qa_print_eval_report(reader, retriever_with_docs):

labels = [
Expand Down Expand Up @@ -885,6 +897,7 @@ def test_faq_calculate_metrics(retriever_with_docs):

@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
@pytest.mark.parametrize("document_store_with_docs", ["memory"], indirect=True)
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
def test_extractive_qa_eval_translation(reader, retriever_with_docs):

# FIXME it makes no sense to have DE->EN input and DE->EN output, right?
Expand Down Expand Up @@ -1017,8 +1030,7 @@ def test_qa_multi_retriever_pipeline_eval(document_store_with_docs, reader):


@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True)
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
def test_multi_retriever_pipeline_eval(document_store_with_docs, reader):
def test_multi_retriever_pipeline_eval(document_store_with_docs):
es_retriever = BM25Retriever(document_store=document_store_with_docs)
dpr_retriever = DensePassageRetriever(document_store_with_docs)
document_store_with_docs.update_embeddings(retriever=dpr_retriever)
Expand Down
1 change: 1 addition & 0 deletions test/test_pipeline_extractive_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def test_extractive_qa_answers_single_result(reader, retriever_with_docs):

@pytest.mark.slow
@pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True)
@pytest.mark.parametrize("reader", ["farm"], indirect=True)
def test_extractive_qa_answers_with_translator(reader, retriever_with_docs, en_to_de_translator, de_to_en_translator):
base_pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs)
pipeline = TranslationWrapperPipeline(
Expand Down

0 comments on commit 771ed0b

Please sign in to comment.