Skip to content

Commit

Permalink
Fix TableReader for tables without rows (#2369)
Browse files Browse the repository at this point in the history
* Skip tables without rows

* Update Documentation & Code Style

* Add tests

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
  • Loading branch information
bogdankostic and github-actions[bot] authored Mar 30, 2022
1 parent eb514a6 commit ca98891
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 2 deletions.
14 changes: 12 additions & 2 deletions haystack/nodes/reader/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,10 +137,15 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] =
no_answer_score = 1.0
for document in documents:
if document.content_type != "table":
logger.warning(f"Skipping document with id {document.id} in TableReader, as it is not of type table.")
logger.warning(f"Skipping document with id '{document.id}' in TableReader as it is not of type table.")
continue

table: pd.DataFrame = document.content
if table.shape[0] == 0:
logger.warning(
f"Skipping document with id '{document.id}' in TableReader as it does not contain any rows."
)
continue
# Tokenize query and current table
inputs = self.tokenizer(
table=table, queries=query, max_length=self.max_seq_len, return_tensors="pt", truncation=True
Expand Down Expand Up @@ -525,10 +530,15 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] =
answers = []
for document in documents:
if document.content_type != "table":
logger.warning(f"Skipping document with id {document.id} in RCIReader, as it is not of type table.")
logger.warning(f"Skipping document with id '{document.id}' in RCIReader as it is not of type table.")
continue

table: pd.DataFrame = document.content
if table.shape[0] == 0:
logger.warning(
f"Skipping document with id '{document.id}' in RCIReader as it does not contain any rows."
)
continue
table = table.astype(str)
# Create row and column representations
row_reps, column_reps = self._create_row_column_representations(table)
Expand Down
20 changes: 20 additions & 0 deletions test/test_table_reader.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import logging

import pandas as pd
import pytest

Expand Down Expand Up @@ -60,3 +62,21 @@ def test_table_reader_aggregation(table_reader):
assert prediction["answers"][0].answer == "43046.0 m"
assert prediction["answers"][0].meta["aggregation_operator"] == "SUM"
assert prediction["answers"][0].meta["answer_cells"] == ["8848m", "8,611 m", "8 586m", "8 516 m", "8,485m"]


def test_table_without_rows(caplog, table_reader):
# empty DataFrame
table = pd.DataFrame()
document = Document(content=table, content_type="table", id="no_rows")
with caplog.at_level(logging.WARNING):
predictions = table_reader.predict(query="test", documents=[document])
assert "Skipping document with id 'no_rows'" in caplog.text
assert len(predictions["answers"]) == 0


def test_text_document(caplog, table_reader):
document = Document(content="text", id="text_doc")
with caplog.at_level(logging.WARNING):
predictions = table_reader.predict(query="test", documents=[document])
assert "Skipping document with id 'text_doc'" in caplog.text
assert len(predictions["answers"]) == 0

0 comments on commit ca98891

Please sign in to comment.