Fix TableReader for tables without rows (#2369)

* Skip tables without rows * Update Documentation & Code Style * Add tests Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
deepset-ai · Mar 30, 2022 · ca98891 · ca98891
1 parent eb514a6
commit ca98891
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 2 deletions.
diff --git a/haystack/nodes/reader/table.py b/haystack/nodes/reader/table.py
@@ -137,10 +137,15 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] =
         no_answer_score = 1.0
         for document in documents:
             if document.content_type != "table":
-                logger.warning(f"Skipping document with id {document.id} in TableReader, as it is not of type table.")
+                logger.warning(f"Skipping document with id '{document.id}' in TableReader as it is not of type table.")
                 continue
 
             table: pd.DataFrame = document.content
+            if table.shape[0] == 0:
+                logger.warning(
+                    f"Skipping document with id '{document.id}' in TableReader as it does not contain any rows."
+                )
+                continue
             # Tokenize query and current table
             inputs = self.tokenizer(
                 table=table, queries=query, max_length=self.max_seq_len, return_tensors="pt", truncation=True
@@ -525,10 +530,15 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] =
         answers = []
         for document in documents:
             if document.content_type != "table":
-                logger.warning(f"Skipping document with id {document.id} in RCIReader, as it is not of type table.")
+                logger.warning(f"Skipping document with id '{document.id}' in RCIReader as it is not of type table.")
                 continue
 
             table: pd.DataFrame = document.content
+            if table.shape[0] == 0:
+                logger.warning(
+                    f"Skipping document with id '{document.id}' in RCIReader as it does not contain any rows."
+                )
+                continue
             table = table.astype(str)
             # Create row and column representations
             row_reps, column_reps = self._create_row_column_representations(table)

diff --git a/test/test_table_reader.py b/test/test_table_reader.py
@@ -1,3 +1,5 @@
+import logging
+
 import pandas as pd
 import pytest
 
@@ -60,3 +62,21 @@ def test_table_reader_aggregation(table_reader):
     assert prediction["answers"][0].answer == "43046.0 m"
     assert prediction["answers"][0].meta["aggregation_operator"] == "SUM"
     assert prediction["answers"][0].meta["answer_cells"] == ["8848m", "8,611 m", "8 586m", "8 516 m", "8,485m"]
+
+
+def test_table_without_rows(caplog, table_reader):
+    # empty DataFrame
+    table = pd.DataFrame()
+    document = Document(content=table, content_type="table", id="no_rows")
+    with caplog.at_level(logging.WARNING):
+        predictions = table_reader.predict(query="test", documents=[document])
+        assert "Skipping document with id 'no_rows'" in caplog.text
+        assert len(predictions["answers"]) == 0
+
+
+def test_text_document(caplog, table_reader):
+    document = Document(content="text", id="text_doc")
+    with caplog.at_level(logging.WARNING):
+        predictions = table_reader.predict(query="test", documents=[document])
+        assert "Skipping document with id 'text_doc'" in caplog.text
+        assert len(predictions["answers"]) == 0