Removing (deprecation) warnings (#530)

1. Few warnings need fix in FARM 2. Can't remove warning from docx library.
deepset-ai · Nov 2, 2020 · 5d45992 · 5d45992
1 parent f541916
commit 5d45992
Show file tree

Hide file tree

Showing 4 changed files with 9 additions and 8 deletions.
diff --git a/haystack/document_store/elasticsearch.py b/haystack/document_store/elasticsearch.py
@@ -304,7 +304,7 @@ def write_labels(self, labels: Union[List[Label], List[dict]], index: Optional[s
 
     def update_document_meta(self, id: str, meta: Dict[str, str]):
         body = {"doc": meta}
-        self.client.update(index=self.index, doc_type="_doc", id=id, body=body, refresh=self.refresh_type)
+        self.client.update(index=self.index, id=id, body=body, refresh=self.refresh_type)
 
     def get_document_count(self, filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None) -> int:
         index = index or self.index

diff --git a/haystack/preprocessor/utils.py b/haystack/preprocessor/utils.py
@@ -196,12 +196,14 @@ def tika_convert_files_to_dicts(
                     last_para = ''
                     for para in paras:
                         para = para.strip()
-                        if not para: continue
+                        if not para:
+                            continue
                         # merge paragraphs to improve qa
                         # merge this paragraph if less than 10 characters or 2 words
                         # or this paragraph starts with a lower case and last paragraph does not end with a punctuation
-                        if merge_short and len(para) < 10 or len(re.findall('\s+', para)) < 2 \
-                            or merge_lowercase and para and para[0].islower() and last_para and last_para[-1] not in '.?!"\'\]\)':
+                        if merge_short and len(para) < 10 or len(re.findall(r'\s+', para)) < 2 \
+                                or merge_lowercase and para and para[0].islower() and last_para \
+                                and last_para[-1] not in r'.?!"\'\]\)':
                             last_para += ' ' + para
                         else:
                             if last_para:

diff --git a/haystack/reader/farm.py b/haystack/reader/farm.py
@@ -338,6 +338,7 @@ def predict(self, question: str, documents: List[Document], top_k: Optional[int]
             inputs.append(cur)
 
         # get answers from QA model
+        # TODO: Need fix in FARM's `to_dict` function of `QAInput` class
         predictions = self.inferencer.inference_from_objects(
             objects=inputs, return_json=False, multiprocessing_chunksize=1
         )

diff --git a/haystack/retriever/dense.py b/haystack/retriever/dense.py
@@ -5,13 +5,9 @@
 from pathlib import Path
 from tqdm import tqdm
 
-from farm.infer import Inferencer
-
 from haystack.document_store.base import BaseDocumentStore
 from haystack import Document
-from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
 from haystack.retriever.base import BaseRetriever
-from haystack.retriever.sparse import logger
 
 from farm.infer import Inferencer
 from farm.modeling.tokenization import Tokenizer
@@ -374,6 +370,8 @@ def embed(self, texts: Union[List[str], str]) -> List[np.array]:
         assert type(texts) == list, "Expecting a list of texts, i.e. create_embeddings(texts=['text1',...])"
 
         if self.model_format == "farm" or self.model_format == "transformers":
+            # TODO: FARM's `sample_to_features_text` need to fix following warning -
+            # tokenization_utils.py:460: FutureWarning: `is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.
             emb = self.embedding_model.inference_from_dicts(dicts=[{"text": t} for t in texts])  # type: ignore
             emb = [(r["vec"]) for r in emb]
         elif self.model_format == "sentence_transformers":