From dbf3abdd570d55afa4c50fdb118d5dffcc39f8f2 Mon Sep 17 00:00:00 2001
From: Giulio <giulioccavalcante@gmail.com>
Date: Fri, 23 Aug 2024 11:13:52 -0300
Subject: [PATCH] =?UTF-8?q?Adiciona=20pre-commit,=20corrige=20e=20formata?=
 =?UTF-8?q?=20c=C3=B3digo?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .pre-commit-config.yaml                       | 30 +++++++++++
 data_extraction/__init__.py                   |  6 +++
 data_extraction/interfaces.py                 |  1 +
 data_extraction/text_extraction.py            |  3 +-
 database/__init__.py                          |  6 +++
 database/interfaces.py                        |  2 +-
 database/postgresql.py                        | 16 +++---
 index/__init__.py                             |  5 ++
 index/interfaces.py                           |  4 +-
 index/opensearch.py                           | 33 +++++++++---
 main/__init__.py                              |  7 ++-
 main/__main__.py                              | 24 ++++++---
 requirements-dev.txt                          |  1 +
 segmentation/__init__.py                      |  2 +-
 segmentation/base/__init__.py                 |  4 +-
 segmentation/base/association_segmenter.py    | 10 ++--
 segmentation/base/gazette_segment.py          |  5 +-
 segmentation/factory.py                       |  7 +--
 segmentation/segmenters/__init__.py           |  2 +-
 .../segmenters/al_associacao_municipios.py    | 24 +++++----
 storage/__init__.py                           |  6 +++
 storage/digital_ocean_spaces.py               | 36 +++++++------
 storage/interfaces.py                         |  8 +--
 tasks/__init__.py                             |  1 -
 tasks/create_aggregates_table.py              |  5 +-
 tasks/create_index.py                         | 14 ++---
 tasks/gazette_excerpts_embedding_reranking.py |  1 +
 tasks/gazette_excerpts_entities_tagging.py    |  5 +-
 tasks/gazette_text_extraction.py              |  4 +-
 tasks/gazette_themed_excerpts_extraction.py   | 17 ++++--
 tasks/gazette_txt_to_xml.py                   |  3 +-
 tasks/list_gazettes_to_be_processed.py        |  1 -
 tasks/list_territories.py                     |  2 +-
 tasks/utils/__init__.py                       | 29 +++++++---
 tasks/utils/datetime.py                       |  1 -
 tasks/utils/hash.py                           |  4 +-
 tasks/utils/iter.py                           |  2 +-
 tasks/utils/territories.py                    | 15 ++++--
 tasks/utils/text.py                           |  4 +-
 tests/__init__.py                             | 40 +++++++++-----
 tests/digital_ocean_spaces.py                 | 11 +---
 tests/main_tests.py                           |  5 +-
 tests/opensearch.py                           | 54 +++++++++----------
 tests/postgresql.py                           |  5 +-
 tests/text_extraction_task_tests.py           | 12 ++---
 tests/text_extraction_tests.py                | 13 +++--
 46 files changed, 311 insertions(+), 179 deletions(-)
 create mode 100644 .pre-commit-config.yaml
 create mode 100644 requirements-dev.txt

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..87e5f3a
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,30 @@
+repos:
+  - repo: /~https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.6.0
+    hooks:
+    - id: check-case-conflict
+      fail_fast: true
+    - id: check-merge-conflict
+      fail_fast: true
+    - id: debug-statements
+      fail_fast: true
+    - id: detect-aws-credentials
+      fail_fast: true
+      args: ["--allow-missing-credentials"]
+    - id: detect-private-key
+      fail_fast: true
+
+  - repo: /~https://github.com/astral-sh/ruff-pre-commit
+    rev: 'v0.6.2'
+    hooks:
+    - id: ruff
+      fail_fast: true
+      args: ["--fix", "--ignore", "E501", "--select", "I"]
+    - id: ruff-format
+
+  - repo: /~https://github.com/Lucas-C/pre-commit-hooks-safety
+    rev: v1.3.3
+    hooks:
+    - id: python-safety-dependencies-check
+      fail_fast: true
+      files: requirements\/[a-z]+\.txt
diff --git a/data_extraction/__init__.py b/data_extraction/__init__.py
index 96c887b..6cb6d07 100644
--- a/data_extraction/__init__.py
+++ b/data_extraction/__init__.py
@@ -1,2 +1,8 @@
 from .interfaces import TextExtractorInterface
 from .text_extraction import ApacheTikaTextExtractor, create_apache_tika_text_extraction
+
+__all__ = [
+    "ApacheTikaTextExtractor",
+    "create_apache_tika_text_extraction",
+    "TextExtractorInterface",
+]
diff --git a/data_extraction/interfaces.py b/data_extraction/interfaces.py
index b708f4b..6563571 100644
--- a/data_extraction/interfaces.py
+++ b/data_extraction/interfaces.py
@@ -1,5 +1,6 @@
 import abc
 
+
 class TextExtractorInterface(abc.ABC):
     @abc.abstractmethod
     def extract_text(self, filepath: str) -> str:
diff --git a/data_extraction/text_extraction.py b/data_extraction/text_extraction.py
index 4de0f1b..8595aaf 100644
--- a/data_extraction/text_extraction.py
+++ b/data_extraction/text_extraction.py
@@ -1,8 +1,7 @@
 import logging
-import magic
 import os
-import subprocess
 
+import magic
 import requests
 
 from .interfaces import TextExtractorInterface
diff --git a/database/__init__.py b/database/__init__.py
index b6da446..22d2af8 100644
--- a/database/__init__.py
+++ b/database/__init__.py
@@ -1,2 +1,8 @@
 from .interfaces import DatabaseInterface
 from .postgresql import PostgreSQL, create_database_interface
+
+__all__ = [
+    "create_database_interface",
+    "DatabaseInterface",
+    "PostgreSQL",
+]
diff --git a/database/interfaces.py b/database/interfaces.py
index d555d27..fe082b4 100644
--- a/database/interfaces.py
+++ b/database/interfaces.py
@@ -1,5 +1,5 @@
-from typing import Dict, Iterable, Tuple
 import abc
+from typing import Dict, Iterable, Tuple
 
 
 class DatabaseInterface(abc.ABC):
diff --git a/database/postgresql.py b/database/postgresql.py
index dd34674..541b5e0 100644
--- a/database/postgresql.py
+++ b/database/postgresql.py
@@ -1,6 +1,6 @@
-from typing import Dict, Iterable, Tuple
-import os
 import logging
+import os
+from typing import Dict, Iterable, Tuple
 
 import psycopg2
 
@@ -59,16 +59,16 @@ def select(self, command: str) -> Iterable[Tuple]:
             logging.debug(f"Finished query: {cursor.query}")
 
     def insert(self, command: str, data: Dict = {}):
-        logging.debug(f"Inserting:")
+        logging.debug("Inserting:")
         self._commit_changes(command, data)
-        logging.debug(f"Finished inserting")
+        logging.debug("Finished inserting")
 
     def update(self, command: str, data: Dict = {}):
-        logging.debug(f"Updating:")
+        logging.debug("Updating:")
         self._commit_changes(command, data)
-        logging.debug(f"Finished updating")
+        logging.debug("Finished updating")
 
     def delete(self, command: str, data: Dict = {}):
-        logging.debug(f"Deleting:")
+        logging.debug("Deleting:")
         self._commit_changes(command, data)
-        logging.debug(f"Finished deleting")
+        logging.debug("Finished deleting")
diff --git a/index/__init__.py b/index/__init__.py
index 78d0d60..482c40a 100644
--- a/index/__init__.py
+++ b/index/__init__.py
@@ -1,2 +1,7 @@
 from .interfaces import IndexInterface
 from .opensearch import create_index_interface
+
+__all__ = [
+    "create_index_interface",
+    "IndexInterface",
+]
diff --git a/index/interfaces.py b/index/interfaces.py
index 41a80cd..59c69ad 100644
--- a/index/interfaces.py
+++ b/index/interfaces.py
@@ -1,5 +1,5 @@
-from typing import Dict, Iterable
 import abc
+from typing import Dict, Iterable
 
 
 class IndexInterface(abc.ABC):
@@ -40,5 +40,3 @@ def paginated_search(
         """
         Searches the index with the provided query, with pagination
         """
-
-
diff --git a/index/opensearch.py b/index/opensearch.py
index 08fed97..33cfdb7 100644
--- a/index/opensearch.py
+++ b/index/opensearch.py
@@ -1,5 +1,5 @@
-from typing import Dict, Iterable, List, Union
 import os
+from typing import Dict, Iterable, List, Union
 
 import opensearchpy
 
@@ -7,8 +7,17 @@
 
 
 class OpenSearchInterface(IndexInterface):
-    def __init__(self, hosts: List, user: str, password: str, timeout: int = 30, default_index: str = ""):
-        self._search_engine = opensearchpy.OpenSearch(hosts=hosts, http_auth=(user, password))
+    def __init__(
+        self,
+        hosts: List,
+        user: str,
+        password: str,
+        timeout: int = 30,
+        default_index: str = "",
+    ):
+        self._search_engine = opensearchpy.OpenSearch(
+            hosts=hosts, http_auth=(user, password)
+        )
         self._timeout = timeout
         self._default_index = default_index
 
@@ -51,7 +60,9 @@ def index_document(
         refresh: bool = False,
     ) -> None:
         index = self.get_index_name(index)
-        self._search_engine.index(index=index, body=document, id=document_id, refresh=refresh)
+        self._search_engine.index(
+            index=index, body=document, id=document_id, refresh=refresh
+        )
 
     def search(self, query: Dict, index: str = "") -> Dict:
         index = self.get_index_name(index)
@@ -60,7 +71,9 @@ def search(self, query: Dict, index: str = "") -> Dict:
 
     def analyze(self, text: str, field: str, index: str = "") -> Dict:
         index = self.get_index_name(index)
-        result = self._search_engine.indices.analyze(body={"text": text, "field":field}, index=index)
+        result = self._search_engine.indices.analyze(
+            body={"text": text, "field": field}, index=index
+        )
         return result
 
     def paginated_search(
@@ -96,12 +109,15 @@ def get_opensearch_host():
 def get_opensearch_index():
     return os.environ["OPENSEARCH_INDEX"]
 
+
 def get_opensearch_user():
     return os.environ["OPENSEARCH_USER"]
 
+
 def get_opensearch_password():
     return os.environ["OPENSEARCH_PASSWORD"]
 
+
 def create_index_interface() -> IndexInterface:
     hosts = get_opensearch_host()
     if not isinstance(hosts, str) or len(hosts) == 0:
@@ -109,4 +125,9 @@ def create_index_interface() -> IndexInterface:
     default_index_name = get_opensearch_index()
     if not isinstance(default_index_name, str) or len(default_index_name) == 0:
         raise Exception("Invalid index name")
-    return OpenSearchInterface([hosts], get_opensearch_user(), get_opensearch_password(), default_index=default_index_name)
+    return OpenSearchInterface(
+        [hosts],
+        get_opensearch_user(),
+        get_opensearch_password(),
+        default_index=default_index_name,
+    )
diff --git a/main/__init__.py b/main/__init__.py
index 0285851..45eb901 100644
--- a/main/__init__.py
+++ b/main/__init__.py
@@ -1,4 +1,9 @@
 from .__main__ import (
-    is_debug_enabled,
     enable_debug_if_necessary,
+    is_debug_enabled,
 )
+
+__all__ = [
+    "is_debug_enabled",
+    "enable_debug_if_necessary",
+]
diff --git a/main/__main__.py b/main/__main__.py
index cd1da3f..5aa3664 100644
--- a/main/__main__.py
+++ b/main/__main__.py
@@ -1,11 +1,11 @@
-from os import environ
 import argparse
 import logging
+from os import environ
 
 from data_extraction import create_apache_tika_text_extraction
 from database import create_database_interface
-from storage import create_storage_interface
 from index import create_index_interface
+from storage import create_storage_interface
 from tasks import run_task
 
 
@@ -37,12 +37,24 @@ def gazette_texts_pipeline():
 
     run_task("create_gazettes_index", index)
     territories = run_task("get_territories", database)
-    gazettes_to_be_processed = run_task("get_gazettes_to_be_processed", execution_mode, database)
-    indexed_gazette_ids = run_task("extract_text_from_gazettes", gazettes_to_be_processed, territories, database, storage, index, text_extractor)
-   
+    gazettes_to_be_processed = run_task(
+        "get_gazettes_to_be_processed", execution_mode, database
+    )
+    indexed_gazette_ids = run_task(
+        "extract_text_from_gazettes",
+        gazettes_to_be_processed,
+        territories,
+        database,
+        storage,
+        index,
+        text_extractor,
+    )
+
     for theme in themes:
         run_task("create_themed_excerpts_index", theme, index)
-        themed_excerpt_ids = run_task("extract_themed_excerpts_from_gazettes", theme, indexed_gazette_ids, index)
+        themed_excerpt_ids = run_task(
+            "extract_themed_excerpts_from_gazettes", theme, indexed_gazette_ids, index
+        )
         run_task("embedding_rerank_excerpts", theme, themed_excerpt_ids, index)
         run_task("tag_entities_in_excerpts", theme, themed_excerpt_ids, index)
 
diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 0000000..959c060
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1 @@
+pre-commit==3.5.0
diff --git a/segmentation/__init__.py b/segmentation/__init__.py
index ce1ea7c..7bdf164 100644
--- a/segmentation/__init__.py
+++ b/segmentation/__init__.py
@@ -2,4 +2,4 @@
 
 __all__ = [
     "get_segmenter",
-]
\ No newline at end of file
+]
diff --git a/segmentation/base/__init__.py b/segmentation/base/__init__.py
index bdbbd2b..4f5314b 100644
--- a/segmentation/base/__init__.py
+++ b/segmentation/base/__init__.py
@@ -1,7 +1,7 @@
-from .gazette_segment import GazetteSegment
 from .association_segmenter import AssociationSegmenter
+from .gazette_segment import GazetteSegment
 
 __all__ = [
     "GazetteSegment",
     "AssociationSegmenter",
-]
\ No newline at end of file
+]
diff --git a/segmentation/base/association_segmenter.py b/segmentation/base/association_segmenter.py
index 0d777be..9d65ef1 100644
--- a/segmentation/base/association_segmenter.py
+++ b/segmentation/base/association_segmenter.py
@@ -1,4 +1,5 @@
 from typing import Any, Dict, Iterable, List, Union
+
 from segmentation.base import GazetteSegment
 
 
@@ -6,13 +7,17 @@ class AssociationSegmenter:
     def __init__(self, territories: Iterable[Dict[str, Any]]):
         self.territories = territories
 
-    def get_gazette_segments(self, *args, **kwargs) -> List[Union[GazetteSegment, Dict]]:
+    def get_gazette_segments(
+        self, *args, **kwargs
+    ) -> List[Union[GazetteSegment, Dict]]:
         """
         Returns a list of GazetteSegment
         """
         raise NotImplementedError
 
-    def split_text_by_territory(self, *args, **kwargs) -> Union[Dict[str, str], List[str]]:
+    def split_text_by_territory(
+        self, *args, **kwargs
+    ) -> Union[Dict[str, str], List[str]]:
         """
         Segment a association text by territory
         and returns a list of text segments
@@ -24,4 +29,3 @@ def build_segment(self, *args, **kwargs) -> GazetteSegment:
         Returns a GazetteSegment
         """
         raise NotImplementedError
-
diff --git a/segmentation/base/gazette_segment.py b/segmentation/base/gazette_segment.py
index aef8fb1..b31d05c 100644
--- a/segmentation/base/gazette_segment.py
+++ b/segmentation/base/gazette_segment.py
@@ -1,5 +1,5 @@
-from datetime import date, datetime
 from dataclasses import dataclass
+from datetime import date, datetime
 
 
 @dataclass
@@ -8,6 +8,7 @@ class GazetteSegment:
     Dataclass to represent a gazette segment of a association
     related to a city
     """
+
     id: str
     territory_name: str
     source_text: str
@@ -24,4 +25,4 @@ class GazetteSegment:
     state_code: str
     territory_id: str
     file_raw_txt: str
-    url: str
\ No newline at end of file
+    url: str
diff --git a/segmentation/factory.py b/segmentation/factory.py
index 65c693e..0e57284 100644
--- a/segmentation/factory.py
+++ b/segmentation/factory.py
@@ -1,13 +1,14 @@
 from typing import Any, Dict, Iterable
 
-from segmentation.base import AssociationSegmenter
 from segmentation import segmenters
-
+from segmentation.base import AssociationSegmenter
 
 _segmenter_instances = {}
 
 
-def get_segmenter(territory_id: str, territories: Iterable[Dict[str, Any]]) -> AssociationSegmenter:
+def get_segmenter(
+    territory_id: str, territories: Iterable[Dict[str, Any]]
+) -> AssociationSegmenter:
     """
     Factory method to return a AssociationSegmenter
 
diff --git a/segmentation/segmenters/__init__.py b/segmentation/segmenters/__init__.py
index 39de174..d4db466 100644
--- a/segmentation/segmenters/__init__.py
+++ b/segmentation/segmenters/__init__.py
@@ -2,4 +2,4 @@
 
 __all__ = [
     "ALAssociacaoMunicipiosSegmenter",
-]
\ No newline at end of file
+]
diff --git a/segmentation/segmenters/al_associacao_municipios.py b/segmentation/segmenters/al_associacao_municipios.py
index 7485e51..9ec6990 100644
--- a/segmentation/segmenters/al_associacao_municipios.py
+++ b/segmentation/segmenters/al_associacao_municipios.py
@@ -1,7 +1,7 @@
-import re
 import logging
-
+import re
 from typing import Any, Dict, List
+
 from segmentation.base import AssociationSegmenter, GazetteSegment
 from tasks.utils import batched, get_checksum, get_territory_data, get_territory_slug
 
@@ -64,15 +64,17 @@ def build_segment(
         )
         territory_data = get_territory_data(territory_slug, self.territories)
 
-        return GazetteSegment(**{
-            **gazette,
-            # segment specific values
-            "processed": True,
-            "file_checksum": get_checksum(segment_text),
-            "source_text": segment_text.strip(),
-            "territory_name": territory_data["territory_name"],
-            "territory_id": territory_data["id"],
-        })
+        return GazetteSegment(
+            **{
+                **gazette,
+                # segment specific values
+                "processed": True,
+                "file_checksum": get_checksum(segment_text),
+                "source_text": segment_text.strip(),
+                "territory_name": territory_data["territory_name"],
+                "territory_id": territory_data["id"],
+            }
+        )
 
     def _normalize_territory_name(self, territory_name: str) -> str:
         clean_name = territory_name.strip().replace("\n", "")
diff --git a/storage/__init__.py b/storage/__init__.py
index 8d9c47c..9603c9f 100644
--- a/storage/__init__.py
+++ b/storage/__init__.py
@@ -1,2 +1,8 @@
 from .digital_ocean_spaces import DigitalOceanSpaces, create_storage_interface
 from .interfaces import StorageInterface
+
+__all__ = [
+    "create_storage_interface",
+    "DigitalOceanSpaces",
+    "StorageInterface",
+]
diff --git a/storage/digital_ocean_spaces.py b/storage/digital_ocean_spaces.py
index 99b738a..e1feab2 100644
--- a/storage/digital_ocean_spaces.py
+++ b/storage/digital_ocean_spaces.py
@@ -1,8 +1,8 @@
 import logging
 import os
-from typing import Union
 from io import BytesIO
 from pathlib import Path
+from typing import Union
 
 import boto3
 
@@ -71,7 +71,9 @@ def __init__(
 
     def get_file(self, file_to_be_downloaded: Union[str, Path], destination) -> None:
         logging.debug(f"Getting {file_to_be_downloaded}")
-        self._client.download_fileobj(self._bucket, str(file_to_be_downloaded), destination)
+        self._client.download_fileobj(
+            self._bucket, str(file_to_be_downloaded), destination
+        )
 
     def upload_content(
         self,
@@ -88,7 +90,10 @@ def upload_content(
             )
         else:
             self._client.upload_fileobj(
-            content_to_be_uploaded, self._bucket, file_key, ExtraArgs={"ACL": permission}
+                content_to_be_uploaded,
+                self._bucket,
+                file_key,
+                ExtraArgs={"ACL": permission},
             )
 
     def upload_file(
@@ -111,13 +116,15 @@ def upload_file_multipart(
     ) -> None:
         logging.debug(f"Uploading {file_key} with multipart")
 
-        multipart_upload = self._client.create_multipart_upload(Bucket=self._bucket, Key=file_key, ACL=permission)
-        upload_id = multipart_upload['UploadId']
+        multipart_upload = self._client.create_multipart_upload(
+            Bucket=self._bucket, Key=file_key, ACL=permission
+        )
+        upload_id = multipart_upload["UploadId"]
 
         parts = []
 
         try:
-            with open(file_path, 'rb') as file:
+            with open(file_path, "rb") as file:
                 part_number = 1
                 while True:
                     data = file.read(part_size)
@@ -129,25 +136,24 @@ def upload_file_multipart(
                         Key=file_key,
                         PartNumber=part_number,
                         UploadId=upload_id,
-                        Body=data
+                        Body=data,
                     )
 
-                    parts.append({
-                        'PartNumber': part_number,
-                        'ETag': response['ETag']
-                    })
+                    parts.append({"PartNumber": part_number, "ETag": response["ETag"]})
                     part_number += 1
 
             self._client.complete_multipart_upload(
                 Bucket=self._bucket,
                 Key=file_key,
                 UploadId=upload_id,
-                MultipartUpload={'Parts': parts}
+                MultipartUpload={"Parts": parts},
             )
 
         except Exception as e:
             logging.debug(f"Aborted uploading {file_key} with multipart")
-            self._client.abort_multipart_upload(Bucket=self._bucket, Key=file_key, UploadId=upload_id)
+            self._client.abort_multipart_upload(
+                Bucket=self._bucket, Key=file_key, UploadId=upload_id
+            )
             raise e
         else:
             logging.debug(f"Finished uploading {file_key} with multipart")
@@ -156,8 +162,8 @@ def copy_file(self, source_file_key: str, destination_file_key: str) -> None:
         logging.debug(f"Copying {source_file_key} to {destination_file_key}")
         self._client.copy_object(
             Bucket=self._bucket,
-            CopySource={'Bucket': self._bucket, 'Key': source_file_key},
-            Key=destination_file_key
+            CopySource={"Bucket": self._bucket, "Key": source_file_key},
+            Key=destination_file_key,
         )
 
     def delete_file(self, file_key: str) -> None:
diff --git a/storage/interfaces.py b/storage/interfaces.py
index 1578752..0f03ce7 100644
--- a/storage/interfaces.py
+++ b/storage/interfaces.py
@@ -1,7 +1,7 @@
-from typing import Union
-from pathlib import Path
 import abc
 from io import BytesIO
+from pathlib import Path
+from typing import Union
 
 
 class StorageInterface(abc.ABC):
@@ -16,7 +16,9 @@ def get_file(self, file_to_be_downloaded: Union[str, Path], destination) -> None
         """
 
     @abc.abstractmethod
-    def upload_content(self, file_key: str, content_to_be_uploaded: Union[str, BytesIO]) -> None:
+    def upload_content(
+        self, file_key: str, content_to_be_uploaded: Union[str, BytesIO]
+    ) -> None:
         """
         Upload the given content to the destination on the host
         """
diff --git a/tasks/__init__.py b/tasks/__init__.py
index 43a5e3f..aa5e378 100644
--- a/tasks/__init__.py
+++ b/tasks/__init__.py
@@ -1,6 +1,5 @@
 from importlib import import_module
 
-
 AVAILABLE_TASKS = {
     "create_aggregates": "tasks.gazette_txt_to_xml",
     "create_gazettes_index": "tasks.create_index",
diff --git a/tasks/create_aggregates_table.py b/tasks/create_aggregates_table.py
index 17d169b..7310e1d 100644
--- a/tasks/create_aggregates_table.py
+++ b/tasks/create_aggregates_table.py
@@ -13,6 +13,5 @@ def create_aggregates_table(database: DatabaseInterface):
             file_size_mb REAL,
             hash_info VARCHAR(64),
             last_updated TIMESTAMP
-        ); """)
-
-        
\ No newline at end of file
+        ); """
+    )
diff --git a/tasks/create_index.py b/tasks/create_index.py
index 7abd8f0..3adcd6f 100644
--- a/tasks/create_index.py
+++ b/tasks/create_index.py
@@ -38,7 +38,7 @@ def create_gazettes_index(index: IndexInterface) -> None:
                             "analyzer": "exact",
                             "index_options": "offsets",
                             "term_vector": "with_positions_offsets",
-                        }
+                        },
                     },
                 },
                 "state_code": {"type": "keyword"},
@@ -52,8 +52,8 @@ def create_gazettes_index(index: IndexInterface) -> None:
         },
         "settings": {
             "index": {
-              "sort.field": ["territory_id", "date"],
-              "sort.order": ["asc", "desc"]
+                "sort.field": ["territory_id", "date"],
+                "sort.order": ["asc", "desc"],
             },
             "analysis": {
                 "filter": {
@@ -72,7 +72,7 @@ def create_gazettes_index(index: IndexInterface) -> None:
                         "filter": ["lowercase"],
                     },
                 },
-            }
+            },
         },
     }
     index.create_index(body=body)
@@ -127,8 +127,8 @@ def create_themed_excerpts_index(theme: Dict, index: IndexInterface) -> None:
         },
         "settings": {
             "index": {
-              "sort.field": ["source_territory_id", "source_date"],
-              "sort.order": ["asc", "desc"]
+                "sort.field": ["source_territory_id", "source_date"],
+                "sort.order": ["asc", "desc"],
             },
             "analysis": {
                 "filter": {
@@ -147,7 +147,7 @@ def create_themed_excerpts_index(theme: Dict, index: IndexInterface) -> None:
                         "filter": ["lowercase"],
                     },
                 },
-            }
+            },
         },
     }
     index.create_index(index_name=theme["index"], body=body)
diff --git a/tasks/gazette_excerpts_embedding_reranking.py b/tasks/gazette_excerpts_embedding_reranking.py
index 3ffc40a..a2f282b 100644
--- a/tasks/gazette_excerpts_embedding_reranking.py
+++ b/tasks/gazette_excerpts_embedding_reranking.py
@@ -4,6 +4,7 @@
 import sentence_transformers
 
 from index import IndexInterface
+
 from .utils import get_documents_with_ids
 
 
diff --git a/tasks/gazette_excerpts_entities_tagging.py b/tasks/gazette_excerpts_entities_tagging.py
index c63522e..fb14056 100644
--- a/tasks/gazette_excerpts_entities_tagging.py
+++ b/tasks/gazette_excerpts_entities_tagging.py
@@ -2,6 +2,7 @@
 from typing import Dict, List
 
 from index import IndexInterface
+
 from .utils import (
     get_documents_from_query_with_highlights,
     get_documents_with_ids,
@@ -24,9 +25,7 @@ def tag_theme_cases(theme: Dict, excerpt_ids: List[str], index: IndexInterface)
         )
         for document in documents:
             excerpt = document["_source"]
-            highlight = document["highlight"][
-                "excerpt.with_stopwords"
-            ][0]
+            highlight = document["highlight"]["excerpt.with_stopwords"][0]
             excerpt.update(
                 {
                     "excerpt_entities": list(
diff --git a/tasks/gazette_text_extraction.py b/tasks/gazette_text_extraction.py
index 18c0c3a..61db32d 100644
--- a/tasks/gazette_text_extraction.py
+++ b/tasks/gazette_text_extraction.py
@@ -1,13 +1,13 @@
 import logging
-import tempfile
 import os
+import tempfile
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Union
-from segmentation import get_segmenter
 
 from data_extraction import TextExtractorInterface
 from database import DatabaseInterface
 from index import IndexInterface
+from segmentation import get_segmenter
 from storage import StorageInterface
 
 
diff --git a/tasks/gazette_themed_excerpts_extraction.py b/tasks/gazette_themed_excerpts_extraction.py
index 1b05882..336dd7f 100644
--- a/tasks/gazette_themed_excerpts_extraction.py
+++ b/tasks/gazette_themed_excerpts_extraction.py
@@ -2,7 +2,12 @@
 from typing import Dict, Iterable, List
 
 from index import IndexInterface
-from .utils import batched, clean_extra_whitespaces, get_documents_from_query_with_highlights
+
+from .utils import (
+    batched,
+    clean_extra_whitespaces,
+    get_documents_from_query_with_highlights,
+)
 
 
 def extract_themed_excerpts_from_gazettes(
@@ -14,7 +19,7 @@ def extract_themed_excerpts_from_gazettes(
             for excerpt in get_excerpts_from_gazettes_with_themed_query(
                 theme_query, batch, index
             ):
-                # excerpts with less than 10% of the expected size of excerpt account for 
+                # excerpts with less than 10% of the expected size of excerpt account for
                 # fewer than 1% of excerpts yet their score is usually high
                 if len(excerpt["excerpt"]) < 200:
                     continue
@@ -99,9 +104,13 @@ def get_es_query_from_themed_query(
                 phrase_block = {
                     "span_near": {"clauses": [], "slop": 0, "in_order": True}
                 }
-                tokenized_term = index.analyze(text=term, field="source_text.with_stopwords")
+                tokenized_term = index.analyze(
+                    text=term, field="source_text.with_stopwords"
+                )
                 for token in tokenized_term["tokens"]:
-                    word_block = {"span_term": {"source_text.with_stopwords": token["token"]}}
+                    word_block = {
+                        "span_term": {"source_text.with_stopwords": token["token"]}
+                    }
                     phrase_block["span_near"]["clauses"].append(word_block)
                 synonym_block["span_or"]["clauses"].append(phrase_block)
             proximity_block["span_near"]["clauses"].append(synonym_block)
diff --git a/tasks/gazette_txt_to_xml.py b/tasks/gazette_txt_to_xml.py
index 68491c9..474f9e7 100644
--- a/tasks/gazette_txt_to_xml.py
+++ b/tasks/gazette_txt_to_xml.py
@@ -5,7 +5,7 @@
 from datetime import datetime
 from io import BytesIO
 from pathlib import Path
-from tempfile import mkstemp, TemporaryDirectory, NamedTemporaryFile
+from tempfile import NamedTemporaryFile, TemporaryDirectory, mkstemp
 from typing import Iterable
 from zipfile import ZIP_DEFLATED, ZipFile
 
@@ -13,6 +13,7 @@
 
 from database import DatabaseInterface
 from storage import StorageInterface
+
 from .utils import br_timezone, get_territory_slug, hash_file
 
 logger = logging.getLogger(__name__)
diff --git a/tasks/list_gazettes_to_be_processed.py b/tasks/list_gazettes_to_be_processed.py
index 0dd0984..32285d2 100644
--- a/tasks/list_gazettes_to_be_processed.py
+++ b/tasks/list_gazettes_to_be_processed.py
@@ -7,7 +7,6 @@
 def get_gazettes_to_be_processed(
     execution_mode: str, database: DatabaseInterface
 ) -> Iterable[Dict]:
-
     if execution_mode == "DAILY":
         yield from get_gazettes_extracted_since_yesterday(database)
     elif execution_mode == "ALL":
diff --git a/tasks/list_territories.py b/tasks/list_territories.py
index 55f4c32..2d52084 100644
--- a/tasks/list_territories.py
+++ b/tasks/list_territories.py
@@ -1,4 +1,4 @@
-from functools import lru_cache 
+from functools import lru_cache
 from typing import Dict, Iterable
 
 from database import DatabaseInterface
diff --git a/tasks/utils/__init__.py b/tasks/utils/__init__.py
index 882450b..be80e87 100644
--- a/tasks/utils/__init__.py
+++ b/tasks/utils/__init__.py
@@ -1,4 +1,8 @@
 from .datetime import br_timezone
+from .hash import (
+    hash_content,
+    hash_file,
+)
 from .index import (
     get_documents_from_query_with_highlights,
     get_documents_with_ids,
@@ -6,15 +10,24 @@
 from .iter import (
     batched,
 )
-from .text import (
-    clean_extra_whitespaces,
-    get_checksum,
-)
 from .territories import (
-    get_territory_slug,
     get_territory_data,
+    get_territory_slug,
 )
-from .hash import (
-    hash_content,
-    hash_file,
+from .text import (
+    clean_extra_whitespaces,
+    get_checksum,
 )
+
+__all__ = [
+    "batched",
+    "br_timezone",
+    "clean_extra_whitespaces",
+    "get_checksum",
+    "get_documents_from_query_with_highlights",
+    "get_documents_with_ids",
+    "get_territory_data",
+    "get_territory_slug",
+    "hash_content",
+    "hash_file",
+]
diff --git a/tasks/utils/datetime.py b/tasks/utils/datetime.py
index e0028c9..f06c8ac 100644
--- a/tasks/utils/datetime.py
+++ b/tasks/utils/datetime.py
@@ -1,4 +1,3 @@
 from datetime import timedelta, timezone
 
-
 br_timezone = timezone(timedelta(hours=-3))
diff --git a/tasks/utils/hash.py b/tasks/utils/hash.py
index 9e0a139..f69ab43 100644
--- a/tasks/utils/hash.py
+++ b/tasks/utils/hash.py
@@ -19,7 +19,7 @@ def hash_file(file) -> str:
     chunk_size = 128 * hash.block_size
 
     if isinstance(file, str):
-        with open(file, 'rb') as f:
+        with open(file, "rb") as f:
             _chunk_hashing(hash, chunk_size, f)
     else:
         file.seek(0)
@@ -30,5 +30,5 @@ def hash_file(file) -> str:
 
 
 def _chunk_hashing(hash, chunk_size, file):
-    for chunk in iter(lambda: file.read(chunk_size), b''): 
+    for chunk in iter(lambda: file.read(chunk_size), b""):
         hash.update(chunk)
diff --git a/tasks/utils/iter.py b/tasks/utils/iter.py
index 034715a..d456cca 100644
--- a/tasks/utils/iter.py
+++ b/tasks/utils/iter.py
@@ -5,7 +5,7 @@ def batched(iterable, n):
     # batched('ABCDEFG', 3) --> ABC DEF G
     # pode ser removido ao usar python 3.12, em favor de itertools.batched
     if n < 1:
-        raise ValueError('n must be at least one')
+        raise ValueError("n must be at least one")
     it = iter(iterable)
     while batch := tuple(islice(it, n)):
         yield batch
diff --git a/tasks/utils/territories.py b/tasks/utils/territories.py
index e77235c..846adbf 100644
--- a/tasks/utils/territories.py
+++ b/tasks/utils/territories.py
@@ -2,7 +2,6 @@
 
 from slugify import slugify
 
-
 _territory_slug_to_data_map = {}
 
 
@@ -10,10 +9,14 @@ def get_territory_slug(name: str, state_code: str) -> str:
     full_name = f"{state_code} {name}"
     stopwords = ["de", "d", "da", "do", "das", "dos"]
     replacements = [("´", "'"), ("`", "'")]
-    return slugify(full_name, separator="", stopwords=stopwords, replacements=replacements)
+    return slugify(
+        full_name, separator="", stopwords=stopwords, replacements=replacements
+    )
 
 
-def get_territory_data(identifier: Union[str, Tuple[str, str]], territories: Iterable[Dict[str, Any]]) -> Dict[str, Dict]:
+def get_territory_data(
+    identifier: Union[str, Tuple[str, str]], territories: Iterable[Dict[str, Any]]
+) -> Dict[str, Dict]:
     if isinstance(identifier, tuple):
         territory_name, state_code = identifier
         territory_slug = get_territory_slug(territory_name, state_code)
@@ -25,12 +28,14 @@ def get_territory_data(identifier: Union[str, Tuple[str, str]], territories: Ite
     slug_to_data = get_territory_slug_to_data_map(territories)
 
     if territory_slug not in slug_to_data:
-        raise KeyError(f"Couldn't find info for \"{territory_slug}\"")
+        raise KeyError(f'Couldn\'t find info for "{territory_slug}"')
 
     return slug_to_data[territory_slug]
 
 
-def get_territory_slug_to_data_map(territories: Iterable[Dict[str, Any]]) -> Dict[str, Dict]:
+def get_territory_slug_to_data_map(
+    territories: Iterable[Dict[str, Any]],
+) -> Dict[str, Dict]:
     global _territory_slug_to_data_map
     if not _territory_slug_to_data_map:
         territory_to_data = {
diff --git a/tasks/utils/text.py b/tasks/utils/text.py
index 7d40bca..41fc4c0 100644
--- a/tasks/utils/text.py
+++ b/tasks/utils/text.py
@@ -1,5 +1,5 @@
-import re
 import hashlib
+import re
 from io import BytesIO
 
 
@@ -25,4 +25,4 @@ def get_checksum(source_text: str) -> str:
         if not d:
             break
         m.update(d)
-    return m.hexdigest()
\ No newline at end of file
+    return m.hexdigest()
diff --git a/tests/__init__.py b/tests/__init__.py
index 1d2c663..7b20313 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -1,24 +1,38 @@
 import unittest
 
-from .text_extraction_tests import (
-    ApacheTikaTextExtractorTest,
-    FactoryMethodApacheTikaTest,
-)
 from .digital_ocean_spaces import (
     DigitalOceanSpacesIntegrationTests,
     StorageInterfaceCreationTests,
 )
-from .postgresql import (
-    PostgreSQLTests,
-    PostgreSQLConnectionTests,
-    CreationDatabaseInterfaceFunctionTests,
-)
-from .text_extraction_task_tests import TextExtractionTaskTests
-
 from .main_tests import MainModuleTests
-
 from .opensearch import (
-    OpensearchBasicTests,
     IndexInterfaceFactoryFunctionTests,
+    OpensearchBasicTests,
     OpensearchIntegrationTests,
 )
+from .postgresql import (
+    CreationDatabaseInterfaceFunctionTests,
+    PostgreSQLConnectionTests,
+    PostgreSQLTests,
+)
+from .text_extraction_task_tests import TextExtractionTaskTests
+from .text_extraction_tests import (
+    ApacheTikaTextExtractorTest,
+    FactoryMethodApacheTikaTest,
+)
+
+__all__ = [
+    "ApacheTikaTextExtractorTest",
+    "CreationDatabaseInterfaceFunctionTests",
+    "DigitalOceanSpacesIntegrationTests",
+    "FactoryMethodApacheTikaTest",
+    "IndexInterfaceFactoryFunctionTests",
+    "MainModuleTests",
+    "OpensearchBasicTests",
+    "OpensearchIntegrationTests",
+    "PostgreSQLConnectionTests",
+    "PostgreSQLTests",
+    "StorageInterfaceCreationTests",
+    "TextExtractionTaskTests",
+    "unittest",
+]
diff --git a/tests/digital_ocean_spaces.py b/tests/digital_ocean_spaces.py
index 31ad4f4..62befd8 100644
--- a/tests/digital_ocean_spaces.py
+++ b/tests/digital_ocean_spaces.py
@@ -1,13 +1,7 @@
-import datetime
-import hashlib
 import tempfile
-from io import BytesIO
 from unittest import TestCase, expectedFailure
 from unittest.mock import patch, sentinel
 
-import boto3
-from botocore.stub import Stubber
-
 from storage import DigitalOceanSpaces, StorageInterface, create_storage_interface
 
 
@@ -43,7 +37,6 @@ def test_create_storage_interface_creation_function(self):
 
 
 class DigitalOceanSpacesIntegrationTests(TestCase):
-
     REGION = "fake3"
     ACCESS_KEY = "fake key"
     ACCESS_SECRET = "fake secret"
@@ -53,7 +46,7 @@ class DigitalOceanSpacesIntegrationTests(TestCase):
     def test_if_digital_ocean_spaces_class_implements_the_right_tasks_interface(self):
         with patch(
             "boto3.Session.client",
-        ) as mock:
+        ):
             spaces = DigitalOceanSpaces(
                 self.REGION,
                 self.ENDPOINT,
@@ -107,7 +100,7 @@ def test_download_files_should_receive_the_bucket_filekey_destination(self):
     def test_get_file_when_boto3_fail(self):
         with patch(
             "boto3.s3.inject.download_fileobj", side_effect=Exception("Dummy error")
-        ) as mock:
+        ):
             spaces = DigitalOceanSpaces(
                 self.REGION,
                 self.ENDPOINT,
diff --git a/tests/main_tests.py b/tests/main_tests.py
index 60892cc..0d1f0d3 100644
--- a/tests/main_tests.py
+++ b/tests/main_tests.py
@@ -1,9 +1,8 @@
-import os
 import logging
-from unittest import TestCase, expectedFailure
+from unittest import TestCase
 from unittest.mock import patch
 
-from main import enable_debug_if_necessary, start_to_process_pending_gazettes
+from main import enable_debug_if_necessary
 
 
 class MainModuleTests(TestCase):
diff --git a/tests/opensearch.py b/tests/opensearch.py
index 2d2e3c9..0733f6d 100644
--- a/tests/opensearch.py
+++ b/tests/opensearch.py
@@ -23,7 +23,7 @@ def test_create_index_interface_factory_method_with_valid_arguments(self):
 
     @expectedFailure
     def test_index_interface_factory_method_failed_without_required_info(self):
-        interface = create_index_interface()
+        create_index_interface()
 
     @patch.dict(
         "os.environ",
@@ -33,7 +33,7 @@ def test_index_interface_factory_method_failed_without_required_info(self):
     )
     @expectedFailure
     def test_index_interface_factory_method_failed_with_no_hosts(self):
-        interface = create_index_interface()
+        create_index_interface()
 
     @patch.dict(
         "os.environ",
@@ -43,7 +43,7 @@ def test_index_interface_factory_method_failed_with_no_hosts(self):
     )
     @expectedFailure
     def test_create_index_interface_factory_method_with_no_index(self):
-        interface = create_index_interface()
+        create_index_interface()
 
     @patch.dict(
         "os.environ",
@@ -54,7 +54,7 @@ def test_create_index_interface_factory_method_with_no_index(self):
     )
     @expectedFailure
     def test_create_index_interface_factory_method_with_empty_index(self):
-        interface = create_index_interface()
+        create_index_interface()
 
     @patch.dict(
         "os.environ",
@@ -65,7 +65,7 @@ def test_create_index_interface_factory_method_with_empty_index(self):
     )
     @expectedFailure
     def test_create_index_interface_factory_method_with_empty_hosts(self):
-        interface = create_index_interface()
+        create_index_interface()
 
 
 class OpensearchBasicTests(TestCase):
@@ -93,7 +93,7 @@ def test_opensearch_should_implement_index_interface(self):
 
     @patch("opensearchpy.Opensearch", autospec=True)
     def test_opensearch_connection(self, opensearch_mock):
-        interface = OpenSearchInterface(["127.0.0.1"])
+        OpenSearchInterface(["127.0.0.1"])
         opensearch_mock.assert_called_once_with(hosts=["127.0.0.1"])
 
     @patch("opensearchpy.Opensearch", autospec=True)
@@ -104,7 +104,9 @@ def test_opensearch_index_creation_should_check_if_index_exists(
         interface.search_engine.indices = MagicMock()
         interface.search_engine.indices.exists = MagicMock()
         interface.create_index("querido-diario")
-        interface.search_engine.indices.exists.assert_called_once_with(index="querido-diario")
+        interface.search_engine.indices.exists.assert_called_once_with(
+            index="querido-diario"
+        )
 
     @patch("opensearchpy.Opensearch", autospec=True)
     def test_opensearch_index_creation_should_failed_when_no_index_is_provided(
@@ -117,21 +119,17 @@ def test_opensearch_index_creation_should_failed_when_no_index_is_provided(
             interface.create_index()
 
     @patch("opensearchpy.Opensearch", autospec=True)
-    def test_opensearch_index_creation_with_default_index_value(
-        self, opensearch_mock
-    ):
-        interface = OpenSearchInterface(
-            ["127.0.0.1"], default_index="querido-diario2"
-        )
+    def test_opensearch_index_creation_with_default_index_value(self, opensearch_mock):
+        interface = OpenSearchInterface(["127.0.0.1"], default_index="querido-diario2")
         interface.search_engine.indices = MagicMock()
         interface.search_engine.indices.exists = MagicMock()
         interface.create_index()
-        interface.search_engine.indices.exists.assert_called_once_with(index="querido-diario2")
+        interface.search_engine.indices.exists.assert_called_once_with(
+            index="querido-diario2"
+        )
 
     @patch("opensearchpy.Opensearch", autospec=True)
-    def test_opensearch_index_default_timeout_should_be_30s(
-        self, opensearch_mock
-    ):
+    def test_opensearch_index_default_timeout_should_be_30s(self, opensearch_mock):
         interface = OpenSearchInterface(["127.0.0.1"])
         interface.search_engine.indices = MagicMock()
         interface.search_engine.indices.exists = MagicMock(return_value=False)
@@ -167,7 +165,9 @@ def test_opensearch_index_creation_should_not_recreate_index_if_it_exists(
         interface.search_engine.indices.exists = MagicMock(return_value=True)
         interface.search_engine.indices.create = MagicMock()
         interface.create_index("querido-diario")
-        interface.search_engine.indices.exists.assert_called_once_with(index="querido-diario")
+        interface.search_engine.indices.exists.assert_called_once_with(
+            index="querido-diario"
+        )
         interface.search_engine.indices.create.assert_not_called()
 
     @patch("opensearchpy.Opensearch", autospec=True)
@@ -179,7 +179,9 @@ def test_opensearch_should_create_index_if_it_does_not_exists(
         interface.search_engine.indices.exists = MagicMock(return_value=False)
         interface.search_engine.indices.create = MagicMock()
         interface.create_index("querido-diario")
-        interface.search_engine.indices.exists.assert_called_once_with(index="querido-diario")
+        interface.search_engine.indices.exists.assert_called_once_with(
+            index="querido-diario"
+        )
         interface.search_engine.indices.create.assert_called_once_with(
             index="querido-diario",
             body={"mappings": {"properties": {"date": {"type": "date"}}}},
@@ -190,14 +192,14 @@ def test_opensearch_should_create_index_if_it_does_not_exists(
     def test_opensearch_should_create_index_with_default_value_with_function_has_no_arguments(
         self, opensearch_mock
     ):
-        interface = OpenSearchInterface(
-            ["127.0.0.1"], default_index="querido-diario2"
-        )
+        interface = OpenSearchInterface(["127.0.0.1"], default_index="querido-diario2")
         interface.search_engine.indices = MagicMock()
         interface.search_engine.indices.exists = MagicMock(return_value=False)
         interface.search_engine.indices.create = MagicMock()
         interface.create_index()
-        interface.search_engine.indices.exists.assert_called_once_with(index="querido-diario2")
+        interface.search_engine.indices.exists.assert_called_once_with(
+            index="querido-diario2"
+        )
         interface.search_engine.indices.create.assert_called_once_with(
             index="querido-diario2",
             body={"mappings": {"properties": {"date": {"type": "date"}}}},
@@ -207,7 +209,6 @@ def test_opensearch_should_create_index_with_default_value_with_function_has_no_
     @patch("opensearchpy.Opensearch", autospec=True)
     def test_upload_document_to_index(self, opensearch_mock):
         interface = OpenSearchInterface(["127.0.0.1"])
-        document_checksum = str(uuid.uuid1())
         interface.index_document(self.fake_document, "querido-diario")
         interface.search_engine.index.assert_called_once_with(
             index="querido-diario",
@@ -217,10 +218,7 @@ def test_upload_document_to_index(self, opensearch_mock):
 
     @patch("opensearchpy.Opensearch", autospec=True)
     def test_upload_document_to_index_using_default_index(self, opensearch_mock):
-        interface = OpenSearchInterface(
-            ["127.0.0.1"], default_index="querido-diario2"
-        )
-        document_checksum = str(uuid.uuid1())
+        interface = OpenSearchInterface(["127.0.0.1"], default_index="querido-diario2")
         interface.index_document(self.fake_document)
         interface.search_engine.index.assert_called_once_with(
             index="querido-diario2",
diff --git a/tests/postgresql.py b/tests/postgresql.py
index 6677173..70fd81a 100644
--- a/tests/postgresql.py
+++ b/tests/postgresql.py
@@ -1,7 +1,7 @@
 import os
 import uuid
 from datetime import date, datetime
-from unittest import TestCase, expectedFailure
+from unittest import TestCase
 from unittest.mock import patch
 
 import psycopg2
@@ -71,7 +71,6 @@ def test_postgresql_connection(self):
 
 
 class PostgreSQLTests(TestCase):
-
     _data = []
 
     def setUp(self):
@@ -267,7 +266,7 @@ def set_some_fake_data_as_ingested_by_the_system_and_no_need_to_be_processed(sel
 
     def get_gazettes_pending_to_be_processed(self):
         for gazette in self._data:
-            if gazette["processed"] == False:
+            if not gazette["processed"]:
                 yield gazette
 
     def clean_database(self):
diff --git a/tests/text_extraction_task_tests.py b/tests/text_extraction_task_tests.py
index 5c1a3fe..078ce1b 100644
--- a/tests/text_extraction_task_tests.py
+++ b/tests/text_extraction_task_tests.py
@@ -1,9 +1,8 @@
-from unittest import TestCase
-from unittest.mock import MagicMock, patch
 import os
-import logging
-from datetime import date, datetime
 import tempfile
+from datetime import date, datetime
+from unittest import TestCase
+from unittest.mock import MagicMock, patch
 
 from data_extraction import TextExtractorInterface
 from tasks import (
@@ -155,9 +154,7 @@ def test_indexed_document_should_contain_gazette_content(self):
         database_mock.get_pending_gazettes = MagicMock(return_value=data)
         database_mock.set_gazette_as_processed = MagicMock()
 
-        tmp_gazette_file = self.copy_file_to_temporary_file(
-            "tests/data/fake_gazette.txt"
-        )
+        self.copy_file_to_temporary_file("tests/data/fake_gazette.txt")
         text_extraction_function = MagicMock(spec=TextExtractorInterface)
         text_extraction_function.extract_text.return_value = expected_data[
             "source_text"
@@ -177,7 +174,6 @@ def file_should_not_exist(self, file_to_check):
         )
 
     def test_invalid_file_type_should_be_skipped(self):
-
         text_extraction_function = MagicMock(spec=TextExtractorInterface)
         text_extraction_function.extract_text.side_effect = Exception(
             "Unsupported file type"
diff --git a/tests/text_extraction_tests.py b/tests/text_extraction_tests.py
index 2091650..9c30a40 100644
--- a/tests/text_extraction_tests.py
+++ b/tests/text_extraction_tests.py
@@ -1,8 +1,11 @@
-from unittest import TestCase, skip
-from unittest.mock import patch, mock_open, MagicMock
-import os
+from unittest import TestCase
+from unittest.mock import MagicMock, mock_open, patch
 
-from data_extraction import ApacheTikaTextExtractor, TextExtractorInterface
+from data_extraction import (
+    ApacheTikaTextExtractor,
+    TextExtractorInterface,
+    create_apache_tika_text_extraction,
+)
 
 
 class ApacheTikaTextExtractorTest(TestCase):
@@ -40,7 +43,7 @@ def test_request_reponse_return(self, magic_mock, open_mock, request_get_mock):
     @patch("magic.from_file", return_value="application/pdf")
     def test_odt_file_content_extraction(self, magic_mock, open_mock, request_get_mock):
         with self.assertRaisesRegex(Exception, "Could not extract file content"):
-            text = self.extractor.extract_text("tests/data/fake_gazette.pdf")
+            self.extractor.extract_text("tests/data/fake_gazette.pdf")
 
     def test_extract_from_pdf_file_should_return_text_file(self):
         text = self.extractor.extract_text("tests/data/fake_gazette.pdf")