From dbf3abdd570d55afa4c50fdb118d5dffcc39f8f2 Mon Sep 17 00:00:00 2001 From: Giulio Date: Fri, 23 Aug 2024 11:13:52 -0300 Subject: [PATCH] =?UTF-8?q?Adiciona=20pre-commit,=20corrige=20e=20formata?= =?UTF-8?q?=20c=C3=B3digo?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .pre-commit-config.yaml | 30 +++++++++++ data_extraction/__init__.py | 6 +++ data_extraction/interfaces.py | 1 + data_extraction/text_extraction.py | 3 +- database/__init__.py | 6 +++ database/interfaces.py | 2 +- database/postgresql.py | 16 +++--- index/__init__.py | 5 ++ index/interfaces.py | 4 +- index/opensearch.py | 33 +++++++++--- main/__init__.py | 7 ++- main/__main__.py | 24 ++++++--- requirements-dev.txt | 1 + segmentation/__init__.py | 2 +- segmentation/base/__init__.py | 4 +- segmentation/base/association_segmenter.py | 10 ++-- segmentation/base/gazette_segment.py | 5 +- segmentation/factory.py | 7 +-- segmentation/segmenters/__init__.py | 2 +- .../segmenters/al_associacao_municipios.py | 24 +++++---- storage/__init__.py | 6 +++ storage/digital_ocean_spaces.py | 36 +++++++------ storage/interfaces.py | 8 +-- tasks/__init__.py | 1 - tasks/create_aggregates_table.py | 5 +- tasks/create_index.py | 14 ++--- tasks/gazette_excerpts_embedding_reranking.py | 1 + tasks/gazette_excerpts_entities_tagging.py | 5 +- tasks/gazette_text_extraction.py | 4 +- tasks/gazette_themed_excerpts_extraction.py | 17 ++++-- tasks/gazette_txt_to_xml.py | 3 +- tasks/list_gazettes_to_be_processed.py | 1 - tasks/list_territories.py | 2 +- tasks/utils/__init__.py | 29 +++++++--- tasks/utils/datetime.py | 1 - tasks/utils/hash.py | 4 +- tasks/utils/iter.py | 2 +- tasks/utils/territories.py | 15 ++++-- tasks/utils/text.py | 4 +- tests/__init__.py | 40 +++++++++----- tests/digital_ocean_spaces.py | 11 +--- tests/main_tests.py | 5 +- tests/opensearch.py | 54 +++++++++---------- tests/postgresql.py | 5 +- tests/text_extraction_task_tests.py | 12 ++--- tests/text_extraction_tests.py | 13 +++-- 46 files changed, 311 insertions(+), 179 deletions(-) create mode 100644 .pre-commit-config.yaml create mode 100644 requirements-dev.txt diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..87e5f3a --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,30 @@ +repos: + - repo: /~https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: check-case-conflict + fail_fast: true + - id: check-merge-conflict + fail_fast: true + - id: debug-statements + fail_fast: true + - id: detect-aws-credentials + fail_fast: true + args: ["--allow-missing-credentials"] + - id: detect-private-key + fail_fast: true + + - repo: /~https://github.com/astral-sh/ruff-pre-commit + rev: 'v0.6.2' + hooks: + - id: ruff + fail_fast: true + args: ["--fix", "--ignore", "E501", "--select", "I"] + - id: ruff-format + + - repo: /~https://github.com/Lucas-C/pre-commit-hooks-safety + rev: v1.3.3 + hooks: + - id: python-safety-dependencies-check + fail_fast: true + files: requirements\/[a-z]+\.txt diff --git a/data_extraction/__init__.py b/data_extraction/__init__.py index 96c887b..6cb6d07 100644 --- a/data_extraction/__init__.py +++ b/data_extraction/__init__.py @@ -1,2 +1,8 @@ from .interfaces import TextExtractorInterface from .text_extraction import ApacheTikaTextExtractor, create_apache_tika_text_extraction + +__all__ = [ + "ApacheTikaTextExtractor", + "create_apache_tika_text_extraction", + "TextExtractorInterface", +] diff --git a/data_extraction/interfaces.py b/data_extraction/interfaces.py index b708f4b..6563571 100644 --- a/data_extraction/interfaces.py +++ b/data_extraction/interfaces.py @@ -1,5 +1,6 @@ import abc + class TextExtractorInterface(abc.ABC): @abc.abstractmethod def extract_text(self, filepath: str) -> str: diff --git a/data_extraction/text_extraction.py b/data_extraction/text_extraction.py index 4de0f1b..8595aaf 100644 --- a/data_extraction/text_extraction.py +++ b/data_extraction/text_extraction.py @@ -1,8 +1,7 @@ import logging -import magic import os -import subprocess +import magic import requests from .interfaces import TextExtractorInterface diff --git a/database/__init__.py b/database/__init__.py index b6da446..22d2af8 100644 --- a/database/__init__.py +++ b/database/__init__.py @@ -1,2 +1,8 @@ from .interfaces import DatabaseInterface from .postgresql import PostgreSQL, create_database_interface + +__all__ = [ + "create_database_interface", + "DatabaseInterface", + "PostgreSQL", +] diff --git a/database/interfaces.py b/database/interfaces.py index d555d27..fe082b4 100644 --- a/database/interfaces.py +++ b/database/interfaces.py @@ -1,5 +1,5 @@ -from typing import Dict, Iterable, Tuple import abc +from typing import Dict, Iterable, Tuple class DatabaseInterface(abc.ABC): diff --git a/database/postgresql.py b/database/postgresql.py index dd34674..541b5e0 100644 --- a/database/postgresql.py +++ b/database/postgresql.py @@ -1,6 +1,6 @@ -from typing import Dict, Iterable, Tuple -import os import logging +import os +from typing import Dict, Iterable, Tuple import psycopg2 @@ -59,16 +59,16 @@ def select(self, command: str) -> Iterable[Tuple]: logging.debug(f"Finished query: {cursor.query}") def insert(self, command: str, data: Dict = {}): - logging.debug(f"Inserting:") + logging.debug("Inserting:") self._commit_changes(command, data) - logging.debug(f"Finished inserting") + logging.debug("Finished inserting") def update(self, command: str, data: Dict = {}): - logging.debug(f"Updating:") + logging.debug("Updating:") self._commit_changes(command, data) - logging.debug(f"Finished updating") + logging.debug("Finished updating") def delete(self, command: str, data: Dict = {}): - logging.debug(f"Deleting:") + logging.debug("Deleting:") self._commit_changes(command, data) - logging.debug(f"Finished deleting") + logging.debug("Finished deleting") diff --git a/index/__init__.py b/index/__init__.py index 78d0d60..482c40a 100644 --- a/index/__init__.py +++ b/index/__init__.py @@ -1,2 +1,7 @@ from .interfaces import IndexInterface from .opensearch import create_index_interface + +__all__ = [ + "create_index_interface", + "IndexInterface", +] diff --git a/index/interfaces.py b/index/interfaces.py index 41a80cd..59c69ad 100644 --- a/index/interfaces.py +++ b/index/interfaces.py @@ -1,5 +1,5 @@ -from typing import Dict, Iterable import abc +from typing import Dict, Iterable class IndexInterface(abc.ABC): @@ -40,5 +40,3 @@ def paginated_search( """ Searches the index with the provided query, with pagination """ - - diff --git a/index/opensearch.py b/index/opensearch.py index 08fed97..33cfdb7 100644 --- a/index/opensearch.py +++ b/index/opensearch.py @@ -1,5 +1,5 @@ -from typing import Dict, Iterable, List, Union import os +from typing import Dict, Iterable, List, Union import opensearchpy @@ -7,8 +7,17 @@ class OpenSearchInterface(IndexInterface): - def __init__(self, hosts: List, user: str, password: str, timeout: int = 30, default_index: str = ""): - self._search_engine = opensearchpy.OpenSearch(hosts=hosts, http_auth=(user, password)) + def __init__( + self, + hosts: List, + user: str, + password: str, + timeout: int = 30, + default_index: str = "", + ): + self._search_engine = opensearchpy.OpenSearch( + hosts=hosts, http_auth=(user, password) + ) self._timeout = timeout self._default_index = default_index @@ -51,7 +60,9 @@ def index_document( refresh: bool = False, ) -> None: index = self.get_index_name(index) - self._search_engine.index(index=index, body=document, id=document_id, refresh=refresh) + self._search_engine.index( + index=index, body=document, id=document_id, refresh=refresh + ) def search(self, query: Dict, index: str = "") -> Dict: index = self.get_index_name(index) @@ -60,7 +71,9 @@ def search(self, query: Dict, index: str = "") -> Dict: def analyze(self, text: str, field: str, index: str = "") -> Dict: index = self.get_index_name(index) - result = self._search_engine.indices.analyze(body={"text": text, "field":field}, index=index) + result = self._search_engine.indices.analyze( + body={"text": text, "field": field}, index=index + ) return result def paginated_search( @@ -96,12 +109,15 @@ def get_opensearch_host(): def get_opensearch_index(): return os.environ["OPENSEARCH_INDEX"] + def get_opensearch_user(): return os.environ["OPENSEARCH_USER"] + def get_opensearch_password(): return os.environ["OPENSEARCH_PASSWORD"] + def create_index_interface() -> IndexInterface: hosts = get_opensearch_host() if not isinstance(hosts, str) or len(hosts) == 0: @@ -109,4 +125,9 @@ def create_index_interface() -> IndexInterface: default_index_name = get_opensearch_index() if not isinstance(default_index_name, str) or len(default_index_name) == 0: raise Exception("Invalid index name") - return OpenSearchInterface([hosts], get_opensearch_user(), get_opensearch_password(), default_index=default_index_name) + return OpenSearchInterface( + [hosts], + get_opensearch_user(), + get_opensearch_password(), + default_index=default_index_name, + ) diff --git a/main/__init__.py b/main/__init__.py index 0285851..45eb901 100644 --- a/main/__init__.py +++ b/main/__init__.py @@ -1,4 +1,9 @@ from .__main__ import ( - is_debug_enabled, enable_debug_if_necessary, + is_debug_enabled, ) + +__all__ = [ + "is_debug_enabled", + "enable_debug_if_necessary", +] diff --git a/main/__main__.py b/main/__main__.py index cd1da3f..5aa3664 100644 --- a/main/__main__.py +++ b/main/__main__.py @@ -1,11 +1,11 @@ -from os import environ import argparse import logging +from os import environ from data_extraction import create_apache_tika_text_extraction from database import create_database_interface -from storage import create_storage_interface from index import create_index_interface +from storage import create_storage_interface from tasks import run_task @@ -37,12 +37,24 @@ def gazette_texts_pipeline(): run_task("create_gazettes_index", index) territories = run_task("get_territories", database) - gazettes_to_be_processed = run_task("get_gazettes_to_be_processed", execution_mode, database) - indexed_gazette_ids = run_task("extract_text_from_gazettes", gazettes_to_be_processed, territories, database, storage, index, text_extractor) - + gazettes_to_be_processed = run_task( + "get_gazettes_to_be_processed", execution_mode, database + ) + indexed_gazette_ids = run_task( + "extract_text_from_gazettes", + gazettes_to_be_processed, + territories, + database, + storage, + index, + text_extractor, + ) + for theme in themes: run_task("create_themed_excerpts_index", theme, index) - themed_excerpt_ids = run_task("extract_themed_excerpts_from_gazettes", theme, indexed_gazette_ids, index) + themed_excerpt_ids = run_task( + "extract_themed_excerpts_from_gazettes", theme, indexed_gazette_ids, index + ) run_task("embedding_rerank_excerpts", theme, themed_excerpt_ids, index) run_task("tag_entities_in_excerpts", theme, themed_excerpt_ids, index) diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..959c060 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1 @@ +pre-commit==3.5.0 diff --git a/segmentation/__init__.py b/segmentation/__init__.py index ce1ea7c..7bdf164 100644 --- a/segmentation/__init__.py +++ b/segmentation/__init__.py @@ -2,4 +2,4 @@ __all__ = [ "get_segmenter", -] \ No newline at end of file +] diff --git a/segmentation/base/__init__.py b/segmentation/base/__init__.py index bdbbd2b..4f5314b 100644 --- a/segmentation/base/__init__.py +++ b/segmentation/base/__init__.py @@ -1,7 +1,7 @@ -from .gazette_segment import GazetteSegment from .association_segmenter import AssociationSegmenter +from .gazette_segment import GazetteSegment __all__ = [ "GazetteSegment", "AssociationSegmenter", -] \ No newline at end of file +] diff --git a/segmentation/base/association_segmenter.py b/segmentation/base/association_segmenter.py index 0d777be..9d65ef1 100644 --- a/segmentation/base/association_segmenter.py +++ b/segmentation/base/association_segmenter.py @@ -1,4 +1,5 @@ from typing import Any, Dict, Iterable, List, Union + from segmentation.base import GazetteSegment @@ -6,13 +7,17 @@ class AssociationSegmenter: def __init__(self, territories: Iterable[Dict[str, Any]]): self.territories = territories - def get_gazette_segments(self, *args, **kwargs) -> List[Union[GazetteSegment, Dict]]: + def get_gazette_segments( + self, *args, **kwargs + ) -> List[Union[GazetteSegment, Dict]]: """ Returns a list of GazetteSegment """ raise NotImplementedError - def split_text_by_territory(self, *args, **kwargs) -> Union[Dict[str, str], List[str]]: + def split_text_by_territory( + self, *args, **kwargs + ) -> Union[Dict[str, str], List[str]]: """ Segment a association text by territory and returns a list of text segments @@ -24,4 +29,3 @@ def build_segment(self, *args, **kwargs) -> GazetteSegment: Returns a GazetteSegment """ raise NotImplementedError - diff --git a/segmentation/base/gazette_segment.py b/segmentation/base/gazette_segment.py index aef8fb1..b31d05c 100644 --- a/segmentation/base/gazette_segment.py +++ b/segmentation/base/gazette_segment.py @@ -1,5 +1,5 @@ -from datetime import date, datetime from dataclasses import dataclass +from datetime import date, datetime @dataclass @@ -8,6 +8,7 @@ class GazetteSegment: Dataclass to represent a gazette segment of a association related to a city """ + id: str territory_name: str source_text: str @@ -24,4 +25,4 @@ class GazetteSegment: state_code: str territory_id: str file_raw_txt: str - url: str \ No newline at end of file + url: str diff --git a/segmentation/factory.py b/segmentation/factory.py index 65c693e..0e57284 100644 --- a/segmentation/factory.py +++ b/segmentation/factory.py @@ -1,13 +1,14 @@ from typing import Any, Dict, Iterable -from segmentation.base import AssociationSegmenter from segmentation import segmenters - +from segmentation.base import AssociationSegmenter _segmenter_instances = {} -def get_segmenter(territory_id: str, territories: Iterable[Dict[str, Any]]) -> AssociationSegmenter: +def get_segmenter( + territory_id: str, territories: Iterable[Dict[str, Any]] +) -> AssociationSegmenter: """ Factory method to return a AssociationSegmenter diff --git a/segmentation/segmenters/__init__.py b/segmentation/segmenters/__init__.py index 39de174..d4db466 100644 --- a/segmentation/segmenters/__init__.py +++ b/segmentation/segmenters/__init__.py @@ -2,4 +2,4 @@ __all__ = [ "ALAssociacaoMunicipiosSegmenter", -] \ No newline at end of file +] diff --git a/segmentation/segmenters/al_associacao_municipios.py b/segmentation/segmenters/al_associacao_municipios.py index 7485e51..9ec6990 100644 --- a/segmentation/segmenters/al_associacao_municipios.py +++ b/segmentation/segmenters/al_associacao_municipios.py @@ -1,7 +1,7 @@ -import re import logging - +import re from typing import Any, Dict, List + from segmentation.base import AssociationSegmenter, GazetteSegment from tasks.utils import batched, get_checksum, get_territory_data, get_territory_slug @@ -64,15 +64,17 @@ def build_segment( ) territory_data = get_territory_data(territory_slug, self.territories) - return GazetteSegment(**{ - **gazette, - # segment specific values - "processed": True, - "file_checksum": get_checksum(segment_text), - "source_text": segment_text.strip(), - "territory_name": territory_data["territory_name"], - "territory_id": territory_data["id"], - }) + return GazetteSegment( + **{ + **gazette, + # segment specific values + "processed": True, + "file_checksum": get_checksum(segment_text), + "source_text": segment_text.strip(), + "territory_name": territory_data["territory_name"], + "territory_id": territory_data["id"], + } + ) def _normalize_territory_name(self, territory_name: str) -> str: clean_name = territory_name.strip().replace("\n", "") diff --git a/storage/__init__.py b/storage/__init__.py index 8d9c47c..9603c9f 100644 --- a/storage/__init__.py +++ b/storage/__init__.py @@ -1,2 +1,8 @@ from .digital_ocean_spaces import DigitalOceanSpaces, create_storage_interface from .interfaces import StorageInterface + +__all__ = [ + "create_storage_interface", + "DigitalOceanSpaces", + "StorageInterface", +] diff --git a/storage/digital_ocean_spaces.py b/storage/digital_ocean_spaces.py index 99b738a..e1feab2 100644 --- a/storage/digital_ocean_spaces.py +++ b/storage/digital_ocean_spaces.py @@ -1,8 +1,8 @@ import logging import os -from typing import Union from io import BytesIO from pathlib import Path +from typing import Union import boto3 @@ -71,7 +71,9 @@ def __init__( def get_file(self, file_to_be_downloaded: Union[str, Path], destination) -> None: logging.debug(f"Getting {file_to_be_downloaded}") - self._client.download_fileobj(self._bucket, str(file_to_be_downloaded), destination) + self._client.download_fileobj( + self._bucket, str(file_to_be_downloaded), destination + ) def upload_content( self, @@ -88,7 +90,10 @@ def upload_content( ) else: self._client.upload_fileobj( - content_to_be_uploaded, self._bucket, file_key, ExtraArgs={"ACL": permission} + content_to_be_uploaded, + self._bucket, + file_key, + ExtraArgs={"ACL": permission}, ) def upload_file( @@ -111,13 +116,15 @@ def upload_file_multipart( ) -> None: logging.debug(f"Uploading {file_key} with multipart") - multipart_upload = self._client.create_multipart_upload(Bucket=self._bucket, Key=file_key, ACL=permission) - upload_id = multipart_upload['UploadId'] + multipart_upload = self._client.create_multipart_upload( + Bucket=self._bucket, Key=file_key, ACL=permission + ) + upload_id = multipart_upload["UploadId"] parts = [] try: - with open(file_path, 'rb') as file: + with open(file_path, "rb") as file: part_number = 1 while True: data = file.read(part_size) @@ -129,25 +136,24 @@ def upload_file_multipart( Key=file_key, PartNumber=part_number, UploadId=upload_id, - Body=data + Body=data, ) - parts.append({ - 'PartNumber': part_number, - 'ETag': response['ETag'] - }) + parts.append({"PartNumber": part_number, "ETag": response["ETag"]}) part_number += 1 self._client.complete_multipart_upload( Bucket=self._bucket, Key=file_key, UploadId=upload_id, - MultipartUpload={'Parts': parts} + MultipartUpload={"Parts": parts}, ) except Exception as e: logging.debug(f"Aborted uploading {file_key} with multipart") - self._client.abort_multipart_upload(Bucket=self._bucket, Key=file_key, UploadId=upload_id) + self._client.abort_multipart_upload( + Bucket=self._bucket, Key=file_key, UploadId=upload_id + ) raise e else: logging.debug(f"Finished uploading {file_key} with multipart") @@ -156,8 +162,8 @@ def copy_file(self, source_file_key: str, destination_file_key: str) -> None: logging.debug(f"Copying {source_file_key} to {destination_file_key}") self._client.copy_object( Bucket=self._bucket, - CopySource={'Bucket': self._bucket, 'Key': source_file_key}, - Key=destination_file_key + CopySource={"Bucket": self._bucket, "Key": source_file_key}, + Key=destination_file_key, ) def delete_file(self, file_key: str) -> None: diff --git a/storage/interfaces.py b/storage/interfaces.py index 1578752..0f03ce7 100644 --- a/storage/interfaces.py +++ b/storage/interfaces.py @@ -1,7 +1,7 @@ -from typing import Union -from pathlib import Path import abc from io import BytesIO +from pathlib import Path +from typing import Union class StorageInterface(abc.ABC): @@ -16,7 +16,9 @@ def get_file(self, file_to_be_downloaded: Union[str, Path], destination) -> None """ @abc.abstractmethod - def upload_content(self, file_key: str, content_to_be_uploaded: Union[str, BytesIO]) -> None: + def upload_content( + self, file_key: str, content_to_be_uploaded: Union[str, BytesIO] + ) -> None: """ Upload the given content to the destination on the host """ diff --git a/tasks/__init__.py b/tasks/__init__.py index 43a5e3f..aa5e378 100644 --- a/tasks/__init__.py +++ b/tasks/__init__.py @@ -1,6 +1,5 @@ from importlib import import_module - AVAILABLE_TASKS = { "create_aggregates": "tasks.gazette_txt_to_xml", "create_gazettes_index": "tasks.create_index", diff --git a/tasks/create_aggregates_table.py b/tasks/create_aggregates_table.py index 17d169b..7310e1d 100644 --- a/tasks/create_aggregates_table.py +++ b/tasks/create_aggregates_table.py @@ -13,6 +13,5 @@ def create_aggregates_table(database: DatabaseInterface): file_size_mb REAL, hash_info VARCHAR(64), last_updated TIMESTAMP - ); """) - - \ No newline at end of file + ); """ + ) diff --git a/tasks/create_index.py b/tasks/create_index.py index 7abd8f0..3adcd6f 100644 --- a/tasks/create_index.py +++ b/tasks/create_index.py @@ -38,7 +38,7 @@ def create_gazettes_index(index: IndexInterface) -> None: "analyzer": "exact", "index_options": "offsets", "term_vector": "with_positions_offsets", - } + }, }, }, "state_code": {"type": "keyword"}, @@ -52,8 +52,8 @@ def create_gazettes_index(index: IndexInterface) -> None: }, "settings": { "index": { - "sort.field": ["territory_id", "date"], - "sort.order": ["asc", "desc"] + "sort.field": ["territory_id", "date"], + "sort.order": ["asc", "desc"], }, "analysis": { "filter": { @@ -72,7 +72,7 @@ def create_gazettes_index(index: IndexInterface) -> None: "filter": ["lowercase"], }, }, - } + }, }, } index.create_index(body=body) @@ -127,8 +127,8 @@ def create_themed_excerpts_index(theme: Dict, index: IndexInterface) -> None: }, "settings": { "index": { - "sort.field": ["source_territory_id", "source_date"], - "sort.order": ["asc", "desc"] + "sort.field": ["source_territory_id", "source_date"], + "sort.order": ["asc", "desc"], }, "analysis": { "filter": { @@ -147,7 +147,7 @@ def create_themed_excerpts_index(theme: Dict, index: IndexInterface) -> None: "filter": ["lowercase"], }, }, - } + }, }, } index.create_index(index_name=theme["index"], body=body) diff --git a/tasks/gazette_excerpts_embedding_reranking.py b/tasks/gazette_excerpts_embedding_reranking.py index 3ffc40a..a2f282b 100644 --- a/tasks/gazette_excerpts_embedding_reranking.py +++ b/tasks/gazette_excerpts_embedding_reranking.py @@ -4,6 +4,7 @@ import sentence_transformers from index import IndexInterface + from .utils import get_documents_with_ids diff --git a/tasks/gazette_excerpts_entities_tagging.py b/tasks/gazette_excerpts_entities_tagging.py index c63522e..fb14056 100644 --- a/tasks/gazette_excerpts_entities_tagging.py +++ b/tasks/gazette_excerpts_entities_tagging.py @@ -2,6 +2,7 @@ from typing import Dict, List from index import IndexInterface + from .utils import ( get_documents_from_query_with_highlights, get_documents_with_ids, @@ -24,9 +25,7 @@ def tag_theme_cases(theme: Dict, excerpt_ids: List[str], index: IndexInterface) ) for document in documents: excerpt = document["_source"] - highlight = document["highlight"][ - "excerpt.with_stopwords" - ][0] + highlight = document["highlight"]["excerpt.with_stopwords"][0] excerpt.update( { "excerpt_entities": list( diff --git a/tasks/gazette_text_extraction.py b/tasks/gazette_text_extraction.py index 18c0c3a..61db32d 100644 --- a/tasks/gazette_text_extraction.py +++ b/tasks/gazette_text_extraction.py @@ -1,13 +1,13 @@ import logging -import tempfile import os +import tempfile from pathlib import Path from typing import Any, Dict, Iterable, List, Union -from segmentation import get_segmenter from data_extraction import TextExtractorInterface from database import DatabaseInterface from index import IndexInterface +from segmentation import get_segmenter from storage import StorageInterface diff --git a/tasks/gazette_themed_excerpts_extraction.py b/tasks/gazette_themed_excerpts_extraction.py index 1b05882..336dd7f 100644 --- a/tasks/gazette_themed_excerpts_extraction.py +++ b/tasks/gazette_themed_excerpts_extraction.py @@ -2,7 +2,12 @@ from typing import Dict, Iterable, List from index import IndexInterface -from .utils import batched, clean_extra_whitespaces, get_documents_from_query_with_highlights + +from .utils import ( + batched, + clean_extra_whitespaces, + get_documents_from_query_with_highlights, +) def extract_themed_excerpts_from_gazettes( @@ -14,7 +19,7 @@ def extract_themed_excerpts_from_gazettes( for excerpt in get_excerpts_from_gazettes_with_themed_query( theme_query, batch, index ): - # excerpts with less than 10% of the expected size of excerpt account for + # excerpts with less than 10% of the expected size of excerpt account for # fewer than 1% of excerpts yet their score is usually high if len(excerpt["excerpt"]) < 200: continue @@ -99,9 +104,13 @@ def get_es_query_from_themed_query( phrase_block = { "span_near": {"clauses": [], "slop": 0, "in_order": True} } - tokenized_term = index.analyze(text=term, field="source_text.with_stopwords") + tokenized_term = index.analyze( + text=term, field="source_text.with_stopwords" + ) for token in tokenized_term["tokens"]: - word_block = {"span_term": {"source_text.with_stopwords": token["token"]}} + word_block = { + "span_term": {"source_text.with_stopwords": token["token"]} + } phrase_block["span_near"]["clauses"].append(word_block) synonym_block["span_or"]["clauses"].append(phrase_block) proximity_block["span_near"]["clauses"].append(synonym_block) diff --git a/tasks/gazette_txt_to_xml.py b/tasks/gazette_txt_to_xml.py index 68491c9..474f9e7 100644 --- a/tasks/gazette_txt_to_xml.py +++ b/tasks/gazette_txt_to_xml.py @@ -5,7 +5,7 @@ from datetime import datetime from io import BytesIO from pathlib import Path -from tempfile import mkstemp, TemporaryDirectory, NamedTemporaryFile +from tempfile import NamedTemporaryFile, TemporaryDirectory, mkstemp from typing import Iterable from zipfile import ZIP_DEFLATED, ZipFile @@ -13,6 +13,7 @@ from database import DatabaseInterface from storage import StorageInterface + from .utils import br_timezone, get_territory_slug, hash_file logger = logging.getLogger(__name__) diff --git a/tasks/list_gazettes_to_be_processed.py b/tasks/list_gazettes_to_be_processed.py index 0dd0984..32285d2 100644 --- a/tasks/list_gazettes_to_be_processed.py +++ b/tasks/list_gazettes_to_be_processed.py @@ -7,7 +7,6 @@ def get_gazettes_to_be_processed( execution_mode: str, database: DatabaseInterface ) -> Iterable[Dict]: - if execution_mode == "DAILY": yield from get_gazettes_extracted_since_yesterday(database) elif execution_mode == "ALL": diff --git a/tasks/list_territories.py b/tasks/list_territories.py index 55f4c32..2d52084 100644 --- a/tasks/list_territories.py +++ b/tasks/list_territories.py @@ -1,4 +1,4 @@ -from functools import lru_cache +from functools import lru_cache from typing import Dict, Iterable from database import DatabaseInterface diff --git a/tasks/utils/__init__.py b/tasks/utils/__init__.py index 882450b..be80e87 100644 --- a/tasks/utils/__init__.py +++ b/tasks/utils/__init__.py @@ -1,4 +1,8 @@ from .datetime import br_timezone +from .hash import ( + hash_content, + hash_file, +) from .index import ( get_documents_from_query_with_highlights, get_documents_with_ids, @@ -6,15 +10,24 @@ from .iter import ( batched, ) -from .text import ( - clean_extra_whitespaces, - get_checksum, -) from .territories import ( - get_territory_slug, get_territory_data, + get_territory_slug, ) -from .hash import ( - hash_content, - hash_file, +from .text import ( + clean_extra_whitespaces, + get_checksum, ) + +__all__ = [ + "batched", + "br_timezone", + "clean_extra_whitespaces", + "get_checksum", + "get_documents_from_query_with_highlights", + "get_documents_with_ids", + "get_territory_data", + "get_territory_slug", + "hash_content", + "hash_file", +] diff --git a/tasks/utils/datetime.py b/tasks/utils/datetime.py index e0028c9..f06c8ac 100644 --- a/tasks/utils/datetime.py +++ b/tasks/utils/datetime.py @@ -1,4 +1,3 @@ from datetime import timedelta, timezone - br_timezone = timezone(timedelta(hours=-3)) diff --git a/tasks/utils/hash.py b/tasks/utils/hash.py index 9e0a139..f69ab43 100644 --- a/tasks/utils/hash.py +++ b/tasks/utils/hash.py @@ -19,7 +19,7 @@ def hash_file(file) -> str: chunk_size = 128 * hash.block_size if isinstance(file, str): - with open(file, 'rb') as f: + with open(file, "rb") as f: _chunk_hashing(hash, chunk_size, f) else: file.seek(0) @@ -30,5 +30,5 @@ def hash_file(file) -> str: def _chunk_hashing(hash, chunk_size, file): - for chunk in iter(lambda: file.read(chunk_size), b''): + for chunk in iter(lambda: file.read(chunk_size), b""): hash.update(chunk) diff --git a/tasks/utils/iter.py b/tasks/utils/iter.py index 034715a..d456cca 100644 --- a/tasks/utils/iter.py +++ b/tasks/utils/iter.py @@ -5,7 +5,7 @@ def batched(iterable, n): # batched('ABCDEFG', 3) --> ABC DEF G # pode ser removido ao usar python 3.12, em favor de itertools.batched if n < 1: - raise ValueError('n must be at least one') + raise ValueError("n must be at least one") it = iter(iterable) while batch := tuple(islice(it, n)): yield batch diff --git a/tasks/utils/territories.py b/tasks/utils/territories.py index e77235c..846adbf 100644 --- a/tasks/utils/territories.py +++ b/tasks/utils/territories.py @@ -2,7 +2,6 @@ from slugify import slugify - _territory_slug_to_data_map = {} @@ -10,10 +9,14 @@ def get_territory_slug(name: str, state_code: str) -> str: full_name = f"{state_code} {name}" stopwords = ["de", "d", "da", "do", "das", "dos"] replacements = [("ยด", "'"), ("`", "'")] - return slugify(full_name, separator="", stopwords=stopwords, replacements=replacements) + return slugify( + full_name, separator="", stopwords=stopwords, replacements=replacements + ) -def get_territory_data(identifier: Union[str, Tuple[str, str]], territories: Iterable[Dict[str, Any]]) -> Dict[str, Dict]: +def get_territory_data( + identifier: Union[str, Tuple[str, str]], territories: Iterable[Dict[str, Any]] +) -> Dict[str, Dict]: if isinstance(identifier, tuple): territory_name, state_code = identifier territory_slug = get_territory_slug(territory_name, state_code) @@ -25,12 +28,14 @@ def get_territory_data(identifier: Union[str, Tuple[str, str]], territories: Ite slug_to_data = get_territory_slug_to_data_map(territories) if territory_slug not in slug_to_data: - raise KeyError(f"Couldn't find info for \"{territory_slug}\"") + raise KeyError(f'Couldn\'t find info for "{territory_slug}"') return slug_to_data[territory_slug] -def get_territory_slug_to_data_map(territories: Iterable[Dict[str, Any]]) -> Dict[str, Dict]: +def get_territory_slug_to_data_map( + territories: Iterable[Dict[str, Any]], +) -> Dict[str, Dict]: global _territory_slug_to_data_map if not _territory_slug_to_data_map: territory_to_data = { diff --git a/tasks/utils/text.py b/tasks/utils/text.py index 7d40bca..41fc4c0 100644 --- a/tasks/utils/text.py +++ b/tasks/utils/text.py @@ -1,5 +1,5 @@ -import re import hashlib +import re from io import BytesIO @@ -25,4 +25,4 @@ def get_checksum(source_text: str) -> str: if not d: break m.update(d) - return m.hexdigest() \ No newline at end of file + return m.hexdigest() diff --git a/tests/__init__.py b/tests/__init__.py index 1d2c663..7b20313 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,24 +1,38 @@ import unittest -from .text_extraction_tests import ( - ApacheTikaTextExtractorTest, - FactoryMethodApacheTikaTest, -) from .digital_ocean_spaces import ( DigitalOceanSpacesIntegrationTests, StorageInterfaceCreationTests, ) -from .postgresql import ( - PostgreSQLTests, - PostgreSQLConnectionTests, - CreationDatabaseInterfaceFunctionTests, -) -from .text_extraction_task_tests import TextExtractionTaskTests - from .main_tests import MainModuleTests - from .opensearch import ( - OpensearchBasicTests, IndexInterfaceFactoryFunctionTests, + OpensearchBasicTests, OpensearchIntegrationTests, ) +from .postgresql import ( + CreationDatabaseInterfaceFunctionTests, + PostgreSQLConnectionTests, + PostgreSQLTests, +) +from .text_extraction_task_tests import TextExtractionTaskTests +from .text_extraction_tests import ( + ApacheTikaTextExtractorTest, + FactoryMethodApacheTikaTest, +) + +__all__ = [ + "ApacheTikaTextExtractorTest", + "CreationDatabaseInterfaceFunctionTests", + "DigitalOceanSpacesIntegrationTests", + "FactoryMethodApacheTikaTest", + "IndexInterfaceFactoryFunctionTests", + "MainModuleTests", + "OpensearchBasicTests", + "OpensearchIntegrationTests", + "PostgreSQLConnectionTests", + "PostgreSQLTests", + "StorageInterfaceCreationTests", + "TextExtractionTaskTests", + "unittest", +] diff --git a/tests/digital_ocean_spaces.py b/tests/digital_ocean_spaces.py index 31ad4f4..62befd8 100644 --- a/tests/digital_ocean_spaces.py +++ b/tests/digital_ocean_spaces.py @@ -1,13 +1,7 @@ -import datetime -import hashlib import tempfile -from io import BytesIO from unittest import TestCase, expectedFailure from unittest.mock import patch, sentinel -import boto3 -from botocore.stub import Stubber - from storage import DigitalOceanSpaces, StorageInterface, create_storage_interface @@ -43,7 +37,6 @@ def test_create_storage_interface_creation_function(self): class DigitalOceanSpacesIntegrationTests(TestCase): - REGION = "fake3" ACCESS_KEY = "fake key" ACCESS_SECRET = "fake secret" @@ -53,7 +46,7 @@ class DigitalOceanSpacesIntegrationTests(TestCase): def test_if_digital_ocean_spaces_class_implements_the_right_tasks_interface(self): with patch( "boto3.Session.client", - ) as mock: + ): spaces = DigitalOceanSpaces( self.REGION, self.ENDPOINT, @@ -107,7 +100,7 @@ def test_download_files_should_receive_the_bucket_filekey_destination(self): def test_get_file_when_boto3_fail(self): with patch( "boto3.s3.inject.download_fileobj", side_effect=Exception("Dummy error") - ) as mock: + ): spaces = DigitalOceanSpaces( self.REGION, self.ENDPOINT, diff --git a/tests/main_tests.py b/tests/main_tests.py index 60892cc..0d1f0d3 100644 --- a/tests/main_tests.py +++ b/tests/main_tests.py @@ -1,9 +1,8 @@ -import os import logging -from unittest import TestCase, expectedFailure +from unittest import TestCase from unittest.mock import patch -from main import enable_debug_if_necessary, start_to_process_pending_gazettes +from main import enable_debug_if_necessary class MainModuleTests(TestCase): diff --git a/tests/opensearch.py b/tests/opensearch.py index 2d2e3c9..0733f6d 100644 --- a/tests/opensearch.py +++ b/tests/opensearch.py @@ -23,7 +23,7 @@ def test_create_index_interface_factory_method_with_valid_arguments(self): @expectedFailure def test_index_interface_factory_method_failed_without_required_info(self): - interface = create_index_interface() + create_index_interface() @patch.dict( "os.environ", @@ -33,7 +33,7 @@ def test_index_interface_factory_method_failed_without_required_info(self): ) @expectedFailure def test_index_interface_factory_method_failed_with_no_hosts(self): - interface = create_index_interface() + create_index_interface() @patch.dict( "os.environ", @@ -43,7 +43,7 @@ def test_index_interface_factory_method_failed_with_no_hosts(self): ) @expectedFailure def test_create_index_interface_factory_method_with_no_index(self): - interface = create_index_interface() + create_index_interface() @patch.dict( "os.environ", @@ -54,7 +54,7 @@ def test_create_index_interface_factory_method_with_no_index(self): ) @expectedFailure def test_create_index_interface_factory_method_with_empty_index(self): - interface = create_index_interface() + create_index_interface() @patch.dict( "os.environ", @@ -65,7 +65,7 @@ def test_create_index_interface_factory_method_with_empty_index(self): ) @expectedFailure def test_create_index_interface_factory_method_with_empty_hosts(self): - interface = create_index_interface() + create_index_interface() class OpensearchBasicTests(TestCase): @@ -93,7 +93,7 @@ def test_opensearch_should_implement_index_interface(self): @patch("opensearchpy.Opensearch", autospec=True) def test_opensearch_connection(self, opensearch_mock): - interface = OpenSearchInterface(["127.0.0.1"]) + OpenSearchInterface(["127.0.0.1"]) opensearch_mock.assert_called_once_with(hosts=["127.0.0.1"]) @patch("opensearchpy.Opensearch", autospec=True) @@ -104,7 +104,9 @@ def test_opensearch_index_creation_should_check_if_index_exists( interface.search_engine.indices = MagicMock() interface.search_engine.indices.exists = MagicMock() interface.create_index("querido-diario") - interface.search_engine.indices.exists.assert_called_once_with(index="querido-diario") + interface.search_engine.indices.exists.assert_called_once_with( + index="querido-diario" + ) @patch("opensearchpy.Opensearch", autospec=True) def test_opensearch_index_creation_should_failed_when_no_index_is_provided( @@ -117,21 +119,17 @@ def test_opensearch_index_creation_should_failed_when_no_index_is_provided( interface.create_index() @patch("opensearchpy.Opensearch", autospec=True) - def test_opensearch_index_creation_with_default_index_value( - self, opensearch_mock - ): - interface = OpenSearchInterface( - ["127.0.0.1"], default_index="querido-diario2" - ) + def test_opensearch_index_creation_with_default_index_value(self, opensearch_mock): + interface = OpenSearchInterface(["127.0.0.1"], default_index="querido-diario2") interface.search_engine.indices = MagicMock() interface.search_engine.indices.exists = MagicMock() interface.create_index() - interface.search_engine.indices.exists.assert_called_once_with(index="querido-diario2") + interface.search_engine.indices.exists.assert_called_once_with( + index="querido-diario2" + ) @patch("opensearchpy.Opensearch", autospec=True) - def test_opensearch_index_default_timeout_should_be_30s( - self, opensearch_mock - ): + def test_opensearch_index_default_timeout_should_be_30s(self, opensearch_mock): interface = OpenSearchInterface(["127.0.0.1"]) interface.search_engine.indices = MagicMock() interface.search_engine.indices.exists = MagicMock(return_value=False) @@ -167,7 +165,9 @@ def test_opensearch_index_creation_should_not_recreate_index_if_it_exists( interface.search_engine.indices.exists = MagicMock(return_value=True) interface.search_engine.indices.create = MagicMock() interface.create_index("querido-diario") - interface.search_engine.indices.exists.assert_called_once_with(index="querido-diario") + interface.search_engine.indices.exists.assert_called_once_with( + index="querido-diario" + ) interface.search_engine.indices.create.assert_not_called() @patch("opensearchpy.Opensearch", autospec=True) @@ -179,7 +179,9 @@ def test_opensearch_should_create_index_if_it_does_not_exists( interface.search_engine.indices.exists = MagicMock(return_value=False) interface.search_engine.indices.create = MagicMock() interface.create_index("querido-diario") - interface.search_engine.indices.exists.assert_called_once_with(index="querido-diario") + interface.search_engine.indices.exists.assert_called_once_with( + index="querido-diario" + ) interface.search_engine.indices.create.assert_called_once_with( index="querido-diario", body={"mappings": {"properties": {"date": {"type": "date"}}}}, @@ -190,14 +192,14 @@ def test_opensearch_should_create_index_if_it_does_not_exists( def test_opensearch_should_create_index_with_default_value_with_function_has_no_arguments( self, opensearch_mock ): - interface = OpenSearchInterface( - ["127.0.0.1"], default_index="querido-diario2" - ) + interface = OpenSearchInterface(["127.0.0.1"], default_index="querido-diario2") interface.search_engine.indices = MagicMock() interface.search_engine.indices.exists = MagicMock(return_value=False) interface.search_engine.indices.create = MagicMock() interface.create_index() - interface.search_engine.indices.exists.assert_called_once_with(index="querido-diario2") + interface.search_engine.indices.exists.assert_called_once_with( + index="querido-diario2" + ) interface.search_engine.indices.create.assert_called_once_with( index="querido-diario2", body={"mappings": {"properties": {"date": {"type": "date"}}}}, @@ -207,7 +209,6 @@ def test_opensearch_should_create_index_with_default_value_with_function_has_no_ @patch("opensearchpy.Opensearch", autospec=True) def test_upload_document_to_index(self, opensearch_mock): interface = OpenSearchInterface(["127.0.0.1"]) - document_checksum = str(uuid.uuid1()) interface.index_document(self.fake_document, "querido-diario") interface.search_engine.index.assert_called_once_with( index="querido-diario", @@ -217,10 +218,7 @@ def test_upload_document_to_index(self, opensearch_mock): @patch("opensearchpy.Opensearch", autospec=True) def test_upload_document_to_index_using_default_index(self, opensearch_mock): - interface = OpenSearchInterface( - ["127.0.0.1"], default_index="querido-diario2" - ) - document_checksum = str(uuid.uuid1()) + interface = OpenSearchInterface(["127.0.0.1"], default_index="querido-diario2") interface.index_document(self.fake_document) interface.search_engine.index.assert_called_once_with( index="querido-diario2", diff --git a/tests/postgresql.py b/tests/postgresql.py index 6677173..70fd81a 100644 --- a/tests/postgresql.py +++ b/tests/postgresql.py @@ -1,7 +1,7 @@ import os import uuid from datetime import date, datetime -from unittest import TestCase, expectedFailure +from unittest import TestCase from unittest.mock import patch import psycopg2 @@ -71,7 +71,6 @@ def test_postgresql_connection(self): class PostgreSQLTests(TestCase): - _data = [] def setUp(self): @@ -267,7 +266,7 @@ def set_some_fake_data_as_ingested_by_the_system_and_no_need_to_be_processed(sel def get_gazettes_pending_to_be_processed(self): for gazette in self._data: - if gazette["processed"] == False: + if not gazette["processed"]: yield gazette def clean_database(self): diff --git a/tests/text_extraction_task_tests.py b/tests/text_extraction_task_tests.py index 5c1a3fe..078ce1b 100644 --- a/tests/text_extraction_task_tests.py +++ b/tests/text_extraction_task_tests.py @@ -1,9 +1,8 @@ -from unittest import TestCase -from unittest.mock import MagicMock, patch import os -import logging -from datetime import date, datetime import tempfile +from datetime import date, datetime +from unittest import TestCase +from unittest.mock import MagicMock, patch from data_extraction import TextExtractorInterface from tasks import ( @@ -155,9 +154,7 @@ def test_indexed_document_should_contain_gazette_content(self): database_mock.get_pending_gazettes = MagicMock(return_value=data) database_mock.set_gazette_as_processed = MagicMock() - tmp_gazette_file = self.copy_file_to_temporary_file( - "tests/data/fake_gazette.txt" - ) + self.copy_file_to_temporary_file("tests/data/fake_gazette.txt") text_extraction_function = MagicMock(spec=TextExtractorInterface) text_extraction_function.extract_text.return_value = expected_data[ "source_text" @@ -177,7 +174,6 @@ def file_should_not_exist(self, file_to_check): ) def test_invalid_file_type_should_be_skipped(self): - text_extraction_function = MagicMock(spec=TextExtractorInterface) text_extraction_function.extract_text.side_effect = Exception( "Unsupported file type" diff --git a/tests/text_extraction_tests.py b/tests/text_extraction_tests.py index 2091650..9c30a40 100644 --- a/tests/text_extraction_tests.py +++ b/tests/text_extraction_tests.py @@ -1,8 +1,11 @@ -from unittest import TestCase, skip -from unittest.mock import patch, mock_open, MagicMock -import os +from unittest import TestCase +from unittest.mock import MagicMock, mock_open, patch -from data_extraction import ApacheTikaTextExtractor, TextExtractorInterface +from data_extraction import ( + ApacheTikaTextExtractor, + TextExtractorInterface, + create_apache_tika_text_extraction, +) class ApacheTikaTextExtractorTest(TestCase): @@ -40,7 +43,7 @@ def test_request_reponse_return(self, magic_mock, open_mock, request_get_mock): @patch("magic.from_file", return_value="application/pdf") def test_odt_file_content_extraction(self, magic_mock, open_mock, request_get_mock): with self.assertRaisesRegex(Exception, "Could not extract file content"): - text = self.extractor.extract_text("tests/data/fake_gazette.pdf") + self.extractor.extract_text("tests/data/fake_gazette.pdf") def test_extract_from_pdf_file_should_return_text_file(self): text = self.extractor.extract_text("tests/data/fake_gazette.pdf")