Skip to content

Commit

Permalink
Adiciona pre-commit, corrige e formata código
Browse files Browse the repository at this point in the history
  • Loading branch information
ogecece committed Aug 23, 2024
1 parent 915a3b9 commit dbf3abd
Show file tree
Hide file tree
Showing 46 changed files with 311 additions and 179 deletions.
30 changes: 30 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
repos:
- repo: /~https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
hooks:
- id: check-case-conflict
fail_fast: true
- id: check-merge-conflict
fail_fast: true
- id: debug-statements
fail_fast: true
- id: detect-aws-credentials
fail_fast: true
args: ["--allow-missing-credentials"]
- id: detect-private-key
fail_fast: true

- repo: /~https://github.com/astral-sh/ruff-pre-commit
rev: 'v0.6.2'
hooks:
- id: ruff
fail_fast: true
args: ["--fix", "--ignore", "E501", "--select", "I"]
- id: ruff-format

- repo: /~https://github.com/Lucas-C/pre-commit-hooks-safety
rev: v1.3.3
hooks:
- id: python-safety-dependencies-check
fail_fast: true
files: requirements\/[a-z]+\.txt
6 changes: 6 additions & 0 deletions data_extraction/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
from .interfaces import TextExtractorInterface
from .text_extraction import ApacheTikaTextExtractor, create_apache_tika_text_extraction

__all__ = [
"ApacheTikaTextExtractor",
"create_apache_tika_text_extraction",
"TextExtractorInterface",
]
1 change: 1 addition & 0 deletions data_extraction/interfaces.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import abc


class TextExtractorInterface(abc.ABC):
@abc.abstractmethod
def extract_text(self, filepath: str) -> str:
Expand Down
3 changes: 1 addition & 2 deletions data_extraction/text_extraction.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import logging
import magic
import os
import subprocess

import magic
import requests

from .interfaces import TextExtractorInterface
Expand Down
6 changes: 6 additions & 0 deletions database/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
from .interfaces import DatabaseInterface
from .postgresql import PostgreSQL, create_database_interface

__all__ = [
"create_database_interface",
"DatabaseInterface",
"PostgreSQL",
]
2 changes: 1 addition & 1 deletion database/interfaces.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from typing import Dict, Iterable, Tuple
import abc
from typing import Dict, Iterable, Tuple


class DatabaseInterface(abc.ABC):
Expand Down
16 changes: 8 additions & 8 deletions database/postgresql.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Dict, Iterable, Tuple
import os
import logging
import os
from typing import Dict, Iterable, Tuple

import psycopg2

Expand Down Expand Up @@ -59,16 +59,16 @@ def select(self, command: str) -> Iterable[Tuple]:
logging.debug(f"Finished query: {cursor.query}")

def insert(self, command: str, data: Dict = {}):
logging.debug(f"Inserting:")
logging.debug("Inserting:")
self._commit_changes(command, data)
logging.debug(f"Finished inserting")
logging.debug("Finished inserting")

def update(self, command: str, data: Dict = {}):
logging.debug(f"Updating:")
logging.debug("Updating:")
self._commit_changes(command, data)
logging.debug(f"Finished updating")
logging.debug("Finished updating")

def delete(self, command: str, data: Dict = {}):
logging.debug(f"Deleting:")
logging.debug("Deleting:")
self._commit_changes(command, data)
logging.debug(f"Finished deleting")
logging.debug("Finished deleting")
5 changes: 5 additions & 0 deletions index/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,7 @@
from .interfaces import IndexInterface
from .opensearch import create_index_interface

__all__ = [
"create_index_interface",
"IndexInterface",
]
4 changes: 1 addition & 3 deletions index/interfaces.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from typing import Dict, Iterable
import abc
from typing import Dict, Iterable


class IndexInterface(abc.ABC):
Expand Down Expand Up @@ -40,5 +40,3 @@ def paginated_search(
"""
Searches the index with the provided query, with pagination
"""


33 changes: 27 additions & 6 deletions index/opensearch.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,23 @@
from typing import Dict, Iterable, List, Union
import os
from typing import Dict, Iterable, List, Union

import opensearchpy

from .interfaces import IndexInterface


class OpenSearchInterface(IndexInterface):
def __init__(self, hosts: List, user: str, password: str, timeout: int = 30, default_index: str = ""):
self._search_engine = opensearchpy.OpenSearch(hosts=hosts, http_auth=(user, password))
def __init__(
self,
hosts: List,
user: str,
password: str,
timeout: int = 30,
default_index: str = "",
):
self._search_engine = opensearchpy.OpenSearch(
hosts=hosts, http_auth=(user, password)
)
self._timeout = timeout
self._default_index = default_index

Expand Down Expand Up @@ -51,7 +60,9 @@ def index_document(
refresh: bool = False,
) -> None:
index = self.get_index_name(index)
self._search_engine.index(index=index, body=document, id=document_id, refresh=refresh)
self._search_engine.index(
index=index, body=document, id=document_id, refresh=refresh
)

def search(self, query: Dict, index: str = "") -> Dict:
index = self.get_index_name(index)
Expand All @@ -60,7 +71,9 @@ def search(self, query: Dict, index: str = "") -> Dict:

def analyze(self, text: str, field: str, index: str = "") -> Dict:
index = self.get_index_name(index)
result = self._search_engine.indices.analyze(body={"text": text, "field":field}, index=index)
result = self._search_engine.indices.analyze(
body={"text": text, "field": field}, index=index
)
return result

def paginated_search(
Expand Down Expand Up @@ -96,17 +109,25 @@ def get_opensearch_host():
def get_opensearch_index():
return os.environ["OPENSEARCH_INDEX"]


def get_opensearch_user():
return os.environ["OPENSEARCH_USER"]


def get_opensearch_password():
return os.environ["OPENSEARCH_PASSWORD"]


def create_index_interface() -> IndexInterface:
hosts = get_opensearch_host()
if not isinstance(hosts, str) or len(hosts) == 0:
raise Exception("Missing index hosts")
default_index_name = get_opensearch_index()
if not isinstance(default_index_name, str) or len(default_index_name) == 0:
raise Exception("Invalid index name")
return OpenSearchInterface([hosts], get_opensearch_user(), get_opensearch_password(), default_index=default_index_name)
return OpenSearchInterface(
[hosts],
get_opensearch_user(),
get_opensearch_password(),
default_index=default_index_name,
)
7 changes: 6 additions & 1 deletion main/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
from .__main__ import (
is_debug_enabled,
enable_debug_if_necessary,
is_debug_enabled,
)

__all__ = [
"is_debug_enabled",
"enable_debug_if_necessary",
]
24 changes: 18 additions & 6 deletions main/__main__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from os import environ
import argparse
import logging
from os import environ

from data_extraction import create_apache_tika_text_extraction
from database import create_database_interface
from storage import create_storage_interface
from index import create_index_interface
from storage import create_storage_interface
from tasks import run_task


Expand Down Expand Up @@ -37,12 +37,24 @@ def gazette_texts_pipeline():

run_task("create_gazettes_index", index)
territories = run_task("get_territories", database)
gazettes_to_be_processed = run_task("get_gazettes_to_be_processed", execution_mode, database)
indexed_gazette_ids = run_task("extract_text_from_gazettes", gazettes_to_be_processed, territories, database, storage, index, text_extractor)

gazettes_to_be_processed = run_task(
"get_gazettes_to_be_processed", execution_mode, database
)
indexed_gazette_ids = run_task(
"extract_text_from_gazettes",
gazettes_to_be_processed,
territories,
database,
storage,
index,
text_extractor,
)

for theme in themes:
run_task("create_themed_excerpts_index", theme, index)
themed_excerpt_ids = run_task("extract_themed_excerpts_from_gazettes", theme, indexed_gazette_ids, index)
themed_excerpt_ids = run_task(
"extract_themed_excerpts_from_gazettes", theme, indexed_gazette_ids, index
)
run_task("embedding_rerank_excerpts", theme, themed_excerpt_ids, index)
run_task("tag_entities_in_excerpts", theme, themed_excerpt_ids, index)

Expand Down
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pre-commit==3.5.0
2 changes: 1 addition & 1 deletion segmentation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

__all__ = [
"get_segmenter",
]
]
4 changes: 2 additions & 2 deletions segmentation/base/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from .gazette_segment import GazetteSegment
from .association_segmenter import AssociationSegmenter
from .gazette_segment import GazetteSegment

__all__ = [
"GazetteSegment",
"AssociationSegmenter",
]
]
10 changes: 7 additions & 3 deletions segmentation/base/association_segmenter.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,23 @@
from typing import Any, Dict, Iterable, List, Union

from segmentation.base import GazetteSegment


class AssociationSegmenter:
def __init__(self, territories: Iterable[Dict[str, Any]]):
self.territories = territories

def get_gazette_segments(self, *args, **kwargs) -> List[Union[GazetteSegment, Dict]]:
def get_gazette_segments(
self, *args, **kwargs
) -> List[Union[GazetteSegment, Dict]]:
"""
Returns a list of GazetteSegment
"""
raise NotImplementedError

def split_text_by_territory(self, *args, **kwargs) -> Union[Dict[str, str], List[str]]:
def split_text_by_territory(
self, *args, **kwargs
) -> Union[Dict[str, str], List[str]]:
"""
Segment a association text by territory
and returns a list of text segments
Expand All @@ -24,4 +29,3 @@ def build_segment(self, *args, **kwargs) -> GazetteSegment:
Returns a GazetteSegment
"""
raise NotImplementedError

5 changes: 3 additions & 2 deletions segmentation/base/gazette_segment.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from datetime import date, datetime
from dataclasses import dataclass
from datetime import date, datetime


@dataclass
Expand All @@ -8,6 +8,7 @@ class GazetteSegment:
Dataclass to represent a gazette segment of a association
related to a city
"""

id: str
territory_name: str
source_text: str
Expand All @@ -24,4 +25,4 @@ class GazetteSegment:
state_code: str
territory_id: str
file_raw_txt: str
url: str
url: str
7 changes: 4 additions & 3 deletions segmentation/factory.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
from typing import Any, Dict, Iterable

from segmentation.base import AssociationSegmenter
from segmentation import segmenters

from segmentation.base import AssociationSegmenter

_segmenter_instances = {}


def get_segmenter(territory_id: str, territories: Iterable[Dict[str, Any]]) -> AssociationSegmenter:
def get_segmenter(
territory_id: str, territories: Iterable[Dict[str, Any]]
) -> AssociationSegmenter:
"""
Factory method to return a AssociationSegmenter
Expand Down
2 changes: 1 addition & 1 deletion segmentation/segmenters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

__all__ = [
"ALAssociacaoMunicipiosSegmenter",
]
]
24 changes: 13 additions & 11 deletions segmentation/segmenters/al_associacao_municipios.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import re
import logging

import re
from typing import Any, Dict, List

from segmentation.base import AssociationSegmenter, GazetteSegment
from tasks.utils import batched, get_checksum, get_territory_data, get_territory_slug

Expand Down Expand Up @@ -64,15 +64,17 @@ def build_segment(
)
territory_data = get_territory_data(territory_slug, self.territories)

return GazetteSegment(**{
**gazette,
# segment specific values
"processed": True,
"file_checksum": get_checksum(segment_text),
"source_text": segment_text.strip(),
"territory_name": territory_data["territory_name"],
"territory_id": territory_data["id"],
})
return GazetteSegment(
**{
**gazette,
# segment specific values
"processed": True,
"file_checksum": get_checksum(segment_text),
"source_text": segment_text.strip(),
"territory_name": territory_data["territory_name"],
"territory_id": territory_data["id"],
}
)

def _normalize_territory_name(self, territory_name: str) -> str:
clean_name = territory_name.strip().replace("\n", "")
Expand Down
6 changes: 6 additions & 0 deletions storage/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,8 @@
from .digital_ocean_spaces import DigitalOceanSpaces, create_storage_interface
from .interfaces import StorageInterface

__all__ = [
"create_storage_interface",
"DigitalOceanSpaces",
"StorageInterface",
]
Loading

0 comments on commit dbf3abd

Please sign in to comment.