Skip to content

Commit

Permalink
Update term utilities
Browse files Browse the repository at this point in the history
  • Loading branch information
rafelafrance committed Mar 19, 2024
1 parent bd1ccc7 commit fe91d6c
Show file tree
Hide file tree
Showing 9 changed files with 94 additions and 29 deletions.
4 changes: 2 additions & 2 deletions traiter/pylib/rules/color.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
class Color(Base):
# Class vars ----------
color_csv: ClassVar[Path] = Path(__file__).parent / "terms" / "color_terms.csv"
replace: ClassVar[dict[str, str]] = term_util.term_data(color_csv, "replace")
remove: ClassVar[dict[str, int]] = term_util.term_data(color_csv, "remove", int)
replace: ClassVar[dict[str, str]] = term_util.look_up_table(color_csv, "replace")
remove: ClassVar[dict[str, int]] = term_util.look_up_table(color_csv, "remove", int)
# ---------------------

color: str = None
Expand Down
2 changes: 1 addition & 1 deletion traiter/pylib/rules/date_.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class Date(Base):
month_csv: ClassVar[Path] = Path(__file__).parent / "terms" / "month_terms.csv"
all_csvs: ClassVar[list[Path]] = [date_csv, month_csv]
sep: ClassVar[str] = "(.,;/_'-"
replace: ClassVar[dict[str, str]] = term_util.term_data(all_csvs, "replace")
replace: ClassVar[dict[str, str]] = term_util.look_up_table(all_csvs, "replace")
# ---------------------

date: str = None
Expand Down
4 changes: 2 additions & 2 deletions traiter/pylib/rules/elevation.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ class Elevation(Base):
tic_csv: ClassVar[Path] = Path(__file__).parent / "terms" / "unit_tic_terms.csv"
all_csvs: ClassVar[list[Path]] = [elevation_csv, unit_csv, about_csv, tic_csv]

replace: ClassVar[dict[str, str]] = term_util.term_data(all_csvs, "replace")
factors_cm: ClassVar[dict[str, float]] = term_util.term_data(
replace: ClassVar[dict[str, str]] = term_util.look_up_table(all_csvs, "replace")
factors_cm: ClassVar[dict[str, float]] = term_util.look_up_table(
(unit_csv, tic_csv),
"factor_cm",
float,
Expand Down
2 changes: 1 addition & 1 deletion traiter/pylib/rules/habitat.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
class Habitat(Base):
# Class vars ----------
habitat_csv: ClassVar[Path] = Path(__file__).parent / "terms" / "habitat_terms.csv"
replace: ClassVar[dict[str, str]] = term_util.term_data(habitat_csv, "replace")
replace: ClassVar[dict[str, str]] = term_util.look_up_table(habitat_csv, "replace")
sep: ClassVar[str] = "/,-"
# ---------------------

Expand Down
27 changes: 12 additions & 15 deletions traiter/pylib/rules/lat_long.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ class LatLong(Base):
)
unit_csv: ClassVar[Path] = Path(__file__).parent / "terms" / "unit_length_terms.csv"
all_csvs: ClassVar[list[Path]] = [lat_long_csv, unit_csv, datum_csv]
replace: ClassVar[dict[str, str]] = term_util.term_data(all_csvs, "replace")
factors_cm: ClassVar[dict[str, float]] = term_util.term_data(
replace: ClassVar[dict[str, str]] = term_util.look_up_table(all_csvs, "replace")
factors_cm: ClassVar[dict[str, float]] = term_util.look_up_table(
unit_csv,
"factor_cm",
float,
Expand Down Expand Up @@ -149,12 +149,12 @@ def lat_long_patterns(cls):
),
(
"label* sp? [-]? 99.0 deg 99.0? min* -? 99.0? sec* dir? ,* sp? "
" [-]? 99.0 deg 99.0? min* -? 99.0? sec* dir? "
" [-]? 99.0 deg 99.0? min* -? 99.0? sec* dir? "
"(? datum* )?"
),
(
"label* sp? [-]? 99.0 deg 99.0? min* -? 99.0? 's ,* sp? "
" [-]? 99.0 deg 99.0? min* -? 99.0? sec* dir? ,* "
" [-]? 99.0 deg 99.0? min* -? 99.0? sec* dir? ,* "
"(? datum* )?"
),
"label* sp? [-]? 99.99 dir? ,* sp? [-]? 99.99 dir? ,* (? datum* )?",
Expand All @@ -181,10 +181,9 @@ def lat_long_plus_patterns(cls):
}
return [
Compiler(
label="lat_long_uncertain",
id="lat_long",
on_match="lat_long_uncertain",
keep=["lat_long"],
label="lat_long",
on_match="lat_long_plus",
keep="lat_long",
decoder=decoder,
patterns=[
"lat_long+ ,* datum_label+ ,* (? datum+ )?",
Expand Down Expand Up @@ -246,7 +245,7 @@ def lat_long_match(cls, ent):
return trait

@classmethod
def lat_long_uncertain(cls, ent):
def lat_long_plus(cls, ent):
value = 0.0
unit = []
datum = []
Expand Down Expand Up @@ -290,16 +289,14 @@ def lat_long_uncertain(cls, ent):
datum = "".join(datum)
kwargs["datum"] = cls.replace.get(datum, datum) if datum else None

trait = super().from_ent(ent, **kwargs)
trait.trait = "lat_long"
return trait
return LatLong.from_ent(ent, **kwargs)


@registry.misc("lat_long_match")
def lat_long_match(ent):
return LatLong.lat_long_match(ent)


@registry.misc("lat_long_uncertain")
def lat_long_uncertain(ent):
return LatLong.lat_long_uncertain(ent)
@registry.misc("lat_long_plus")
def lat_long_plus(ent):
return LatLong.lat_long_plus(ent)
51 changes: 51 additions & 0 deletions traiter/pylib/rules/number.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import re
from dataclasses import dataclass
from pathlib import Path
from typing import ClassVar

from spacy import Language, registry

from traiter.pylib import const as t_const
from traiter.pylib import term_util
from traiter.pylib.darwin_core import DarwinCore
from traiter.pylib.pattern_compiler import Compiler
from traiter.pylib.pipes import add
Expand All @@ -23,8 +26,16 @@

@dataclass(eq=False)
class Number(Base):
# Class vars ----------
csv: ClassVar[Path] = Path(__file__).parent / "terms" / "numeric_terms.csv"
numeric_terms: ClassVar[list[dict]] = term_util.read_terms(csv)
words: ClassVar[list[dict]] = term_util.filter_labels(numeric_terms, "number_word")
replace: ClassVar[dict[str, str]] = term_util.term_patterns(words, "replace", int)
# ---------------------

number: float = None
is_fraction: bool = None
is_word: bool = None

def to_dwc(self, dwc) -> DarwinCore:
return dwc.add_dyn()
Expand All @@ -33,11 +44,17 @@ def to_dwc(self, dwc) -> DarwinCore:
def pipe(cls, nlp: Language, _overwrite: list[str] | None = None):
global NUMBER_COUNT
NUMBER_COUNT += 1

add.trait_pipe(
nlp, name=f"fraction_{NUMBER_COUNT}", compiler=cls.fraction_patterns()
)
# add.debug_tokens(nlp) # ###########################################

add.trait_pipe(
nlp, name=f"number_word_{NUMBER_COUNT}", compiler=cls.number_word_patterns()
)
# add.debug_tokens(nlp) # ###########################################

add.trait_pipe(
nlp, name=f"number_{NUMBER_COUNT}", compiler=cls.number_patterns()
)
Expand Down Expand Up @@ -66,6 +83,23 @@ def number_patterns(cls):
),
]

@classmethod
def number_word_patterns(cls):
decoder = {
"word": {"ENT_TYPE": "number_word"},
}
return [
Compiler(
label="number",
keep="number",
on_match="number_word_match",
decoder=decoder,
patterns=[
" word ",
],
),
]

@classmethod
def fraction_patterns(cls):
decoder = {
Expand Down Expand Up @@ -108,12 +142,29 @@ def fract_match(cls, ent):

return trait

@classmethod
def number_word_match(cls, ent):
word = ent.text.lower()
number = cls.replace.get(word)

trait = cls.from_ent(ent, number=number, is_word=True)

ent[0]._.trait = trait
ent[0]._.flag = "number"

return trait


@registry.misc("number_match")
def number_match(ent):
return Number.number_match(ent)


@registry.misc("number_word_match")
def number_word_match(ent):
return Number.number_word_match(ent)


@registry.misc("fract_match")
def fract_match(ent):
return Number.fract_match(ent)
2 changes: 1 addition & 1 deletion traiter/pylib/rules/trs.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
class TRS(Base):
# Class vars ----------
trs_csv: ClassVar[Path] = Path(__file__).parent / "terms" / "trs_terms.csv"
replace: ClassVar[dict[str, str]] = term_util.term_data([trs_csv], "replace")
replace: ClassVar[dict[str, str]] = term_util.look_up_table([trs_csv], "replace")
dir_: ClassVar[str] = """((north|east|south|west)(ing)?|[nesw])"""
min_len: ClassVar[int] = 2
# ---------------------
Expand Down
2 changes: 1 addition & 1 deletion traiter/pylib/rules/utm.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class UTM(Base):
utm_csv: ClassVar[Path] = Path(__file__).parent / "terms" / "utm_terms.csv"
unit_csv: ClassVar[Path] = Path(__file__).parent / "terms" / "unit_length_terms.csv"
all_csvs: ClassVar[list[Path]] = [utm_csv, datum_csv, unit_csv]
replace: ClassVar[dict[str, str]] = term_util.term_data(all_csvs, "replace")
replace: ClassVar[dict[str, str]] = term_util.look_up_table(all_csvs, "replace")
dir_: ClassVar[str] = """((north|east|south|west)(ing)?|[nesw])"""
# ---------------------

Expand Down
29 changes: 23 additions & 6 deletions traiter/pylib/term_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,30 @@
from zipfile import ZipFile


def term_data(
def look_up_table(
csv_path: Path | Iterable[Path],
field: str,
type_=None,
) -> dict[str, Any]:
paths = csv_path if isinstance(csv_path, Iterable) else [csv_path]
type_ = type_ if type_ else str
data = {}
for path in paths:
terms = read_terms(path)
for term in terms:
value = term.get(field)
if value not in (None, ""):
data[term["pattern"]] = type_(value)
data |= term_patterns(terms, field, type_)
return data


def term_patterns(
terms: list[dict[str, Any]],
field: str,
type_=None,
) -> dict[str, Any]:
type_ = type_ if type_ else str
data = {}
for term in terms:
value = term.get(field)
if value not in (None, ""):
data[term["pattern"]] = type_(value)
return data


Expand All @@ -43,6 +53,13 @@ def get_labels(
def delete_terms(terms: list, patterns: list[str] | str) -> list:
patterns = patterns if isinstance(patterns, list) else patterns.split()
terms = [t for t in terms if t["pattern"] not in patterns]
return terms


def filter_labels(terms: list, keep: list[str] | str) -> list:
keep = keep if isinstance(keep, list) else keep.split()
terms = [t for t in terms if t["label"] in keep]
return terms


def labels_to_remove(
Expand Down

0 comments on commit fe91d6c

Please sign in to comment.