Update term utilities

rafelafrance · Mar 19, 2024 · fe91d6c · fe91d6c
1 parent bd1ccc7
commit fe91d6c
Show file tree

Hide file tree

Showing 9 changed files with 94 additions and 29 deletions.
diff --git a/traiter/pylib/rules/color.py b/traiter/pylib/rules/color.py
@@ -17,8 +17,8 @@
 class Color(Base):
     # Class vars ----------
     color_csv: ClassVar[Path] = Path(__file__).parent / "terms" / "color_terms.csv"
-    replace: ClassVar[dict[str, str]] = term_util.term_data(color_csv, "replace")
-    remove: ClassVar[dict[str, int]] = term_util.term_data(color_csv, "remove", int)
+    replace: ClassVar[dict[str, str]] = term_util.look_up_table(color_csv, "replace")
+    remove: ClassVar[dict[str, int]] = term_util.look_up_table(color_csv, "remove", int)
     # ---------------------
 
     color: str = None

diff --git a/traiter/pylib/rules/date_.py b/traiter/pylib/rules/date_.py
@@ -25,7 +25,7 @@ class Date(Base):
     month_csv: ClassVar[Path] = Path(__file__).parent / "terms" / "month_terms.csv"
     all_csvs: ClassVar[list[Path]] = [date_csv, month_csv]
     sep: ClassVar[str] = "(.,;/_'-"
-    replace: ClassVar[dict[str, str]] = term_util.term_data(all_csvs, "replace")
+    replace: ClassVar[dict[str, str]] = term_util.look_up_table(all_csvs, "replace")
     # ---------------------
 
     date: str = None

diff --git a/traiter/pylib/rules/elevation.py b/traiter/pylib/rules/elevation.py
@@ -28,8 +28,8 @@ class Elevation(Base):
     tic_csv: ClassVar[Path] = Path(__file__).parent / "terms" / "unit_tic_terms.csv"
     all_csvs: ClassVar[list[Path]] = [elevation_csv, unit_csv, about_csv, tic_csv]
 
-    replace: ClassVar[dict[str, str]] = term_util.term_data(all_csvs, "replace")
-    factors_cm: ClassVar[dict[str, float]] = term_util.term_data(
+    replace: ClassVar[dict[str, str]] = term_util.look_up_table(all_csvs, "replace")
+    factors_cm: ClassVar[dict[str, float]] = term_util.look_up_table(
         (unit_csv, tic_csv),
         "factor_cm",
         float,

diff --git a/traiter/pylib/rules/habitat.py b/traiter/pylib/rules/habitat.py
@@ -18,7 +18,7 @@
 class Habitat(Base):
     # Class vars ----------
     habitat_csv: ClassVar[Path] = Path(__file__).parent / "terms" / "habitat_terms.csv"
-    replace: ClassVar[dict[str, str]] = term_util.term_data(habitat_csv, "replace")
+    replace: ClassVar[dict[str, str]] = term_util.look_up_table(habitat_csv, "replace")
     sep: ClassVar[str] = "/,-"
     # ---------------------
 

diff --git a/traiter/pylib/rules/lat_long.py b/traiter/pylib/rules/lat_long.py
@@ -30,8 +30,8 @@ class LatLong(Base):
     )
     unit_csv: ClassVar[Path] = Path(__file__).parent / "terms" / "unit_length_terms.csv"
     all_csvs: ClassVar[list[Path]] = [lat_long_csv, unit_csv, datum_csv]
-    replace: ClassVar[dict[str, str]] = term_util.term_data(all_csvs, "replace")
-    factors_cm: ClassVar[dict[str, float]] = term_util.term_data(
+    replace: ClassVar[dict[str, str]] = term_util.look_up_table(all_csvs, "replace")
+    factors_cm: ClassVar[dict[str, float]] = term_util.look_up_table(
         unit_csv,
         "factor_cm",
         float,
@@ -149,12 +149,12 @@ def lat_long_patterns(cls):
                     ),
                     (
                         "label* sp? [-]? 99.0 deg 99.0? min* -? 99.0? sec* dir? ,* sp? "
-                        "          [-]? 99.0 deg 99.0? min* -? 99.0? sec* dir? "
+                        "           [-]? 99.0 deg 99.0? min* -? 99.0? sec* dir? "
                         "(? datum* )?"
                     ),
                     (
                         "label* sp? [-]? 99.0 deg 99.0? min* -? 99.0? 's ,* sp? "
-                        "          [-]? 99.0 deg 99.0? min* -? 99.0? sec* dir? ,* "
+                        "           [-]? 99.0 deg 99.0? min* -? 99.0? sec* dir? ,* "
                         "(? datum* )?"
                     ),
                     "label* sp? [-]? 99.99 dir? ,* sp? [-]? 99.99 dir? ,* (? datum* )?",
@@ -181,10 +181,9 @@ def lat_long_plus_patterns(cls):
         }
         return [
             Compiler(
-                label="lat_long_uncertain",
-                id="lat_long",
-                on_match="lat_long_uncertain",
-                keep=["lat_long"],
+                label="lat_long",
+                on_match="lat_long_plus",
+                keep="lat_long",
                 decoder=decoder,
                 patterns=[
                     "lat_long+                         ,* datum_label+ ,* (? datum+ )?",
@@ -246,7 +245,7 @@ def lat_long_match(cls, ent):
         return trait
 
     @classmethod
-    def lat_long_uncertain(cls, ent):
+    def lat_long_plus(cls, ent):
         value = 0.0
         unit = []
         datum = []
@@ -290,16 +289,14 @@ def lat_long_uncertain(cls, ent):
         datum = "".join(datum)
         kwargs["datum"] = cls.replace.get(datum, datum) if datum else None
 
-        trait = super().from_ent(ent, **kwargs)
-        trait.trait = "lat_long"
-        return trait
+        return LatLong.from_ent(ent, **kwargs)
 
 
 @registry.misc("lat_long_match")
 def lat_long_match(ent):
     return LatLong.lat_long_match(ent)
 
 
-@registry.misc("lat_long_uncertain")
-def lat_long_uncertain(ent):
-    return LatLong.lat_long_uncertain(ent)
+@registry.misc("lat_long_plus")
+def lat_long_plus(ent):
+    return LatLong.lat_long_plus(ent)
diff --git a/traiter/pylib/rules/number.py b/traiter/pylib/rules/number.py
@@ -1,9 +1,12 @@
 import re
 from dataclasses import dataclass
+from pathlib import Path
+from typing import ClassVar
 
 from spacy import Language, registry
 
 from traiter.pylib import const as t_const
+from traiter.pylib import term_util
 from traiter.pylib.darwin_core import DarwinCore
 from traiter.pylib.pattern_compiler import Compiler
 from traiter.pylib.pipes import add
@@ -23,8 +26,16 @@
 
 @dataclass(eq=False)
 class Number(Base):
+    # Class vars ----------
+    csv: ClassVar[Path] = Path(__file__).parent / "terms" / "numeric_terms.csv"
+    numeric_terms: ClassVar[list[dict]] = term_util.read_terms(csv)
+    words: ClassVar[list[dict]] = term_util.filter_labels(numeric_terms, "number_word")
+    replace: ClassVar[dict[str, str]] = term_util.term_patterns(words, "replace", int)
+    # ---------------------
+
     number: float = None
     is_fraction: bool = None
+    is_word: bool = None
 
     def to_dwc(self, dwc) -> DarwinCore:
         return dwc.add_dyn()
@@ -33,11 +44,17 @@ def to_dwc(self, dwc) -> DarwinCore:
     def pipe(cls, nlp: Language, _overwrite: list[str] | None = None):
         global NUMBER_COUNT
         NUMBER_COUNT += 1
+
         add.trait_pipe(
             nlp, name=f"fraction_{NUMBER_COUNT}", compiler=cls.fraction_patterns()
         )
         # add.debug_tokens(nlp)  # ###########################################
 
+        add.trait_pipe(
+            nlp, name=f"number_word_{NUMBER_COUNT}", compiler=cls.number_word_patterns()
+        )
+        # add.debug_tokens(nlp)  # ###########################################
+
         add.trait_pipe(
             nlp, name=f"number_{NUMBER_COUNT}", compiler=cls.number_patterns()
         )
@@ -66,6 +83,23 @@ def number_patterns(cls):
             ),
         ]
 
+    @classmethod
+    def number_word_patterns(cls):
+        decoder = {
+            "word": {"ENT_TYPE": "number_word"},
+        }
+        return [
+            Compiler(
+                label="number",
+                keep="number",
+                on_match="number_word_match",
+                decoder=decoder,
+                patterns=[
+                    " word ",
+                ],
+            ),
+        ]
+
     @classmethod
     def fraction_patterns(cls):
         decoder = {
@@ -108,12 +142,29 @@ def fract_match(cls, ent):
 
         return trait
 
+    @classmethod
+    def number_word_match(cls, ent):
+        word = ent.text.lower()
+        number = cls.replace.get(word)
+
+        trait = cls.from_ent(ent, number=number, is_word=True)
+
+        ent[0]._.trait = trait
+        ent[0]._.flag = "number"
+
+        return trait
+
 
 @registry.misc("number_match")
 def number_match(ent):
     return Number.number_match(ent)
 
 
+@registry.misc("number_word_match")
+def number_word_match(ent):
+    return Number.number_word_match(ent)
+
+
 @registry.misc("fract_match")
 def fract_match(ent):
     return Number.fract_match(ent)
diff --git a/traiter/pylib/rules/trs.py b/traiter/pylib/rules/trs.py
@@ -18,7 +18,7 @@
 class TRS(Base):
     # Class vars ----------
     trs_csv: ClassVar[Path] = Path(__file__).parent / "terms" / "trs_terms.csv"
-    replace: ClassVar[dict[str, str]] = term_util.term_data([trs_csv], "replace")
+    replace: ClassVar[dict[str, str]] = term_util.look_up_table([trs_csv], "replace")
     dir_: ClassVar[str] = """((north|east|south|west)(ing)?|[nesw])"""
     min_len: ClassVar[int] = 2
     # ---------------------

diff --git a/traiter/pylib/rules/utm.py b/traiter/pylib/rules/utm.py
@@ -23,7 +23,7 @@ class UTM(Base):
     utm_csv: ClassVar[Path] = Path(__file__).parent / "terms" / "utm_terms.csv"
     unit_csv: ClassVar[Path] = Path(__file__).parent / "terms" / "unit_length_terms.csv"
     all_csvs: ClassVar[list[Path]] = [utm_csv, datum_csv, unit_csv]
-    replace: ClassVar[dict[str, str]] = term_util.term_data(all_csvs, "replace")
+    replace: ClassVar[dict[str, str]] = term_util.look_up_table(all_csvs, "replace")
     dir_: ClassVar[str] = """((north|east|south|west)(ing)?|[nesw])"""
     # ---------------------
 

diff --git a/traiter/pylib/term_util.py b/traiter/pylib/term_util.py
@@ -6,20 +6,30 @@
 from zipfile import ZipFile
 
 
-def term_data(
+def look_up_table(
     csv_path: Path | Iterable[Path],
     field: str,
     type_=None,
 ) -> dict[str, Any]:
     paths = csv_path if isinstance(csv_path, Iterable) else [csv_path]
-    type_ = type_ if type_ else str
     data = {}
     for path in paths:
         terms = read_terms(path)
-        for term in terms:
-            value = term.get(field)
-            if value not in (None, ""):
-                data[term["pattern"]] = type_(value)
+        data |= term_patterns(terms, field, type_)
+    return data
+
+
+def term_patterns(
+    terms: list[dict[str, Any]],
+    field: str,
+    type_=None,
+) -> dict[str, Any]:
+    type_ = type_ if type_ else str
+    data = {}
+    for term in terms:
+        value = term.get(field)
+        if value not in (None, ""):
+            data[term["pattern"]] = type_(value)
     return data
 
 
@@ -43,6 +53,13 @@ def get_labels(
 def delete_terms(terms: list, patterns: list[str] | str) -> list:
     patterns = patterns if isinstance(patterns, list) else patterns.split()
     terms = [t for t in terms if t["pattern"] not in patterns]
+    return terms
+
+
+def filter_labels(terms: list, keep: list[str] | str) -> list:
+    keep = keep if isinstance(keep, list) else keep.split()
+    terms = [t for t in terms if t["label"] in keep]
+    return terms
 
 
 def labels_to_remove(