Add UUID & number traits, & modify elevation terms

rafelafrance · Mar 14, 2024 · d03ad72 · d03ad72
1 parent 94a4cb8
commit d03ad72
Show file tree

Hide file tree

Showing 7 changed files with 242 additions and 3 deletions.
diff --git a/tests/rules/test_uuid.py b/tests/rules/test_uuid.py
@@ -0,0 +1,49 @@
+import unittest
+
+from tests.setup import parse
+from traiter.pylib.rules.date_ import Date
+from traiter.pylib.rules.uuid import Uuid
+
+
+class TestUuid(unittest.TestCase):
+    def test_uuid_01(self):
+        self.assertEqual(
+            parse("c701563b-dbd9-4500-184f-1ad61eb8da11"),
+            [
+                Uuid(
+                    uuid="c701563b-dbd9-4500-184f-1ad61eb8da11",
+                    trait="uuid",
+                    start=0,
+                    end=36,
+                ),
+            ],
+        )
+
+    def test_uuid_02(self):
+        self.assertEqual(
+            parse(
+                '{"created": "2014-10-29", "relatedresourceid": '
+                '"eeba8b10-040e-4477-a0a6-870102b56234;'
+                'abbf14f5-1a7c-48f6-8f2f-2a8af53c8c86"}'
+            ),
+            [
+                Date(
+                    trait="date",
+                    start=13,
+                    end=23,
+                    date="2014-10-29",
+                ),
+                Uuid(
+                    uuid="eeba8b10-040e-4477-a0a6-870102b56234",
+                    trait="uuid",
+                    start=48,
+                    end=84,
+                ),
+                Uuid(
+                    uuid="abbf14f5-1a7c-48f6-8f2f-2a8af53c8c86",
+                    trait="uuid",
+                    start=85,
+                    end=121,
+                ),
+            ],
+        )
diff --git a/traiter/pylib/pipeline.py b/traiter/pylib/pipeline.py
@@ -8,6 +8,7 @@
 from traiter.pylib.rules.lat_long import LatLong
 from traiter.pylib.rules.trs import TRS
 from traiter.pylib.rules.utm import UTM
+from traiter.pylib.rules.uuid import Uuid
 
 
 def build():
@@ -20,6 +21,7 @@ def build():
     nlp.add_pipe(sentence.SENTENCES, before="parser")
 
     Color.pipe(nlp)
+    Uuid.pipe(nlp)
     Date.pipe(nlp)
     Elevation.pipe(nlp)
     LatLong.pipe(nlp)

diff --git a/traiter/pylib/pipes/tokenizer.py b/traiter/pylib/pipes/tokenizer.py
@@ -4,6 +4,7 @@
 The default Spacy tokenizer works great for model-based parsing but sometimes causes
 complications for rule-based parsers.
 """
+
 import csv
 import re
 import string

diff --git a/traiter/pylib/rules/elevation.py b/traiter/pylib/rules/elevation.py
@@ -18,7 +18,7 @@
 @dataclass(eq=False)
 class Elevation(Base):
     # Class vars ----------
-    float_re: ClassVar[str] = r"^(\d[\d,.]+)\Z"
+    float_re: ClassVar[str] = r"^(\d[\d,.]+)$"
     all_units: ClassVar[list[str]] = ["metric_length", "imperial_length"]
     elevation_csv: ClassVar[Path] = (
         Path(__file__).parent / "terms" / "elevation_terms.csv"
@@ -65,6 +65,7 @@ def pipe(cls, nlp: Language):
             name="elevation_patterns",
             compiler=cls.elevation_compilers(),
         )
+        # add.debug_tokens(nlp)  # ##########################################
         add.cleanup_pipe(nlp, name="elevation_cleanup")
 
     @classmethod
@@ -81,8 +82,8 @@ def elevation_compilers(cls):
                     "-/to": {"LOWER": {"IN": [*const.DASH, "to", "_"]}, "OP": "+"},
                     "/": {"TEXT": {"IN": const.SLASH}},
                     "99": {"TEXT": {"REGEX": cls.float_re}},
-                    ":": {"TEXT": {"REGEX": rf"^{label_ender}+\Z"}},
-                    ",": {"TEXT": {"REGEX": rf"^{label_ender}+\Z"}},
+                    ":": {"TEXT": {"REGEX": rf"^{label_ender}+$"}},
+                    ",": {"TEXT": {"REGEX": rf"^{label_ender}+$"}},
                     "about": {"ENT_TYPE": "about_term"},
                     "label": {"ENT_TYPE": "elev_label"},
                     "m": {"ENT_TYPE": {"IN": cls.all_units}},

diff --git a/traiter/pylib/rules/number.py b/traiter/pylib/rules/number.py
@@ -0,0 +1,119 @@
+import re
+from dataclasses import dataclass
+
+from spacy import Language, registry
+
+from traiter.pylib import const as t_const
+from traiter.pylib.darwin_core import DarwinCore
+from traiter.pylib.pattern_compiler import Compiler
+from traiter.pylib.pipes import add
+from traiter.pylib.rules.base import Base
+from traiter.pylib.util import to_positive_float as as_float
+
+FLOAT_RE: str = r"\d{1,4}(\.\d{,3})?"
+FLOAT3_RE: str = r"\d{3}(\.\d{,3})?"
+INT_RE: str = r"\d{1,4}"
+DEC_RE: str = r"\.\d{1,3}"
+
+FACT_LEN = 2
+
+# This pipe is used multiple times
+NUMBER_COUNT = 0  # Used to rename the Number pipe
+
+
+@dataclass(eq=False)
+class Number(Base):
+    number: float = None
+    is_fraction: bool = None
+
+    def to_dwc(self, dwc) -> DarwinCore:
+        return dwc.add_dyn()
+
+    @classmethod
+    def pipe(cls, nlp: Language, _overwrite: list[str] | None = None):
+        global NUMBER_COUNT
+        NUMBER_COUNT += 1
+        add.trait_pipe(
+            nlp, name=f"fraction_{NUMBER_COUNT}", compiler=cls.fraction_patterns()
+        )
+        # add.debug_tokens(nlp)  # ###########################################
+
+        add.trait_pipe(
+            nlp, name=f"number_{NUMBER_COUNT}", compiler=cls.number_patterns()
+        )
+        # add.debug_tokens(nlp)  # ###########################################
+
+    @classmethod
+    def number_patterns(cls):
+        decoder = {
+            ",": {"TEXT": {"IN": t_const.COMMA}},
+            "99.0": {"LOWER": {"REGEX": f"^{FLOAT_RE}+$"}},
+            "999.0": {"LOWER": {"REGEX": f"^{FLOAT3_RE}+$"}},
+            "99": {"LOWER": {"REGEX": f"^{INT_RE}+$"}},
+            ".99": {"LOWER": {"REGEX": f"^{DEC_RE}+$"}},
+        }
+        return [
+            Compiler(
+                label="number",
+                keep="number",
+                on_match="number_match",
+                decoder=decoder,
+                patterns=[
+                    " 99.0 ",
+                    " 99 , 999.0 ",
+                    " .99 ",
+                ],
+            ),
+        ]
+
+    @classmethod
+    def fraction_patterns(cls):
+        decoder = {
+            "/": {"TEXT": {"IN": t_const.SLASH}},
+            "99": {"LOWER": {"REGEX": f"^{INT_RE}+$"}},
+        }
+        return [
+            Compiler(
+                label="number",
+                keep="number",
+                on_match="fract_match",
+                decoder=decoder,
+                patterns=[
+                    "    99 / 99 ",
+                    " 99 99 / 99 ",
+                ],
+            ),
+        ]
+
+    @classmethod
+    def number_match(cls, ent):
+        number = as_float(ent.text)
+        trait = cls.from_ent(ent, number=number)
+        ent[0]._.trait = trait
+        ent[0]._.flag = "number"
+        return trait
+
+    @classmethod
+    def fract_match(cls, ent):
+        numbers = [as_float(t.text) for t in ent if re.match(INT_RE, t.text)]
+
+        number = numbers[-2] / numbers[-1]  # Calculate the fraction part
+        # Add in the whole number part
+        number += numbers[0] if len(numbers) > FACT_LEN else 0.0
+
+        trait = cls.from_ent(ent, number=number, is_fraction=True)
+
+        ent[0]._.trait = trait
+        ent[0]._.flag = "number"
+
+        return trait
+
+
+@registry.misc("number_match")
+def number_match(ent):
+    return Number.number_match(ent)
+
+
+@registry.misc("fract_match")
+def fract_match(ent):
+    return Number.fract_match(ent)
diff --git a/traiter/pylib/rules/terms/elevation_terms.csv b/traiter/pylib/rules/terms/elevation_terms.csv
@@ -11,7 +11,16 @@ elev_label,cc.
 elev_label,el
 elev_label,el.
 elev_label,elev
+elev_label,elev g.t.
+elev_label,elev g . t.
+elev_label,elev g . t .
 elev_label,elev.
+elev_label,elev. g.t.
+elev_label,elev. g . t.
+elev_label,elev. g . t .
 elev_label,elevation
+elev_label,elevation g.t.
+elev_label,elevation g . t.
+elev_label,elevation g . t .
 elev_label,pass
 elev_label,pass.
diff --git a/traiter/pylib/rules/uuid.py b/traiter/pylib/rules/uuid.py
@@ -0,0 +1,58 @@
+import re
+from dataclasses import dataclass
+from typing import ClassVar
+
+from spacy import Language, registry
+
+from traiter.pylib import const as t_const
+from traiter.pylib.darwin_core import DarwinCore
+from traiter.pylib.pattern_compiler import Compiler
+from traiter.pylib.pipes import add, reject_match
+from traiter.pylib.rules.base import Base
+
+
+@dataclass(eq=False)
+class Uuid(Base):
+    # Class vars ----------
+    hx: ClassVar[str] = "[0-9A-Fa-f]"
+    whole: ClassVar[str] = rf"^{hx}{{8}}-{hx}{{4}}-{hx}{{4}}-{hx}{{4}}-{hx}{{12}}$"
+    # ---------------------
+
+    uuid: str = None
+
+    def to_dwc(self, dwc) -> DarwinCore:
+        return dwc.add_dyn(UUID=self.uuid)
+
+    @classmethod
+    def pipe(cls, nlp: Language, _overwrite: list[str] | None = None):
+        add.trait_pipe(nlp, name="uuid_patterns", compiler=cls.uuid_patterns())
+        # add.debug_tokens(nlp)  # ###########################################
+
+    @classmethod
+    def uuid_patterns(cls):
+        decoder = {
+            "-": {"TEXT": {"IN": t_const.DASH}},
+            "hex": {"LOWER": {"REGEX": f"^{cls.hx}+$"}, "OP": "+"},
+        }
+        return [
+            Compiler(
+                label="uuid",
+                keep="uuid",
+                on_match="uuid_match",
+                decoder=decoder,
+                patterns=[
+                    " hex - hex - hex - hex - hex ",
+                ],
+            ),
+        ]
+
+    @classmethod
+    def uuid_match(cls, ent):
+        if not re.search(cls.whole, ent.text):
+            raise reject_match.RejectMatch
+        return cls.from_ent(ent, uuid=ent.text)
+
+
+@registry.misc("uuid_match")
+def uuid_match(ent):
+    return Uuid.uuid_match(ent)