Skip to content

Commit

Permalink
Add UUID & number traits, & modify elevation terms
Browse files Browse the repository at this point in the history
  • Loading branch information
rafelafrance committed Mar 14, 2024
1 parent 94a4cb8 commit d03ad72
Show file tree
Hide file tree
Showing 7 changed files with 242 additions and 3 deletions.
49 changes: 49 additions & 0 deletions tests/rules/test_uuid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import unittest

from tests.setup import parse
from traiter.pylib.rules.date_ import Date
from traiter.pylib.rules.uuid import Uuid


class TestUuid(unittest.TestCase):
def test_uuid_01(self):
self.assertEqual(
parse("c701563b-dbd9-4500-184f-1ad61eb8da11"),
[
Uuid(
uuid="c701563b-dbd9-4500-184f-1ad61eb8da11",
trait="uuid",
start=0,
end=36,
),
],
)

def test_uuid_02(self):
self.assertEqual(
parse(
'{"created": "2014-10-29", "relatedresourceid": '
'"eeba8b10-040e-4477-a0a6-870102b56234;'
'abbf14f5-1a7c-48f6-8f2f-2a8af53c8c86"}'
),
[
Date(
trait="date",
start=13,
end=23,
date="2014-10-29",
),
Uuid(
uuid="eeba8b10-040e-4477-a0a6-870102b56234",
trait="uuid",
start=48,
end=84,
),
Uuid(
uuid="abbf14f5-1a7c-48f6-8f2f-2a8af53c8c86",
trait="uuid",
start=85,
end=121,
),
],
)
2 changes: 2 additions & 0 deletions traiter/pylib/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from traiter.pylib.rules.lat_long import LatLong
from traiter.pylib.rules.trs import TRS
from traiter.pylib.rules.utm import UTM
from traiter.pylib.rules.uuid import Uuid


def build():
Expand All @@ -20,6 +21,7 @@ def build():
nlp.add_pipe(sentence.SENTENCES, before="parser")

Color.pipe(nlp)
Uuid.pipe(nlp)
Date.pipe(nlp)
Elevation.pipe(nlp)
LatLong.pipe(nlp)
Expand Down
1 change: 1 addition & 0 deletions traiter/pylib/pipes/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
The default Spacy tokenizer works great for model-based parsing but sometimes causes
complications for rule-based parsers.
"""

import csv
import re
import string
Expand Down
7 changes: 4 additions & 3 deletions traiter/pylib/rules/elevation.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
@dataclass(eq=False)
class Elevation(Base):
# Class vars ----------
float_re: ClassVar[str] = r"^(\d[\d,.]+)\Z"
float_re: ClassVar[str] = r"^(\d[\d,.]+)$"
all_units: ClassVar[list[str]] = ["metric_length", "imperial_length"]
elevation_csv: ClassVar[Path] = (
Path(__file__).parent / "terms" / "elevation_terms.csv"
Expand Down Expand Up @@ -65,6 +65,7 @@ def pipe(cls, nlp: Language):
name="elevation_patterns",
compiler=cls.elevation_compilers(),
)
# add.debug_tokens(nlp) # ##########################################
add.cleanup_pipe(nlp, name="elevation_cleanup")

@classmethod
Expand All @@ -81,8 +82,8 @@ def elevation_compilers(cls):
"-/to": {"LOWER": {"IN": [*const.DASH, "to", "_"]}, "OP": "+"},
"/": {"TEXT": {"IN": const.SLASH}},
"99": {"TEXT": {"REGEX": cls.float_re}},
":": {"TEXT": {"REGEX": rf"^{label_ender}+\Z"}},
",": {"TEXT": {"REGEX": rf"^{label_ender}+\Z"}},
":": {"TEXT": {"REGEX": rf"^{label_ender}+$"}},
",": {"TEXT": {"REGEX": rf"^{label_ender}+$"}},
"about": {"ENT_TYPE": "about_term"},
"label": {"ENT_TYPE": "elev_label"},
"m": {"ENT_TYPE": {"IN": cls.all_units}},
Expand Down
119 changes: 119 additions & 0 deletions traiter/pylib/rules/number.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import re
from dataclasses import dataclass

from spacy import Language, registry

from traiter.pylib import const as t_const
from traiter.pylib.darwin_core import DarwinCore
from traiter.pylib.pattern_compiler import Compiler
from traiter.pylib.pipes import add
from traiter.pylib.rules.base import Base
from traiter.pylib.util import to_positive_float as as_float

FLOAT_RE: str = r"\d{1,4}(\.\d{,3})?"
FLOAT3_RE: str = r"\d{3}(\.\d{,3})?"
INT_RE: str = r"\d{1,4}"
DEC_RE: str = r"\.\d{1,3}"

FACT_LEN = 2

# This pipe is used multiple times
NUMBER_COUNT = 0 # Used to rename the Number pipe


@dataclass(eq=False)
class Number(Base):
number: float = None
is_fraction: bool = None

def to_dwc(self, dwc) -> DarwinCore:
return dwc.add_dyn()

@classmethod
def pipe(cls, nlp: Language, _overwrite: list[str] | None = None):
global NUMBER_COUNT
NUMBER_COUNT += 1
add.trait_pipe(
nlp, name=f"fraction_{NUMBER_COUNT}", compiler=cls.fraction_patterns()
)
# add.debug_tokens(nlp) # ###########################################

add.trait_pipe(
nlp, name=f"number_{NUMBER_COUNT}", compiler=cls.number_patterns()
)
# add.debug_tokens(nlp) # ###########################################

@classmethod
def number_patterns(cls):
decoder = {
",": {"TEXT": {"IN": t_const.COMMA}},
"99.0": {"LOWER": {"REGEX": f"^{FLOAT_RE}+$"}},
"999.0": {"LOWER": {"REGEX": f"^{FLOAT3_RE}+$"}},
"99": {"LOWER": {"REGEX": f"^{INT_RE}+$"}},
".99": {"LOWER": {"REGEX": f"^{DEC_RE}+$"}},
}
return [
Compiler(
label="number",
keep="number",
on_match="number_match",
decoder=decoder,
patterns=[
" 99.0 ",
" 99 , 999.0 ",
" .99 ",
],
),
]

@classmethod
def fraction_patterns(cls):
decoder = {
"/": {"TEXT": {"IN": t_const.SLASH}},
"99": {"LOWER": {"REGEX": f"^{INT_RE}+$"}},
}
return [
Compiler(
label="number",
keep="number",
on_match="fract_match",
decoder=decoder,
patterns=[
" 99 / 99 ",
" 99 99 / 99 ",
],
),
]

@classmethod
def number_match(cls, ent):
number = as_float(ent.text)
trait = cls.from_ent(ent, number=number)
ent[0]._.trait = trait
ent[0]._.flag = "number"
return trait

@classmethod
def fract_match(cls, ent):
numbers = [as_float(t.text) for t in ent if re.match(INT_RE, t.text)]

number = numbers[-2] / numbers[-1] # Calculate the fraction part
# Add in the whole number part
number += numbers[0] if len(numbers) > FACT_LEN else 0.0

trait = cls.from_ent(ent, number=number, is_fraction=True)

ent[0]._.trait = trait
ent[0]._.flag = "number"

return trait


@registry.misc("number_match")
def number_match(ent):
return Number.number_match(ent)


@registry.misc("fract_match")
def fract_match(ent):
return Number.fract_match(ent)
9 changes: 9 additions & 0 deletions traiter/pylib/rules/terms/elevation_terms.csv
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,16 @@ elev_label,cc.
elev_label,el
elev_label,el.
elev_label,elev
elev_label,elev g.t.
elev_label,elev g . t.
elev_label,elev g . t .
elev_label,elev.
elev_label,elev. g.t.
elev_label,elev. g . t.
elev_label,elev. g . t .
elev_label,elevation
elev_label,elevation g.t.
elev_label,elevation g . t.
elev_label,elevation g . t .
elev_label,pass
elev_label,pass.
58 changes: 58 additions & 0 deletions traiter/pylib/rules/uuid.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import re
from dataclasses import dataclass
from typing import ClassVar

from spacy import Language, registry

from traiter.pylib import const as t_const
from traiter.pylib.darwin_core import DarwinCore
from traiter.pylib.pattern_compiler import Compiler
from traiter.pylib.pipes import add, reject_match
from traiter.pylib.rules.base import Base


@dataclass(eq=False)
class Uuid(Base):
# Class vars ----------
hx: ClassVar[str] = "[0-9A-Fa-f]"
whole: ClassVar[str] = rf"^{hx}{{8}}-{hx}{{4}}-{hx}{{4}}-{hx}{{4}}-{hx}{{12}}$"
# ---------------------

uuid: str = None

def to_dwc(self, dwc) -> DarwinCore:
return dwc.add_dyn(UUID=self.uuid)

@classmethod
def pipe(cls, nlp: Language, _overwrite: list[str] | None = None):
add.trait_pipe(nlp, name="uuid_patterns", compiler=cls.uuid_patterns())
# add.debug_tokens(nlp) # ###########################################

@classmethod
def uuid_patterns(cls):
decoder = {
"-": {"TEXT": {"IN": t_const.DASH}},
"hex": {"LOWER": {"REGEX": f"^{cls.hx}+$"}, "OP": "+"},
}
return [
Compiler(
label="uuid",
keep="uuid",
on_match="uuid_match",
decoder=decoder,
patterns=[
" hex - hex - hex - hex - hex ",
],
),
]

@classmethod
def uuid_match(cls, ent):
if not re.search(cls.whole, ent.text):
raise reject_match.RejectMatch
return cls.from_ent(ent, uuid=ent.text)


@registry.misc("uuid_match")
def uuid_match(ent):
return Uuid.uuid_match(ent)

0 comments on commit d03ad72

Please sign in to comment.