-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add UUID & number traits, & modify elevation terms
- Loading branch information
1 parent
94a4cb8
commit d03ad72
Showing
7 changed files
with
242 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
import unittest | ||
|
||
from tests.setup import parse | ||
from traiter.pylib.rules.date_ import Date | ||
from traiter.pylib.rules.uuid import Uuid | ||
|
||
|
||
class TestUuid(unittest.TestCase): | ||
def test_uuid_01(self): | ||
self.assertEqual( | ||
parse("c701563b-dbd9-4500-184f-1ad61eb8da11"), | ||
[ | ||
Uuid( | ||
uuid="c701563b-dbd9-4500-184f-1ad61eb8da11", | ||
trait="uuid", | ||
start=0, | ||
end=36, | ||
), | ||
], | ||
) | ||
|
||
def test_uuid_02(self): | ||
self.assertEqual( | ||
parse( | ||
'{"created": "2014-10-29", "relatedresourceid": ' | ||
'"eeba8b10-040e-4477-a0a6-870102b56234;' | ||
'abbf14f5-1a7c-48f6-8f2f-2a8af53c8c86"}' | ||
), | ||
[ | ||
Date( | ||
trait="date", | ||
start=13, | ||
end=23, | ||
date="2014-10-29", | ||
), | ||
Uuid( | ||
uuid="eeba8b10-040e-4477-a0a6-870102b56234", | ||
trait="uuid", | ||
start=48, | ||
end=84, | ||
), | ||
Uuid( | ||
uuid="abbf14f5-1a7c-48f6-8f2f-2a8af53c8c86", | ||
trait="uuid", | ||
start=85, | ||
end=121, | ||
), | ||
], | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
import re | ||
from dataclasses import dataclass | ||
|
||
from spacy import Language, registry | ||
|
||
from traiter.pylib import const as t_const | ||
from traiter.pylib.darwin_core import DarwinCore | ||
from traiter.pylib.pattern_compiler import Compiler | ||
from traiter.pylib.pipes import add | ||
from traiter.pylib.rules.base import Base | ||
from traiter.pylib.util import to_positive_float as as_float | ||
|
||
FLOAT_RE: str = r"\d{1,4}(\.\d{,3})?" | ||
FLOAT3_RE: str = r"\d{3}(\.\d{,3})?" | ||
INT_RE: str = r"\d{1,4}" | ||
DEC_RE: str = r"\.\d{1,3}" | ||
|
||
FACT_LEN = 2 | ||
|
||
# This pipe is used multiple times | ||
NUMBER_COUNT = 0 # Used to rename the Number pipe | ||
|
||
|
||
@dataclass(eq=False) | ||
class Number(Base): | ||
number: float = None | ||
is_fraction: bool = None | ||
|
||
def to_dwc(self, dwc) -> DarwinCore: | ||
return dwc.add_dyn() | ||
|
||
@classmethod | ||
def pipe(cls, nlp: Language, _overwrite: list[str] | None = None): | ||
global NUMBER_COUNT | ||
NUMBER_COUNT += 1 | ||
add.trait_pipe( | ||
nlp, name=f"fraction_{NUMBER_COUNT}", compiler=cls.fraction_patterns() | ||
) | ||
# add.debug_tokens(nlp) # ########################################### | ||
|
||
add.trait_pipe( | ||
nlp, name=f"number_{NUMBER_COUNT}", compiler=cls.number_patterns() | ||
) | ||
# add.debug_tokens(nlp) # ########################################### | ||
|
||
@classmethod | ||
def number_patterns(cls): | ||
decoder = { | ||
",": {"TEXT": {"IN": t_const.COMMA}}, | ||
"99.0": {"LOWER": {"REGEX": f"^{FLOAT_RE}+$"}}, | ||
"999.0": {"LOWER": {"REGEX": f"^{FLOAT3_RE}+$"}}, | ||
"99": {"LOWER": {"REGEX": f"^{INT_RE}+$"}}, | ||
".99": {"LOWER": {"REGEX": f"^{DEC_RE}+$"}}, | ||
} | ||
return [ | ||
Compiler( | ||
label="number", | ||
keep="number", | ||
on_match="number_match", | ||
decoder=decoder, | ||
patterns=[ | ||
" 99.0 ", | ||
" 99 , 999.0 ", | ||
" .99 ", | ||
], | ||
), | ||
] | ||
|
||
@classmethod | ||
def fraction_patterns(cls): | ||
decoder = { | ||
"/": {"TEXT": {"IN": t_const.SLASH}}, | ||
"99": {"LOWER": {"REGEX": f"^{INT_RE}+$"}}, | ||
} | ||
return [ | ||
Compiler( | ||
label="number", | ||
keep="number", | ||
on_match="fract_match", | ||
decoder=decoder, | ||
patterns=[ | ||
" 99 / 99 ", | ||
" 99 99 / 99 ", | ||
], | ||
), | ||
] | ||
|
||
@classmethod | ||
def number_match(cls, ent): | ||
number = as_float(ent.text) | ||
trait = cls.from_ent(ent, number=number) | ||
ent[0]._.trait = trait | ||
ent[0]._.flag = "number" | ||
return trait | ||
|
||
@classmethod | ||
def fract_match(cls, ent): | ||
numbers = [as_float(t.text) for t in ent if re.match(INT_RE, t.text)] | ||
|
||
number = numbers[-2] / numbers[-1] # Calculate the fraction part | ||
# Add in the whole number part | ||
number += numbers[0] if len(numbers) > FACT_LEN else 0.0 | ||
|
||
trait = cls.from_ent(ent, number=number, is_fraction=True) | ||
|
||
ent[0]._.trait = trait | ||
ent[0]._.flag = "number" | ||
|
||
return trait | ||
|
||
|
||
@registry.misc("number_match") | ||
def number_match(ent): | ||
return Number.number_match(ent) | ||
|
||
|
||
@registry.misc("fract_match") | ||
def fract_match(ent): | ||
return Number.fract_match(ent) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
import re | ||
from dataclasses import dataclass | ||
from typing import ClassVar | ||
|
||
from spacy import Language, registry | ||
|
||
from traiter.pylib import const as t_const | ||
from traiter.pylib.darwin_core import DarwinCore | ||
from traiter.pylib.pattern_compiler import Compiler | ||
from traiter.pylib.pipes import add, reject_match | ||
from traiter.pylib.rules.base import Base | ||
|
||
|
||
@dataclass(eq=False) | ||
class Uuid(Base): | ||
# Class vars ---------- | ||
hx: ClassVar[str] = "[0-9A-Fa-f]" | ||
whole: ClassVar[str] = rf"^{hx}{{8}}-{hx}{{4}}-{hx}{{4}}-{hx}{{4}}-{hx}{{12}}$" | ||
# --------------------- | ||
|
||
uuid: str = None | ||
|
||
def to_dwc(self, dwc) -> DarwinCore: | ||
return dwc.add_dyn(UUID=self.uuid) | ||
|
||
@classmethod | ||
def pipe(cls, nlp: Language, _overwrite: list[str] | None = None): | ||
add.trait_pipe(nlp, name="uuid_patterns", compiler=cls.uuid_patterns()) | ||
# add.debug_tokens(nlp) # ########################################### | ||
|
||
@classmethod | ||
def uuid_patterns(cls): | ||
decoder = { | ||
"-": {"TEXT": {"IN": t_const.DASH}}, | ||
"hex": {"LOWER": {"REGEX": f"^{cls.hx}+$"}, "OP": "+"}, | ||
} | ||
return [ | ||
Compiler( | ||
label="uuid", | ||
keep="uuid", | ||
on_match="uuid_match", | ||
decoder=decoder, | ||
patterns=[ | ||
" hex - hex - hex - hex - hex ", | ||
], | ||
), | ||
] | ||
|
||
@classmethod | ||
def uuid_match(cls, ent): | ||
if not re.search(cls.whole, ent.text): | ||
raise reject_match.RejectMatch | ||
return cls.from_ent(ent, uuid=ent.text) | ||
|
||
|
||
@registry.misc("uuid_match") | ||
def uuid_match(ent): | ||
return Uuid.uuid_match(ent) |