Skip to content

Commit

Permalink
Fixing vocabularies and parsers
Browse files Browse the repository at this point in the history
  • Loading branch information
rafelafrance committed Feb 26, 2024
1 parent f31b65a commit 476fd64
Show file tree
Hide file tree
Showing 9 changed files with 75 additions and 43 deletions.
26 changes: 3 additions & 23 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,32 +3,23 @@
.spyproject/
.vscode/
.idea/
.mypy_cache/
.~*
*-journal

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# Linters
.pylintrc
.ruff_cache/
.mypy_cache/

# Environment
.env
Expand All @@ -53,14 +44,3 @@ data/
temp/
*junk*
old/

# Test
.pytest_cache/

# Other code
args/
src/

# Editor files
.~*
*-journal
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ This repository is a library for other Traiter projects and is not designed to b

There are tests which you can run like so:
```bash
export MOCK_DATA=1; python -m unittest discover
export MOCK_DATA=1; python -m unittest discover
```

Please `export MOCK_DATA=0` before you run any scripts on real data.
13 changes: 13 additions & 0 deletions tests/rules/test_date.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,3 +297,16 @@ def test_date_22(self):
),
],
)

def test_date_23(self):
self.assertEqual(
parse("Date 9-IV-1977"),
[
Date(
date="1977-04-09",
trait="date",
start=0,
end=14,
),
],
)
6 changes: 3 additions & 3 deletions tests/rules/test_lat_long.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def test_lat_long_06(self):
lat_long="""N 33 deg 27' 33", W 111 deg 56' 35\"""",
trait="lat_long",
start=0,
end=35,
end=36,
),
],
)
Expand Down Expand Up @@ -179,7 +179,7 @@ def test_lat_long_12(self):
parse("""Lat. 13.5° - 14°55'S Long. 60.2° - 61°50'W."""),
[
LatLong(
lat_long="""Lat. 13.5° -14° 55'S Long. 60.2° -61° 50' W.""",
lat_long="""Lat. 13.5° -14° 55'S Long. 60.2° -61° 50' W""",
trait="lat_long",
start=0,
end=43,
Expand Down Expand Up @@ -211,7 +211,7 @@ def test_lat_long_14(self):
lat_long="N41° 50.046’ W087° 54.172’",
trait="lat_long",
start=0,
end=26,
end=27,
),
],
)
Expand Down
5 changes: 2 additions & 3 deletions traiter/pylib/rules/date_.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@ class Date(Base):
# Class vars ----------
date_csv: ClassVar[Path] = Path(__file__).parent / "terms" / "date_terms.csv"
month_csv: ClassVar[Path] = Path(__file__).parent / "terms" / "month_terms.csv"
numeric_csv: ClassVar[Path] = Path(__file__).parent / "terms" / "numeric_terms.csv"
all_csvs: ClassVar[list[Path]] = [date_csv, month_csv, numeric_csv]
all_csvs: ClassVar[list[Path]] = [date_csv, month_csv]
sep: ClassVar[str] = "(.,;/_'-"
replace: ClassVar[dict[str, str]] = term_util.term_data(all_csvs, "replace")
# ---------------------
Expand Down Expand Up @@ -105,7 +104,7 @@ def date_match(cls, ent):
century_adjust = None

for token in ent:
# Get the numeric parts
# Get numeric parts, they're sometimes smashed together into 1 token
if re.match(rf"^[\d{cls.sep}]+$", token.text):
parts = [p for p in re.split(rf"[{cls.sep}]+", token.text) if p]
if parts:
Expand Down
28 changes: 16 additions & 12 deletions traiter/pylib/rules/lat_long.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,20 +62,22 @@ def key(self):
def pipe(cls, nlp: Language):
add.term_pipe(nlp, name="lat_long_terms", path=cls.all_csvs)
add.trait_pipe(nlp, name="lat_long_patterns", compiler=cls.lat_long_patterns())
# add.debug_tokens(nlp) # #############################################
add.trait_pipe(
nlp,
name="lat_long_plus_patterns",
overwrite=["lat_long"],
compiler=cls.lat_long_plus_patterns(),
)
# add.debug_tokens(nlp) # #############################################
add.cleanup_pipe(nlp, name="lat_long_cleanup")

@classmethod
def lat_long_patterns(cls):
decoder = {
"(": {"TEXT": {"IN": const.OPEN}},
")": {"TEXT": {"IN": const.CLOSE}},
",": {"TEXT": {"REGEX": r"^[,;._:]\Z"}},
",": {"TEXT": {"REGEX": r"^[,;._:]$"}},
"/": {"TEXT": {"IN": const.SLASH}},
"-": {"TEXT": {"IN": const.DASH}},
"'s": {"LOWER": "'s"},
Expand Down Expand Up @@ -109,21 +111,22 @@ def lat_long_patterns(cls):
patterns=[
(
"label* sp? [-]? 99.0 deg 99.0? min* 99.0? sec* ,* sp? "
" [-]? 99.0 deg 99.0? min* 99.0? sec* (? datum* )?"
" [-]? 99.0 deg 99.0? min* 99.0? sec* ,* (? datum* )?"
),
(
"label* sp? [-]? 99.0 deg* 99.0? min* 99.0? sec* dir ,* sp? "
" [-]? 99.0 deg* 99.0? min* 99.0? sec* dir "
" [-]? 99.0 deg* 99.0? min* 99.0? sec* dir ,* "
"(? datum* )?"
),
(
"label* sp? dir [-]? 99.0 deg* 99.0? min* 99.0? sec* ,* sp? "
" dir [-]? 99.0 deg* 99.0? min* 99.0? sec* "
" dir [-]? 99.0 deg* 99.0? min* 99.0? sec* ,* "
"(? datum* )?"
),
(
"key ,* [-]? 99.0 deg* 99.0? min* 99.0? sec* dir? ,* sp? "
"key ,* [-]? 99.0 deg* 99.0? min* 99.0? sec* dir? (? datum* )?"
"key ,* [-]? 99.0 deg* 99.0? min* 99.0? sec* dir? ,* "
"(? datum* )?"
),
(
"[-]? 99.0 deg* 99.0? min* 99.0? sec* dir? key ,* sp? "
Expand All @@ -133,11 +136,11 @@ def lat_long_patterns(cls):
"key ,* [-]? 99.0 deg* 99.0? min* 99.0? sec* dir? [-] sp? "
"99.0 deg* 99.0? min* 99.0? sec* dir? ,* sp? "
"key ,* [-]? 99.0 deg* 99.0? min* 99.0? sec* dir? [-] sp? "
"99.0 deg* 99.0? min* 99.0? sec* dir? (? datum* )?"
"99.0 deg* 99.0? min* 99.0? sec* dir? ,* (? datum* )?"
),
(
"label* sp? dir99.0 deg* 99.0? min* 99.0? sec* ,* sp? "
" dir99.0 deg* 99.0? min* 99.0? sec* (? datum* )?"
" dir99.0 deg* 99.0? min* 99.0? sec* ,* (? datum* )?"
),
(
"label* sp? [-]? 99.0 deg 99.0? min* -? 99.0? sec* dir? ,* sp? "
Expand All @@ -146,10 +149,10 @@ def lat_long_patterns(cls):
),
(
"label* sp? [-]? 99.0 deg 99.0? min* -? 99.0? 's ,* sp? "
" [-]? 99.0 deg 99.0? min* -? 99.0? sec* dir? "
" [-]? 99.0 deg 99.0? min* -? 99.0? sec* dir? ,* "
"(? datum* )?"
),
"label* sp? [-]? 99.99 dir? ,* sp? [-]? 99.99 dir? (? datum* )?",
"label* sp? [-]? 99.99 dir? ,* sp? [-]? 99.99 dir? ,* (? datum* )?",
],
),
]
Expand All @@ -158,14 +161,14 @@ def lat_long_patterns(cls):
def lat_long_plus_patterns(cls):
decoder = {
"-": {"TEXT": {"IN": const.DASH}},
",": {"TEXT": {"REGEX": r"^[,;._:]\Z"}},
",": {"TEXT": {"REGEX": r"^[,;._:]$"}},
"(": {"TEXT": {"IN": const.OPEN}},
")": {"TEXT": {"IN": const.CLOSE}},
"datum": {"ENT_TYPE": "datum"},
"datum_label": {"ENT_TYPE": "datum_label"},
"m": {"ENT_TYPE": {"IN": ["metric_length", "imperial_length"]}},
"99.0": {"TEXT": {"REGEX": rf"^{cls.float_re}$"}},
"+99.0": {"TEXT": {"REGEX": r"^(±|\+|-)?\d+(\.\d+)?\Z"}},
"+99.0": {"TEXT": {"REGEX": r"^(±|\+|-)?\d+(\.\d+)?$"}},
"uncert": {"ENT_TYPE": "uncertain_label"},
"lat_long": {"ENT_TYPE": "lat_long"},
"[+]": {"TEXT": {"REGEX": cls.plus}},
Expand All @@ -179,7 +182,7 @@ def lat_long_plus_patterns(cls):
keep=["lat_long"],
decoder=decoder,
patterns=[
"lat_long+ datum_label+ ,* (? datum+ )?",
"lat_long+ ,* datum_label+ ,* (? datum+ )?",
"lat_long+ ,? uncert? ,? +99.0 m ,* datum_label* ,* (? datum* )?",
(
"lat_long+ ,? uncert? ,? [+]* 99.0 m ,* "
Expand All @@ -200,6 +203,7 @@ def format_coords(cls, frags):
coords = re.sub(r"\s(:)", r"\1", coords)
coords = re.sub(r"(?<=\d)([NESWnesw])", r" \1", coords)
coords = re.sub(r"-\s(?=\d)", r"-", coords)
coords = re.sub(r"\s*[,;._:]\s*$", "", coords)
return " ".join(coords.split())

@classmethod
Expand Down
12 changes: 12 additions & 0 deletions traiter/pylib/rules/terms/month_terms.csv
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,15 @@ month,SEPT,text,September
month,SOFT,text,September
month,SOFT.,text,September
month,september,lower,September
roman,i,lower,January
roman,ii,lower,February
roman,iii,lower,March
roman,iv,lower,April
roman,ix,lower,September
roman,v,lower,May
roman,vi,lower,June
roman,vii,lower,July
roman,viii,lower,August
roman,X,text,October
roman,xi,lower,Novenber
roman,xii,lower,December
25 changes: 24 additions & 1 deletion traiter/pylib/rules/terms/name_terms.csv
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ not_name,biological,lower,
not_name,bitter,lower,
not_name,blvd,lower,
not_name,blvd.,lower,
not_name,botanic,lower,
not_name,botanical,lower,
not_name,bottom,lower,
not_name,boulevard,lower,
Expand All @@ -107,7 +108,6 @@ not_name,cooperative,lower,
not_name,coordinates,lower,
not_name,copyight,lower,
not_name,cottage,lower,
not_name,county,lower,
not_name,cultivation,lower,
not_name,data,lower,
not_name,database,lower,
Expand Down Expand Up @@ -301,6 +301,7 @@ not_name,tabletop,lower,
not_name,technical,lower,
not_name,temperate,lower,
not_name,ten.mile,lower,
not_name,tissue,lower,
not_name,trail,lower,
not_name,trailhead,lower,
not_name,training,lower,
Expand All @@ -323,6 +324,7 @@ not_name,voucher,lower,
not_name,wilderness,lower,
not_name,wildlife,lower,
not_name,zone,lower,
not_name_prefix,flora of,lower,
not_name_prefix,fort,lower,
not_name_prefix,project,lower,
not_name_prefix,project.,lower,
Expand All @@ -335,13 +337,34 @@ not_name_prefix,univ. of,lower,
not_name_prefix,university of,lower,
not_name_prefix,the university,lower,
not_name_prefix,the university of,lower,
not_name_suffix,area,lower,
not_name_suffix,barcode,lower,
not_name_suffix,biological,lower,
not_name_suffix,co,lower,
not_name_suffix,co.,lower,
not_name_suffix,collection,lower,
not_name_suffix,county,lower,
not_name_suffix,dr,lower,
not_name_suffix,drive,lower,
not_name_suffix,foundation,lower,
not_name_suffix,garden,lower,
not_name_suffix,gardens,lower,
not_name_suffix,lane,lower,
not_name_suffix,ln,lower,
not_name_suffix,mountain,lower,
not_name_suffix,mountains,lower,
not_name_suffix,mts,lower,
not_name_suffix,mts.,lower,
not_name_suffix,rd,lower,
not_name_suffix,rd.,lower,
not_name_suffix,reserve,lower,
not_name_suffix,road,lower,
not_name_suffix,sample,lower,
not_name_suffix,st,lower,
not_name_suffix,st.,lower,
not_name_suffix,street,lower,
not_name_suffix,unit,lower,
not_name_suffix,widerness,lower,
last_prefix,del,lower,
last_prefix,der,lower,
last_prefix,la,lower,
Expand Down
1 change: 1 addition & 0 deletions traiter/pylib/rules/terms/us_location_terms.csv
Original file line number Diff line number Diff line change
Expand Up @@ -1614,6 +1614,7 @@ us_county,ST MARY,text,,LA,
us_county,ST MARYS,text,,MD,
us_county,ST TAMMANY,text,,LA,
us_county,ST. BERNARD,text,,LA,
us_county,ST. BERNARDO,text,St. Bernard,LA,
us_county,ST. CHARLES,text,,LA MO,
us_county,ST. CLAIR,text,,AL IL MI MO,
us_county,ST. CROIX,text,,WI,
Expand Down

0 comments on commit 476fd64

Please sign in to comment.