Skip to content

Commit

Permalink
feat: Add ignore_script option to compare spoken languages
Browse files Browse the repository at this point in the history
  • Loading branch information
mtrd3v authored and georgkrause committed Nov 19, 2024
1 parent 677ad3f commit 51ad46b
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 17 deletions.
63 changes: 46 additions & 17 deletions langcodes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,14 +674,16 @@ def match_score(self, supported: 'Language') -> int:
)
return 100 - min(self.distance(supported), 100)

def distance(self, supported: 'Language') -> int:
def distance(self, supported: 'Language', ignore_script: bool = False) -> int:
"""
Suppose that `self` is the language that the user desires, and
`supported` is a language that is actually supported.
This method returns a number from 0 to 134 measuring the 'distance'
between the languages (lower numbers are better). This is not a
symmetric relation.
symmetric relation. If `ignore_script` is `True`, the script will
not be used in the comparison, possibly resulting in a smaller
'distance'.
The language distance is not really about the linguistic similarity or
history of the languages; instead, it's based largely on sociopolitical
Expand All @@ -703,25 +705,39 @@ def distance(self, supported: 'Language') -> int:
desired_triple = ('und', 'Zzzz', 'ZZ')
else:
desired_complete = self.prefer_macrolanguage().maximize()
desired_triple = (
desired_complete.language,
desired_complete.script,
desired_complete.territory,
)

if ignore_script:
desired_triple = (
desired_complete.language,
None,
desired_complete.territory,
)
else:
desired_triple = (
desired_complete.language,
desired_complete.script,
desired_complete.territory,
)

if (
supported.language is None
and supported.script is None
and supported.territory is None
):
supported_triple = ('und', 'Zzzz', 'ZZ')
supported_triple = ('und', 'Zzzz', 'ZZ')
else:
supported_complete = supported.prefer_macrolanguage().maximize()
supported_triple = (
supported_complete.language,
supported_complete.script,
supported_complete.territory,
)
if ignore_script:
supported_triple = (
supported_complete.language,
None,
supported_complete.territory,
)
else:
supported_triple = (
supported_complete.language,
supported_complete.script,
supported_complete.territory,
)

return tuple_distance_cached(desired_triple, supported_triple)

Expand Down Expand Up @@ -1648,7 +1664,7 @@ def tag_match_score(
return desired_ld.match_score(supported_ld)


def tag_distance(desired: Union[str, Language], supported: Union[str, Language]) -> int:
def tag_distance(desired: Union[str, Language], supported: Union[str, Language], ignore_script: bool = False) -> int:
"""
Tags that expand to the same thing when likely values are filled in get a
distance of 0.
Expand Down Expand Up @@ -1791,14 +1807,20 @@ def tag_distance(desired: Union[str, Language], supported: Union[str, Language])
>>> tag_distance('ja', 'ja-Latn-US-hepburn')
54
If `ignore_script` is used, the script difference is ignored and a smaller
differenge with lower score will be found.
>>> tag_distance('ja', 'ja-Latn-hepburn', ignore_script=True)
0
>>> # You can read the Shavian script, right?
>>> tag_distance('en', 'en-Shaw')
54
"""
desired_obj = Language.get(desired)
supported_obj = Language.get(supported)
return desired_obj.distance(supported_obj)
return desired_obj.distance(supported_obj, ignore_script)


def best_match(
Expand Down Expand Up @@ -1835,6 +1857,7 @@ def closest_match(
desired_language: Union[str, Language],
supported_languages: Sequence[str],
max_distance: int = 25,
ignore_script: bool = False,
) -> Tuple[str, int]:
"""
You have software that supports any of the `supported_languages`. You want
Expand All @@ -1853,6 +1876,9 @@ def closest_match(
value is 25, and raising it can cause data to be processed in significantly
the wrong language. The documentation for `tag_distance` describes the
distance values in more detail.
`ignore_script` makes the matching ignore scripts, allowing matches to be
found when they wouldn't otherwise be due to different scripts.
When there is a tie for the best matching language, the first one in the
tie will be used.
Expand All @@ -1871,6 +1897,9 @@ def closest_match(
>>> closest_match('ja', ['ja-Latn-hepburn', 'en'])
('und', 1000)
>>> closest_match('ja', ['ja-Latn-hepburn', 'en'], ignore_script=True)
('ja-Latn-hepburn', 0)
"""
desired_language = str(desired_language)

Expand All @@ -1884,7 +1913,7 @@ def closest_match(
return desired_language, 0

match_distances = [
(supported, tag_distance(desired_language, supported))
(supported, tag_distance(desired_language, supported, ignore_script))
for supported in supported_languages
]
match_distances = [
Expand Down
21 changes: 21 additions & 0 deletions langcodes/tests/test_issue_59.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from langcodes import closest_match


def test_language_less_than():
spoken_language_1 = 'pa'
spoken_language_2 = 'pa-PK'
match = closest_match(
spoken_language_1, [spoken_language_2], ignore_script=True
)
print(match)
assert match[0] != "und"


def test_language_more_than():
spoken_language_1 = 'pa-PK'
spoken_language_2 = 'pa'
match = closest_match(
spoken_language_1, [spoken_language_2], ignore_script=True
)
print(match)
assert match[0] != "und"

0 comments on commit 51ad46b

Please sign in to comment.