From 51ad46bc385487cae936c94cbda460404d71e21d Mon Sep 17 00:00:00 2001 From: MTR Date: Fri, 26 Jul 2024 19:37:40 +0500 Subject: [PATCH] feat: Add ignore_script option to compare spoken languages --- langcodes/__init__.py | 63 +++++++++++++++++++++++--------- langcodes/tests/test_issue_59.py | 21 +++++++++++ 2 files changed, 67 insertions(+), 17 deletions(-) create mode 100644 langcodes/tests/test_issue_59.py diff --git a/langcodes/__init__.py b/langcodes/__init__.py index 7d78d3e6..09259588 100644 --- a/langcodes/__init__.py +++ b/langcodes/__init__.py @@ -674,14 +674,16 @@ def match_score(self, supported: 'Language') -> int: ) return 100 - min(self.distance(supported), 100) - def distance(self, supported: 'Language') -> int: + def distance(self, supported: 'Language', ignore_script: bool = False) -> int: """ Suppose that `self` is the language that the user desires, and `supported` is a language that is actually supported. This method returns a number from 0 to 134 measuring the 'distance' between the languages (lower numbers are better). This is not a - symmetric relation. + symmetric relation. If `ignore_script` is `True`, the script will + not be used in the comparison, possibly resulting in a smaller + 'distance'. The language distance is not really about the linguistic similarity or history of the languages; instead, it's based largely on sociopolitical @@ -703,25 +705,39 @@ def distance(self, supported: 'Language') -> int: desired_triple = ('und', 'Zzzz', 'ZZ') else: desired_complete = self.prefer_macrolanguage().maximize() - desired_triple = ( - desired_complete.language, - desired_complete.script, - desired_complete.territory, - ) - + if ignore_script: + desired_triple = ( + desired_complete.language, + None, + desired_complete.territory, + ) + else: + desired_triple = ( + desired_complete.language, + desired_complete.script, + desired_complete.territory, + ) + if ( supported.language is None and supported.script is None and supported.territory is None ): - supported_triple = ('und', 'Zzzz', 'ZZ') + supported_triple = ('und', 'Zzzz', 'ZZ') else: supported_complete = supported.prefer_macrolanguage().maximize() - supported_triple = ( - supported_complete.language, - supported_complete.script, - supported_complete.territory, - ) + if ignore_script: + supported_triple = ( + supported_complete.language, + None, + supported_complete.territory, + ) + else: + supported_triple = ( + supported_complete.language, + supported_complete.script, + supported_complete.territory, + ) return tuple_distance_cached(desired_triple, supported_triple) @@ -1648,7 +1664,7 @@ def tag_match_score( return desired_ld.match_score(supported_ld) -def tag_distance(desired: Union[str, Language], supported: Union[str, Language]) -> int: +def tag_distance(desired: Union[str, Language], supported: Union[str, Language], ignore_script: bool = False) -> int: """ Tags that expand to the same thing when likely values are filled in get a distance of 0. @@ -1791,6 +1807,12 @@ def tag_distance(desired: Union[str, Language], supported: Union[str, Language]) >>> tag_distance('ja', 'ja-Latn-US-hepburn') 54 + + If `ignore_script` is used, the script difference is ignored and a smaller + differenge with lower score will be found. + + >>> tag_distance('ja', 'ja-Latn-hepburn', ignore_script=True) + 0 >>> # You can read the Shavian script, right? >>> tag_distance('en', 'en-Shaw') @@ -1798,7 +1820,7 @@ def tag_distance(desired: Union[str, Language], supported: Union[str, Language]) """ desired_obj = Language.get(desired) supported_obj = Language.get(supported) - return desired_obj.distance(supported_obj) + return desired_obj.distance(supported_obj, ignore_script) def best_match( @@ -1835,6 +1857,7 @@ def closest_match( desired_language: Union[str, Language], supported_languages: Sequence[str], max_distance: int = 25, + ignore_script: bool = False, ) -> Tuple[str, int]: """ You have software that supports any of the `supported_languages`. You want @@ -1853,6 +1876,9 @@ def closest_match( value is 25, and raising it can cause data to be processed in significantly the wrong language. The documentation for `tag_distance` describes the distance values in more detail. + + `ignore_script` makes the matching ignore scripts, allowing matches to be + found when they wouldn't otherwise be due to different scripts. When there is a tie for the best matching language, the first one in the tie will be used. @@ -1871,6 +1897,9 @@ def closest_match( >>> closest_match('ja', ['ja-Latn-hepburn', 'en']) ('und', 1000) + + >>> closest_match('ja', ['ja-Latn-hepburn', 'en'], ignore_script=True) + ('ja-Latn-hepburn', 0) """ desired_language = str(desired_language) @@ -1884,7 +1913,7 @@ def closest_match( return desired_language, 0 match_distances = [ - (supported, tag_distance(desired_language, supported)) + (supported, tag_distance(desired_language, supported, ignore_script)) for supported in supported_languages ] match_distances = [ diff --git a/langcodes/tests/test_issue_59.py b/langcodes/tests/test_issue_59.py new file mode 100644 index 00000000..8879228d --- /dev/null +++ b/langcodes/tests/test_issue_59.py @@ -0,0 +1,21 @@ +from langcodes import closest_match + + +def test_language_less_than(): + spoken_language_1 = 'pa' + spoken_language_2 = 'pa-PK' + match = closest_match( + spoken_language_1, [spoken_language_2], ignore_script=True + ) + print(match) + assert match[0] != "und" + + +def test_language_more_than(): + spoken_language_1 = 'pa-PK' + spoken_language_2 = 'pa' + match = closest_match( + spoken_language_1, [spoken_language_2], ignore_script=True + ) + print(match) + assert match[0] != "und" \ No newline at end of file