feat: Add ignore_script option to compare spoken languages

georgkrause · Nov 19, 2024 · 51ad46b · 51ad46b
1 parent 677ad3f
commit 51ad46b
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 17 deletions.
diff --git a/langcodes/__init__.py b/langcodes/__init__.py
@@ -674,14 +674,16 @@ def match_score(self, supported: 'Language') -> int:
         )
         return 100 - min(self.distance(supported), 100)
 
-    def distance(self, supported: 'Language') -> int:
+    def distance(self, supported: 'Language', ignore_script: bool = False) -> int:
         """
         Suppose that `self` is the language that the user desires, and
         `supported` is a language that is actually supported.
 
         This method returns a number from 0 to 134 measuring the 'distance'
         between the languages (lower numbers are better). This is not a
-        symmetric relation.
+        symmetric relation. If `ignore_script` is `True`, the script will
+        not be used in the comparison, possibly resulting in a smaller
+        'distance'.
 
         The language distance is not really about the linguistic similarity or
         history of the languages; instead, it's based largely on sociopolitical
@@ -703,25 +705,39 @@ def distance(self, supported: 'Language') -> int:
             desired_triple = ('und', 'Zzzz', 'ZZ')
         else:
             desired_complete = self.prefer_macrolanguage().maximize()
-            desired_triple = (
-                desired_complete.language,
-                desired_complete.script,
-                desired_complete.territory,
-            )
-
+            if ignore_script:
+                desired_triple = (
+                    desired_complete.language,
+                    None,
+                    desired_complete.territory,
+                )
+            else:
+                desired_triple = (
+                    desired_complete.language,
+                    desired_complete.script,
+                    desired_complete.territory,
+                )
+
         if (
             supported.language is None
             and supported.script is None
             and supported.territory is None
         ):
-            supported_triple = ('und', 'Zzzz', 'ZZ')
+            supported_triple = ('und', 'Zzzz', 'ZZ')    
         else:
             supported_complete = supported.prefer_macrolanguage().maximize()
-            supported_triple = (
-                supported_complete.language,
-                supported_complete.script,
-                supported_complete.territory,
-            )
+            if ignore_script:
+               supported_triple = (
+                    supported_complete.language,
+                    None,
+                    supported_complete.territory,
+                )
+            else:
+                supported_triple = (
+                    supported_complete.language,
+                    supported_complete.script,
+                    supported_complete.territory,
+                )
 
         return tuple_distance_cached(desired_triple, supported_triple)
 
@@ -1648,7 +1664,7 @@ def tag_match_score(
     return desired_ld.match_score(supported_ld)
 
 
-def tag_distance(desired: Union[str, Language], supported: Union[str, Language]) -> int:
+def tag_distance(desired: Union[str, Language], supported: Union[str, Language], ignore_script: bool = False) -> int:
     """
     Tags that expand to the same thing when likely values are filled in get a
     distance of 0.
@@ -1791,14 +1807,20 @@ def tag_distance(desired: Union[str, Language], supported: Union[str, Language])
 
     >>> tag_distance('ja', 'ja-Latn-US-hepburn')
     54
+    
+    If `ignore_script` is used, the script difference is ignored and a smaller
+    differenge with lower score will be found.
+
+    >>> tag_distance('ja', 'ja-Latn-hepburn', ignore_script=True)
+    0
 
     >>> # You can read the Shavian script, right?
     >>> tag_distance('en', 'en-Shaw')
     54
     """
     desired_obj = Language.get(desired)
     supported_obj = Language.get(supported)
-    return desired_obj.distance(supported_obj)
+    return desired_obj.distance(supported_obj, ignore_script)
 
 
 def best_match(
@@ -1835,6 +1857,7 @@ def closest_match(
     desired_language: Union[str, Language],
     supported_languages: Sequence[str],
     max_distance: int = 25,
+    ignore_script: bool = False,
 ) -> Tuple[str, int]:
     """
     You have software that supports any of the `supported_languages`. You want
@@ -1853,6 +1876,9 @@ def closest_match(
     value is 25, and raising it can cause data to be processed in significantly
     the wrong language. The documentation for `tag_distance` describes the
     distance values in more detail.
+    
+    `ignore_script` makes the matching ignore scripts, allowing matches to be 
+    found when they wouldn't otherwise be due to different scripts.
 
     When there is a tie for the best matching language, the first one in the
     tie will be used.
@@ -1871,6 +1897,9 @@ def closest_match(
 
     >>> closest_match('ja', ['ja-Latn-hepburn', 'en'])
     ('und', 1000)
+    
+    >>> closest_match('ja', ['ja-Latn-hepburn', 'en'], ignore_script=True)
+    ('ja-Latn-hepburn', 0)
     """
     desired_language = str(desired_language)
 
@@ -1884,7 +1913,7 @@ def closest_match(
         return desired_language, 0
 
     match_distances = [
-        (supported, tag_distance(desired_language, supported))
+        (supported, tag_distance(desired_language, supported, ignore_script))
         for supported in supported_languages
     ]
     match_distances = [

diff --git a/langcodes/tests/test_issue_59.py b/langcodes/tests/test_issue_59.py
@@ -0,0 +1,21 @@
+from langcodes import closest_match
+
+
+def test_language_less_than():
+    spoken_language_1 = 'pa'
+    spoken_language_2 = 'pa-PK'
+    match = closest_match(
+        spoken_language_1, [spoken_language_2], ignore_script=True
+    )
+    print(match)
+    assert match[0] != "und"
+
+
+def test_language_more_than():
+    spoken_language_1 = 'pa-PK'
+    spoken_language_2 = 'pa'
+    match = closest_match(
+        spoken_language_1, [spoken_language_2], ignore_script=True
+    )
+    print(match)
+    assert match[0] != "und"