From e5371d86c96136551629abc2bed8a1487e1322f5 Mon Sep 17 00:00:00 2001
From: bigfarts <bigfarts@punymail.com>
Date: Fri, 15 Apr 2022 02:15:01 -0700
Subject: [PATCH] Add line breaking rules for Japanese text. Fixes #1497.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This allows line breaks after any kana, unless they are immediately followed by a gyōtō kinsoku character, in which case the line may not break at that point.

Also pedantically renamed is_chinese to is_cjk_ideograph as is_chinese will also cause line breaks on Japanese kanji.
---
 epaint/src/text/text_layout.rs | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)
diff --git a/epaint/src/text/text_layout.rs b/epaint/src/text/text_layout.rs
index eacc09584a0..47faa2d38a7 100644
--- a/epaint/src/text/text_layout.rs
+++ b/epaint/src/text/text_layout.rs
@@ -194,8 +194,8 @@ fn line_break(
     let mut row_start_idx = 0;
     let mut non_empty_rows = 0;
 
-    for (i, glyph) in paragraph.glyphs.iter().enumerate() {
-        let potential_row_width = glyph.max_x() - row_start_x;
+    for i in 0..paragraph.glyphs.len() {
+        let potential_row_width = paragraph.glyphs[i].max_x() - row_start_x;
 
         if job.wrap.max_rows > 0 && non_empty_rows >= job.wrap.max_rows {
             break;
@@ -245,7 +245,7 @@ fn line_break(
             }
         }
 
-        row_break_candidates.add(i, glyph.chr);
+        row_break_candidates.add(i, &paragraph.glyphs[i..]);
     }
 
     if row_start_idx < paragraph.glyphs.len() {
@@ -716,6 +716,8 @@ struct RowBreakCandidates {
     space: Option<usize>,
     /// Logograms (single character representing a whole word) are good candidates for line break.
     logogram: Option<usize>,
+    /// Kana (Japanese hiragana and katakana) may be line broken unless before a gyōtō kinsoku character.
+    kana: Option<usize>,
     /// Breaking at a dash is a super-
     /// good idea.
     dash: Option<usize>,
@@ -728,16 +730,19 @@ struct RowBreakCandidates {
 }
 
 impl RowBreakCandidates {
-    fn add(&mut self, index: usize, chr: char) {
+    fn add(&mut self, index: usize, glyphs: &[Glyph]) {
+        let chr = glyphs[0].chr;
         const NON_BREAKING_SPACE: char = '\u{A0}';
         if chr.is_whitespace() && chr != NON_BREAKING_SPACE {
             self.space = Some(index);
-        } else if is_chinese(chr) {
+        } else if is_cjk_ideograph(chr) {
             self.logogram = Some(index);
         } else if chr == '-' {
             self.dash = Some(index);
         } else if chr.is_ascii_punctuation() {
             self.punctuation = Some(index);
+        } else if is_kana(chr) && (glyphs.len() == 1 || !is_gyoto_kinsoku(glyphs[1].chr)) {
+            self.kana = Some(index);
         }
         self.any = Some(index);
     }
@@ -759,6 +764,7 @@ impl RowBreakCandidates {
             self.any
         } else {
             self.space
+                .or(self.kana)
                 .or(self.logogram)
                 .or(self.dash)
                 .or(self.punctuation)
@@ -768,12 +774,25 @@ impl RowBreakCandidates {
 }
 
 #[inline]
-fn is_chinese(c: char) -> bool {
+fn is_cjk_ideograph(c: char) -> bool {
     ('\u{4E00}' <= c && c <= '\u{9FFF}')
         || ('\u{3400}' <= c && c <= '\u{4DBF}')
         || ('\u{2B740}' <= c && c <= '\u{2B81F}')
 }
 
+#[inline]
+fn is_kana(c: char) -> bool {
+    ('\u{3040}' <= c && c <= '\u{309F}') // Hiragana block
+        || ('\u{30A0}' <= c && c <= '\u{30FF}') // Katakana block
+}
+
+#[inline]
+fn is_gyoto_kinsoku(c: char) -> bool {
+    // Gyōtō (meaning "beginning of line") kinsoku characters in Japanese typesetting are characters that may not appear at the start of a line, according to kinsoku shori rules.
+    // The list of gyōtō kinsoku characters can be found at https://en.wikipedia.org/wiki/Line_breaking_rules_in_East_Asian_languages#Characters_not_permitted_on_the_start_of_a_line.
+    ")]｝〕〉》」』】〙〗〟'\"｠»ヽヾーァィゥェォッャュョヮヵヶぁぃぅぇぉっゃゅょゎゕゖㇰㇱㇲㇳㇴㇵㇶㇷㇸㇹㇺㇻㇼㇽㇾㇿ々〻‐゠–〜?!‼⁇⁈⁉・、:;,。.".contains(c)
+}
+
 // ----------------------------------------------------------------------------
 
 #[test]