From fc04b5ed704c0f7d3ca3d6f777ae511736e8b58d Mon Sep 17 00:00:00 2001 From: hippietrail Date: Mon, 17 Feb 2025 19:02:32 +0800 Subject: [PATCH 1/6] feat: capitalize multi-word country and capital city names There's some subjectivity about what's regarded as a country. Some do not fit in either `proper_noun_capitalization.rs` or `matcher.rs` The former capitalizes every word, but some have one minor word that shouldn't be capitalized. I didn't check if that logic is already built in though. The latter expects words separated by spaces only, but some use hyphens. Place names that have an uncapitalized word *and* a hyphen therefor don't fit either. There are some similar cases. We can use hyphen in the former, but periods don't seem to work for places with names like "St. Foo". The latter is case sensitive so I put in 3 case variants of each: all lower, first word caps but not the other words, all words caps including minor words which shouldn't be. For place names that have some other variant such as covering both space and hyphen or with and without accent, this adds up to a lot and it's easy to miss one. Plus any other combination of upper and lower case doesn't get flagged. A linter to handle the most common of these would be better. --- harper-core/src/linting/lint_group.rs | 9 +- harper-core/src/linting/matcher.rs | 105 +++++++ harper-core/src/linting/mod.rs | 5 +- .../proper_noun_capitalization_linters.rs | 277 +++++++++++++++++- 4 files changed, 383 insertions(+), 13 deletions(-) diff --git a/harper-core/src/linting/lint_group.rs b/harper-core/src/linting/lint_group.rs index fc901c3f..1d77dd66 100644 --- a/harper-core/src/linting/lint_group.rs +++ b/harper-core/src/linting/lint_group.rs @@ -38,8 +38,9 @@ use super::plural_conjugate::PluralConjugate; use super::possessive_your::PossessiveYour; use super::pronoun_contraction::PronounContraction; use super::proper_noun_capitalization_linters::{ - AmazonNames, Americas, AppleNames, Australia, AzureNames, ChineseCommunistParty, GoogleNames, - Holidays, Koreas, Malaysia, MetaNames, MicrosoftNames, UnitedOrganizations, + AmazonNames, Americas, AppleNames, Australia, AzureNames, ChineseCommunistParty, Countries, + GoogleNames, Holidays, Koreas, Malaysia, MetaNames, MicrosoftNames, NationalCapitals, + UnitedOrganizations, }; use super::repeated_words::RepeatedWords; use super::sentence_capitalization::SentenceCapitalization; @@ -286,7 +287,9 @@ create_lint_group_config!( SpecialAttention => true, Everywhere => true, ThanOthers => true, - SupposedTo => true + SupposedTo => true, + Countries => true, + NationalCapitals => true ); impl Default for LintGroup { diff --git a/harper-core/src/linting/matcher.rs b/harper-core/src/linting/matcher.rs index 3dca3918..d759673f 100644 --- a/harper-core/src/linting/matcher.rs +++ b/harper-core/src/linting/matcher.rs @@ -242,6 +242,111 @@ impl Matcher { "World","War","Ii" => "World War II" }); + // countries and capitals with special casing or punctuation + triggers.extend(pt! { + "andorra","la","vella" => "Andorra la Vella", + "Andorra","La","vella" => "Andorra la Vella", + "Andorra","La","Vella" => "Andorra la Vella", + "antigua","and","barbuda" => "Antigua and Barbuda", + "Antigua","and","barbuda" => "Antigua and Barbuda", + "Antigua","And","Barbuda" => "Antigua and Barbuda", + "bosnia and herzegovina" => "Bosnia and Herzegovina", + "Bosnia","and","herzegovina" => "Bosnia and Herzegovina", + "Bosnia","And","herzegovina" => "Bosnia and Herzegovina", + "democratic","republic","of","the","congo" => "Democratic Republic of the Congo", + "Democratic","republic","of","the","congo" => "Democratic Republic of the Congo", + "Democratic","Republic","Of","The","Congo" => "Democratic Republic of the Congo", + "guinea","bissau" => "Guinea-Bissau", + "Guinea","bissau" => "Guinea-Bissau", + "Guinea","Bissau" => "Guinea-Bissau", + "isle","of","man" => "Isle of Man", + "Isle","of","man" => "Isle of Man", + "Isle","Of","Man" => "Isle of Man", + "ndjamena" => "N'Djamena", + "Ndjamena" => "N'Djamena", + "n'djamena" => "N'Djamena", + "N'djamena" => "N'Djamena", + "port","au","prince" => "Port-au-Prince", + "Port","au","prince" => "Port-au-Prince", + "Port","Au","Prince" => "Port-au-Prince", + // port-au-prince won't work here because the left side has hyphens + // Port-au-prince ditto + // Port-Au-Prince ditto + "porto","novovo" => "Porto-Novo", + "Porto","novovo" => "Porto-Novo", + "saint","kitts","and","nevis" => "Saint Kitts and Nevis", + "Saint","kitts","and","nevis" => "Saint Kitts and Nevis", + "Saint","Kitts","And","Nevis" => "Saint Kitts and Nevis", + "saint","pierre","and","miqueleon" => "Saint Pierre and Miquelon", + "Saint","pierre","and","miquelon" => "Saint Pierre and Miquelon", + "Saint","Pierre","And","Miquelon" => "Saint Pierre and Miquelon", + "saint","vincent","and","the","grenadines" => "Saint Vincent and the Grenadines", + "Saint","vincent","and","the","grenadines" => "Saint Vincent and the Grenadines", + "Saint","Vincent","And","The","Grenadines" => "Saint Vincent and the Grenadines", + "st","georges" => "St. George's", + // "st.","georges" => "St. George's", + "st","george's" => "St. George's", + // "st.","george's" => "St. George's", + "St","georges" => "St. George's", + // "St.","georges" => "St. George's", + "St","george's" => "St. George's", + // "St.","george's" => "St. George's", + "St","Georges" => "St. George's", + // "St.","Georges" => "St. George's", + "St","George's" => "St. George's", + "trinidad","and","tobago" => "Trinidad and Tobago", + "Trinidad","and","tobago" => "Trinidad and Tobago", + "Trinidad","And","Tobago" => "Trinidad and Tobago" + }); + + // countries and capitals with accents and diacritics + triggers.extend(pt! { + "asuncion" => "Asunción", + "asunción" => "Asunción", + "Asuncion" => "Asunción", + "chisinau" => "Chișinău", + "chișinău" => "Chișinău", + "Chisinau" => "Chișinău", + "bogota" => "Bogotá", + "bogotá" => "Bogotá", + "Bogota" => "Bogotá", + "curacao" => "Curaçao", + "curaçao" => "Curaçao", + "curacao" => "Curaçao", + "lome" => "Lomé", + "lomé" => "Lomé", + "Lome" => "Lomé", + "male" => "Malé", + "malé" => "Malé", + "Male" => "Malé", + "nukualofa" => "Nukuʻalofa", + "Nukualofa" => "Nukuʻalofa", + "nuku'alofa" => "Nukuʻalofa", + "Nuku'alofa" => "Nukuʻalofa", + "reykjavik" => "Reykjavík", + "reykjavík" => "Reykjavík", + "Reykjavik" => "Reykjavík", + "san","jose" => "San José", + "san","josé" => "San José", + "San","jose" => "San José", + "sao","tome" => "São Tomé", + "são","tomé" => "São Tomé", + "Sao","Tome" => "São Tomé", + "sao","tome","and","principe" => "São Tomé and Príncipe", + "são","tomé","and","príncipe" => "São Tomé and Príncipe", + "Sao","Tome","and","Principe" => "São Tomé and Príncipe", + "Sao","Tome","And","Principe" => "São Tomé and Príncipe", + "torshavn" => "Tórshavn", + "tórshavn" => "Tórshavn", + "Torshavn" => "Tórshavn", + "turkiye" => "Türkiye", + "türkiye" => "Türkiye", + "Turkiye" => "Türkiye", + "yaounde" => "Yaoundé", + "yaoundé" => "Yaoundé", + "Yaounde" => "Yaoundé" + }); + triggers.push(Rule { pattern: vec![pt!("L"), pt!(Period), pt!("L"), pt!(Period), pt!("M")], replace_with: vecword!("large language model"), diff --git a/harper-core/src/linting/mod.rs b/harper-core/src/linting/mod.rs index ce8bdc03..8390cd7a 100644 --- a/harper-core/src/linting/mod.rs +++ b/harper-core/src/linting/mod.rs @@ -94,8 +94,9 @@ pub use plural_conjugate::PluralConjugate; pub use possessive_your::PossessiveYour; pub use pronoun_contraction::PronounContraction; pub use proper_noun_capitalization_linters::{ - AmazonNames, Americas, AppleNames, Australia, AzureNames, ChineseCommunistParty, GoogleNames, - Holidays, Koreas, Malaysia, MetaNames, MicrosoftNames, UnitedOrganizations, + AmazonNames, Americas, AppleNames, Australia, AzureNames, ChineseCommunistParty, Countries, + GoogleNames, Holidays, Koreas, Malaysia, MetaNames, MicrosoftNames, NationalCapitals, + UnitedOrganizations, }; pub use repeated_words::RepeatedWords; pub use sentence_capitalization::SentenceCapitalization; diff --git a/harper-core/src/linting/proper_noun_capitalization_linters.rs b/harper-core/src/linting/proper_noun_capitalization_linters.rs index 57c0fe18..abe95f83 100644 --- a/harper-core/src/linting/proper_noun_capitalization_linters.rs +++ b/harper-core/src/linting/proper_noun_capitalization_linters.rs @@ -160,10 +160,7 @@ create_linter_for!( ), Box::new(SequencePattern::default() .then(Box::new(EitherPattern::new(vec![ - Box::new(WordSet::all(&[ - "Johor", - "Kota" - ])), + Box::new(WordSet::all(&["Johor", "Kota"])), ]))) .then_whitespace() .t_aco("Bahru") @@ -177,10 +174,7 @@ create_linter_for!( .t_aco("Kuala") .then_whitespace() .then(Box::new(EitherPattern::new(vec![ - Box::new(WordSet::all(&[ - "Lumpur", - "Terengganu" - ])), + Box::new(WordSet::all(&["Lumpur", "Terengganu"])), ]))) ), Box::new(SequencePattern::default() @@ -192,6 +186,273 @@ create_linter_for!( "When referring to the states of Malaysia and their capitals, make sure to treat them as a proper noun." ); +create_linter_for!( + Countries, + EitherPattern::new(vec![ + // Grouped country names + // ... Islands + Box::new( + SequencePattern::default() + .then(Box::new(EitherPattern::new(vec![ + Box::new(WordSet::all(&["Cayman", "Falkland", "Marshall", "Solomon"])), + Box::new( + SequencePattern::default() + .then(Box::new(EitherPattern::new(vec![ + Box::new(SequencePattern::aco("British")), + Box::new( + SequencePattern::aco("United") + .then_whitespace() + .t_aco("States") + ), + ]))) + .then_whitespace() + .t_aco("Virgin") + ), + Box::new( + SequencePattern::aco("Northern") + .then_whitespace() + .t_aco("Mariana") + ) + ]))) + .then_whitespace() + .t_aco("Islands") + ), + // New ... + Box::new( + SequencePattern::aco("New") + .then_whitespace() + .then(Box::new(WordSet::all(&["Caledonia", "Zealand"]))) + ), + // Northern ... + Box::new( + SequencePattern::aco("Northern") + .then_whitespace() + .then(Box::new(WordSet::all(&["Cyprus", "Ireland"]))) + ), + // ... Republic + Box::new( + SequencePattern::default() + .then(Box::new(EitherPattern::new(vec![ + Box::new( + SequencePattern::aco("Central") + .then_whitespace() + .t_aco("African") + ), + Box::new(WordSet::all(&["Czech", "Dominican"])), + ]))) + .then_whitespace() + .t_aco("Republic") + ), + // Saint ... + Box::new( + SequencePattern::aco("Saint") + .then_whitespace() + .then(Box::new(WordSet::all(&["Helena", "Lucia", "Martin"]))) + ), + // South ... + Box::new( + SequencePattern::aco("South") + .then_whitespace() + .then(Box::new(WordSet::all(&["Africa", "Ossetia", "Sudan"]))) + ), + // South Korea is under "Koreas" + // One-off country names + Box::new( + SequencePattern::aco("American") + .then_whitespace() + .t_aco("Samoa") + ), + // United Arab Emirates is under "United Organizations" + Box::new( + SequencePattern::aco("Burkina") + .then_whitespace() + .t_aco("Faso") + ), + Box::new( + SequencePattern::aco("Cape") + .then_whitespace() + .t_aco("Verde") + ), + Box::new( + SequencePattern::aco("Costa") + .then_whitespace() + .t_aco("Rica") + ), + Box::new( + SequencePattern::aco("East") + .then_whitespace() + .t_aco("Timor") + ), + Box::new( + SequencePattern::aco("El") + .then_whitespace() + .t_aco("Salvador") + ), + Box::new( + SequencePattern::aco("Equatorial") + .then_whitespace() + .t_aco("Guinea") + ), + // Box::new(SequencePattern::aco("Falkland").then_whitespace().t_aco("Islands")), + Box::new( + SequencePattern::aco("French") + .then_whitespace() + .t_aco("Polyynesia") + ), + Box::new(SequencePattern::aco("Guinea").then_hyphen().t_aco("Bissau")), + Box::new( + SequencePattern::aco("Ivory") + .then_whitespace() + .t_aco("Coast") + ), + Box::new(SequencePattern::aco("La").then_whitespace().t_aco("Paz")), + Box::new( + SequencePattern::aco("North") + .then_whitespace() + .t_aco("Macedonia") + ), + Box::new( + SequencePattern::aco("Papua") + .then_whitespace() + .t_aco("New") + .then_whitespace() + .t_aco("Guinea") + ), + Box::new( + SequencePattern::aco("Puerto") + .then_whitespace() + .t_aco("Rico") + ), + // Saint Kitts and Nevis has lowercase "and" + Box::new( + SequencePattern::aco("Sierra") + .then_whitespace() + .t_aco("Leone") + ), + Box::new( + SequencePattern::aco("Sint") + .then_whitespace() + .t_aco("Maarten") + ), + Box::new(SequencePattern::aco("Sri").then_whitespace().t_aco("Lanka")), + Box::new( + SequencePattern::aco("Saudi") + .then_whitespace() + .t_aco("Arabia") + ), + Box::new( + SequencePattern::aco("Western") + .then_whitespace() + .t_aco("Sahara") + ) + ]), + "When referring to Countries, make sure to treat it as a proper noun." +); + +create_linter_for!( + NationalCapitals, + EitherPattern::new(vec![ + // Grouped capital names + // ... City + Box::new( + SequencePattern::default() + .then(Box::new(EitherPattern::new(vec![ + Box::new(WordSet::all(&[ + "Belize", + "Guatemala", + "Kuwait", + "Mexico", + "Panama", + "Vatican" + ])), + Box::new( + SequencePattern::aco("Ho") + .then_whitespace() + .t_aco("Chi") + .then_whitespace() + .t_aco("Minh") + ) + ]))) + .then_whitespace() + .t_aco("City") + ), + // San ... + Box::new( + SequencePattern::aco("San") + .then_whitespace() + .then(Box::new(WordSet::all(&["Juan", "Marino", "Salvador"]))) + ), + // St. ... TODO the period should be optional but this doesn't match even when it's not optional + // Box::new( + // SequencePattern::aco("St") + // .then_period() + // .then_whitespace() + // .then(Box::new(WordSet::all(&["Helier", "John's", "Pierre"]))) + // ), + // ... Town + Box::new( + SequencePattern::default() + .then(Box::new(WordSet::all(&[ + "Cape", "George" // Cayman Islands + ]))) + .then_whitespace() + .t_aco("Town") + ), + // George Town is with "Cape Town" + // One-off capital names + Box::new(SequencePattern::aco("Abu").then_whitespace().t_aco("Dhabi")), + Box::new( + SequencePattern::aco("Addis") + .then_whitespace() + .t_aco("Ababa") + ), + // Andorra la Vella has lowercase "la" + Box::new( + SequencePattern::aco("Bandar") + .then_whitespace() + .t_aco("Seri") + .then_whitespace() + .t_aco("Begawan") + ), + Box::new( + SequencePattern::aco("Buenos") + .then_whitespace() + .t_aco("Aires") + ), + Box::new( + SequencePattern::aco("Diego") + .then_whitespace() + .t_aco("Garcia") + ), + // Guatemala City is with "Belize City" + // Ho Chi Minh City is with "Belize City" + // Kuala Lumpur is under "Malaysia" + // Mexico City is with "Belize City" + Box::new(SequencePattern::aco("New").then_whitespace().t_aco("Delhi")), + Box::new(SequencePattern::aco("Pago").then_whitespace().t_aco("Pago")), + Box::new( + SequencePattern::aco("Phnom") + .then_whitespace() + .t_aco("Penh") + ), + Box::new( + // Port-au-Prince can't be done here because "au" must not be capitalized + // Port of Spain can't be done here because "of" must not be capitalized + SequencePattern::aco("Port") + .then_whitespace() + .then(Box::new(WordSet::all(&["Louis", "Moresby", "Vila"]))) + ), + Box::new(SequencePattern::aco("Porto").then_hyphen().t_aco("Novo")), + Box::new(SequencePattern::aco("Santo").then_hyphen().t_aco("Domingo")), + Box::new( + SequencePattern::aco("The") + .then_whitespace() + .then(Box::new(WordSet::all(&["Bahamas", "Hague"]))) + ) + ]), + "When referring to national capitals, make sure to treat it as a proper noun." +); + create_linter_for!( ChineseCommunistParty, SequencePattern::aco("Chinese") From 402649f02a9e998966c9ba421827189a40793ea1 Mon Sep 17 00:00:00 2001 From: hippietrail Date: Mon, 17 Feb 2025 21:27:48 +0800 Subject: [PATCH 2/6] fix: proper noun caps linter doesn't capitalize minor words --- harper-core/src/linting/matcher.rs | 29 +---- .../proper_noun_capitalization_linters.rs | 119 ++++++++++++++---- 2 files changed, 98 insertions(+), 50 deletions(-) diff --git a/harper-core/src/linting/matcher.rs b/harper-core/src/linting/matcher.rs index d759673f..442339b1 100644 --- a/harper-core/src/linting/matcher.rs +++ b/harper-core/src/linting/matcher.rs @@ -247,21 +247,9 @@ impl Matcher { "andorra","la","vella" => "Andorra la Vella", "Andorra","La","vella" => "Andorra la Vella", "Andorra","La","Vella" => "Andorra la Vella", - "antigua","and","barbuda" => "Antigua and Barbuda", - "Antigua","and","barbuda" => "Antigua and Barbuda", - "Antigua","And","Barbuda" => "Antigua and Barbuda", - "bosnia and herzegovina" => "Bosnia and Herzegovina", - "Bosnia","and","herzegovina" => "Bosnia and Herzegovina", - "Bosnia","And","herzegovina" => "Bosnia and Herzegovina", - "democratic","republic","of","the","congo" => "Democratic Republic of the Congo", - "Democratic","republic","of","the","congo" => "Democratic Republic of the Congo", - "Democratic","Republic","Of","The","Congo" => "Democratic Republic of the Congo", "guinea","bissau" => "Guinea-Bissau", "Guinea","bissau" => "Guinea-Bissau", "Guinea","Bissau" => "Guinea-Bissau", - "isle","of","man" => "Isle of Man", - "Isle","of","man" => "Isle of Man", - "Isle","Of","Man" => "Isle of Man", "ndjamena" => "N'Djamena", "Ndjamena" => "N'Djamena", "n'djamena" => "N'Djamena", @@ -274,15 +262,6 @@ impl Matcher { // Port-Au-Prince ditto "porto","novovo" => "Porto-Novo", "Porto","novovo" => "Porto-Novo", - "saint","kitts","and","nevis" => "Saint Kitts and Nevis", - "Saint","kitts","and","nevis" => "Saint Kitts and Nevis", - "Saint","Kitts","And","Nevis" => "Saint Kitts and Nevis", - "saint","pierre","and","miqueleon" => "Saint Pierre and Miquelon", - "Saint","pierre","and","miquelon" => "Saint Pierre and Miquelon", - "Saint","Pierre","And","Miquelon" => "Saint Pierre and Miquelon", - "saint","vincent","and","the","grenadines" => "Saint Vincent and the Grenadines", - "Saint","vincent","and","the","grenadines" => "Saint Vincent and the Grenadines", - "Saint","Vincent","And","The","Grenadines" => "Saint Vincent and the Grenadines", "st","georges" => "St. George's", // "st.","georges" => "St. George's", "st","george's" => "St. George's", @@ -293,10 +272,7 @@ impl Matcher { // "St.","george's" => "St. George's", "St","Georges" => "St. George's", // "St.","Georges" => "St. George's", - "St","George's" => "St. George's", - "trinidad","and","tobago" => "Trinidad and Tobago", - "Trinidad","and","tobago" => "Trinidad and Tobago", - "Trinidad","And","Tobago" => "Trinidad and Tobago" + "St","George's" => "St. George's" }); // countries and capitals with accents and diacritics @@ -319,6 +295,9 @@ impl Matcher { "male" => "Malé", "malé" => "Malé", "Male" => "Malé", + "noumea" => "Nouméa", + "nouméa" => "Nouméa", + "Noumea" => "Nouméa", "nukualofa" => "Nukuʻalofa", "Nukualofa" => "Nukuʻalofa", "nuku'alofa" => "Nukuʻalofa", diff --git a/harper-core/src/linting/proper_noun_capitalization_linters.rs b/harper-core/src/linting/proper_noun_capitalization_linters.rs index abe95f83..09c30f4d 100644 --- a/harper-core/src/linting/proper_noun_capitalization_linters.rs +++ b/harper-core/src/linting/proper_noun_capitalization_linters.rs @@ -190,6 +190,16 @@ create_linter_for!( Countries, EitherPattern::new(vec![ // Grouped country names + // ... Guinea + Box::new( + SequencePattern::default() + .then(Box::new(EitherPattern::new(vec![ + Box::new(SequencePattern::aco("Equatorial")), + Box::new(SequencePattern::aco("Papua").then_whitespace().t_aco("New")), + ]))) + .then_whitespace() + .t_aco("Guinea") + ), // ... Islands Box::new( SequencePattern::default() @@ -262,7 +272,21 @@ create_linter_for!( .then_whitespace() .t_aco("Samoa") ), + Box::new( + SequencePattern::aco("Antigua") + .then_whitespace() + .t_aco("and") + .then_whitespace() + .t_aco("Barbuda") + ), // United Arab Emirates is under "United Organizations" + Box::new( + SequencePattern::aco("Bosnia") + .then_whitespace() + .t_aco("and") + .then_whitespace() + .t_aco("herzegovina") + ), Box::new( SequencePattern::aco("Burkina") .then_whitespace() @@ -278,6 +302,17 @@ create_linter_for!( .then_whitespace() .t_aco("Rica") ), + Box::new( + SequencePattern::aco("Democratic") + .then_whitespace() + .t_aco("republic") + .then_whitespace() + .t_aco("of") + .then_whitespace() + .t_aco("the") + .then_whitespace() + .t_aco("congo") + ), Box::new( SequencePattern::aco("East") .then_whitespace() @@ -288,42 +323,40 @@ create_linter_for!( .then_whitespace() .t_aco("Salvador") ), + // Box::new(SequencePattern::aco("Equatorial").then_whitespace().t_aco("Guinea")), Box::new( - SequencePattern::aco("Equatorial") + SequencePattern::aco("French") .then_whitespace() - .t_aco("Guinea") + .t_aco("Polynesia") ), - // Box::new(SequencePattern::aco("Falkland").then_whitespace().t_aco("Islands")), + Box::new(SequencePattern::aco("Guinea").then_hyphen().t_aco("Bissau")), Box::new( - SequencePattern::aco("French") + SequencePattern::aco("Isle") .then_whitespace() - .t_aco("Polyynesia") + .t_aco("of") + .then_whitespace() + .t_aco("Man") ), - Box::new(SequencePattern::aco("Guinea").then_hyphen().t_aco("Bissau")), Box::new( SequencePattern::aco("Ivory") .then_whitespace() .t_aco("Coast") ), - Box::new(SequencePattern::aco("La").then_whitespace().t_aco("Paz")), Box::new( SequencePattern::aco("North") .then_whitespace() .t_aco("Macedonia") ), Box::new( - SequencePattern::aco("Papua") - .then_whitespace() - .t_aco("New") + SequencePattern::aco("Puerto") .then_whitespace() - .t_aco("Guinea") + .t_aco("Rico") ), Box::new( - SequencePattern::aco("Puerto") + SequencePattern::aco("Saudi") .then_whitespace() - .t_aco("Rico") + .t_aco("Arabia") ), - // Saint Kitts and Nevis has lowercase "and" Box::new( SequencePattern::aco("Sierra") .then_whitespace() @@ -336,9 +369,11 @@ create_linter_for!( ), Box::new(SequencePattern::aco("Sri").then_whitespace().t_aco("Lanka")), Box::new( - SequencePattern::aco("Saudi") + SequencePattern::aco("Trinidad") .then_whitespace() - .t_aco("Arabia") + .t_aco("and") + .then_whitespace() + .t_aco("Tobago") ), Box::new( SequencePattern::aco("Western") @@ -376,6 +411,36 @@ create_linter_for!( .then_whitespace() .t_aco("City") ), + // Saint ... + Box::new( + SequencePattern::aco("Saint") + .then_whitespace() + .then(Box::new(EitherPattern::new(vec![ + Box::new( + SequencePattern::aco("Kitts") + .then_whitespace() + .t_aco("and") + .then_whitespace() + .t_aco("Nevis") + ), + Box::new( + SequencePattern::aco("Pierre") + .then_whitespace() + .t_aco("and") + .then_whitespace() + .t_aco("Miquelon") + ), + Box::new( + SequencePattern::aco("Vincent") + .then_whitespace() + .t_aco("and") + .then_whitespace() + .t_aco("the") + .then_whitespace() + .t_aco("Grenadines") + ) + ]))) + ), // San ... Box::new( SequencePattern::aco("San") @@ -398,7 +463,6 @@ create_linter_for!( .then_whitespace() .t_aco("Town") ), - // George Town is with "Cape Town" // One-off capital names Box::new(SequencePattern::aco("Abu").then_whitespace().t_aco("Dhabi")), Box::new( @@ -406,7 +470,7 @@ create_linter_for!( .then_whitespace() .t_aco("Ababa") ), - // Andorra la Vella has lowercase "la" + // Andorra la Vella can't be done here because "la" must not be capitalized Box::new( SequencePattern::aco("Bandar") .then_whitespace() @@ -419,15 +483,14 @@ create_linter_for!( .then_whitespace() .t_aco("Aires") ), + // Dar es Salaam can't be done here because "es" must not be capitalized Box::new( SequencePattern::aco("Diego") .then_whitespace() .t_aco("Garcia") ), - // Guatemala City is with "Belize City" - // Ho Chi Minh City is with "Belize City" // Kuala Lumpur is under "Malaysia" - // Mexico City is with "Belize City" + Box::new(SequencePattern::aco("La").then_whitespace().t_aco("Paz")), Box::new(SequencePattern::aco("New").then_whitespace().t_aco("Delhi")), Box::new(SequencePattern::aco("Pago").then_whitespace().t_aco("Pago")), Box::new( @@ -435,15 +498,21 @@ create_linter_for!( .then_whitespace() .t_aco("Penh") ), + // Port-au-Prince can't be done here because "au" must not be capitalized Box::new( - // Port-au-Prince can't be done here because "au" must not be capitalized - // Port of Spain can't be done here because "of" must not be capitalized SequencePattern::aco("Port") .then_whitespace() - .then(Box::new(WordSet::all(&["Louis", "Moresby", "Vila"]))) + .then(Box::new(EitherPattern::new(vec![ + Box::new(WordSet::all(&["Louis", "Moresby", "Vila"])), + Box::new(SequencePattern::aco("of").then_whitespace().t_aco("Spain")) + ]))) ), Box::new(SequencePattern::aco("Porto").then_hyphen().t_aco("Novo")), - Box::new(SequencePattern::aco("Santo").then_hyphen().t_aco("Domingo")), + Box::new( + SequencePattern::aco("Santo") + .then_whitespace() + .t_aco("Domingo") + ), Box::new( SequencePattern::aco("The") .then_whitespace() From 3e395238977ce66d63735ffccdf0873f701e6798 Mon Sep 17 00:00:00 2001 From: hippietrail Date: Mon, 17 Feb 2025 21:37:07 +0800 Subject: [PATCH 3/6] fix: typos --- harper-core/src/linting/matcher.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/harper-core/src/linting/matcher.rs b/harper-core/src/linting/matcher.rs index 442339b1..d3012f4d 100644 --- a/harper-core/src/linting/matcher.rs +++ b/harper-core/src/linting/matcher.rs @@ -245,7 +245,7 @@ impl Matcher { // countries and capitals with special casing or punctuation triggers.extend(pt! { "andorra","la","vella" => "Andorra la Vella", - "Andorra","La","vella" => "Andorra la Vella", + "Andorra","la","vella" => "Andorra la Vella", "Andorra","La","Vella" => "Andorra la Vella", "guinea","bissau" => "Guinea-Bissau", "Guinea","bissau" => "Guinea-Bissau", @@ -260,8 +260,9 @@ impl Matcher { // port-au-prince won't work here because the left side has hyphens // Port-au-prince ditto // Port-Au-Prince ditto - "porto","novovo" => "Porto-Novo", - "Porto","novovo" => "Porto-Novo", + "porto","novo" => "Porto-Novo", + "Porto","novo" => "Porto-Novo", + "Porto","Novo" => "Porto-Novo", "st","georges" => "St. George's", // "st.","georges" => "St. George's", "st","george's" => "St. George's", From 5ebd349fe3347299af09f66c786750428f1f5633 Mon Sep 17 00:00:00 2001 From: hippietrail Date: Mon, 17 Feb 2025 21:40:32 +0800 Subject: [PATCH 4/6] chore: remove commented-out code --- harper-core/src/linting/proper_noun_capitalization_linters.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/harper-core/src/linting/proper_noun_capitalization_linters.rs b/harper-core/src/linting/proper_noun_capitalization_linters.rs index 09c30f4d..742e1fbb 100644 --- a/harper-core/src/linting/proper_noun_capitalization_linters.rs +++ b/harper-core/src/linting/proper_noun_capitalization_linters.rs @@ -323,7 +323,6 @@ create_linter_for!( .then_whitespace() .t_aco("Salvador") ), - // Box::new(SequencePattern::aco("Equatorial").then_whitespace().t_aco("Guinea")), Box::new( SequencePattern::aco("French") .then_whitespace() From bd8d559e37218a04d16991dcf4ad994e80249537 Mon Sep 17 00:00:00 2001 From: hippietrail Date: Tue, 18 Feb 2025 06:52:47 +0800 Subject: [PATCH 5/6] fix: capitalization and other fixes spotted by Elijah --- harper-core/dictionary.dict | 17 ++++++++++++++--- harper-core/src/linting/matcher.rs | 14 -------------- .../proper_noun_capitalization_linters.rs | 18 ++++++++++++++---- 3 files changed, 28 insertions(+), 21 deletions(-) diff --git a/harper-core/dictionary.dict b/harper-core/dictionary.dict index 3ce7cbba..b339be37 100644 --- a/harper-core/dictionary.dict +++ b/harper-core/dictionary.dict @@ -683,7 +683,8 @@ Astoria/2M Astrakhan/21M AstroTurf/1M Asturias/2M -Asuncion/M +Asuncion/2M +Asunción/2M Aswan/2M Atacama/2M Atahualpa/M @@ -1271,6 +1272,7 @@ Boer/12SM Boethius/2M Bogart/2M Bogota/2M +Bogotá/2M Bohemia/21M Bohemian/152SM Bohr/2M @@ -2040,6 +2042,7 @@ Chiquita/M Chirico/2M Chisholm/2M Chisinau/2M +Chișinău/2M Chittagong/2M Chivas/M Chloe/2M @@ -2427,6 +2430,7 @@ Cunard/2M Cunningham/2M Cupid/2M Curacao/2M +Curaçao/2M Curie/2M Curitiba/2M Currier/2M @@ -5831,7 +5835,8 @@ Lollobrigida/2M Lombard/125M Lombardi/2M Lombardy/2M -Lome/M +Lome/2M +Lomé/2M Lompoc/2M Lon/2M London/2MRZ @@ -6126,6 +6131,7 @@ Maldives/2M Maldivian/15MS Maldonado/2M Male/2M +Malé/2M Mali/21M Malian/15SM Malibu/2M @@ -7250,7 +7256,8 @@ Nosferatu/1M Nostradamus/21M Nottingham/2M Nouakchott/2M -Noumea/M +Noumea/2M +Nouméa/2M Nov/2M Nova/21M Novartis/M @@ -8300,6 +8307,7 @@ Revlon/2M Rex/2M Reyes/2M Reykjavik/2M +Reykjavík/2M Reyna/M Reynaldo/M Reynolds/2M @@ -9813,6 +9821,7 @@ Torrance/2M Torrens/2M Torres/2M Torricelli/2M +Tórshavn/2M Tortola/2M Tortuga/2M Torvalds/2M @@ -9923,6 +9932,7 @@ Turkestan/2M Turkey/215M Turkic/52MS Turkish/25M +Türkiye/2M Turkmenistan/2M Turlock/2M Turner/21M @@ -10612,6 +10622,7 @@ Yank/1SM Yankee/14SM Yaobang/M Yaounde/2M +Yaoundé/2M Yaqui/12M Yaren/2 Yaroslavl/2M diff --git a/harper-core/src/linting/matcher.rs b/harper-core/src/linting/matcher.rs index d3012f4d..1606cadf 100644 --- a/harper-core/src/linting/matcher.rs +++ b/harper-core/src/linting/matcher.rs @@ -279,51 +279,37 @@ impl Matcher { // countries and capitals with accents and diacritics triggers.extend(pt! { "asuncion" => "Asunción", - "asunción" => "Asunción", "Asuncion" => "Asunción", "chisinau" => "Chișinău", - "chișinău" => "Chișinău", "Chisinau" => "Chișinău", "bogota" => "Bogotá", - "bogotá" => "Bogotá", "Bogota" => "Bogotá", "curacao" => "Curaçao", - "curaçao" => "Curaçao", "curacao" => "Curaçao", "lome" => "Lomé", - "lomé" => "Lomé", "Lome" => "Lomé", "male" => "Malé", - "malé" => "Malé", "Male" => "Malé", "noumea" => "Nouméa", - "nouméa" => "Nouméa", "Noumea" => "Nouméa", "nukualofa" => "Nukuʻalofa", "Nukualofa" => "Nukuʻalofa", "nuku'alofa" => "Nukuʻalofa", "Nuku'alofa" => "Nukuʻalofa", "reykjavik" => "Reykjavík", - "reykjavík" => "Reykjavík", "Reykjavik" => "Reykjavík", "san","jose" => "San José", - "san","josé" => "San José", "San","jose" => "San José", "sao","tome" => "São Tomé", - "são","tomé" => "São Tomé", "Sao","Tome" => "São Tomé", "sao","tome","and","principe" => "São Tomé and Príncipe", - "são","tomé","and","príncipe" => "São Tomé and Príncipe", "Sao","Tome","and","Principe" => "São Tomé and Príncipe", "Sao","Tome","And","Principe" => "São Tomé and Príncipe", "torshavn" => "Tórshavn", - "tórshavn" => "Tórshavn", "Torshavn" => "Tórshavn", "turkiye" => "Türkiye", - "türkiye" => "Türkiye", "Turkiye" => "Türkiye", "yaounde" => "Yaoundé", - "yaoundé" => "Yaoundé", "Yaounde" => "Yaoundé" }); diff --git a/harper-core/src/linting/proper_noun_capitalization_linters.rs b/harper-core/src/linting/proper_noun_capitalization_linters.rs index 742e1fbb..176ef491 100644 --- a/harper-core/src/linting/proper_noun_capitalization_linters.rs +++ b/harper-core/src/linting/proper_noun_capitalization_linters.rs @@ -285,7 +285,7 @@ create_linter_for!( .then_whitespace() .t_aco("and") .then_whitespace() - .t_aco("herzegovina") + .t_aco("Herzegovina") ), Box::new( SequencePattern::aco("Burkina") @@ -305,13 +305,13 @@ create_linter_for!( Box::new( SequencePattern::aco("Democratic") .then_whitespace() - .t_aco("republic") + .t_aco("Republic") .then_whitespace() .t_aco("of") .then_whitespace() .t_aco("the") .then_whitespace() - .t_aco("congo") + .t_aco("Congo") ), Box::new( SequencePattern::aco("East") @@ -351,6 +351,15 @@ create_linter_for!( .then_whitespace() .t_aco("Rico") ), + Box::new( + SequencePattern::aco("São") + .then_whitespace() + .t_aco("Tomé") + .then_whitespace() + .t_aco("and") + .then_whitespace() + .t_aco("Príncipe") + ), Box::new( SequencePattern::aco("Saudi") .then_whitespace() @@ -444,7 +453,7 @@ create_linter_for!( Box::new( SequencePattern::aco("San") .then_whitespace() - .then(Box::new(WordSet::all(&["Juan", "Marino", "Salvador"]))) + .then(Box::new(WordSet::all(&["José", "Juan", "Marino", "Salvador"]))) ), // St. ... TODO the period should be optional but this doesn't match even when it's not optional // Box::new( @@ -512,6 +521,7 @@ create_linter_for!( .then_whitespace() .t_aco("Domingo") ), + Box::new(SequencePattern::aco("São").then_whitespace().t_aco("Tomé")), Box::new( SequencePattern::aco("The") .then_whitespace() From ceb14075d0cbf9dd649d941bd2d2782fad9800c5 Mon Sep 17 00:00:00 2001 From: hippietrail Date: Tue, 18 Feb 2025 07:04:52 +0800 Subject: [PATCH 6/6] `just format` --- harper-core/src/linting/lint_group.rs | 4 ++-- harper-core/src/linting/mod.rs | 4 ++-- harper-core/src/linting/proper_noun_capitalization_linters.rs | 4 +++- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/harper-core/src/linting/lint_group.rs b/harper-core/src/linting/lint_group.rs index 71e05569..65c6444f 100644 --- a/harper-core/src/linting/lint_group.rs +++ b/harper-core/src/linting/lint_group.rs @@ -59,8 +59,8 @@ use super::proper_noun_capitalization_linters::PocketCastsNames; use super::proper_noun_capitalization_linters::TumblrNames; use super::proper_noun_capitalization_linters::{ AmazonNames, Americas, AppleNames, Australia, AzureNames, Canada, ChineseCommunistParty, - Countries, GoogleNames, Holidays, Koreas, Malaysia, MetaNames, MicrosoftNames, NationalCapitals, - UnitedOrganizations, + Countries, GoogleNames, Holidays, Koreas, Malaysia, MetaNames, MicrosoftNames, + NationalCapitals, UnitedOrganizations, }; use super::repeated_words::RepeatedWords; use super::sentence_capitalization::SentenceCapitalization; diff --git a/harper-core/src/linting/mod.rs b/harper-core/src/linting/mod.rs index 3d55bf12..f79cf1ff 100644 --- a/harper-core/src/linting/mod.rs +++ b/harper-core/src/linting/mod.rs @@ -124,8 +124,8 @@ pub use proper_noun_capitalization_linters::PocketCastsNames; pub use proper_noun_capitalization_linters::TumblrNames; pub use proper_noun_capitalization_linters::{ AmazonNames, Americas, AppleNames, Australia, AzureNames, Canada, ChineseCommunistParty, - Countries, GoogleNames, Holidays, Koreas, Malaysia, MetaNames, MicrosoftNames, NationalCapitals, - UnitedOrganizations, + Countries, GoogleNames, Holidays, Koreas, Malaysia, MetaNames, MicrosoftNames, + NationalCapitals, UnitedOrganizations, }; pub use repeated_words::RepeatedWords; pub use sentence_capitalization::SentenceCapitalization; diff --git a/harper-core/src/linting/proper_noun_capitalization_linters.rs b/harper-core/src/linting/proper_noun_capitalization_linters.rs index 9bc0d676..c24277dd 100644 --- a/harper-core/src/linting/proper_noun_capitalization_linters.rs +++ b/harper-core/src/linting/proper_noun_capitalization_linters.rs @@ -500,7 +500,9 @@ create_linter_for!( Box::new( SequencePattern::aco("San") .then_whitespace() - .then(Box::new(WordSet::all(&["José", "Juan", "Marino", "Salvador"]))) + .then(Box::new(WordSet::all(&[ + "José", "Juan", "Marino", "Salvador" + ]))) ), // St. ... TODO the period should be optional but this doesn't match even when it's not optional // Box::new(