Skip to content

Commit

Permalink
Merge pull request #703 from hippietrail/countries_and_capitals
Browse files Browse the repository at this point in the history
feat: capitalize multi-word country and capital city names
  • Loading branch information
elijah-potter authored Feb 24, 2025
2 parents fef85af + c332f06 commit 154d259
Show file tree
Hide file tree
Showing 3 changed files with 436 additions and 3 deletions.
17 changes: 14 additions & 3 deletions harper-core/dictionary.dict
Original file line number Diff line number Diff line change
Expand Up @@ -684,7 +684,8 @@ Astoria/2M
Astrakhan/21M
AstroTurf/1M
Asturias/2M
Asuncion/M
Asuncion/2M
Asunción/2M
Aswan/2M
Atacama/2M
Atahualpa/M
Expand Down Expand Up @@ -1273,6 +1274,7 @@ Boer/12SM
Boethius/2M
Bogart/2M
Bogota/2M
Bogotá/2M
Bohemia/21M
Bohemian/152SM
Bohr/2M
Expand Down Expand Up @@ -2045,6 +2047,7 @@ Chiquita/M
Chirico/2M
Chisholm/2M
Chisinau/2M
Chișinău/2M
Chittagong/2M
Chivas/M
Chloe/2M
Expand Down Expand Up @@ -2432,6 +2435,7 @@ Cunard/2M
Cunningham/2M
Cupid/2M
Curacao/2M
Curaçao/2M
Curie/2M
Curitiba/2M
Currier/2M
Expand Down Expand Up @@ -5849,7 +5853,8 @@ Lollobrigida/2M
Lombard/125M
Lombardi/2M
Lombardy/2M
Lome/M
Lome/2M
Lomé/2M
Lompoc/2M
Lon/2M
London/2MRZ
Expand Down Expand Up @@ -6144,6 +6149,7 @@ Maldives/29M
Maldivian/15MS
Maldonado/2M
Male/2M
Malé/2M
Mali/21M
Malian/15SM
Malibu/2M
Expand Down Expand Up @@ -7269,7 +7275,8 @@ Nosferatu/1M
Nostradamus/21M
Nottingham/2M
Nouakchott/2M
Noumea/M
Noumea/2M
Nouméa/2M
Nov/2M
Nova/21M
Novartis/M
Expand Down Expand Up @@ -8321,6 +8328,7 @@ Revlon/2M
Rex/2M
Reyes/2M
Reykjavik/2M
Reykjavík/2M
Reyna/M
Reynaldo/M
Reynolds/2M
Expand Down Expand Up @@ -9836,6 +9844,7 @@ Torrance/2M
Torrens/2M
Torres/2M
Torricelli/2M
Tórshavn/2M
Tortola/2M
Tortuga/2M
Torvalds/2M
Expand Down Expand Up @@ -9947,6 +9956,7 @@ Turkestan/2M
Turkey/215M
Turkic/52MS
Turkish/25M
Türkiye/2M
Turkmenistan/2M
Turlock/2M
Turner/21M
Expand Down Expand Up @@ -10639,6 +10649,7 @@ Yank/1SM
Yankee/14SM
Yaobang/M
Yaounde/2M
Yaoundé/2M
Yaqui/12M
Yaren/2
Yaroslavl/2M
Expand Down
71 changes: 71 additions & 0 deletions harper-core/src/linting/matcher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,77 @@ impl Matcher {
"World","War","Ii" => "World War II"
});

// countries and capitals with special casing or punctuation
triggers.extend(pt! {
"andorra","la","vella" => "Andorra la Vella",
"Andorra","la","vella" => "Andorra la Vella",
"Andorra","La","Vella" => "Andorra la Vella",
"guinea","bissau" => "Guinea-Bissau",
"Guinea","bissau" => "Guinea-Bissau",
"Guinea","Bissau" => "Guinea-Bissau",
"ndjamena" => "N'Djamena",
"Ndjamena" => "N'Djamena",
"n'djamena" => "N'Djamena",
"N'djamena" => "N'Djamena",
"port","au","prince" => "Port-au-Prince",
"Port","au","prince" => "Port-au-Prince",
"Port","Au","Prince" => "Port-au-Prince",
// port-au-prince won't work here because the left side has hyphens
// Port-au-prince ditto
// Port-Au-Prince ditto
"porto","novo" => "Porto-Novo",
"Porto","novo" => "Porto-Novo",
"Porto","Novo" => "Porto-Novo",
"st","georges" => "St. George's",
// "st.","georges" => "St. George's",
"st","george's" => "St. George's",
// "st.","george's" => "St. George's",
"St","georges" => "St. George's",
// "St.","georges" => "St. George's",
"St","george's" => "St. George's",
// "St.","george's" => "St. George's",
"St","Georges" => "St. George's",
// "St.","Georges" => "St. George's",
"St","George's" => "St. George's"
});

// countries and capitals with accents and diacritics
triggers.extend(pt! {
"asuncion" => "Asunción",
"Asuncion" => "Asunción",
"chisinau" => "Chișinău",
"Chisinau" => "Chișinău",
"bogota" => "Bogotá",
"Bogota" => "Bogotá",
"curacao" => "Curaçao",
"curacao" => "Curaçao",
"lome" => "Lomé",
"Lome" => "Lomé",
"male" => "Malé",
"Male" => "Malé",
"noumea" => "Nouméa",
"Noumea" => "Nouméa",
"nukualofa" => "Nukuʻalofa",
"Nukualofa" => "Nukuʻalofa",
"nuku'alofa" => "Nukuʻalofa",
"Nuku'alofa" => "Nukuʻalofa",
"reykjavik" => "Reykjavík",
"Reykjavik" => "Reykjavík",
"san","jose" => "San José",
"San","jose" => "San José",
"sao","tome" => "São Tomé",
"Sao","Tome" => "São Tomé",
"sao","tome","and","principe" => "São Tomé and Príncipe",
"Sao","Tome","and","Principe" => "São Tomé and Príncipe",
"Sao","Tome","And","Principe" => "São Tomé and Príncipe",
"torshavn" => "Tórshavn",
"Torshavn" => "Tórshavn",
"turkiye" => "Türkiye",
"Turkiye" => "Türkiye",
"yaounde" => "Yaoundé",
"Yaounde" => "Yaoundé"
});

triggers.push(Rule {
pattern: vec![pt!("L"), pt!(Period), pt!("L"), pt!(Period), pt!("M")],
replace_with: vecword!("large language model"),
Expand Down
Loading

0 comments on commit 154d259

Please sign in to comment.