Skip to content

Commit

Permalink
Transliterates Saint and Sainte in ES schema see pelias/schema#268
Browse files Browse the repository at this point in the history
  • Loading branch information
Joxit authored and orangejulius committed Feb 28, 2018
1 parent d187e7d commit 551f022
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 89 deletions.
18 changes: 8 additions & 10 deletions sanitizer/_city_name_standardizer.js
Original file line number Diff line number Diff line change
@@ -1,20 +1,18 @@
const _ = require('lodash');

// matches 'ft', 'mt', 'saint', and 'sainte' on word boundary
const mountSaintFort = /\b([fm]t|ste?)\b/g;
// matches 'ft', 'mt' on word boundary
const mountFort = /\b([fm]t)\b/g;

const transliterations = {
'mt': 'mount',
'ft': 'fort',
'st': 'saint',
'ste': 'sainte'
'ft': 'fort'
};

function transliterate(match) {
return _.get(transliterations, match);
}

// transliterate ft/mt/saint/sainte to fort/mount/st/ste, respectively
// transliterate ft/mt to fort/mount, respectively
function _sanitize(raw, clean) {
// error & warning messages
// this function doesn't add any error or warning messages
Expand All @@ -24,14 +22,14 @@ function _sanitize(raw, clean) {
if (!_.isEmpty(_.get(clean, 'parsed_text.city'))) {
// eg input: Ft. st Louis
// after 1. ft st louis
// after 2. fort saint louis
// after 3. fort saint louis
// after 2. fort st louis
// after 3. fort st louis

// 1. remove '.' that could abbreviate ft and mt (makes transliteration regex easier)
const periods_removed = _.toLower(clean.parsed_text.city).replace(/\b(mt|ft)\./g, '$1 ');

// 2. transliterate 'st'->'saint', etc
const transliterated = periods_removed.replace(mountSaintFort, transliterate);
// 2. transliterate 'ft'->'fort', etc
const transliterated = periods_removed.replace(mountFort, transliterate);

// 3. reduce whitespace sequences that can occur when removing periods down to a single space
const whitespace_normalized = _.trimEnd(transliterated.replace(/\s+/, ' '));
Expand Down
82 changes: 3 additions & 79 deletions test/unit/sanitizer/_city_name_standardizer.js
Original file line number Diff line number Diff line change
Expand Up @@ -48,82 +48,6 @@ module.exports.tests.text_parser = function(test, common) {

});

test('\'st\' should be expanded to \'saint\' wherever it appears in the city', function(t) {
const raw = {};

const clean = {
parsed_text: {
query: 'saint query value',
neighbourhood: 'saint neighbourhood value',
borough: 'saint borough value',
city: 'st city ST value St',
county: 'saint county value',
state: 'saint state value',
postalcode: 'saint postalcode value',
country: 'saint country value'
}
};

const expected_clean = {
parsed_text: {
query: 'saint query value',
neighbourhood: 'saint neighbourhood value',
borough: 'saint borough value',
city: 'saint city saint value saint',
county: 'saint county value',
state: 'saint state value',
postalcode: 'saint postalcode value',
country: 'saint country value'
}
};

const messages = sanitizer.sanitize(raw, clean);

t.deepEquals(clean, expected_clean);
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();

});

test('\'ste\' should be expanded to \'sainte\' wherever it appears in the city', function(t) {
const raw = {};

const clean = {
parsed_text: {
query: 'sainte query value',
neighbourhood: 'sainte neighbourhood value',
borough: 'sainte borough value',
city: 'ste city STE value StE',
county: 'sainte county value',
state: 'sainte state value',
postalcode: 'sainte postalcode value',
country: 'sainte country value'
}
};

const expected_clean = {
parsed_text: {
query: 'sainte query value',
neighbourhood: 'sainte neighbourhood value',
borough: 'sainte borough value',
city: 'sainte city sainte value sainte',
county: 'sainte county value',
state: 'sainte state value',
postalcode: 'sainte postalcode value',
country: 'sainte country value'
}
};

const messages = sanitizer.sanitize(raw, clean);

t.deepEquals(clean, expected_clean);
t.deepEquals(messages.errors, [], 'no errors');
t.deepEquals(messages.warnings, [], 'no warnings');
t.end();

});

test('\'ft\' should be expanded to \'fort\' wherever it appears in the city', function(t) {
const raw = {};

Expand Down Expand Up @@ -200,18 +124,18 @@ module.exports.tests.text_parser = function(test, common) {

});

test('mixture of \'mt\', \'ft\', \'st\', and \'st\' should be expanded', function(t) {
test('mixture of \'mt\', \'ft\' should be expanded', function(t) {
const raw = {};

const clean = {
parsed_text: {
city: 'mt. ft st ste mt ft.'
city: 'mt. ft mt ft.'
}
};

const expected_clean = {
parsed_text: {
city: 'mount fort saint sainte mount fort'
city: 'mount fort mount fort'
}
};

Expand Down

0 comments on commit 551f022

Please sign in to comment.