From 0fe6e526c78e8b115371855a91d4b39ccb22098a Mon Sep 17 00:00:00 2001 From: Richard Boulton Date: Fri, 22 May 2015 10:44:00 +0100 Subject: [PATCH] Don't use stopwords in new ranking I don't think that stopwords are helping us currently; they force us to add workarounds for some cases (eg, "form AN"), and standard weighting measures should ensure that common words like stopwords aren't given undue prominence. If we find that stopwords are causing a problem with ranking, we should change the weighting algorithm to one that has better compensation for common words (such as BM25f). In order not to change the existing ranking, this indexes `searchable_text` fields additionally to a `.no_stop` sub-field. The `all_searchable_text` field isn't used by the existing ranking, so just remove stopwording from that field's default analyzer. --- config/schema/elasticsearch_schema.yml | 6 ++++++ config/schema/field_types.json | 7 +++++++ lib/query_components/text_query.rb | 6 +++--- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/config/schema/elasticsearch_schema.yml b/config/schema/elasticsearch_schema.yml index 26a573187..f6999653d 100644 --- a/config/schema/elasticsearch_schema.yml +++ b/config/schema/elasticsearch_schema.yml @@ -8,6 +8,12 @@ index: filter: [standard, lowercase, stop, stemmer_override, stemmer_english] char_filter: [normalize_quotes, strip_quotes] + searchable_text: + type: custom + tokenizer: standard + filter: [standard, lowercase, stemmer_override, stemmer_english] + char_filter: [normalize_quotes, strip_quotes] + # Analyzer used at index time for the .synonym variants of searchable # text fields. with_index_synonyms: diff --git a/config/schema/field_types.json b/config/schema/field_types.json index 7ffc3122d..b250bf9e4 100644 --- a/config/schema/field_types.json +++ b/config/schema/field_types.json @@ -39,6 +39,12 @@ "include_in_all": true, "copy_to": ["spelling_text", "all_searchable_text"], "fields": { + "no_stop": { + "type": "string", + "index": "analyzed", + "include_in_all": false, + "analyzer": "searchable_text" + }, "synonym": { "type": "string", "index": "analyzed", @@ -55,6 +61,7 @@ "es_config": { "type": "string", "index": "analyzed", + "analyzer": "searchable_text", "include_in_all": false, "fields": { "synonym": { diff --git a/lib/query_components/text_query.rb b/lib/query_components/text_query.rb index 36844bbe9..4ed51c5f7 100644 --- a/lib/query_components/text_query.rb +++ b/lib/query_components/text_query.rb @@ -77,7 +77,7 @@ def field_boosts_words # Return the highest weight found by looking for a word-based match in # individual fields MATCH_FIELDS.map { |field_name, boost| - match_query(field_name, search_term, boost: boost) + match_query("#{field_name}.no_stop", search_term, boost: boost) } end @@ -85,7 +85,7 @@ def field_boosts_phrase # Return the highest weight found by looking for a phrase match in # individual fields MATCH_FIELDS.map { |field_name, boost| - match_query(field_name, search_term, type: :phrase, boost: boost) + match_query("#{field_name}.no_stop", search_term, type: :phrase, boost: boost) } end @@ -93,7 +93,7 @@ def field_boosts_all_terms # Return the highest weight found by looking for a match of all terms # individual fields MATCH_FIELDS.map { |field_name, boost| - match_query(field_name, search_term, type: :boolean, operator: :and, boost: boost) + match_query("#{field_name}.no_stop", search_term, type: :boolean, operator: :and, boost: boost) } end