From 0fe6e526c78e8b115371855a91d4b39ccb22098a Mon Sep 17 00:00:00 2001
From: Richard Boulton <richard.boulton@digital.cabinet-office.gov.uk>
Date: Fri, 22 May 2015 10:44:00 +0100
Subject: [PATCH] Don't use stopwords in new ranking

I don't think that stopwords are helping us currently; they force us to
add workarounds for some cases (eg, "form AN"), and standard weighting
measures should ensure that common words like stopwords aren't given
undue prominence.  If we find that stopwords are causing a problem with
ranking, we should change the weighting algorithm to one that has better
compensation for common words (such as BM25f).

In order not to change the existing ranking, this indexes
`searchable_text` fields additionally to a `.no_stop` sub-field.  The
`all_searchable_text` field isn't used by the existing ranking, so just
remove stopwording from that field's default analyzer.
---
 config/schema/elasticsearch_schema.yml | 6 ++++++
 config/schema/field_types.json         | 7 +++++++
 lib/query_components/text_query.rb     | 6 +++---
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/config/schema/elasticsearch_schema.yml b/config/schema/elasticsearch_schema.yml
index 26a573187..f6999653d 100644
--- a/config/schema/elasticsearch_schema.yml
+++ b/config/schema/elasticsearch_schema.yml
@@ -8,6 +8,12 @@ index:
           filter: [standard, lowercase, stop, stemmer_override, stemmer_english]
           char_filter: [normalize_quotes, strip_quotes]
 
+        searchable_text:
+          type: custom
+          tokenizer: standard
+          filter: [standard, lowercase, stemmer_override, stemmer_english]
+          char_filter: [normalize_quotes, strip_quotes]
+
         # Analyzer used at index time for the .synonym variants of searchable
         # text fields.
         with_index_synonyms:
diff --git a/config/schema/field_types.json b/config/schema/field_types.json
index 7ffc3122d..b250bf9e4 100644
--- a/config/schema/field_types.json
+++ b/config/schema/field_types.json
@@ -39,6 +39,12 @@
       "include_in_all": true,
       "copy_to": ["spelling_text", "all_searchable_text"],
       "fields": {
+        "no_stop": {
+          "type": "string",
+          "index": "analyzed",
+          "include_in_all": false,
+          "analyzer": "searchable_text"
+        },
         "synonym": {
           "type": "string",
           "index": "analyzed",
@@ -55,6 +61,7 @@
     "es_config": {
       "type": "string",
       "index": "analyzed",
+      "analyzer": "searchable_text",
       "include_in_all": false,
       "fields": {
         "synonym": {
diff --git a/lib/query_components/text_query.rb b/lib/query_components/text_query.rb
index 36844bbe9..4ed51c5f7 100644
--- a/lib/query_components/text_query.rb
+++ b/lib/query_components/text_query.rb
@@ -77,7 +77,7 @@ def field_boosts_words
       # Return the highest weight found by looking for a word-based match in
       # individual fields
       MATCH_FIELDS.map { |field_name, boost|
-        match_query(field_name, search_term, boost: boost)
+        match_query("#{field_name}.no_stop", search_term, boost: boost)
       }
     end
 
@@ -85,7 +85,7 @@ def field_boosts_phrase
       # Return the highest weight found by looking for a phrase match in
       # individual fields
       MATCH_FIELDS.map { |field_name, boost|
-        match_query(field_name, search_term, type: :phrase, boost: boost)
+        match_query("#{field_name}.no_stop", search_term, type: :phrase, boost: boost)
       }
     end
 
@@ -93,7 +93,7 @@ def field_boosts_all_terms
       # Return the highest weight found by looking for a match of all terms
       # individual fields
       MATCH_FIELDS.map { |field_name, boost|
-        match_query(field_name, search_term, type: :boolean, operator: :and, boost: boost)
+        match_query("#{field_name}.no_stop", search_term, type: :boolean, operator: :and, boost: boost)
       }
     end