From 5b29d1654abc5e8a014c2230da990ab2f91fb340 Mon Sep 17 00:00:00 2001
From: Gin <bazinga931212@gmail.com>
Date: Sat, 10 Aug 2019 21:05:01 +0800
Subject: [PATCH] Run anserini+BM25 baseline on PubMed and DBLP (#765)

* add script to convert and run pubmed and dblp

* updated openresearch docs to add comparison table; update key terms result
---
 docs/experiments-openresearch.md              | 109 +++++++++-
 .../convert_openresearch_to_whoosh_index.py   |  26 ++-
 .../convert_pubmed_dblp_to_anserini_format.py | 196 ++++++++++++++++++
 .../convert_pubmed_dblp_to_whoosh_index.py    |  47 +++++
 src/main/python/openresearch/retrieve.py      |  15 +-
 .../openresearch/retrieve_with_key_terms.py   |  16 +-
 .../python/openresearch/run_pubmed_dblp.sh    | 108 ++++++++++
 7 files changed, 502 insertions(+), 15 deletions(-)
 create mode 100644 src/main/python/openresearch/convert_pubmed_dblp_to_anserini_format.py
 create mode 100644 src/main/python/openresearch/convert_pubmed_dblp_to_whoosh_index.py
 create mode 100644 src/main/python/openresearch/run_pubmed_dblp.sh
diff --git a/docs/experiments-openresearch.md b/docs/experiments-openresearch.md
index 1072c2cc32..67df4e39e3 100644
--- a/docs/experiments-openresearch.md
+++ b/docs/experiments-openresearch.md
@@ -124,11 +124,11 @@ recall_1000           	all	0.3628
 The output of using key terms in title and abstract as query should be:
 
 ```
-map                   	all	0.0412
-recip_rank            	all	0.2521
-P_20                  	all	0.0546
-recall_20             	all	0.0790
-recall_1000           	all	0.2818
+map                     all 0.0528
+recip_rank              all 0.2202
+P_20                    all 0.0428
+recall_20               all 0.1022
+recall_1000             all 0.3344
 ```
 
 
@@ -139,6 +139,103 @@ The table below compares our BM25 results against Bhagavatula's et. al (2018):
 | BM25 (Bhagavatula et. al, 2018) | 0.058 | 0.218 |
 | BM25 (Anserini, Ours, title)    | 0.063 | 0.244 |
 | BM25 (Anserini, Ours, title+abstract)| 0.095 | 0.351 |
-| BM25 (Anserini, Ours, key terms)| 0.065 | 0.251 |
+| BM25 (Anserini, Ours, key terms)| 0.060 | 0.220 |
 
 
+## Extra Baseline on PubMed and DBLP
+
+### PubMed and DBLP dataset
+
+Follow [citeomatic's repo](/~https://github.com/allenai/citeomatic/tree/44dc210c82515b5d4c5a96f5aafcb9b6e48206af) to download the necessary data.
+
+The steps are similar to run baseline on OpenResearch, to run all three experiments on PubMed and DBLP quickly, run
+
+`./src/main/python/openresearch/run_pubmed_dblp.sh -citeomatic_data <YOUR CITEOMATIC_DATA_ROOT> -output_folder <YOUR_OUTPUT_FOLDER>`
+
+The results are as follows:
+
+The output of using PubMed title as query
+
+```
+map                     all     0.1615
+recip_rank              all     0.5844
+P_20                    all     0.2034
+recall_20               all     0.1954
+recall_1000             all     0.6536
+f1_20                   all     0.199
+```
+
+The output of using PubMed key terms from title and abstract as query
+
+```
+map                     all     0.1637
+recip_rank              all     0.5953
+P_20                    all     0.2058
+recall_20               all     0.1969
+recall_1000             all     0.6041
+f1_20                   all     0.201
+```
+
+The output of using PubMed title + abstract as query
+
+```
+map                     all     0.2361
+recip_rank              all     0.7208
+P_20                    all     0.2726
+recall_20               all     0.2632
+recall_1000             all     0.7649
+f1_20                   all     0.268
+```
+
+The output of using DBLP title as query
+
+```
+map                     all     0.1056
+recip_rank              all     0.4244
+P_20                    all     0.1090
+recall_20               all     0.1721
+recall_1000             all     0.5511
+f1_20                   all     0.133
+```
+
+The output of using DBLP key terms from title and abstract as query
+
+```
+map                     all     0.1015
+recip_rank              all     0.4254
+P_20                    all     0.1059
+recall_20               all     0.1669
+recall_1000             all     0.5099
+f1_20                   all     0.130
+```
+
+The output of using DBLP title + abstract as query
+
+```
+map                     all     0.1687
+recip_rank              all     0.5851
+P_20                    all     0.1586
+recall_20               all     0.2511
+recall_1000             all     0.6913
+f1_20                   all     0.194
+```
+
+The table below compares our BM25 results against Bhagavatula's et. al (2018):
+
+**PubMed**
+
+|                                 | F1@20 |  MRR  |
+|----------|:-------------:|------:|
+| BM25 (Bhagavatula et. al, 2018) | 0.209 | 0.574 |
+| BM25 (Anserini, Ours, title)    | 0.199 | 0.584 |
+| BM25 (Anserini, Ours, key terms)| 0.201 | 0.595 |
+| BM25 (Anserini, Ours, title+abstract)| 0.268 | 0.720|
+
+**DBLP**
+
+|                                 | F1@20 |  MRR  |
+|----------|:-------------:|------:|
+| BM25 (Bhagavatula et. al, 2018) | 0.119 | 0.425 |
+| BM25 (Anserini, Ours, title)    | 0.133 | 0.424 |
+| BM25 (Anserini, Ours, key terms)| 0.130 | 0.425 |
+| BM25 (Anserini, Ours, title+abstract)| 0.194 | 0.585 |
diff --git a/src/main/python/openresearch/convert_openresearch_to_whoosh_index.py b/src/main/python/openresearch/convert_openresearch_to_whoosh_index.py
index a80a1f8f77..6f3ea0f714 100644
--- a/src/main/python/openresearch/convert_openresearch_to_whoosh_index.py
+++ b/src/main/python/openresearch/convert_openresearch_to_whoosh_index.py
@@ -9,6 +9,29 @@
 from whoosh.fields import *
 
 
+def get_id_years(file_paths):
+  print('Collecting paper ids and their publication years...')
+  id_years = []
+  for file_num, file_path in enumerate(file_paths):
+    with gzip.open(file_path) as f:
+      for line_num, line in enumerate(f):
+        obj = json.loads(line.strip())
+        doc_id = obj['id']
+        if 'year' not in obj:
+          continue
+        year = int(obj['year'])
+
+        id_years.append((doc_id, year))
+        if line_num % 100000 == 0:
+          print('Processed {} lines. Collected {} docs.'.format(
+              line_num + 1, len(id_years)))
+
+  print('Sorting papers by year...')
+  id_years.sort(key = lambda x: x[1])
+  id_years = {id: year for id, year in id_years}
+  return id_years
+
+
 def create_dataset(args):
     print('Converting data...')
 
@@ -35,6 +58,8 @@ def create_dataset(args):
     whoosh_index = create_in(args.whoosh_index, schema)
     writer = whoosh_index.writer()
 
+    id_years = get_id_years(file_paths)
+    doc_ids = set(id_years.keys())
     line_num = 0
     start_time = time.time()
     for file_num, file_path in enumerate(file_paths):
@@ -42,7 +67,6 @@ def create_dataset(args):
             for line in f:
                 obj = json.loads(line.strip())
                 doc_id = obj['id']
-
                 writer.add_document(id=doc_id, title=obj['title'], abstract=obj['paperAbstract'])
                 line_num += 1
                 if line_num % 100000 == 0:
diff --git a/src/main/python/openresearch/convert_pubmed_dblp_to_anserini_format.py b/src/main/python/openresearch/convert_pubmed_dblp_to_anserini_format.py
new file mode 100644
index 0000000000..4eb477a1f3
--- /dev/null
+++ b/src/main/python/openresearch/convert_pubmed_dblp_to_anserini_format.py
@@ -0,0 +1,196 @@
+import argparse
+import gzip
+import json
+import os
+import time
+from collections import defaultdict
+
+
+def clean(text):
+    return text.replace('\n', ' ').replace('\t', ' ')
+
+
+def get_ids(start, end, year_ids):
+    result = []
+    for year in range(start, end+1):
+        result.extend(year_ids[year])
+    return set(result)
+
+
+def get_id_years(file_name, data_type):
+    print('Collecting paper ids and their publication years...')
+    year_ids = defaultdict(list)
+    with open(file_name) as f:
+        for line_num, line in enumerate(f):
+            obj = json.loads(line.strip())
+            doc_id = obj['id']
+            if 'year' not in obj:
+                continue
+            year = int(obj['year'])
+
+            year_ids[year].append(doc_id)
+            if line_num % 1000000 == 0:
+                print('Processed {} lines. Collected {} docs.'.format(
+                    line_num + 1, len(year_ids)))
+
+    train_ranges = {'dblp': (1966, 2007), 'pubmed': (1966, 2008)}
+    dev_ranges = {'dblp': (2008, 2008), 'pubmed': (2009, 2009)}
+    test_ranges = {'dblp': (2009, 2011), 'pubmed': (2010, 2013)}
+
+    train_ids = get_ids(train_ranges[data_type][0], train_ranges[data_type][1], year_ids)
+    dev_ids = get_ids(dev_ranges[data_type][0], dev_ranges[data_type][1], year_ids)
+    test_ids = get_ids(test_ranges[data_type][0], test_ranges[data_type][1], year_ids)
+
+    num_train = len(train_ids)
+    num_dev = len(dev_ids)
+    num_test = len(test_ids)
+
+    print('Collected {}, {}, {} papers for training, dev, and test sets.'.format(
+        num_train, num_dev, num_test))
+    
+    return train_ids, dev_ids, test_ids, year_ids
+
+
+def create_dataset(args):
+    print('Converting data...')
+    queries_files = {}
+    qrels_files = {}
+    for set_name in ['train', 'dev', 'test']:
+        queries_filepath = os.path.join(
+            args.output_folder, 'queries.{}.tsv'.format(set_name))
+        qrels_filepath = os.path.join(
+            args.output_folder, 'qrels.{}'.format(set_name))
+        queries_files[set_name] = open(queries_filepath, 'w')
+        qrels_files[set_name] = open(qrels_filepath, 'w')
+
+    file_name = os.path.join(args.collection_path, 'corpus.json')
+
+    train_ids, dev_ids, test_ids, year_ids = get_id_years(
+        file_name=file_name, data_type=args.data_type)
+    
+    doc_ids = train_ids | dev_ids | test_ids
+
+    # Write train_ids to file for future use
+    candidates_file = open(os.path.join(args.output_folder, 'candidates.txt'), 'w')
+    for train_id in train_ids:
+        candidates_file.write(train_id+'\n')
+
+    id_years = {}
+    for y in year_ids:
+        for i in year_ids[y]:
+            id_years[i] = y
+
+    n_docs = 0
+    file_index = 0
+    num_train = 0
+    num_dev = 0
+    num_test = 0
+    start_time = time.time()
+
+    with open(file_name) as f:
+        for line in f:
+            obj = json.loads(line.strip())
+            doc_id = obj['id']
+            if doc_id not in doc_ids:
+                continue
+            if n_docs % args.max_docs_per_file == 0:
+                if n_docs > 0:
+                    output_jsonl_file.close()
+                output_path = os.path.join(
+                    args.output_folder, 'corpus/docs{:02d}.json'.format(file_index))
+                output_jsonl_file = open(output_path, 'w')
+                file_index += 1
+            doc_text = '[Title]: {} [Abstract]: {}'.format(
+                obj['title'], obj['abstract'])
+            doc_text = clean(doc_text)
+            output_dict = {'id': doc_id, 'contents': doc_text}
+            output_jsonl_file.write(json.dumps(output_dict) + '\n')
+            n_docs += 1
+    
+            out_citations = obj['out_citations']
+            
+            # Remove citations not in the corpus.
+            out_citations = [
+            out_citation for out_citation in out_citations 
+            if out_citation in doc_ids
+            ]
+            
+            # Remove self citations.
+            out_citations = [
+            out_citation for out_citation in out_citations 
+            if out_citation != doc_id
+            ]
+
+            # Use only citations that have an older publication year than the citing
+            # paper's or do not have an year.
+            out_citations2 = []
+            for out_citation in out_citations: 
+                if out_citation in id_years:
+                    if id_years[out_citation] <= obj['year']:
+                        out_citations2.append(out_citation)
+            out_citations = out_citations2
+
+            # Follow Bhagavatula's setting to restrict our citations candidates to train_ids only
+            out_citations = set(out_citations)
+            out_citations.intersection_update(train_ids)
+
+            # Skip papers have out citations < 10.
+            if len(out_citations) < 10:
+                continue
+
+            if doc_id in train_ids:
+                set_name = 'train'
+                num_train += 1  
+            elif doc_id in dev_ids:
+                set_name = 'dev'
+                num_dev += 1
+            elif doc_id in test_ids:
+                set_name = 'test'
+                num_test += 1
+
+            queries_file = queries_files[set_name]
+            qrels_file = qrels_files[set_name]
+
+            doc_title = obj['title']
+            doc_title = clean(doc_title)
+            if args.use_abstract_in_query:
+                doc_abstract = clean(obj['abstract'])
+                query = '[Title]: ' + doc_title + ' [Abstract]: ' + doc_abstract
+            else:
+                query = doc_title
+            queries_file.write('{}\t{}\n'.format(doc_id, query))
+            for out_citation in out_citations:
+                qrels_file.write('{} 0 {} 1\n'.format(doc_id, out_citation))
+
+        print('Examples: {} train, {} valid, {} test'.format(
+                    num_train, num_dev, num_test))
+
+    # Close queries and qrels files.
+    for queries_file in queries_files.values():
+        queries_file.close()
+    for qrels_file in qrels_files.values():
+        qrels_file.close()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Converts DBLP Corpus json collection to '
+        'Anserini\'s jsonl files.')
+    parser.add_argument('--collection_path', required=True, 
+        help='DBLP json collection file')
+    parser.add_argument('--output_folder', required=True, help='output file')
+    parser.add_argument('--max_docs_per_file', default=1000000, type=int, 
+        help='maximum number of documents in each jsonl file.')
+    parser.add_argument('--data_type', required=True, default='dblp', help='dblp or pubmed')
+    parser.add_argument('--use_abstract_in_query', action='store_true',
+        help='If True use title and a abstract as query. If '
+        'False, use only title.')
+
+    args = parser.parse_args()
+
+    if not os.path.exists(args.output_folder):
+        os.makedirs(args.output_folder)
+        os.makedirs(os.path.join(args.output_folder, 'corpus'))
+
+    create_dataset(args)
+    print('Done!')
\ No newline at end of file
diff --git a/src/main/python/openresearch/convert_pubmed_dblp_to_whoosh_index.py b/src/main/python/openresearch/convert_pubmed_dblp_to_whoosh_index.py
new file mode 100644
index 0000000000..57a3351b9f
--- /dev/null
+++ b/src/main/python/openresearch/convert_pubmed_dblp_to_whoosh_index.py
@@ -0,0 +1,47 @@
+import argparse
+import json
+import os
+import time
+from whoosh.index import create_in
+from whoosh.fields import *
+
+
+def create_dataset(args):
+    print('Converting data...')
+
+    file_name = os.path.join(args.collection_path, 'corpus.json')
+
+    # We need to create whoosh index files to do the key term extraction
+    schema = Schema(title=TEXT,
+                    abstract=TEXT,
+                    id=ID(stored=True))
+    if os.path.exists(args.whoosh_index):
+        assert False
+    else:
+        os.mkdir(args.whoosh_index)
+    whoosh_index = create_in(args.whoosh_index, schema)
+    writer = whoosh_index.writer()
+
+    line_num = 0
+    start_time = time.time()
+    with open(file_name) as f:
+        for line in f:
+            obj = json.loads(line.strip())
+            doc_id = obj['id']
+            writer.add_document(id=doc_id, title=obj['title'], abstract=obj['abstract'])
+            line_num += 1
+            if line_num % 100000 == 0:
+                print("{} lines whoosh indexed in {} seconds\r".format(line_num, int(time.time()-start_time)))
+
+    writer.commit()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='''Converts Open Research Corpus jsonl collection to Whoosh's index.''')
+    parser.add_argument('--collection_path', required=True, help='Open Research jsonl collection file')
+    parser.add_argument('--whoosh_index', required=True, help='whoosh index folder')
+    args = parser.parse_args()
+
+    create_dataset(args)
+    print('Done!')
diff --git a/src/main/python/openresearch/retrieve.py b/src/main/python/openresearch/retrieve.py
index 8fffb55de5..4e9ec7a7da 100644
--- a/src/main/python/openresearch/retrieve.py
+++ b/src/main/python/openresearch/retrieve.py
@@ -19,10 +19,11 @@
 import time
 
 # Pyjnius setup
+anserini_root='.'
 import sys
 sys.path += ['src/main/python']
 from pyserini.setup import configure_classpath
-configure_classpath()
+configure_classpath(anserini_root)
 
 from jnius import autoclass
 JString = autoclass('java.lang.String')
@@ -31,6 +32,7 @@
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Retrieve Open Research Passages.')
     parser.add_argument('--qid_queries', required=True, default='', help='query id - query mapping file')
+    parser.add_argument('--valid_docs', default='', help='valid doc ids file')
     parser.add_argument('--output', required=True, default='', help='output filee')
     parser.add_argument('--index', required=True, default='', help='index path')
     parser.add_argument('--hits', default=10, type=int, help='number of hits to retrieve')
@@ -44,6 +46,11 @@
 
     args = parser.parse_args()
 
+    data_type = 'oc'
+    if args.valid_docs:
+      data_type = 'pd'
+      valid_docs = set(open(args.valid_docs).read().strip().split('\n'))
+
     searcher = JSearcher(JString(args.index))
     searcher.setBM25Similarity(args.k1, args.b)
     print('Initializing BM25, setting k1={} and b={}'.format(args.k1, args.b))
@@ -67,9 +74,9 @@
           rank = 0
           for i in range(len(hits)):
               doc_id = hits[i].docid
-              # We skip the doc that originated the query.
-              if doc_id == query_id:
-                  continue
+              # We skip the doc that originated the query, we also skipped the doc that doesn't in valid docs.
+              if data_type == 'oc' and doc_id == query_id or data_type == 'pd' and (doc_id == query_id or doc_id not in valid_docs):
+                    continue
               fout.write('{} Q0 {} {} {} Anserini\n'.format(
                   query_id, doc_id, rank + 1, hits[i].score))
               rank += 1
diff --git a/src/main/python/openresearch/retrieve_with_key_terms.py b/src/main/python/openresearch/retrieve_with_key_terms.py
index 3403614a20..1a0b40a4bf 100644
--- a/src/main/python/openresearch/retrieve_with_key_terms.py
+++ b/src/main/python/openresearch/retrieve_with_key_terms.py
@@ -6,10 +6,11 @@
 import time
 
 # Pyjnius setup
+anserini_root='.'
 import sys
 sys.path += ['src/main/python']
 from pyserini.setup import configure_classpath
-configure_classpath()
+configure_classpath(anserini_root)
 
 from jnius import autoclass
 JString = autoclass('java.lang.String')
@@ -17,7 +18,8 @@
 
 
 def update_query_with_key_terms(query, whoosh_searcher):
-    title, abstract = query.split(' [SEP] ')
+    title, abstract = query.split(' [Abstract]: ')
+    title = title.replace('[Title]: ', '')
     title_key_terms = ' '.join([t for t,_ in whoosh_searcher.key_terms_from_text('title', title, numterms=3)])
     abstract_key_terms = ' '.join([t for t,_ in whoosh_searcher.key_terms_from_text('abstract', abstract)])
     return title_key_terms + " " + abstract_key_terms
@@ -26,6 +28,7 @@ def update_query_with_key_terms(query, whoosh_searcher):
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Retrieve Open Research Passages.')
     parser.add_argument('--qid_queries', required=True, default='', help='query id - query mapping file')
+    parser.add_argument('--valid_docs', default='', help='valid doc ids file')
     parser.add_argument('--output', required=True, default='', help='output filee')
     parser.add_argument('--index', required=True, default='', help='index path')
     parser.add_argument('--whoosh_index', required=True, default='', help='whoosh index path')
@@ -39,6 +42,11 @@ def update_query_with_key_terms(query, whoosh_searcher):
 
     args = parser.parse_args()
 
+    data_type = 'oc'
+    if args.valid_docs:
+      data_type = 'pd'
+      valid_docs = set(open(args.valid_docs).read().strip().split('\n'))
+
     searcher = JSearcher(JString(args.index))
     searcher.setBM25Similarity(args.k1, args.b)
     print('Initializing BM25, setting k1={} and b={}'.format(args.k1, args.b))
@@ -70,8 +78,8 @@ def update_query_with_key_terms(query, whoosh_searcher):
           rank = 0
           for i in range(len(hits)):
               doc_id = hits[i].docid
-              # We skip the doc that originated the query.
-              if doc_id == query_id:
+              # We skip the doc that originated the query, we also skipped the doc that doesn't in valid docs.
+              if data_type == 'oc' and doc_id == query_id or data_type == 'pd' and (doc_id == query_id or doc_id not in valid_docs):
                   continue
               fout.write('{} Q0 {} {} {} Anserini\n'.format(
                   query_id, doc_id, rank + 1, hits[i].score))
diff --git a/src/main/python/openresearch/run_pubmed_dblp.sh b/src/main/python/openresearch/run_pubmed_dblp.sh
new file mode 100644
index 0000000000..794e0be9c6
--- /dev/null
+++ b/src/main/python/openresearch/run_pubmed_dblp.sh
@@ -0,0 +1,108 @@
+#!/bin/bash
+
+until [ $# -eq 0 ]
+do
+  name=${1:1}; shift;
+  if [[ -z "$1" || $1 == -* ]] ; then eval "export $name=true"; else eval "export $name=$1"; shift; fi  
+done
+
+python ./src/main/python/openresearch/convert_pubmed_dblp_to_anserini_format.py \
+  --output_folder=${output_folder}/anserini_format/pubmed_title \
+  --collection_path=${citeomatic_data}/citeomatic-2018-02-12/comparison/pubmed \
+  --max_docs_per_file=1000000 \
+  --data_type pubmed
+python ./src/main/python/openresearch/convert_pubmed_dblp_to_anserini_format.py \
+  --output_folder=${output_folder}/anserini_format/pubmed_title_abstract \
+  --collection_path=${citeomatic_data}/citeomatic-2018-02-12/comparison/pubmed \
+  --max_docs_per_file=1000000 \
+  --data_type pubmed \
+  --use_abstract_in_query
+python ./src/main/python/openresearch/convert_pubmed_dblp_to_anserini_format.py \
+  --output_folder=${output_folder}/anserini_format/dblp_title \
+  --collection_path=${citeomatic_data}/citeomatic-2018-02-12/comparison/dblp \
+  --max_docs_per_file=1000000 \
+  --data_type dblp
+python ./src/main/python/openresearch/convert_pubmed_dblp_to_anserini_format.py \
+  --output_folder=${output_folder}/anserini_format/dblp_title_abstract \
+  --collection_path=${citeomatic_data}/citeomatic-2018-02-12/comparison/dblp \
+  --max_docs_per_file=1000000 \
+  --data_type dblp \
+  --use_abstract_in_query
+
+sh ./target/appassembler/bin/IndexCollection -collection JsonCollection \
+ -generator LuceneDocumentGenerator -threads 8 -input ${output_folder}/anserini_format/pubmed_title/corpus \
+ -index ${output_folder}/lucene-index-pubmed-title -optimize -storePositions -storeDocvectors -storeRawDocs
+sh ./target/appassembler/bin/IndexCollection -collection JsonCollection \
+ -generator LuceneDocumentGenerator -threads 8 -input ${output_folder}/anserini_format/pubmed_title_abstract/corpus \
+ -index ${output_folder}/lucene-index-pubmed-title-abstract -optimize -storePositions -storeDocvectors -storeRawDocs
+sh ./target/appassembler/bin/IndexCollection -collection JsonCollection \
+ -generator LuceneDocumentGenerator -threads 8 -input ${output_folder}/anserini_format/dblp_title/corpus \
+ -index ${output_folder}/lucene-index-dblp-title -optimize -storePositions -storeDocvectors -storeRawDocs
+sh ./target/appassembler/bin/IndexCollection -collection JsonCollection \
+ -generator LuceneDocumentGenerator -threads 8 -input ${output_folder}/anserini_format/dblp_title_abstract/corpus \
+ -index ${output_folder}/lucene-index-dblp-title-abstract -optimize -storePositions -storeDocvectors -storeRawDocs
+
+python ./src/main/python/openresearch/convert_pubmed_to_whoosh_index.py \
+  --collection_path ${citeomatic_data}/citeomatic-2018-02-12/comparison/pubmed \
+  --whoosh_index ${output_folder}/anserini_format/pubmed_title_abstract/whoosh_index
+python ./src/main/python/openresearch/convert_pubmed_to_whoosh_index.py \
+  --collection_path ${citeomatic_data}/citeomatic-2018-02-12/comparison/dblp \
+  --whoosh_index ${output_folder}/anserini_format/dblp_title_abstract/whoosh_index
+
+python ./src/main/python/openresearch/retrieve.py \
+  --index ${output_folder}/lucene-index-pubmed-title \
+  --qid_queries ${output_folder}/anserini_format/pubmed_title/queries.test.tsv \
+  --valid_docs ${output_folder}/anserini_format/pubmed_title/candidates.txt \
+  --output ${output_folder}/anserini_format/pubmed_title/run.test \
+  --hits 1000
+python ./src/main/python/openresearch/retrieve_with_key_terms.py \
+  --index ${output_folder}/lucene-index-dblp-title-abstract \
+  --qid_queries ${output_folder}/anserini_format/dblp_title_abstract/queries.test.tsv \
+  --valid_docs ${output_folder}/anserini_format/dblp_title_abstract/candidates.txt \
+  --output ${output_folder}/anserini_format/dblp_title_abstract/run.keyterms.test \
+  --hits 1000 \
+  --whoosh_index ${output_folder}/anserini_format/dblp_title_abstract/whoosh_index
+python ./src/main/python/openresearch/retrieve.py \
+  --index ${output_folder}/lucene-index-pubmed-title-abstract \
+  --qid_queries ${output_folder}/anserini_format/pubmed_title_abstract/queries.test.tsv \
+  --valid_docs ${output_folder}/anserini_format/pubmed_title_abstract/candidates.txt \
+  --output ${output_folder}/anserini_format/pubmed_title_abstract/run.test \
+  --hits 1000
+python ./src/main/python/openresearch/retrieve.py \
+  --index ${output_folder}/lucene-index-dblp-title \
+  --qid_queries ${output_folder}/anserini_format/dblp_title/queries.test.tsv \
+  --valid_docs ${output_folder}/anserini_format/dblp_title/candidates.txt \
+  --output ${output_folder}/anserini_format/dblp_title/run.test \
+  --hits 1000
+python ./src/main/python/openresearch/retrieve_with_key_terms.py \
+  --index ${output_folder}/lucene-index-dblp-title-abstract \
+  --qid_queries ${output_folder}/anserini_format/dblp_title_abstract/queries.test.tsv \
+  --valid_docs ${output_folder}/anserini_format/dblp_title_abstract/candidates.txt \
+  --output ${output_folder}/anserini_format/dblp_title_abstract/run.keyterms.test \
+  --hits 1000 \
+  --whoosh_index ${output_folder}/anserini_format/dblp_title_abstract/whoosh_index
+python ./src/main/python/openresearch/retrieve.py \
+  --index ${output_folder}/lucene-index-dblp-title-abstract \
+  --qid_queries ${output_folder}/anserini_format/dblp_title_abstract/queries.test.tsv \
+  --valid_docs ${output_folder}/anserini_format/dblp_title_abstract/candidates.txt \
+  --output ${output_folder}/anserini_format/dblp_title_abstract/run.test \
+  --hits 1000
+
+echo "pubmed title"
+./eval/trec_eval.9.0.4/trec_eval -mrecip_rank -mmap -mrecall.20,1000 -mP.20  \
+ ${output_folder}/anserini_format/pubmed_title/qrels.test ${output_folder}/anserini_format/pubmed_title/run.test
+echo "pubmed key terms from title + abstract"
+./eval/trec_eval.9.0.4/trec_eval -mrecip_rank -mmap -mrecall.20,1000 -mP.20  \
+ ${output_folder}anserini_format/pubmed_title_abstract/qrels.test ${output_folder}anserini_format/pubmed_title_abstract/run.keyterms.test
+echo "pubmed title + abstract"
+./eval/trec_eval.9.0.4/trec_eval -mrecip_rank -mmap -mrecall.20,1000 -mP.20  \
+ ${output_folder}/anserini_format/pubmed_title_abstract/qrels.test ${output_folder}/anserini_format/pubmed_title_abstract/run.test
+echo "dblp title"
+./eval/trec_eval.9.0.4/trec_eval -mrecip_rank -mmap -mrecall.20,1000 -mP.20  \
+ ${output_folder}/anserini_format/dblp_title/qrels.test ${output_folder}/anserini_format/dblp_title/run.test
+echo "dblp key terms from title + abstract"
+./eval/trec_eval.9.0.4/trec_eval -mrecip_rank -mmap -mrecall.20,1000 -mP.20  \
+ ${output_folder}/anserini_format/dblp_title_abstract/qrels.test ${output_folder}/anserini_format/dblp_title_abstract/run.keyterms.test
+echo "dblp title + abstract"
+./eval/trec_eval.9.0.4/trec_eval -mrecip_rank -mmap -mrecall.20,1000 -mP.20  \
+ ${output_folder}/anserini_format/dblp_title_abstract/qrels.test ${output_folder}/anserini_format/dblp_title_abstract/run.test