From 5b29d1654abc5e8a014c2230da990ab2f91fb340 Mon Sep 17 00:00:00 2001 From: Gin Date: Sat, 10 Aug 2019 21:05:01 +0800 Subject: [PATCH] Run anserini+BM25 baseline on PubMed and DBLP (#765) * add script to convert and run pubmed and dblp * updated openresearch docs to add comparison table; update key terms result --- docs/experiments-openresearch.md | 109 +++++++++- .../convert_openresearch_to_whoosh_index.py | 26 ++- .../convert_pubmed_dblp_to_anserini_format.py | 196 ++++++++++++++++++ .../convert_pubmed_dblp_to_whoosh_index.py | 47 +++++ src/main/python/openresearch/retrieve.py | 15 +- .../openresearch/retrieve_with_key_terms.py | 16 +- .../python/openresearch/run_pubmed_dblp.sh | 108 ++++++++++ 7 files changed, 502 insertions(+), 15 deletions(-) create mode 100644 src/main/python/openresearch/convert_pubmed_dblp_to_anserini_format.py create mode 100644 src/main/python/openresearch/convert_pubmed_dblp_to_whoosh_index.py create mode 100644 src/main/python/openresearch/run_pubmed_dblp.sh diff --git a/docs/experiments-openresearch.md b/docs/experiments-openresearch.md index 1072c2cc32..67df4e39e3 100644 --- a/docs/experiments-openresearch.md +++ b/docs/experiments-openresearch.md @@ -124,11 +124,11 @@ recall_1000 all 0.3628 The output of using key terms in title and abstract as query should be: ``` -map all 0.0412 -recip_rank all 0.2521 -P_20 all 0.0546 -recall_20 all 0.0790 -recall_1000 all 0.2818 +map all 0.0528 +recip_rank all 0.2202 +P_20 all 0.0428 +recall_20 all 0.1022 +recall_1000 all 0.3344 ``` @@ -139,6 +139,103 @@ The table below compares our BM25 results against Bhagavatula's et. al (2018): | BM25 (Bhagavatula et. al, 2018) | 0.058 | 0.218 | | BM25 (Anserini, Ours, title) | 0.063 | 0.244 | | BM25 (Anserini, Ours, title+abstract)| 0.095 | 0.351 | -| BM25 (Anserini, Ours, key terms)| 0.065 | 0.251 | +| BM25 (Anserini, Ours, key terms)| 0.060 | 0.220 | +## Extra Baseline on PubMed and DBLP + +### PubMed and DBLP dataset + +Follow [citeomatic's repo](/~https://github.com/allenai/citeomatic/tree/44dc210c82515b5d4c5a96f5aafcb9b6e48206af) to download the necessary data. + +The steps are similar to run baseline on OpenResearch, to run all three experiments on PubMed and DBLP quickly, run + +`./src/main/python/openresearch/run_pubmed_dblp.sh -citeomatic_data -output_folder ` + +The results are as follows: + +The output of using PubMed title as query + +``` +map all 0.1615 +recip_rank all 0.5844 +P_20 all 0.2034 +recall_20 all 0.1954 +recall_1000 all 0.6536 +f1_20 all 0.199 +``` + +The output of using PubMed key terms from title and abstract as query + +``` +map all 0.1637 +recip_rank all 0.5953 +P_20 all 0.2058 +recall_20 all 0.1969 +recall_1000 all 0.6041 +f1_20 all 0.201 +``` + +The output of using PubMed title + abstract as query + +``` +map all 0.2361 +recip_rank all 0.7208 +P_20 all 0.2726 +recall_20 all 0.2632 +recall_1000 all 0.7649 +f1_20 all 0.268 +``` + +The output of using DBLP title as query + +``` +map all 0.1056 +recip_rank all 0.4244 +P_20 all 0.1090 +recall_20 all 0.1721 +recall_1000 all 0.5511 +f1_20 all 0.133 +``` + +The output of using DBLP key terms from title and abstract as query + +``` +map all 0.1015 +recip_rank all 0.4254 +P_20 all 0.1059 +recall_20 all 0.1669 +recall_1000 all 0.5099 +f1_20 all 0.130 +``` + +The output of using DBLP title + abstract as query + +``` +map all 0.1687 +recip_rank all 0.5851 +P_20 all 0.1586 +recall_20 all 0.2511 +recall_1000 all 0.6913 +f1_20 all 0.194 +``` + +The table below compares our BM25 results against Bhagavatula's et. al (2018): + +**PubMed** + +| | F1@20 | MRR | +|----------|:-------------:|------:| +| BM25 (Bhagavatula et. al, 2018) | 0.209 | 0.574 | +| BM25 (Anserini, Ours, title) | 0.199 | 0.584 | +| BM25 (Anserini, Ours, key terms)| 0.201 | 0.595 | +| BM25 (Anserini, Ours, title+abstract)| 0.268 | 0.720| + +**DBLP** + +| | F1@20 | MRR | +|----------|:-------------:|------:| +| BM25 (Bhagavatula et. al, 2018) | 0.119 | 0.425 | +| BM25 (Anserini, Ours, title) | 0.133 | 0.424 | +| BM25 (Anserini, Ours, key terms)| 0.130 | 0.425 | +| BM25 (Anserini, Ours, title+abstract)| 0.194 | 0.585 | diff --git a/src/main/python/openresearch/convert_openresearch_to_whoosh_index.py b/src/main/python/openresearch/convert_openresearch_to_whoosh_index.py index a80a1f8f77..6f3ea0f714 100644 --- a/src/main/python/openresearch/convert_openresearch_to_whoosh_index.py +++ b/src/main/python/openresearch/convert_openresearch_to_whoosh_index.py @@ -9,6 +9,29 @@ from whoosh.fields import * +def get_id_years(file_paths): + print('Collecting paper ids and their publication years...') + id_years = [] + for file_num, file_path in enumerate(file_paths): + with gzip.open(file_path) as f: + for line_num, line in enumerate(f): + obj = json.loads(line.strip()) + doc_id = obj['id'] + if 'year' not in obj: + continue + year = int(obj['year']) + + id_years.append((doc_id, year)) + if line_num % 100000 == 0: + print('Processed {} lines. Collected {} docs.'.format( + line_num + 1, len(id_years))) + + print('Sorting papers by year...') + id_years.sort(key = lambda x: x[1]) + id_years = {id: year for id, year in id_years} + return id_years + + def create_dataset(args): print('Converting data...') @@ -35,6 +58,8 @@ def create_dataset(args): whoosh_index = create_in(args.whoosh_index, schema) writer = whoosh_index.writer() + id_years = get_id_years(file_paths) + doc_ids = set(id_years.keys()) line_num = 0 start_time = time.time() for file_num, file_path in enumerate(file_paths): @@ -42,7 +67,6 @@ def create_dataset(args): for line in f: obj = json.loads(line.strip()) doc_id = obj['id'] - writer.add_document(id=doc_id, title=obj['title'], abstract=obj['paperAbstract']) line_num += 1 if line_num % 100000 == 0: diff --git a/src/main/python/openresearch/convert_pubmed_dblp_to_anserini_format.py b/src/main/python/openresearch/convert_pubmed_dblp_to_anserini_format.py new file mode 100644 index 0000000000..4eb477a1f3 --- /dev/null +++ b/src/main/python/openresearch/convert_pubmed_dblp_to_anserini_format.py @@ -0,0 +1,196 @@ +import argparse +import gzip +import json +import os +import time +from collections import defaultdict + + +def clean(text): + return text.replace('\n', ' ').replace('\t', ' ') + + +def get_ids(start, end, year_ids): + result = [] + for year in range(start, end+1): + result.extend(year_ids[year]) + return set(result) + + +def get_id_years(file_name, data_type): + print('Collecting paper ids and their publication years...') + year_ids = defaultdict(list) + with open(file_name) as f: + for line_num, line in enumerate(f): + obj = json.loads(line.strip()) + doc_id = obj['id'] + if 'year' not in obj: + continue + year = int(obj['year']) + + year_ids[year].append(doc_id) + if line_num % 1000000 == 0: + print('Processed {} lines. Collected {} docs.'.format( + line_num + 1, len(year_ids))) + + train_ranges = {'dblp': (1966, 2007), 'pubmed': (1966, 2008)} + dev_ranges = {'dblp': (2008, 2008), 'pubmed': (2009, 2009)} + test_ranges = {'dblp': (2009, 2011), 'pubmed': (2010, 2013)} + + train_ids = get_ids(train_ranges[data_type][0], train_ranges[data_type][1], year_ids) + dev_ids = get_ids(dev_ranges[data_type][0], dev_ranges[data_type][1], year_ids) + test_ids = get_ids(test_ranges[data_type][0], test_ranges[data_type][1], year_ids) + + num_train = len(train_ids) + num_dev = len(dev_ids) + num_test = len(test_ids) + + print('Collected {}, {}, {} papers for training, dev, and test sets.'.format( + num_train, num_dev, num_test)) + + return train_ids, dev_ids, test_ids, year_ids + + +def create_dataset(args): + print('Converting data...') + queries_files = {} + qrels_files = {} + for set_name in ['train', 'dev', 'test']: + queries_filepath = os.path.join( + args.output_folder, 'queries.{}.tsv'.format(set_name)) + qrels_filepath = os.path.join( + args.output_folder, 'qrels.{}'.format(set_name)) + queries_files[set_name] = open(queries_filepath, 'w') + qrels_files[set_name] = open(qrels_filepath, 'w') + + file_name = os.path.join(args.collection_path, 'corpus.json') + + train_ids, dev_ids, test_ids, year_ids = get_id_years( + file_name=file_name, data_type=args.data_type) + + doc_ids = train_ids | dev_ids | test_ids + + # Write train_ids to file for future use + candidates_file = open(os.path.join(args.output_folder, 'candidates.txt'), 'w') + for train_id in train_ids: + candidates_file.write(train_id+'\n') + + id_years = {} + for y in year_ids: + for i in year_ids[y]: + id_years[i] = y + + n_docs = 0 + file_index = 0 + num_train = 0 + num_dev = 0 + num_test = 0 + start_time = time.time() + + with open(file_name) as f: + for line in f: + obj = json.loads(line.strip()) + doc_id = obj['id'] + if doc_id not in doc_ids: + continue + if n_docs % args.max_docs_per_file == 0: + if n_docs > 0: + output_jsonl_file.close() + output_path = os.path.join( + args.output_folder, 'corpus/docs{:02d}.json'.format(file_index)) + output_jsonl_file = open(output_path, 'w') + file_index += 1 + doc_text = '[Title]: {} [Abstract]: {}'.format( + obj['title'], obj['abstract']) + doc_text = clean(doc_text) + output_dict = {'id': doc_id, 'contents': doc_text} + output_jsonl_file.write(json.dumps(output_dict) + '\n') + n_docs += 1 + + out_citations = obj['out_citations'] + + # Remove citations not in the corpus. + out_citations = [ + out_citation for out_citation in out_citations + if out_citation in doc_ids + ] + + # Remove self citations. + out_citations = [ + out_citation for out_citation in out_citations + if out_citation != doc_id + ] + + # Use only citations that have an older publication year than the citing + # paper's or do not have an year. + out_citations2 = [] + for out_citation in out_citations: + if out_citation in id_years: + if id_years[out_citation] <= obj['year']: + out_citations2.append(out_citation) + out_citations = out_citations2 + + # Follow Bhagavatula's setting to restrict our citations candidates to train_ids only + out_citations = set(out_citations) + out_citations.intersection_update(train_ids) + + # Skip papers have out citations < 10. + if len(out_citations) < 10: + continue + + if doc_id in train_ids: + set_name = 'train' + num_train += 1 + elif doc_id in dev_ids: + set_name = 'dev' + num_dev += 1 + elif doc_id in test_ids: + set_name = 'test' + num_test += 1 + + queries_file = queries_files[set_name] + qrels_file = qrels_files[set_name] + + doc_title = obj['title'] + doc_title = clean(doc_title) + if args.use_abstract_in_query: + doc_abstract = clean(obj['abstract']) + query = '[Title]: ' + doc_title + ' [Abstract]: ' + doc_abstract + else: + query = doc_title + queries_file.write('{}\t{}\n'.format(doc_id, query)) + for out_citation in out_citations: + qrels_file.write('{} 0 {} 1\n'.format(doc_id, out_citation)) + + print('Examples: {} train, {} valid, {} test'.format( + num_train, num_dev, num_test)) + + # Close queries and qrels files. + for queries_file in queries_files.values(): + queries_file.close() + for qrels_file in qrels_files.values(): + qrels_file.close() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Converts DBLP Corpus json collection to ' + 'Anserini\'s jsonl files.') + parser.add_argument('--collection_path', required=True, + help='DBLP json collection file') + parser.add_argument('--output_folder', required=True, help='output file') + parser.add_argument('--max_docs_per_file', default=1000000, type=int, + help='maximum number of documents in each jsonl file.') + parser.add_argument('--data_type', required=True, default='dblp', help='dblp or pubmed') + parser.add_argument('--use_abstract_in_query', action='store_true', + help='If True use title and a abstract as query. If ' + 'False, use only title.') + + args = parser.parse_args() + + if not os.path.exists(args.output_folder): + os.makedirs(args.output_folder) + os.makedirs(os.path.join(args.output_folder, 'corpus')) + + create_dataset(args) + print('Done!') \ No newline at end of file diff --git a/src/main/python/openresearch/convert_pubmed_dblp_to_whoosh_index.py b/src/main/python/openresearch/convert_pubmed_dblp_to_whoosh_index.py new file mode 100644 index 0000000000..57a3351b9f --- /dev/null +++ b/src/main/python/openresearch/convert_pubmed_dblp_to_whoosh_index.py @@ -0,0 +1,47 @@ +import argparse +import json +import os +import time +from whoosh.index import create_in +from whoosh.fields import * + + +def create_dataset(args): + print('Converting data...') + + file_name = os.path.join(args.collection_path, 'corpus.json') + + # We need to create whoosh index files to do the key term extraction + schema = Schema(title=TEXT, + abstract=TEXT, + id=ID(stored=True)) + if os.path.exists(args.whoosh_index): + assert False + else: + os.mkdir(args.whoosh_index) + whoosh_index = create_in(args.whoosh_index, schema) + writer = whoosh_index.writer() + + line_num = 0 + start_time = time.time() + with open(file_name) as f: + for line in f: + obj = json.loads(line.strip()) + doc_id = obj['id'] + writer.add_document(id=doc_id, title=obj['title'], abstract=obj['abstract']) + line_num += 1 + if line_num % 100000 == 0: + print("{} lines whoosh indexed in {} seconds\r".format(line_num, int(time.time()-start_time))) + + writer.commit() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='''Converts Open Research Corpus jsonl collection to Whoosh's index.''') + parser.add_argument('--collection_path', required=True, help='Open Research jsonl collection file') + parser.add_argument('--whoosh_index', required=True, help='whoosh index folder') + args = parser.parse_args() + + create_dataset(args) + print('Done!') diff --git a/src/main/python/openresearch/retrieve.py b/src/main/python/openresearch/retrieve.py index 8fffb55de5..4e9ec7a7da 100644 --- a/src/main/python/openresearch/retrieve.py +++ b/src/main/python/openresearch/retrieve.py @@ -19,10 +19,11 @@ import time # Pyjnius setup +anserini_root='.' import sys sys.path += ['src/main/python'] from pyserini.setup import configure_classpath -configure_classpath() +configure_classpath(anserini_root) from jnius import autoclass JString = autoclass('java.lang.String') @@ -31,6 +32,7 @@ if __name__ == '__main__': parser = argparse.ArgumentParser(description='Retrieve Open Research Passages.') parser.add_argument('--qid_queries', required=True, default='', help='query id - query mapping file') + parser.add_argument('--valid_docs', default='', help='valid doc ids file') parser.add_argument('--output', required=True, default='', help='output filee') parser.add_argument('--index', required=True, default='', help='index path') parser.add_argument('--hits', default=10, type=int, help='number of hits to retrieve') @@ -44,6 +46,11 @@ args = parser.parse_args() + data_type = 'oc' + if args.valid_docs: + data_type = 'pd' + valid_docs = set(open(args.valid_docs).read().strip().split('\n')) + searcher = JSearcher(JString(args.index)) searcher.setBM25Similarity(args.k1, args.b) print('Initializing BM25, setting k1={} and b={}'.format(args.k1, args.b)) @@ -67,9 +74,9 @@ rank = 0 for i in range(len(hits)): doc_id = hits[i].docid - # We skip the doc that originated the query. - if doc_id == query_id: - continue + # We skip the doc that originated the query, we also skipped the doc that doesn't in valid docs. + if data_type == 'oc' and doc_id == query_id or data_type == 'pd' and (doc_id == query_id or doc_id not in valid_docs): + continue fout.write('{} Q0 {} {} {} Anserini\n'.format( query_id, doc_id, rank + 1, hits[i].score)) rank += 1 diff --git a/src/main/python/openresearch/retrieve_with_key_terms.py b/src/main/python/openresearch/retrieve_with_key_terms.py index 3403614a20..1a0b40a4bf 100644 --- a/src/main/python/openresearch/retrieve_with_key_terms.py +++ b/src/main/python/openresearch/retrieve_with_key_terms.py @@ -6,10 +6,11 @@ import time # Pyjnius setup +anserini_root='.' import sys sys.path += ['src/main/python'] from pyserini.setup import configure_classpath -configure_classpath() +configure_classpath(anserini_root) from jnius import autoclass JString = autoclass('java.lang.String') @@ -17,7 +18,8 @@ def update_query_with_key_terms(query, whoosh_searcher): - title, abstract = query.split(' [SEP] ') + title, abstract = query.split(' [Abstract]: ') + title = title.replace('[Title]: ', '') title_key_terms = ' '.join([t for t,_ in whoosh_searcher.key_terms_from_text('title', title, numterms=3)]) abstract_key_terms = ' '.join([t for t,_ in whoosh_searcher.key_terms_from_text('abstract', abstract)]) return title_key_terms + " " + abstract_key_terms @@ -26,6 +28,7 @@ def update_query_with_key_terms(query, whoosh_searcher): if __name__ == '__main__': parser = argparse.ArgumentParser(description='Retrieve Open Research Passages.') parser.add_argument('--qid_queries', required=True, default='', help='query id - query mapping file') + parser.add_argument('--valid_docs', default='', help='valid doc ids file') parser.add_argument('--output', required=True, default='', help='output filee') parser.add_argument('--index', required=True, default='', help='index path') parser.add_argument('--whoosh_index', required=True, default='', help='whoosh index path') @@ -39,6 +42,11 @@ def update_query_with_key_terms(query, whoosh_searcher): args = parser.parse_args() + data_type = 'oc' + if args.valid_docs: + data_type = 'pd' + valid_docs = set(open(args.valid_docs).read().strip().split('\n')) + searcher = JSearcher(JString(args.index)) searcher.setBM25Similarity(args.k1, args.b) print('Initializing BM25, setting k1={} and b={}'.format(args.k1, args.b)) @@ -70,8 +78,8 @@ def update_query_with_key_terms(query, whoosh_searcher): rank = 0 for i in range(len(hits)): doc_id = hits[i].docid - # We skip the doc that originated the query. - if doc_id == query_id: + # We skip the doc that originated the query, we also skipped the doc that doesn't in valid docs. + if data_type == 'oc' and doc_id == query_id or data_type == 'pd' and (doc_id == query_id or doc_id not in valid_docs): continue fout.write('{} Q0 {} {} {} Anserini\n'.format( query_id, doc_id, rank + 1, hits[i].score)) diff --git a/src/main/python/openresearch/run_pubmed_dblp.sh b/src/main/python/openresearch/run_pubmed_dblp.sh new file mode 100644 index 0000000000..794e0be9c6 --- /dev/null +++ b/src/main/python/openresearch/run_pubmed_dblp.sh @@ -0,0 +1,108 @@ +#!/bin/bash + +until [ $# -eq 0 ] +do + name=${1:1}; shift; + if [[ -z "$1" || $1 == -* ]] ; then eval "export $name=true"; else eval "export $name=$1"; shift; fi +done + +python ./src/main/python/openresearch/convert_pubmed_dblp_to_anserini_format.py \ + --output_folder=${output_folder}/anserini_format/pubmed_title \ + --collection_path=${citeomatic_data}/citeomatic-2018-02-12/comparison/pubmed \ + --max_docs_per_file=1000000 \ + --data_type pubmed +python ./src/main/python/openresearch/convert_pubmed_dblp_to_anserini_format.py \ + --output_folder=${output_folder}/anserini_format/pubmed_title_abstract \ + --collection_path=${citeomatic_data}/citeomatic-2018-02-12/comparison/pubmed \ + --max_docs_per_file=1000000 \ + --data_type pubmed \ + --use_abstract_in_query +python ./src/main/python/openresearch/convert_pubmed_dblp_to_anserini_format.py \ + --output_folder=${output_folder}/anserini_format/dblp_title \ + --collection_path=${citeomatic_data}/citeomatic-2018-02-12/comparison/dblp \ + --max_docs_per_file=1000000 \ + --data_type dblp +python ./src/main/python/openresearch/convert_pubmed_dblp_to_anserini_format.py \ + --output_folder=${output_folder}/anserini_format/dblp_title_abstract \ + --collection_path=${citeomatic_data}/citeomatic-2018-02-12/comparison/dblp \ + --max_docs_per_file=1000000 \ + --data_type dblp \ + --use_abstract_in_query + +sh ./target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator LuceneDocumentGenerator -threads 8 -input ${output_folder}/anserini_format/pubmed_title/corpus \ + -index ${output_folder}/lucene-index-pubmed-title -optimize -storePositions -storeDocvectors -storeRawDocs +sh ./target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator LuceneDocumentGenerator -threads 8 -input ${output_folder}/anserini_format/pubmed_title_abstract/corpus \ + -index ${output_folder}/lucene-index-pubmed-title-abstract -optimize -storePositions -storeDocvectors -storeRawDocs +sh ./target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator LuceneDocumentGenerator -threads 8 -input ${output_folder}/anserini_format/dblp_title/corpus \ + -index ${output_folder}/lucene-index-dblp-title -optimize -storePositions -storeDocvectors -storeRawDocs +sh ./target/appassembler/bin/IndexCollection -collection JsonCollection \ + -generator LuceneDocumentGenerator -threads 8 -input ${output_folder}/anserini_format/dblp_title_abstract/corpus \ + -index ${output_folder}/lucene-index-dblp-title-abstract -optimize -storePositions -storeDocvectors -storeRawDocs + +python ./src/main/python/openresearch/convert_pubmed_to_whoosh_index.py \ + --collection_path ${citeomatic_data}/citeomatic-2018-02-12/comparison/pubmed \ + --whoosh_index ${output_folder}/anserini_format/pubmed_title_abstract/whoosh_index +python ./src/main/python/openresearch/convert_pubmed_to_whoosh_index.py \ + --collection_path ${citeomatic_data}/citeomatic-2018-02-12/comparison/dblp \ + --whoosh_index ${output_folder}/anserini_format/dblp_title_abstract/whoosh_index + +python ./src/main/python/openresearch/retrieve.py \ + --index ${output_folder}/lucene-index-pubmed-title \ + --qid_queries ${output_folder}/anserini_format/pubmed_title/queries.test.tsv \ + --valid_docs ${output_folder}/anserini_format/pubmed_title/candidates.txt \ + --output ${output_folder}/anserini_format/pubmed_title/run.test \ + --hits 1000 +python ./src/main/python/openresearch/retrieve_with_key_terms.py \ + --index ${output_folder}/lucene-index-dblp-title-abstract \ + --qid_queries ${output_folder}/anserini_format/dblp_title_abstract/queries.test.tsv \ + --valid_docs ${output_folder}/anserini_format/dblp_title_abstract/candidates.txt \ + --output ${output_folder}/anserini_format/dblp_title_abstract/run.keyterms.test \ + --hits 1000 \ + --whoosh_index ${output_folder}/anserini_format/dblp_title_abstract/whoosh_index +python ./src/main/python/openresearch/retrieve.py \ + --index ${output_folder}/lucene-index-pubmed-title-abstract \ + --qid_queries ${output_folder}/anserini_format/pubmed_title_abstract/queries.test.tsv \ + --valid_docs ${output_folder}/anserini_format/pubmed_title_abstract/candidates.txt \ + --output ${output_folder}/anserini_format/pubmed_title_abstract/run.test \ + --hits 1000 +python ./src/main/python/openresearch/retrieve.py \ + --index ${output_folder}/lucene-index-dblp-title \ + --qid_queries ${output_folder}/anserini_format/dblp_title/queries.test.tsv \ + --valid_docs ${output_folder}/anserini_format/dblp_title/candidates.txt \ + --output ${output_folder}/anserini_format/dblp_title/run.test \ + --hits 1000 +python ./src/main/python/openresearch/retrieve_with_key_terms.py \ + --index ${output_folder}/lucene-index-dblp-title-abstract \ + --qid_queries ${output_folder}/anserini_format/dblp_title_abstract/queries.test.tsv \ + --valid_docs ${output_folder}/anserini_format/dblp_title_abstract/candidates.txt \ + --output ${output_folder}/anserini_format/dblp_title_abstract/run.keyterms.test \ + --hits 1000 \ + --whoosh_index ${output_folder}/anserini_format/dblp_title_abstract/whoosh_index +python ./src/main/python/openresearch/retrieve.py \ + --index ${output_folder}/lucene-index-dblp-title-abstract \ + --qid_queries ${output_folder}/anserini_format/dblp_title_abstract/queries.test.tsv \ + --valid_docs ${output_folder}/anserini_format/dblp_title_abstract/candidates.txt \ + --output ${output_folder}/anserini_format/dblp_title_abstract/run.test \ + --hits 1000 + +echo "pubmed title" +./eval/trec_eval.9.0.4/trec_eval -mrecip_rank -mmap -mrecall.20,1000 -mP.20 \ + ${output_folder}/anserini_format/pubmed_title/qrels.test ${output_folder}/anserini_format/pubmed_title/run.test +echo "pubmed key terms from title + abstract" +./eval/trec_eval.9.0.4/trec_eval -mrecip_rank -mmap -mrecall.20,1000 -mP.20 \ + ${output_folder}anserini_format/pubmed_title_abstract/qrels.test ${output_folder}anserini_format/pubmed_title_abstract/run.keyterms.test +echo "pubmed title + abstract" +./eval/trec_eval.9.0.4/trec_eval -mrecip_rank -mmap -mrecall.20,1000 -mP.20 \ + ${output_folder}/anserini_format/pubmed_title_abstract/qrels.test ${output_folder}/anserini_format/pubmed_title_abstract/run.test +echo "dblp title" +./eval/trec_eval.9.0.4/trec_eval -mrecip_rank -mmap -mrecall.20,1000 -mP.20 \ + ${output_folder}/anserini_format/dblp_title/qrels.test ${output_folder}/anserini_format/dblp_title/run.test +echo "dblp key terms from title + abstract" +./eval/trec_eval.9.0.4/trec_eval -mrecip_rank -mmap -mrecall.20,1000 -mP.20 \ + ${output_folder}/anserini_format/dblp_title_abstract/qrels.test ${output_folder}/anserini_format/dblp_title_abstract/run.keyterms.test +echo "dblp title + abstract" +./eval/trec_eval.9.0.4/trec_eval -mrecip_rank -mmap -mrecall.20,1000 -mP.20 \ + ${output_folder}/anserini_format/dblp_title_abstract/qrels.test ${output_folder}/anserini_format/dblp_title_abstract/run.test