diff --git a/src/main/python/paragraph_indexing/README.md b/src/main/python/paragraph_indexing/README.md deleted file mode 100644 index 097ecd093b..0000000000 --- a/src/main/python/paragraph_indexing/README.md +++ /dev/null @@ -1,86 +0,0 @@ -# Paragraph Indexing - - - -## Segment - -Segment each raw document into paragraph and dump out into seperate .json file named with DOCID in json format, e.g. - -``` -[ - { - 'id':'{$DOCNO}.0001', - 'content':'content0001' - }, - { - 'id':'{$DOCNO}.0002', - 'content':'content0001' - } -] -``` - -This is done by calling `seg_${collection}.py`, where supported collections so far are `robust04` and `core17` - - - -### Example: - -Run - -``` -python seg_robust04.py \ - --input lucene-index.robust04.pos+docvectors+rawdocs.allDocids.txt.output.tar.gz \ - --output robust04.paragraphs/ -``` - -All documents will be segmented into paragraph and stored in folder `./robust04.paragraphs/` - - - -### Input file - -The input raw documents should be a `tar.gz` file containing each document in a seperate file named as DOCID. This file can be generated through following command (e.g Robust04) - -Suppose you're under Anserini directory. First indexing - -```bash -nohup sh target/appassembler/bin/IndexCollection -collection TrecCollection \ - -input /path/to/disk45/ -generator JsoupGenerator \ - -index lucene-index.robust04.pos+docvectors+rawdocs -threads 16 \ - -storePositions -storeDocvectors -storeRawDocs -optimize \ - >& log.robust04.pos+docvectors+rawdocs & -``` - -and then dump the raw documents by the following two steps: -1. dump all docids of the collection -2. feed the docids file to dump raw documents - -```bash -sh target/appassembler/bin/IndexUtils \ - -index lucene-index.robust04.pos+docvectors+rawdocs \ - -dumpAllDocids NONE && -sh target/appassembler/bin/IndexUtils \ - -index lucene-index.robust04.pos+docvectors+rawdocs \ - -dumpRawDocs lucene-index.robust04.pos+docvectors+rawdocs.allDocids.txt -``` - -and the output `tar.gz` file will be named as - -``` -lucene-index.robust04.pos+docvectors+rawdocs.allDocids.txt.output.tar.gz -``` - - - -## Paragraph Indexing - -The json file can be indexed using `JsonCollection` in Anserini. Run - -```bash -sh target/appassembler/bin/IndexCollection -collection JsonCollection \ - -input /path/to/robust04.paragraphs -generator LuceneDocumentGenerator \ - -index lucene-index.robust04.paragraphs.pos+docvectors+rawdocs -threads 16 \ - -storePositions -storeDocvectors -storeRawDocs -optimize && -``` - -to index each paragraph for Robust04 collection. `-input` should be the output folder of the paragraph segmentation diff --git a/src/main/python/paragraph_indexing/__init__.py b/src/main/python/paragraph_indexing/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/src/main/python/paragraph_indexing/paraseg.py b/src/main/python/paragraph_indexing/paraseg.py deleted file mode 100644 index 84f8c0d049..0000000000 --- a/src/main/python/paragraph_indexing/paraseg.py +++ /dev/null @@ -1,258 +0,0 @@ -""" -Anserini: A toolkit for reproducible information retrieval research built on Lucene - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" - -class _ParaSegmenter(object): - """ The base class for all Paragraph Segmentation class """ - def __init__(self, bufferedreader, start_pattern_list=None): - self._br = bufferedreader - self._isstart = False - self._curline = None - self._paralist = [] - self._setup(start_pattern_list) - - def _setup(self, pattern_list): - """ Find the start of the first paragraph of input document - This can only be called once when initializing the object - - Args: - pattern_list(list): an indicator of starting a paragraph - - Effect: - self._isstart set to True if found, else remains False - If pattern_list is None, treated as started - """ - if pattern_list is None: - self._isstart = True - return - - while True: - self._curline = self._br.readline() - if not self._curline: - return - - if self._curline in pattern_list: - self._isstart = True - return - - def _isend(self, line): - """ An indicator of the a paragraph's end. The code in this base class indicates - the end of a documents, as the end of a document indicates the end of a paragraph as well. - - Args: - line(str): the line to be tested on - - Return: - (bool): if this line is a paragraph end - """ - if not line or line == b'': - return True - - return False - - def hasnextpara(self): - """ Check if there is a paragraph in this document - - If self._isstart == False after initialization, no useful information is contained in - this document, then return False - If reach the end of the doc, return False - - Return: - (bool) if this document has further content - - """ - if not self._isstart or not self._curline or self._curline == b'': - return False - return True - - def nextpara(self): - """ Two cases here: - - 1. There is a pattern indicating a new paragraph followed by self._curline, - In this case, after calling self.hasnextpara(), len(self._paralist) == 0. - In this case, also, subclasses should fill self._paralist in `self.hasnextpara()` - 2. There is only a pattern indicating the end of a paragraph, - so one has to readline until the end to see if it is a paragraph. - In this case, after calling self.hasnextpara() and return True, - len(self._paralist) > 0 - - Return: - str: A string contains a paragraph - """ - if not self._paralist: - while True: - self._curline = self._br.readline() - if self._isend(self._curline): - break - self._paralist.append(self._curline.decode('utf-8').strip()) - - parastr = ' '.join(self._paralist) - del self._paralist[:] - return parastr - - -class FBISParaSegmenter(_ParaSegmenter): - """ A Segmenter to segment documents in FBIS collection under Robust04. - - Args: - bufferedreader (io.BufferedReader): the buffered reader of a document. - """ - def __init__(self, bufferedreader): - start_pattern_list = [b'\n'] # start pattern by observation - - super(FBISParaSegmenter, self).__init__(bufferedreader, start_pattern_list) - self._linelimit = 50 # An empirical number to decide if this is end of paragraph - - def _isend(self, line): - if super(FBISParaSegmenter, self)._isend(line): - return True - - if line[-2:] == b'.\n' and len(line) < self._linelimit: - return True - - return False - - def hasnextpara(self): - if not super(FBISParaSegmenter, self).hasnextpara(): - return False - - while True: - self._curline = self._br.readline() - if self._isend(self._curline): - break - self._paralist.append(self._curline.decode('utf-8').strip()) - - if not self._paralist: - # Handle the following pattern - # b'\n' - # b'\n' - return False - - if len(self._curline) > 1: - # skip if self._curline == b'\n' - self._paralist.append(self._curline.decode('utf-8').strip()) - - return True - - -class FR94ParaSegmenter(_ParaSegmenter): - """ A Segmenter to segment documents in FR94 collection under Robust04. - - Args: - bufferedreader (io.BufferedReader): the buffered reader of a document. - """ - def __init__(self, bufferedreader): - self._start_pattern_list = [ - b'\n', - b'\n' - ] # start pattern by observation - super(FR94ParaSegmenter, self).__init__(bufferedreader, self._start_pattern_list) - - def _isend(self, line): - if super(FR94ParaSegmenter, self)._isend(line): - return True - if line[:4] == b'