Faster build and fxhashmap as vocab.

tomfran · Jan 13, 2024 · 770824d · 770824d
1 parent 8945d50
commit 770824d
Show file tree

Hide file tree

Showing 6 changed files with 61 additions and 29 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -12,4 +12,5 @@ rand = "0.8"
 tokenizers = { version = "0.15.0", features = ["http"] }
 rust-stemmers = "1.2.0"
 rayon = "1.8.0"
-indicatif = {version = "0.17.0", features = ["rayon", "improved_unicode"]}
+indicatif = {version = "0.17.0", features = ["rayon", "improved_unicode"]}
+fxhash = "0.2.1"
diff --git a/README.md b/README.md
@@ -3,24 +3,42 @@
 Search engine written in Rust, based on an inverted index on disk.
 
 ### Implementation status 
-- [x] IO classes for writing and reading bit-streams;
-- [x] Text preprocessing: 
-  - [x] Tokenization;
-  - [x] Stemming.
-- [ ] Index construction:
-  - [x] In-memory datasets index construction;
-  - [ ] Disk-based partial index construction and merging;
-  - [ ] Additional indexes to support things such as spelling correction.
-- [ ] Index queries:
-  - [ ] Boolean queries;
-  - [x] Tf-idf ranked retrieval;
-  - [x] Window computation.
+
+**IO**
+- [x] Classes for writing and reading bit-streams;
+- [ ] Proper strings writer and reader.
+
+**Text preprocessing** 
+- [x] Tokenization;
+- [x] Stemming;
+- [ ] Parametrization at build time.
+
+**Index construction**
+- [x] In-memory datasets index construction;
+- [ ] Proper vocabulary and paths on disk;
+- [ ] Spelling correction index;
+- [ ] Disk-based partial index construction and merging;
+
+**Queries**
+- [x] Tf-idf ranked retrieval;
+- [x] Window computation.
+- [ ] Boolean queries;
+- [ ] Parallel scoring.
+
+**Evaluation**
+- [ ] Query speed;
+- [ ] Query quality; 
+- [ ] Disk overhead.
 
 ### Crates in use
 - [stemmer-rs](/~https://github.com/lise-henry/stemmer-rs)
 - [tokenizers](/~https://github.com/huggingface/tokenizers)
 - [indicatif](/~https://github.com/console-rs/indicatif)
-
+- [fxhash](/~https://github.com/cbreeden/fxhash)
 
 ### References
-[Introduction to Information Retrieval - Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze](https://nlp.stanford.edu/IR-book/information-retrieval-book.html)
+[Introduction to Information Retrieval](https://nlp.stanford.edu/IR-book/information-retrieval-book.html) - Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze
+
+---
+
+*Feel free to get in touch to discuss the project!*
diff --git a/src/index/builder.rs b/src/index/builder.rs
@@ -35,7 +35,7 @@ fn build_in_memory(input_dir: &str, tokenizer: &Tokenizer, stemmer: &Stemmer) ->
         .collect();
 
     let doc_id_mutex = Mutex::new(0);
-    let term_index_map = Mutex::new(BTreeMap::new());
+    let term_index_map = Mutex::new(HashMap::new());
 
     let postings: Mutex<Vec<PostingList>> = Mutex::new(Vec::new());
     let term_doc_map: Mutex<Vec<HashMap<u32, usize>>> = Mutex::new(Vec::new());
@@ -89,8 +89,11 @@ fn build_in_memory(input_dir: &str, tokenizer: &Tokenizer, stemmer: &Stemmer) ->
             *doc_id += 1;
         });
 
+    let sorted_term_index_map: BTreeMap<String, usize> =
+        term_index_map.into_inner().unwrap().into_iter().collect();
+
     InMemoryIndex {
-        term_index_map: term_index_map.into_inner().unwrap(),
+        term_index_map: sorted_term_index_map,
         postings: postings.into_inner().unwrap(),
         document_lengths: document_lengths.into_inner().unwrap(),
     }

diff --git a/src/index/mod.rs b/src/index/mod.rs
@@ -4,6 +4,7 @@ mod postings;
 mod text;
 mod vocabulary;
 
+use fxhash::FxHashMap;
 use rust_stemmers::Stemmer;
 use std::collections::BTreeMap;
 use std::fmt::Display;
@@ -21,7 +22,7 @@ pub const VOCABULARY_LENGHTS_EXTENSION: &str = ".term_lengths";
 
 pub struct Index {
     postings: BitsReader,
-    term_offset_map: BTreeMap<String, u64>,
+    term_offset_map: FxHashMap<String, u64>,
     doc_lenghts: Vec<u32>,
     tokenizer: Tokenizer,
     stemmer: Stemmer,

diff --git a/src/index/vocabulary.rs b/src/index/vocabulary.rs
@@ -1,5 +1,7 @@
 use std::collections::BTreeMap;
 
+use fxhash::FxHashMap;
+
 use crate::disk::{
     bits_reader::BitsReader, bits_writer::BitsWriter, terms_reader::TermsReader,
     terms_writer::TermsWriter,
@@ -23,7 +25,7 @@ pub fn write_vocabulary(vocab: &BTreeMap<String, usize>, output_path: &str) {
     terms_writer.flush();
 }
 
-pub fn load_vocabulary(input_path: &str) -> BTreeMap<String, u64> {
+pub fn load_vocabulary(input_path: &str) -> FxHashMap<String, u64> {
     let terms_path: String = input_path.to_string() + VOCABULARY_ALPHA_EXTENSION;
     let terms_buffer = TermsReader::new(&terms_path).read_to_string();
 
@@ -38,7 +40,7 @@ pub fn load_vocabulary(input_path: &str) -> BTreeMap<String, u64> {
     let mut start_term_offset: usize = 0;
     let mut postings_offset = 0;
 
-    let mut res: BTreeMap<String, u64> = BTreeMap::new();
+    let mut res = FxHashMap::default();
 
     for _ in 0..num_terms {
         let term_length = lenghts_reader.read_gamma() as usize;

diff --git a/src/main.rs b/src/main.rs
@@ -8,14 +8,20 @@ use search::query::QueryProcessor;
 
 use indicatif::HumanDuration;
 
-const NUM_RESULTS: usize = 10;
+const NUM_RESULTS: usize = 1000000;
 
-fn print_results(results: &[u32]) {
-    println!("\nSearch Results:");
-    for (i, doc_id) in results.iter().enumerate() {
-        println!("\t- {:3}. Doc ID: {}", i + 1, doc_id);
-    }
-    println!();
+fn print_results(results: &[u32], elapsed_time: Duration) {
+    // println!("\nSearch Results:");
+
+    // for (i, doc_id) in results.iter().enumerate() {
+    //     println!("\t- {:3}. Doc ID: {}", i + 1, doc_id);
+    // }
+
+    println!(
+        "\nFetched {} documents in {} ms",
+        results.len(),
+        elapsed_time.as_millis()
+    );
 }
 
 fn read_line(prompt: &str) -> String {
@@ -91,8 +97,9 @@ fn main() {
     loop {
         let query = read_line("> ");
 
+        let start_time = Instant::now();
         let results = q.query(&query, NUM_RESULTS);
-
-        print_results(&results);
+        let elapsed_time = start_time.elapsed();
+        print_results(&results, elapsed_time);
     }
 }