diff --git a/Cargo.toml b/Cargo.toml index b2788bc..5ad9a21 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,4 +12,5 @@ rand = "0.8" tokenizers = { version = "0.15.0", features = ["http"] } rust-stemmers = "1.2.0" rayon = "1.8.0" -indicatif = {version = "0.17.0", features = ["rayon", "improved_unicode"]} \ No newline at end of file +indicatif = {version = "0.17.0", features = ["rayon", "improved_unicode"]} +fxhash = "0.2.1" diff --git a/README.md b/README.md index f6b66c9..5892485 100644 --- a/README.md +++ b/README.md @@ -2,25 +2,43 @@ Search engine written in Rust, based on an inverted index on disk. -### Implementation status -- [x] IO classes for writing and reading bit-streams; -- [x] Text preprocessing: - - [x] Tokenization; - - [x] Stemming. -- [ ] Index construction: - - [x] In-memory datasets index construction; - - [ ] Disk-based partial index construction and merging; - - [ ] Additional indexes to support things such as spelling correction. -- [ ] Index queries: - - [ ] Boolean queries; - - [x] Tf-idf ranked retrieval; - - [x] Window computation. - -### Crates in use +## Implementation status + +**IO** +- [x] Classes for writing and reading bit-streams; +- [ ] Proper strings writer and reader. + +**Text preprocessing** +- [x] Tokenization; +- [x] Stemming; +- [ ] Parametrization at build time. + +**Index construction** +- [x] In-memory datasets index construction; +- [ ] Proper vocabulary and paths on disk; +- [ ] Spelling correction index; +- [ ] Disk-based partial index construction and merging; + +**Queries** +- [x] Tf-idf ranked retrieval; +- [x] Window computation. +- [ ] Boolean queries; +- [ ] Parallel scoring. + +**Evaluation** +- [ ] Query speed; +- [ ] Query quality; +- [ ] Disk overhead. + +## Crates in use - [stemmer-rs](/~https://github.com/lise-henry/stemmer-rs) - [tokenizers](/~https://github.com/huggingface/tokenizers) - [indicatif](/~https://github.com/console-rs/indicatif) +- [fxhash](/~https://github.com/cbreeden/fxhash) + +## References +[Introduction to Information Retrieval](https://nlp.stanford.edu/IR-book/information-retrieval-book.html) - Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze +--- -### References -[Introduction to Information Retrieval - Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze](https://nlp.stanford.edu/IR-book/information-retrieval-book.html) \ No newline at end of file +*Feel free to get in touch to discuss the project!* \ No newline at end of file diff --git a/src/index/builder.rs b/src/index/builder.rs index 342b5c1..0317840 100644 --- a/src/index/builder.rs +++ b/src/index/builder.rs @@ -35,7 +35,7 @@ fn build_in_memory(input_dir: &str, tokenizer: &Tokenizer, stemmer: &Stemmer) -> .collect(); let doc_id_mutex = Mutex::new(0); - let term_index_map = Mutex::new(BTreeMap::new()); + let term_index_map = Mutex::new(HashMap::new()); let postings: Mutex> = Mutex::new(Vec::new()); let term_doc_map: Mutex>> = Mutex::new(Vec::new()); @@ -89,8 +89,11 @@ fn build_in_memory(input_dir: &str, tokenizer: &Tokenizer, stemmer: &Stemmer) -> *doc_id += 1; }); + let sorted_term_index_map: BTreeMap = + term_index_map.into_inner().unwrap().into_iter().collect(); + InMemoryIndex { - term_index_map: term_index_map.into_inner().unwrap(), + term_index_map: sorted_term_index_map, postings: postings.into_inner().unwrap(), document_lengths: document_lengths.into_inner().unwrap(), } diff --git a/src/index/mod.rs b/src/index/mod.rs index 87dd534..7a39cc6 100644 --- a/src/index/mod.rs +++ b/src/index/mod.rs @@ -4,6 +4,7 @@ mod postings; mod text; mod vocabulary; +use fxhash::FxHashMap; use rust_stemmers::Stemmer; use std::collections::BTreeMap; use std::fmt::Display; @@ -21,7 +22,7 @@ pub const VOCABULARY_LENGHTS_EXTENSION: &str = ".term_lengths"; pub struct Index { postings: BitsReader, - term_offset_map: BTreeMap, + term_offset_map: FxHashMap, doc_lenghts: Vec, tokenizer: Tokenizer, stemmer: Stemmer, diff --git a/src/index/vocabulary.rs b/src/index/vocabulary.rs index e4cf590..e4a9c08 100644 --- a/src/index/vocabulary.rs +++ b/src/index/vocabulary.rs @@ -1,5 +1,7 @@ use std::collections::BTreeMap; +use fxhash::FxHashMap; + use crate::disk::{ bits_reader::BitsReader, bits_writer::BitsWriter, terms_reader::TermsReader, terms_writer::TermsWriter, @@ -23,7 +25,7 @@ pub fn write_vocabulary(vocab: &BTreeMap, output_path: &str) { terms_writer.flush(); } -pub fn load_vocabulary(input_path: &str) -> BTreeMap { +pub fn load_vocabulary(input_path: &str) -> FxHashMap { let terms_path: String = input_path.to_string() + VOCABULARY_ALPHA_EXTENSION; let terms_buffer = TermsReader::new(&terms_path).read_to_string(); @@ -38,7 +40,7 @@ pub fn load_vocabulary(input_path: &str) -> BTreeMap { let mut start_term_offset: usize = 0; let mut postings_offset = 0; - let mut res: BTreeMap = BTreeMap::new(); + let mut res = FxHashMap::default(); for _ in 0..num_terms { let term_length = lenghts_reader.read_gamma() as usize; diff --git a/src/main.rs b/src/main.rs index b6ecb5f..ee8bf8b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,14 +8,20 @@ use search::query::QueryProcessor; use indicatif::HumanDuration; -const NUM_RESULTS: usize = 10; +const NUM_RESULTS: usize = 1000000; -fn print_results(results: &[u32]) { - println!("\nSearch Results:"); - for (i, doc_id) in results.iter().enumerate() { - println!("\t- {:3}. Doc ID: {}", i + 1, doc_id); - } - println!(); +fn print_results(results: &[u32], elapsed_time: Duration) { + // println!("\nSearch Results:"); + + // for (i, doc_id) in results.iter().enumerate() { + // println!("\t- {:3}. Doc ID: {}", i + 1, doc_id); + // } + + println!( + "\nFetched {} documents in {} ms", + results.len(), + elapsed_time.as_millis() + ); } fn read_line(prompt: &str) -> String { @@ -91,8 +97,9 @@ fn main() { loop { let query = read_line("> "); + let start_time = Instant::now(); let results = q.query(&query, NUM_RESULTS); - - print_results(&results); + let elapsed_time = start_time.elapsed(); + print_results(&results, elapsed_time); } }