Skip to content

Commit

Permalink
Faster build and fxhashmap as vocab.
Browse files Browse the repository at this point in the history
  • Loading branch information
tomfran committed Jan 13, 2024
1 parent 8945d50 commit 770824d
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 29 deletions.
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,5 @@ rand = "0.8"
tokenizers = { version = "0.15.0", features = ["http"] }
rust-stemmers = "1.2.0"
rayon = "1.8.0"
indicatif = {version = "0.17.0", features = ["rayon", "improved_unicode"]}
indicatif = {version = "0.17.0", features = ["rayon", "improved_unicode"]}
fxhash = "0.2.1"
46 changes: 32 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,42 @@
Search engine written in Rust, based on an inverted index on disk.

### Implementation status
- [x] IO classes for writing and reading bit-streams;
- [x] Text preprocessing:
- [x] Tokenization;
- [x] Stemming.
- [ ] Index construction:
- [x] In-memory datasets index construction;
- [ ] Disk-based partial index construction and merging;
- [ ] Additional indexes to support things such as spelling correction.
- [ ] Index queries:
- [ ] Boolean queries;
- [x] Tf-idf ranked retrieval;
- [x] Window computation.

**IO**
- [x] Classes for writing and reading bit-streams;
- [ ] Proper strings writer and reader.

**Text preprocessing**
- [x] Tokenization;
- [x] Stemming;
- [ ] Parametrization at build time.

**Index construction**
- [x] In-memory datasets index construction;
- [ ] Proper vocabulary and paths on disk;
- [ ] Spelling correction index;
- [ ] Disk-based partial index construction and merging;

**Queries**
- [x] Tf-idf ranked retrieval;
- [x] Window computation.
- [ ] Boolean queries;
- [ ] Parallel scoring.

**Evaluation**
- [ ] Query speed;
- [ ] Query quality;
- [ ] Disk overhead.

### Crates in use
- [stemmer-rs](/~https://github.com/lise-henry/stemmer-rs)
- [tokenizers](/~https://github.com/huggingface/tokenizers)
- [indicatif](/~https://github.com/console-rs/indicatif)

- [fxhash](/~https://github.com/cbreeden/fxhash)

### References
[Introduction to Information Retrieval - Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze](https://nlp.stanford.edu/IR-book/information-retrieval-book.html)
[Introduction to Information Retrieval](https://nlp.stanford.edu/IR-book/information-retrieval-book.html) - Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze

---

*Feel free to get in touch to discuss the project!*
7 changes: 5 additions & 2 deletions src/index/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ fn build_in_memory(input_dir: &str, tokenizer: &Tokenizer, stemmer: &Stemmer) ->
.collect();

let doc_id_mutex = Mutex::new(0);
let term_index_map = Mutex::new(BTreeMap::new());
let term_index_map = Mutex::new(HashMap::new());

let postings: Mutex<Vec<PostingList>> = Mutex::new(Vec::new());
let term_doc_map: Mutex<Vec<HashMap<u32, usize>>> = Mutex::new(Vec::new());
Expand Down Expand Up @@ -89,8 +89,11 @@ fn build_in_memory(input_dir: &str, tokenizer: &Tokenizer, stemmer: &Stemmer) ->
*doc_id += 1;
});

let sorted_term_index_map: BTreeMap<String, usize> =
term_index_map.into_inner().unwrap().into_iter().collect();

InMemoryIndex {
term_index_map: term_index_map.into_inner().unwrap(),
term_index_map: sorted_term_index_map,
postings: postings.into_inner().unwrap(),
document_lengths: document_lengths.into_inner().unwrap(),
}
Expand Down
3 changes: 2 additions & 1 deletion src/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ mod postings;
mod text;
mod vocabulary;

use fxhash::FxHashMap;
use rust_stemmers::Stemmer;
use std::collections::BTreeMap;
use std::fmt::Display;
Expand All @@ -21,7 +22,7 @@ pub const VOCABULARY_LENGHTS_EXTENSION: &str = ".term_lengths";

pub struct Index {
postings: BitsReader,
term_offset_map: BTreeMap<String, u64>,
term_offset_map: FxHashMap<String, u64>,
doc_lenghts: Vec<u32>,
tokenizer: Tokenizer,
stemmer: Stemmer,
Expand Down
6 changes: 4 additions & 2 deletions src/index/vocabulary.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use std::collections::BTreeMap;

use fxhash::FxHashMap;

use crate::disk::{
bits_reader::BitsReader, bits_writer::BitsWriter, terms_reader::TermsReader,
terms_writer::TermsWriter,
Expand All @@ -23,7 +25,7 @@ pub fn write_vocabulary(vocab: &BTreeMap<String, usize>, output_path: &str) {
terms_writer.flush();
}

pub fn load_vocabulary(input_path: &str) -> BTreeMap<String, u64> {
pub fn load_vocabulary(input_path: &str) -> FxHashMap<String, u64> {
let terms_path: String = input_path.to_string() + VOCABULARY_ALPHA_EXTENSION;
let terms_buffer = TermsReader::new(&terms_path).read_to_string();

Expand All @@ -38,7 +40,7 @@ pub fn load_vocabulary(input_path: &str) -> BTreeMap<String, u64> {
let mut start_term_offset: usize = 0;
let mut postings_offset = 0;

let mut res: BTreeMap<String, u64> = BTreeMap::new();
let mut res = FxHashMap::default();

for _ in 0..num_terms {
let term_length = lenghts_reader.read_gamma() as usize;
Expand Down
25 changes: 16 additions & 9 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,20 @@ use search::query::QueryProcessor;

use indicatif::HumanDuration;

const NUM_RESULTS: usize = 10;
const NUM_RESULTS: usize = 1000000;

fn print_results(results: &[u32]) {
println!("\nSearch Results:");
for (i, doc_id) in results.iter().enumerate() {
println!("\t- {:3}. Doc ID: {}", i + 1, doc_id);
}
println!();
fn print_results(results: &[u32], elapsed_time: Duration) {
// println!("\nSearch Results:");

// for (i, doc_id) in results.iter().enumerate() {
// println!("\t- {:3}. Doc ID: {}", i + 1, doc_id);
// }

println!(
"\nFetched {} documents in {} ms",
results.len(),
elapsed_time.as_millis()
);
}

fn read_line(prompt: &str) -> String {
Expand Down Expand Up @@ -91,8 +97,9 @@ fn main() {
loop {
let query = read_line("> ");

let start_time = Instant::now();
let results = q.query(&query, NUM_RESULTS);

print_results(&results);
let elapsed_time = start_time.elapsed();
print_results(&results, elapsed_time);
}
}

0 comments on commit 770824d

Please sign in to comment.