From ceea6f6cdf23e8ce2c9858c17bb578aa01cfcd6e Mon Sep 17 00:00:00 2001 From: Sean MacAvaney Date: Tue, 17 Sep 2024 14:22:38 +0100 Subject: [PATCH] Indexer vs InvertedIndexer --- python/README.md | 16 ++------- python/python/bmp/__init__.py | 2 +- python/src/lib.rs | 68 +++++++++++++++++++++++++++++++++-- src/index/inverted_index.rs | 18 ++++++++-- 4 files changed, 84 insertions(+), 20 deletions(-) diff --git a/python/README.md b/python/README.md index 2b936ca..6318641 100644 --- a/python/README.md +++ b/python/README.md @@ -35,20 +35,10 @@ ciff2bmp(ciff_file="/path/to/ciff", output="/path/to/index", bsize=32, compress_ ```python from bmp import Indexer -import string -import random indexer = Indexer('/path/to/index', bsize=32, compress_range=False) -terms = [(c, []) for c in string.ascii_letters] -for doc in range(10_000): - dvec = [] - for idx in range(random.randrange(1, 10)): - tf = random.randrange(1, 1000) - tok = random.randrange(len(terms)) - dvec.append((tok, tf)) - terms[tok][1].append((doc, tf)) - indexer.add_document(f'doc{doc}', dvec) -for term, postings in terms: - indexer.add_term(term, postings) +indexer.add_document('doc1', {'a': 1, 'b': 5, 'c': 8}) # docid, vector +indexer.add_document('doc2', {'a': 2, 'c': 1, 'd': 8, 'f': 2}) +... indexer.finish() ``` diff --git a/python/python/bmp/__init__.py b/python/python/bmp/__init__.py index 2d24761..ae1609d 100644 --- a/python/python/bmp/__init__.py +++ b/python/python/bmp/__init__.py @@ -1 +1 @@ -from bmp._bmp import ciff2bmp, search, Searcher, Indexer +from bmp._bmp import ciff2bmp, search, Searcher, InvertedIndexer, Indexer diff --git a/python/src/lib.rs b/python/src/lib.rs index 0c5eae4..6ecbd54 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -92,7 +92,7 @@ fn search( } #[pyclass] -struct Indexer { +struct InvertedIndexer { path: PathBuf, bsize: usize, compress_range: bool, @@ -101,11 +101,11 @@ struct Indexer { } #[pymethods] -impl Indexer { +impl InvertedIndexer { #[new] fn py_new(path: PathBuf, bsize: usize, compress_range: bool) -> PyResult { - Ok(Indexer { + Ok(InvertedIndexer { path: path, bsize: bsize, compress_range: compress_range, @@ -149,6 +149,67 @@ impl Indexer { } } +#[pyclass] +struct Indexer { + path: PathBuf, + bsize: usize, + compress_range: bool, + inv_builder: bmp::index::inverted_index::IndexBuilder, + fwd_builder: bmp::index::forward_index::ForwardIndexBuilder, + term_map: HashMap, +} + +#[pymethods] +impl Indexer { + + #[new] + fn py_new(path: PathBuf, bsize: usize, compress_range: bool) -> PyResult { + Ok(Indexer { + path: path, + bsize: bsize, + compress_range: compress_range, + inv_builder: bmp::index::inverted_index::IndexBuilder::new(0, bsize), + fwd_builder: bmp::index::forward_index::ForwardIndexBuilder::new(0), + term_map: HashMap::new(), + }) + } + + fn add_document( + &mut self, + doc_id: String, + vector: HashMap, + ) -> PyResult<()> { + let doc_idx = self.inv_builder.insert_document(&doc_id); + let mut int_vector: Vec<(u32, u32)> = Vec::new(); + for (term, weight) in &vector { + if !self.term_map.contains_key(term) { + self.term_map.insert(term.clone(), self.term_map.len() as u32); + self.inv_builder.insert_term(term, Vec::new()); + } + let term_idx = self.term_map[term]; + self.inv_builder.push_posting(term_idx, doc_idx, *weight); + int_vector.push((term_idx, *weight)) + } + self.fwd_builder.insert_document(int_vector); + Ok(()) + } + + fn finish( + &mut self, + ) -> PyResult<()> { + let builder = std::mem::replace(&mut self.inv_builder, bmp::index::inverted_index::IndexBuilder::new(0, 0)); + let inverted_index = builder.build(self.compress_range); + let forward_index = self.fwd_builder.build(); + let b_forward_index = bmp::index::forward_index::fwd2bfwd(&forward_index, self.bsize); + let file = std::fs::File::create(self.path.clone()).expect("Failed to create file"); + let writer = std::io::BufWriter::new(file); + // Serialize the index directly into a file using bincode + bincode::serialize_into(writer, &(&inverted_index, &b_forward_index)) + .expect("Failed to serialize"); + Ok(()) + } +} + /// A Python module implemented in Rust. The name of this function must match /// the `lib.name` setting in the `Cargo.toml`, else Python will not be able to /// import the module. @@ -157,6 +218,7 @@ fn _bmp(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(ciff2bmp, m)?)?; m.add_function(wrap_pyfunction!(search, m)?)?; m.add_class::()?; + m.add_class::()?; m.add_class::()?; Ok(()) } diff --git a/src/index/inverted_index.rs b/src/index/inverted_index.rs index f79e1a7..9b07ff5 100644 --- a/src/index/inverted_index.rs +++ b/src/index/inverted_index.rs @@ -93,8 +93,14 @@ impl IndexBuilder { self.terms.push(term.to_string()); } - pub fn insert_document(&mut self, name: &str) { + pub fn push_posting(&mut self, term_id: u32, doc_id: u32, tf: u32) { + self.posting_lists[term_id as usize].push((doc_id, tf)); + } + + pub fn insert_document(&mut self, name: &str) -> u32 { + let doc_id = self.documents.len(); self.documents.push(name.to_string()); + return doc_id as u32; } fn compress(data: &[u8]) -> Vec { @@ -152,8 +158,14 @@ impl IndexBuilder { .collect(); let mut build = MapBuilder::memory(); - self.terms.iter().enumerate().for_each(|(index, term)| { - let _ = build.insert(term, index as u64); + + let mut indexed_terms: Vec<(usize, &String)> = self.terms.iter().enumerate().collect(); + + // Sort the terms lexicographically while keeping the original indices + indexed_terms.sort_by(|a, b| a.1.cmp(b.1)); + + indexed_terms.iter().for_each(|(index, term)| { + let _ = build.insert(term, *index as u64); }); Index {