Skip to content

Commit

Permalink
Indexer vs InvertedIndexer
Browse files Browse the repository at this point in the history
  • Loading branch information
seanmacavaney committed Sep 17, 2024
1 parent f4245b7 commit ceea6f6
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 20 deletions.
16 changes: 3 additions & 13 deletions python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,20 +35,10 @@ ciff2bmp(ciff_file="/path/to/ciff", output="/path/to/index", bsize=32, compress_

```python
from bmp import Indexer
import string
import random
indexer = Indexer('/path/to/index', bsize=32, compress_range=False)
terms = [(c, []) for c in string.ascii_letters]
for doc in range(10_000):
dvec = []
for idx in range(random.randrange(1, 10)):
tf = random.randrange(1, 1000)
tok = random.randrange(len(terms))
dvec.append((tok, tf))
terms[tok][1].append((doc, tf))
indexer.add_document(f'doc{doc}', dvec)
for term, postings in terms:
indexer.add_term(term, postings)
indexer.add_document('doc1', {'a': 1, 'b': 5, 'c': 8}) # docid, vector
indexer.add_document('doc2', {'a': 2, 'c': 1, 'd': 8, 'f': 2})
...
indexer.finish()
```

Expand Down
2 changes: 1 addition & 1 deletion python/python/bmp/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from bmp._bmp import ciff2bmp, search, Searcher, Indexer
from bmp._bmp import ciff2bmp, search, Searcher, InvertedIndexer, Indexer
68 changes: 65 additions & 3 deletions python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ fn search(
}

#[pyclass]
struct Indexer {
struct InvertedIndexer {
path: PathBuf,
bsize: usize,
compress_range: bool,
Expand All @@ -101,11 +101,11 @@ struct Indexer {
}

#[pymethods]
impl Indexer {
impl InvertedIndexer {

#[new]
fn py_new(path: PathBuf, bsize: usize, compress_range: bool) -> PyResult<Self> {
Ok(Indexer {
Ok(InvertedIndexer {
path: path,
bsize: bsize,
compress_range: compress_range,
Expand Down Expand Up @@ -149,6 +149,67 @@ impl Indexer {
}
}

#[pyclass]
struct Indexer {
path: PathBuf,
bsize: usize,
compress_range: bool,
inv_builder: bmp::index::inverted_index::IndexBuilder,
fwd_builder: bmp::index::forward_index::ForwardIndexBuilder,
term_map: HashMap<String, u32>,
}

#[pymethods]
impl Indexer {

#[new]
fn py_new(path: PathBuf, bsize: usize, compress_range: bool) -> PyResult<Self> {
Ok(Indexer {
path: path,
bsize: bsize,
compress_range: compress_range,
inv_builder: bmp::index::inverted_index::IndexBuilder::new(0, bsize),
fwd_builder: bmp::index::forward_index::ForwardIndexBuilder::new(0),
term_map: HashMap::new(),
})
}

fn add_document(
&mut self,
doc_id: String,
vector: HashMap<String, u32>,
) -> PyResult<()> {
let doc_idx = self.inv_builder.insert_document(&doc_id);
let mut int_vector: Vec<(u32, u32)> = Vec::new();
for (term, weight) in &vector {
if !self.term_map.contains_key(term) {
self.term_map.insert(term.clone(), self.term_map.len() as u32);
self.inv_builder.insert_term(term, Vec::new());
}
let term_idx = self.term_map[term];
self.inv_builder.push_posting(term_idx, doc_idx, *weight);
int_vector.push((term_idx, *weight))
}
self.fwd_builder.insert_document(int_vector);
Ok(())
}

fn finish(
&mut self,
) -> PyResult<()> {
let builder = std::mem::replace(&mut self.inv_builder, bmp::index::inverted_index::IndexBuilder::new(0, 0));
let inverted_index = builder.build(self.compress_range);
let forward_index = self.fwd_builder.build();
let b_forward_index = bmp::index::forward_index::fwd2bfwd(&forward_index, self.bsize);
let file = std::fs::File::create(self.path.clone()).expect("Failed to create file");
let writer = std::io::BufWriter::new(file);
// Serialize the index directly into a file using bincode
bincode::serialize_into(writer, &(&inverted_index, &b_forward_index))
.expect("Failed to serialize");
Ok(())
}
}

/// A Python module implemented in Rust. The name of this function must match
/// the `lib.name` setting in the `Cargo.toml`, else Python will not be able to
/// import the module.
Expand All @@ -157,6 +218,7 @@ fn _bmp(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(ciff2bmp, m)?)?;
m.add_function(wrap_pyfunction!(search, m)?)?;
m.add_class::<Searcher>()?;
m.add_class::<InvertedIndexer>()?;
m.add_class::<Indexer>()?;
Ok(())
}
18 changes: 15 additions & 3 deletions src/index/inverted_index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,14 @@ impl IndexBuilder {
self.terms.push(term.to_string());
}

pub fn insert_document(&mut self, name: &str) {
pub fn push_posting(&mut self, term_id: u32, doc_id: u32, tf: u32) {
self.posting_lists[term_id as usize].push((doc_id, tf));
}

pub fn insert_document(&mut self, name: &str) -> u32 {
let doc_id = self.documents.len();
self.documents.push(name.to_string());
return doc_id as u32;
}

fn compress(data: &[u8]) -> Vec<crate::index::posting_list::CompressedBlock> {
Expand Down Expand Up @@ -152,8 +158,14 @@ impl IndexBuilder {
.collect();

let mut build = MapBuilder::memory();
self.terms.iter().enumerate().for_each(|(index, term)| {
let _ = build.insert(term, index as u64);

let mut indexed_terms: Vec<(usize, &String)> = self.terms.iter().enumerate().collect();

// Sort the terms lexicographically while keeping the original indices
indexed_terms.sort_by(|a, b| a.1.cmp(b.1));

indexed_terms.iter().for_each(|(index, term)| {
let _ = build.insert(term, *index as u64);
});

Index {
Expand Down

0 comments on commit ceea6f6

Please sign in to comment.