Skip to content

Commit

Permalink
DiskBMP
Browse files Browse the repository at this point in the history
  • Loading branch information
amallia committed Jul 29, 2024
1 parent 3d697bf commit 9426240
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 30 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ serde = { version = "1.0.193", features = ["derive"] }
fst = "0.4.7"
num-integer = "0.1.45"
rayon = "1.8.0"
memmap2 = "0.4"

[build-dependencies]
protobuf-codegen-pure = "2.22"
43 changes: 41 additions & 2 deletions bin/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,13 @@ use anyhow::Result;
use bmp::query::cursors_from_queries;
use bmp::search::b_search;
use bmp::util::to_trec;
use memmap2::Mmap;
use std::fs::{File, OpenOptions};
use std::path::PathBuf;
use structopt::StructOpt;

use std::io::BufReader;

// Function to perform the search for each query and return the results

#[derive(Debug, StructOpt)]
Expand Down Expand Up @@ -33,14 +37,49 @@ fn main() -> Result<()> {

// 1. Load the index
eprintln!("Loading the index");
let (index, bfwd) = bmp::index::from_file(args.index)?;
let mut inv_filename = PathBuf::new();
inv_filename.push(&args.index);
inv_filename.set_extension("inv");
let file = File::open(inv_filename)?;
let reader = BufReader::new(file);
let index = bincode::deserialize_from(reader)?;
// let (index, bfwd) = bmp::index::from_file(inv_filename)?;
let mut fwd_filename = PathBuf::new();
fwd_filename.push(&args.index);
fwd_filename.set_extension("fwd");
let file = OpenOptions::new().read(true).open(fwd_filename)?;
let mmap = unsafe { Mmap::map(&file)? };

let block_size_offset = mmap.len() - std::mem::size_of::<usize>();
eprintln!("mmap.len() {}", mmap.len());
let block_size: usize = bincode::deserialize(&mmap[block_size_offset..]).unwrap();
// Read the length of the offsets vector from the end of the file
let offsets_len_offset = block_size_offset - std::mem::size_of::<u64>();
let offsets_len: u64 =
bincode::deserialize(&mmap[offsets_len_offset..block_size_offset]).unwrap();
eprintln!("offsets_len {}", offsets_len);

// Read the offsets vector from the file
let offsets_offset = offsets_len_offset - offsets_len as usize * std::mem::size_of::<u64>() - 8;
eprintln!("offsets_len_offset {}", offsets_len_offset);
eprintln!("offsets_offset {}", offsets_offset);

let offsets: Vec<u64> =
bincode::deserialize(&mmap[offsets_offset..offsets_len_offset]).unwrap();
eprintln!("offsets.len() {}", offsets.len());

// 2. Load the queries
eprintln!("Loading the queries");
let (q_ids, cursors) = cursors_from_queries(args.queries, &index);

// let object_index = 1; // For example, access the second object
// let deserialized_object = get_object_at_index(&mmap, &offsets, object_index);
// eprintln!("{:?}", deserialized_object); // Output: [(2, [(50, 60), (70, 80)])]

eprintln!("Performing query processing");
let results = b_search(cursors, &bfwd, args.k, args.alpha, args.beta);
let results = b_search(
cursors, &mmap, &offsets, block_size, args.k, args.alpha, args.beta,
);

eprintln!("Exporting TREC run");
// 4. Log results into TREC format
Expand Down
33 changes: 25 additions & 8 deletions src/ciff/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use protobuf::CodedInputStream;
use std::fmt;
use std::fs::File;
use std::io::BufWriter;
use std::io::{Seek, SeekFrom};
use std::io::{Seek, SeekFrom, Write};
use std::path::{Path, PathBuf};

use crate::index::forward_index::ForwardIndexBuilder;
Expand Down Expand Up @@ -209,10 +209,6 @@ fn convert_to_bmp(input: &Path, output: &Path, bsize: usize, compress_range: boo
fwd_builder.insert_posting_list(term_id as u32, &posting_list);
progress.inc(1);
}
// for (term_id, posting_list) in inverted_index.posting_lists().iter().enumerate() {
// fwd_builder.insert_posting_list(term_id as u32, posting_list);
// progress.inc(1);
// }
progress.finish();
eprintln!("Converting to blocked forward index");

Expand All @@ -231,11 +227,32 @@ fn convert_to_bmp(input: &Path, output: &Path, bsize: usize, compress_range: boo
"avg docs per term: {}",
tot_avg_docs / b_forward_index.data.len() as f32
);
let file = File::create(output).expect("Failed to create file");
let mut inv_filename = PathBuf::new();
inv_filename.push(output);
inv_filename.set_extension("inv");
let file = File::create(inv_filename).expect("Failed to create file");
let writer = BufWriter::new(file);
// Serialize the index directly into a file using bincode
bincode::serialize_into(writer, &(&inverted_index, &b_forward_index))
.expect("Failed to serialize");
bincode::serialize_into(writer, &inverted_index).expect("Failed to serialize");
let mut fwd_filename = PathBuf::new();
fwd_filename.push(output);
fwd_filename.set_extension("fwd");
let mut file = File::create(fwd_filename).expect("Failed to create file");
let mut offsets = Vec::new();
for object in &b_forward_index.data {
let serialized_data = bincode::serialize(object).unwrap();
offsets.push(file.seek(SeekFrom::Current(0))?);
file.write_all(&serialized_data)?;
}
let offsets_data = bincode::serialize(&offsets).unwrap();
file.write_all(&offsets_data)?;
let offsets_len = offsets.len() as u64;

let offsets_len_data = bincode::serialize(&offsets_len).unwrap();
file.write_all(&offsets_len_data)?;

let block_size_data = bincode::serialize(&b_forward_index.block_size).unwrap();
file.write_all(&block_size_data)?;

Ok(())
}
53 changes: 33 additions & 20 deletions src/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,34 @@ use crate::query::cursor::{RangeMaxScore, RangeMaxScoreCursor};
use crate::query::live_block;
use crate::query::topk_heap::TopKHeap;
use crate::util::progress_bar;
use memmap2::Mmap;
use std::arch::x86_64::_mm_prefetch;
use std::time::Instant;

// Function to get an object at a specific index
fn get_object_at_index(mmap: &Mmap, offsets: &[u64], index: usize) -> Vec<(u16, Vec<(u8, u8)>)> {
let offset = offsets[index];
let slice = &mmap[offset as usize..];
bincode::deserialize(slice).unwrap()
}

pub fn b_search(
queries: Vec<Vec<PostingListIterator>>,
forward_index: &BlockForwardIndex,
mmap: &Mmap,
offsets: &[u64],
block_size: usize,
k: usize,
alpha: f32,
terms_r: f32,
) -> Vec<TopKHeap<u16>> {
b_search_verbose(queries, forward_index, k, alpha, terms_r, true)
b_search_verbose(queries, mmap, offsets, block_size, k, alpha, terms_r, true)
}

pub fn b_search_verbose(
mut queries: Vec<Vec<PostingListIterator>>,
forward_index: &BlockForwardIndex,
mmap: &Mmap,
offsets: &[u64],
block_size: usize,
k: usize,
alpha: f32,
terms_r: f32,
Expand Down Expand Up @@ -74,12 +86,12 @@ pub fn b_search_verbose(
true => live_block::compute_upper_bounds(
&query_ranges_compressed,
&query_weights,
forward_index.data.len(),
offsets.len(),
),
false => live_block::compute_upper_bounds_raw(
&query_ranges_raw,
&query_weights,
forward_index.data.len(),
offsets.len(),
),
};

Expand All @@ -101,26 +113,27 @@ pub fn b_search_verbose(
});

let (mut current_ub, mut current_block) = ub_iter.next().unwrap();
unsafe {
_mm_prefetch(
forward_index.data.as_ptr().add(*current_block as usize) as *const i8,
std::arch::x86_64::_MM_HINT_T0,
);
}
// unsafe {
// _mm_prefetch(
// forward_index.data.as_ptr().add(*current_block as usize) as *const i8,
// std::arch::x86_64::_MM_HINT_T0,
// );
// }
for (next_ub, next_block) in ub_iter {
unsafe {
_mm_prefetch(
forward_index.data.as_ptr().add(*next_block as usize) as *const i8,
std::arch::x86_64::_MM_HINT_T0,
);
}
// unsafe {
// _mm_prefetch(
// forward_index.data.as_ptr().add(*next_block as usize) as *const i8,
// std::arch::x86_64::_MM_HINT_T0,
// );
// }

let offset = *current_block as usize * forward_index.block_size;
let offset = *current_block as usize * block_size;

let res = block_score(
&query_vec,
&forward_index.data[*current_block as usize],
forward_index.block_size,
&get_object_at_index(&mmap, &offsets, *current_block as usize),
// &forward_index.data[*current_block as usize],
block_size,
);

for (doc_id, &score) in res.iter().enumerate() {
Expand Down

0 comments on commit 9426240

Please sign in to comment.