From 557c9bdca2bcbcc03ba4190b27eeebb0c454a621 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 24 Sep 2024 15:05:40 +0200 Subject: [PATCH] use the bias as intended --- src/distance/angular.rs | 6 ++--- src/distance/binary_quantized_angular.rs | 6 ++--- src/distance/binary_quantized_euclidean.rs | 28 +++++++++++++++++----- src/distance/binary_quantized_manhattan.rs | 26 ++++++++++++++++---- src/distance/dot_product.rs | 4 ++-- src/distance/euclidean.rs | 6 ++--- src/distance/manhattan.rs | 4 ++-- src/distance/mod.rs | 12 ++++------ src/node.rs | 10 +++++--- src/reader.rs | 4 ++-- src/writer.rs | 6 ++--- 11 files changed, 70 insertions(+), 42 deletions(-) diff --git a/src/distance/angular.rs b/src/distance/angular.rs index 2e2520f6..a41cce8b 100644 --- a/src/distance/angular.rs +++ b/src/distance/angular.rs @@ -1,5 +1,3 @@ -use std::borrow::Cow; - use bytemuck::{Pod, Zeroable}; use rand::Rng; @@ -67,7 +65,7 @@ impl Distance for Angular { fn create_split<'a, R: Rng>( children: &'a ImmutableSubsetLeafs, rng: &mut R, - ) -> heed::Result>> { + ) -> heed::Result> { let [node_p, node_q] = two_means(rng, children, true)?; let vector: Vec = node_p.vector.iter().zip(node_q.vector.iter()).map(|(p, q)| p - q).collect(); @@ -75,7 +73,7 @@ impl Distance for Angular { let mut normal = Leaf { header: NodeHeaderAngular { norm: 0.0 }, vector: unaligned_vector }; Self::normalize(&mut normal); - Ok(normal.vector) + Ok(normal) } fn margin_no_header( diff --git a/src/distance/binary_quantized_angular.rs b/src/distance/binary_quantized_angular.rs index f21d29e9..a7c9776a 100644 --- a/src/distance/binary_quantized_angular.rs +++ b/src/distance/binary_quantized_angular.rs @@ -1,5 +1,3 @@ -use std::borrow::Cow; - use bytemuck::{Pod, Zeroable}; use rand::Rng; @@ -72,7 +70,7 @@ impl Distance for BinaryQuantizedAngular { fn create_split<'a, R: Rng>( children: &'a ImmutableSubsetLeafs, rng: &mut R, - ) -> heed::Result>> { + ) -> heed::Result> { let [node_p, node_q] = two_means::(rng, children, true)?; let vector: Vec = node_p.vector.iter().zip(node_q.vector.iter()).map(|(p, q)| p - q).collect(); @@ -83,7 +81,7 @@ impl Distance for BinaryQuantizedAngular { }; Self::normalize(&mut normal); - Ok(normal.vector) + Ok(normal) } fn margin_no_header( diff --git a/src/distance/binary_quantized_euclidean.rs b/src/distance/binary_quantized_euclidean.rs index 2d8e5c2f..152aca44 100644 --- a/src/distance/binary_quantized_euclidean.rs +++ b/src/distance/binary_quantized_euclidean.rs @@ -1,9 +1,7 @@ -use std::borrow::Cow; - use bytemuck::{Pod, Zeroable}; use rand::Rng; -use super::{two_means_binary_quantized as two_means, Euclidean}; +use super::{two_means_binary_quantized as two_means, Euclidean, NodeHeaderEuclidean}; use crate::distance::Distance; use crate::node::Leaf; use crate::parallel::ImmutableSubsetLeafs; @@ -59,17 +57,35 @@ impl Distance for BinaryQuantizedEuclidean { fn create_split<'a, R: Rng>( children: &'a ImmutableSubsetLeafs, rng: &mut R, - ) -> heed::Result>> { + ) -> heed::Result> { let [node_p, node_q] = two_means::(rng, children, false)?; let vector: Vec = node_p.vector.iter().zip(node_q.vector.iter()).map(|(p, q)| p - q).collect(); let mut normal = Leaf { - header: NodeHeaderBinaryQuantizedEuclidean { bias: 0.0 }, + header: NodeHeaderBinaryQuantizedEuclidean::zeroed(), vector: UnalignedVector::from_slice(&vector), }; Self::normalize(&mut normal); - Ok(Cow::Owned(normal.vector.into_owned())) + normal.header.bias = normal + .vector + .iter() + .zip( + UnalignedVector::::from_slice( + &node_p.vector.iter().collect::>(), + ) + .iter(), + ) + .zip( + UnalignedVector::::from_slice( + &node_q.vector.iter().collect::>(), + ) + .iter(), + ) + .map(|((n, p), q)| -n * (p + q) / 2.0) + .sum(); + + Ok(normal.into_owned()) } fn margin(p: &Leaf, q: &Leaf) -> f32 { diff --git a/src/distance/binary_quantized_manhattan.rs b/src/distance/binary_quantized_manhattan.rs index 8576918d..d3a0d45e 100644 --- a/src/distance/binary_quantized_manhattan.rs +++ b/src/distance/binary_quantized_manhattan.rs @@ -3,7 +3,7 @@ use std::borrow::Cow; use bytemuck::{Pod, Zeroable}; use rand::Rng; -use super::{two_means_binary_quantized as two_means, Manhattan}; +use super::{two_means_binary_quantized as two_means, Manhattan, NodeHeaderManhattan}; use crate::distance::Distance; use crate::node::Leaf; use crate::parallel::ImmutableSubsetLeafs; @@ -63,17 +63,35 @@ impl Distance for BinaryQuantizedManhattan { fn create_split<'a, R: Rng>( children: &'a ImmutableSubsetLeafs, rng: &mut R, - ) -> heed::Result>> { + ) -> heed::Result> { let [node_p, node_q] = two_means::(rng, children, false)?; let vector: Vec = node_p.vector.iter().zip(node_q.vector.iter()).map(|(p, q)| p - q).collect(); let mut normal = Leaf { - header: NodeHeaderBinaryQuantizedManhattan { bias: 0.0 }, + header: NodeHeaderBinaryQuantizedManhattan::zeroed(), vector: UnalignedVector::from_slice(&vector), }; Self::normalize(&mut normal); - Ok(Cow::Owned(normal.vector.into_owned())) + normal.header.bias = normal + .vector + .iter() + .zip( + UnalignedVector::::from_slice( + &node_p.vector.iter().collect::>(), + ) + .iter(), + ) + .zip( + UnalignedVector::::from_slice( + &node_q.vector.iter().collect::>(), + ) + .iter(), + ) + .map(|((n, p), q)| -n * (p + q) / 2.0) + .sum(); + + Ok(normal.into_owned()) } fn margin(p: &Leaf, q: &Leaf) -> f32 { diff --git a/src/distance/dot_product.rs b/src/distance/dot_product.rs index 6a948c98..49263183 100644 --- a/src/distance/dot_product.rs +++ b/src/distance/dot_product.rs @@ -90,7 +90,7 @@ impl Distance for DotProduct { fn create_split<'a, R: Rng>( children: &'a ImmutableSubsetLeafs, rng: &mut R, - ) -> heed::Result>> { + ) -> heed::Result> { let [node_p, node_q] = two_means(rng, children, true)?; let vector: Vec = node_p.vector.iter().zip(node_q.vector.iter()).map(|(p, q)| p - q).collect(); @@ -101,7 +101,7 @@ impl Distance for DotProduct { normal.header.extra_dim = node_p.header.extra_dim - node_q.header.extra_dim; Self::normalize(&mut normal); - Ok(normal.vector) + Ok(normal) } fn margin(p: &Leaf, q: &Leaf) -> f32 { diff --git a/src/distance/euclidean.rs b/src/distance/euclidean.rs index ccf8ee4f..8407f3bd 100644 --- a/src/distance/euclidean.rs +++ b/src/distance/euclidean.rs @@ -1,5 +1,3 @@ -use std::borrow::Cow; - use bytemuck::{Pod, Zeroable}; use rand::Rng; @@ -50,7 +48,7 @@ impl Distance for Euclidean { fn create_split<'a, R: Rng>( children: &'a ImmutableSubsetLeafs, rng: &mut R, - ) -> heed::Result>> { + ) -> heed::Result> { let [node_p, node_q] = two_means(rng, children, false)?; let vector: Vec<_> = node_p.vector.iter().zip(node_q.vector.iter()).map(|(p, q)| p - q).collect(); @@ -68,7 +66,7 @@ impl Distance for Euclidean { .map(|((n, p), q)| -n * (p + q) / 2.0) .sum(); - Ok(normal.vector) + Ok(normal) } fn margin(p: &Leaf, q: &Leaf) -> f32 { diff --git a/src/distance/manhattan.rs b/src/distance/manhattan.rs index ae4ba4d5..c10ef9c2 100644 --- a/src/distance/manhattan.rs +++ b/src/distance/manhattan.rs @@ -53,7 +53,7 @@ impl Distance for Manhattan { fn create_split<'a, R: Rng>( children: &'a ImmutableSubsetLeafs, rng: &mut R, - ) -> heed::Result>> { + ) -> heed::Result> { let [node_p, node_q] = two_means(rng, children, false)?; let vector: Vec<_> = node_p.vector.iter().zip(node_q.vector.iter()).map(|(p, q)| p - q).collect(); @@ -71,7 +71,7 @@ impl Distance for Manhattan { .map(|((n, p), q)| -n * (p + q) / 2.0) .sum(); - Ok(normal.vector) + Ok(normal) } fn margin(p: &Leaf, q: &Leaf) -> f32 { diff --git a/src/distance/mod.rs b/src/distance/mod.rs index 36d55b97..91d9e348 100644 --- a/src/distance/mod.rs +++ b/src/distance/mod.rs @@ -30,7 +30,7 @@ mod dot_product; mod euclidean; mod manhattan; -fn new_leaf(vec: Vec) -> Leaf<'static, D> { +pub fn new_leaf(vec: Vec) -> Leaf<'static, D> { let vector = UnalignedVector::from_vec(vec); Leaf { header: D::new_header(&vector), vector } } @@ -97,7 +97,7 @@ pub trait Distance: Send + Sync + Sized + Clone + fmt::Debug + 'static { fn create_split<'a, R: Rng>( children: &'a ImmutableSubsetLeafs, rng: &mut R, - ) -> heed::Result>>; + ) -> heed::Result>; fn margin(p: &Leaf, q: &Leaf) -> f32 { Self::margin_no_header(&p.vector, &q.vector) @@ -108,12 +108,8 @@ pub trait Distance: Send + Sync + Sized + Clone + fmt::Debug + 'static { q: &UnalignedVector, ) -> f32; - fn side( - normal_plane: &UnalignedVector, - node: &Leaf, - rng: &mut R, - ) -> Side { - let dot = Self::margin_no_header(&node.vector, normal_plane); + fn side(normal_plane: &Leaf, node: &Leaf, rng: &mut R) -> Side { + let dot = Self::margin(normal_plane, node); if dot > 0.0 { Side::Right } else if dot < 0.0 { diff --git a/src/node.rs b/src/node.rs index 9ae866a4..eac8c84a 100644 --- a/src/node.rs +++ b/src/node.rs @@ -115,7 +115,7 @@ impl fmt::Debug for ItemIds<'_> { pub struct SplitPlaneNormal<'a, D: Distance> { pub left: NodeId, pub right: NodeId, - pub normal: Cow<'a, UnalignedVector>, + pub normal: Leaf<'a, D>, } impl fmt::Debug for SplitPlaneNormal<'_, D> { @@ -153,7 +153,8 @@ impl<'a, D: Distance> BytesEncode<'a> for NodeCodec { bytes.push(SPLIT_PLANE_NORMAL_TAG); bytes.extend_from_slice(&left.to_bytes()); bytes.extend_from_slice(&right.to_bytes()); - bytes.extend_from_slice(normal.as_bytes()); + bytes.extend_from_slice(bytes_of(&normal.header)); + bytes.extend_from_slice(normal.vector.as_bytes()); } Node::Descendants(Descendants { descendants }) => { bytes.push(DESCENDANTS_TAG); @@ -179,8 +180,11 @@ impl<'a, D: Distance> BytesDecode<'a> for NodeCodec { [SPLIT_PLANE_NORMAL_TAG, bytes @ ..] => { let (left, bytes) = NodeId::from_bytes(bytes); let (right, bytes) = NodeId::from_bytes(bytes); + let (header_bytes, remaining) = bytes.split_at(size_of::()); + let header = pod_read_unaligned(header_bytes); + let vector = UnalignedVector::::from_bytes(remaining)?; Ok(Node::SplitPlaneNormal(SplitPlaneNormal { - normal: UnalignedVector::::from_bytes(bytes)?, + normal: Leaf { header, vector }, left, right, })) diff --git a/src/reader.rs b/src/reader.rs index 658146d7..b0834077 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -102,7 +102,7 @@ impl<'t, D: Distance> Reader<'t, D> { Node::SplitPlaneNormal(SplitPlaneNormal { normal, left, right }) => { let left = recursive_depth(rtxn, database, index, left)?; let right = recursive_depth(rtxn, database, index, right)?; - let is_zero_normal = normal.is_zero() as usize; + let is_zero_normal = normal.vector.is_zero() as usize; Ok(TreeStats { depth: 1 + left.depth.max(right.depth), @@ -258,7 +258,7 @@ impl<'t, D: Distance> Reader<'t, D> { } } Node::SplitPlaneNormal(SplitPlaneNormal { normal, left, right }) => { - let margin = D::margin_no_header(&normal, &query_leaf.vector); + let margin = D::margin(&normal, &query_leaf); queue.push((OrderedFloat(D::pq_distance(dist, margin, Side::Left)), left)); queue.push((OrderedFloat(D::pq_distance(dist, margin, Side::Right)), right)); } diff --git a/src/writer.rs b/src/writer.rs index 1bcedeaa..56a6ff36 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -10,7 +10,7 @@ use rayon::iter::repeatn; use rayon::prelude::*; use roaring::RoaringBitmap; -use crate::distance::Distance; +use crate::distance::{new_leaf, Distance}; use crate::internals::{KeyCodec, Side}; use crate::item_iter::ItemIter; use crate::node::{Descendants, ItemIds, Leaf, SplitPlaneNormal}; @@ -563,7 +563,7 @@ impl Writer { let mut left_ids = RoaringBitmap::new(); let mut right_ids = RoaringBitmap::new(); - if normal.is_zero() { + if normal.vector.is_zero() { randomly_split_children(rng, to_insert, &mut left_ids, &mut right_ids); } else { for leaf in to_insert { @@ -730,7 +730,7 @@ impl Writer { let mut children_left = RoaringBitmap::new(); let mut children_right = RoaringBitmap::new(); randomly_split_children(rng, item_indices, &mut children_left, &mut children_right); - UnalignedVector::reset(&mut normal); + UnalignedVector::reset(&mut normal.vector); (children_left, children_right) } else {