Skip to content

Commit

Permalink
Add support for dot_product nn search
Browse files Browse the repository at this point in the history
  • Loading branch information
ffont committed Dec 11, 2024
1 parent 534d66e commit b2d7e8f
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 11 deletions.
34 changes: 24 additions & 10 deletions utils/search/backends/solr555pysolr.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,19 @@

SOLR_VECTOR_FIELDS_DIMENSIONS_MAP = {
100: 'sim_vector100',
512: 'sim_vector512',
}


def get_solr_dense_vector_search_field_name(dimensions, l2_norm=False):
base_field_name = SOLR_VECTOR_FIELDS_DIMENSIONS_MAP.get(dimensions, None)
if base_field_name is None:
return None
if l2_norm:
return f'{base_field_name}_l2'
return base_field_name


SOLR_SOUND_FACET_DEFAULT_OPTIONS = {
'limit': 5,
'type': 'terms',
Expand Down Expand Up @@ -255,21 +265,29 @@ def add_similarity_vectors_to_documents(self, sound_objects, documents):
sound_objects_dict = {s.id: s for s in sound_objects}
for analyzer_name, config_options in settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS.items():
# If we should index similarity data, add it to the documents
vector_solr_field_type = SOLR_VECTOR_FIELDS_DIMENSIONS_MAP.get(config_options['vector_size'], None)
if vector_solr_field_type is None:
vector_field_name = get_solr_dense_vector_search_field_name(config_options['vector_size'], config_options.get('l2_norm', False))
if vector_field_name is None:
# If the vector size is not supported, then we can't index the vectors generated by the requested analyzer
continue
for sa in SoundAnalysis.objects.filter(sound_id__in=sound_ids, analyzer=analyzer_name, analysis_status="OK"):
similarity_vectors_per_analyzer_per_sound=[]
data = sa.get_analysis_data_from_file()
if data is not None:
if data.get(config_options['vector_property_name'], None) is not None:
vector_data =data[config_options['vector_property_name']][0:config_options['vector_size']]

if config_options.get('l2_norm', False):
# Normalize the vector to have unit length
norm = math.sqrt(sum([v*v for v in vector_data]))
if norm > 0:
vector_data = [v/norm for v in vector_data]

sim_vector_document_data = {
'content_type': SOLR_DOC_CONTENT_TYPES['similarity_vector'],
'analyzer': sa.analyzer,
'timestamp_start': 0, # This will be used in the future if analyzers generate multiple sound vectors
'timestamp_end': -1, # This will be used in the future if analyzers generate multiple sound vectors
vector_solr_field_type: data[config_options['vector_property_name']][0:config_options['vector_size']]
vector_field_name: vector_data
}
# Because we still want to be able to group by pack when matching sim vector documents (sound child documents),
# we add the grouping_pack field here as well. In the future we might be able to optimize this if we can tell solr
Expand Down Expand Up @@ -542,19 +560,15 @@ def search_sounds(self, textual_query='', query_fields=None, query_filter='', fi
query.set_query('')
if similar_to_analyzer in settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS:
# Similarity search will find documents close to a target vector. This will match "child" sound documents (of content_type "similarity vectpor")
config_options = settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS[similar_to_analyzer]
vector = None
if isinstance(similar_to, list):
vector = similar_to # we allow vectors to be passed directly
vector_field_name = SOLR_VECTOR_FIELDS_DIMENSIONS_MAP.get(len(vector), None)
else:
# similar_to should be a sound_id
sound = Sound.objects.get(id=similar_to)
config_options = settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS[similar_to_analyzer]
vector_field_name = SOLR_VECTOR_FIELDS_DIMENSIONS_MAP.get(config_options['vector_size'], None)
vector = get_similarity_search_target_vector(sound.id, analyzer=similar_to_analyzer)
if vector is not None:
vector = vector[0:config_options['vector_size']] # Make sure the vector has the right size (just in case)

vector = get_similarity_search_target_vector(sound.id, analyzer=similar_to_analyzer)
vector_field_name = get_solr_dense_vector_search_field_name(config_options['vector_size'], config_options.get('l2_norm', False))
if vector is not None and vector_field_name is not None:
max_similar_sounds = similar_to_max_num_sounds # Max number of results for similarity search search. Filters are applied before the similarity search, so this number will usually be the total number of results (unless filters are more restrictive)
serialized_vector = ','.join([str(n) for n in vector])
Expand Down
2 changes: 1 addition & 1 deletion utils/search/search_sounds.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def get_sound_similarity_from_search_engine_query(query_params, analyzer_name=se

# Update query params to get similarity vectors of the first
config_options = settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS[analyzer_name]
vector_field_name = utils.search.backends.solr555pysolr.SOLR_VECTOR_FIELDS_DIMENSIONS_MAP.get(config_options['vector_size'])
vector_field_name = utils.search.backends.solr555pysolr.get_solr_dense_vector_search_field_name(config_options['vector_size'], config_options('l2_norm', False))
query_params.update({
'facets': None,
'current_page': current_page if current_page is not None else query_params['current_page'],
Expand Down
7 changes: 7 additions & 0 deletions utils/search/solr9/cores/freesound/conf/schema.xml
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,12 @@
performing any ranking -->
<similarity class="solr.BooleanSimilarityFactory"/>
</fieldType>

<!-- Define different types of dense vector fields, including "dot product" versions that expect l^2 normalized vectors -->
<fieldType name="knn_vector100" class="solr.DenseVectorField" vectorDimension="100" similarityFunction="euclidean" knnAlgorithm="hnsw" hnswMaxConnections="10" hnswBeamWidth="40"/>
<fieldType name="knn_vector100_l2" class="solr.DenseVectorField" vectorDimension="100" similarityFunction="dot_product" knnAlgorithm="hnsw" hnswMaxConnections="10" hnswBeamWidth="40"/>
<fieldType name="knn_vector512" class="solr.DenseVectorField" vectorDimension="512" similarityFunction="euclidean" knnAlgorithm="hnsw" hnswMaxConnections="10" hnswBeamWidth="40"/>
<fieldType name="knn_vector512_l2" class="solr.DenseVectorField" vectorDimension="512" similarityFunction="dot_product" knnAlgorithm="hnsw" hnswMaxConnections="10" hnswBeamWidth="40"/>

<field name="_root_" type="string" docValues="false" indexed="true" stored="false"/>
<field name="_text_" type="text_general" multiValued="true" indexed="true" stored="false"/>
Expand Down Expand Up @@ -243,6 +248,8 @@
<field name="_nest_parent_" type="string" indexed="true" stored="true" />
<field name="_nest_path_" type="_nest_path_"/>
<field name="sim_vector100" type="knn_vector100" indexed="true" stored="true" required="false"/>
<field name="sim_vector100_l2" type="knn_vector100_l2" indexed="true" stored="true" required="false"/>
<field name="sim_vector512_l2" type="knn_vector512_l2" indexed="true" stored="true" required="false"/>
<field name="analyzer" type="string" indexed="true" stored="true" required="false" />
<field name="timestamp_start" type="pdouble" indexed="true" stored="true" required="false" />
<field name="timestamp_end" type="pdouble" indexed="true" stored="true" required="false" />
Expand Down

0 comments on commit b2d7e8f

Please sign in to comment.