diff --git a/utils/search/backends/solr555pysolr.py b/utils/search/backends/solr555pysolr.py index 54526bee6..2a7154365 100644 --- a/utils/search/backends/solr555pysolr.py +++ b/utils/search/backends/solr555pysolr.py @@ -91,9 +91,19 @@ SOLR_VECTOR_FIELDS_DIMENSIONS_MAP = { 100: 'sim_vector100', + 512: 'sim_vector512', } +def get_solr_dense_vector_search_field_name(dimensions, l2_norm=False): + base_field_name = SOLR_VECTOR_FIELDS_DIMENSIONS_MAP.get(dimensions, None) + if base_field_name is None: + return None + if l2_norm: + return f'{base_field_name}_l2' + return base_field_name + + SOLR_SOUND_FACET_DEFAULT_OPTIONS = { 'limit': 5, 'type': 'terms', @@ -255,8 +265,8 @@ def add_similarity_vectors_to_documents(self, sound_objects, documents): sound_objects_dict = {s.id: s for s in sound_objects} for analyzer_name, config_options in settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS.items(): # If we should index similarity data, add it to the documents - vector_solr_field_type = SOLR_VECTOR_FIELDS_DIMENSIONS_MAP.get(config_options['vector_size'], None) - if vector_solr_field_type is None: + vector_field_name = get_solr_dense_vector_search_field_name(config_options['vector_size'], config_options.get('l2_norm', False)) + if vector_field_name is None: # If the vector size is not supported, then we can't index the vectors generated by the requested analyzer continue for sa in SoundAnalysis.objects.filter(sound_id__in=sound_ids, analyzer=analyzer_name, analysis_status="OK"): @@ -264,12 +274,20 @@ def add_similarity_vectors_to_documents(self, sound_objects, documents): data = sa.get_analysis_data_from_file() if data is not None: if data.get(config_options['vector_property_name'], None) is not None: + vector_data =data[config_options['vector_property_name']][0:config_options['vector_size']] + + if config_options.get('l2_norm', False): + # Normalize the vector to have unit length + norm = math.sqrt(sum([v*v for v in vector_data])) + if norm > 0: + vector_data = [v/norm for v in vector_data] + sim_vector_document_data = { 'content_type': SOLR_DOC_CONTENT_TYPES['similarity_vector'], 'analyzer': sa.analyzer, 'timestamp_start': 0, # This will be used in the future if analyzers generate multiple sound vectors 'timestamp_end': -1, # This will be used in the future if analyzers generate multiple sound vectors - vector_solr_field_type: data[config_options['vector_property_name']][0:config_options['vector_size']] + vector_field_name: vector_data } # Because we still want to be able to group by pack when matching sim vector documents (sound child documents), # we add the grouping_pack field here as well. In the future we might be able to optimize this if we can tell solr @@ -542,19 +560,15 @@ def search_sounds(self, textual_query='', query_fields=None, query_filter='', fi query.set_query('') if similar_to_analyzer in settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS: # Similarity search will find documents close to a target vector. This will match "child" sound documents (of content_type "similarity vectpor") + config_options = settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS[similar_to_analyzer] vector = None if isinstance(similar_to, list): vector = similar_to # we allow vectors to be passed directly - vector_field_name = SOLR_VECTOR_FIELDS_DIMENSIONS_MAP.get(len(vector), None) else: # similar_to should be a sound_id sound = Sound.objects.get(id=similar_to) - config_options = settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS[similar_to_analyzer] - vector_field_name = SOLR_VECTOR_FIELDS_DIMENSIONS_MAP.get(config_options['vector_size'], None) - vector = get_similarity_search_target_vector(sound.id, analyzer=similar_to_analyzer) - if vector is not None: - vector = vector[0:config_options['vector_size']] # Make sure the vector has the right size (just in case) - + vector = get_similarity_search_target_vector(sound.id, analyzer=similar_to_analyzer) + vector_field_name = get_solr_dense_vector_search_field_name(config_options['vector_size'], config_options.get('l2_norm', False)) if vector is not None and vector_field_name is not None: max_similar_sounds = similar_to_max_num_sounds # Max number of results for similarity search search. Filters are applied before the similarity search, so this number will usually be the total number of results (unless filters are more restrictive) serialized_vector = ','.join([str(n) for n in vector]) diff --git a/utils/search/search_sounds.py b/utils/search/search_sounds.py index ec177ebcf..a5f39d523 100644 --- a/utils/search/search_sounds.py +++ b/utils/search/search_sounds.py @@ -175,7 +175,7 @@ def get_sound_similarity_from_search_engine_query(query_params, analyzer_name=se # Update query params to get similarity vectors of the first config_options = settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS[analyzer_name] - vector_field_name = utils.search.backends.solr555pysolr.SOLR_VECTOR_FIELDS_DIMENSIONS_MAP.get(config_options['vector_size']) + vector_field_name = utils.search.backends.solr555pysolr.get_solr_dense_vector_search_field_name(config_options['vector_size'], config_options('l2_norm', False)) query_params.update({ 'facets': None, 'current_page': current_page if current_page is not None else query_params['current_page'], diff --git a/utils/search/solr9/cores/freesound/conf/schema.xml b/utils/search/solr9/cores/freesound/conf/schema.xml index d8d3a0875..c0eef10f1 100644 --- a/utils/search/solr9/cores/freesound/conf/schema.xml +++ b/utils/search/solr9/cores/freesound/conf/schema.xml @@ -176,7 +176,12 @@ performing any ranking --> + + + + + @@ -243,6 +248,8 @@ + +