diff --git a/DEVELOPERS.md b/DEVELOPERS.md index 7e4a6aa4d..a3f001329 100644 --- a/DEVELOPERS.md +++ b/DEVELOPERS.md @@ -71,6 +71,7 @@ Currently, we only use the following custom permissions: * `tickets.can_moderate` (in `Ticket` model, used to allow sound moderation) * `forum.can_moderate_forum` (in `Post` model, used to allow forum moderation) * `sounds.can_describe_in_bulk` (in `BulkUploadProgress` model, used to allow bulk upload for users who don't meet the other common requirements) +* `profile.show_beta_search_options` (in `Profile` model, used to allow using beta search features) ### URLs that include a username @@ -131,6 +132,33 @@ creating `DeletedSound` objects in the `sounds-models.on_delete_sound` function signal of the `Sound` model. +### Adding new search options in the search page + +The available options for searching and filtering sounds in the search page ara managed using a `SearchQueryProcessor` +object (implemented in `/utils/search/search_query_processor.py`). The `SearchQueryProcessor` class is used to parse and +process search query information from a Django `request` object, and compute a number of useful items for displaying search +information in templates, constructing search URLs, and preparing search options to be passed to the backend search engine. + +To add a new option to the search page, a new member of a specific `SearchOption` class should be added to the `SearchQueryProcessor` +class (see `SearchQueryProcessor` definion for examples). There are a number of already existing types of `SearchOption`s +as you can see by looking at the search options which are already implemented in `SearchQueryProcessor`. If the newly added search +option implies doing some calcualtions for determining the `query_params` to be sent to the `search_sounds` function of the search +engine backend, this should be done in the `SearchQueryProcessor.as_query_params` method. + +Adding a new search option to `SearchQueryProcessor` will make the option work with the search engine backend and with search URLs, +but it will NOT automatically add the option to the form in the search page. This will need to be done manually by adding the +search option in the desired place in `templates/search/search.html` (see how other search options are implemented for inspiration, +there is a `display_search_option` templatetag which will facilitate things in most cases). + +All this will add the search option to the user interface and send corresponding information to the search backend. For example, +if the new search option should apply a filter in the search backend of some `new_property`, this will be handled by the `SearchQueryProcessor`. +However, it is expected that this `new_property` has been added to the search engine schema and indexed properly, otherwise there +will be errors when running the queries. + +Please have a look at the documentation of `SearchQueryProcessor` and the various `SearchOption` classes to get a better +understanding of how all this works. + + ### Search Engine Backends The way in which Freesound communicates with a search engine to search for sounds and forum posts is abstracted through @@ -149,7 +177,6 @@ the implementation of a search backend. You can run it like: Please read carefully the documentation of the management command to better understand how it works and how is it doing the testing. - ### Freesound analysis pipeline In February 2022 we released a refactoring of the analysis pipeline that allows us to more easily incorporate new audio diff --git a/accounts/migrations/0041_alter_profile_options.py b/accounts/migrations/0041_alter_profile_options.py new file mode 100644 index 000000000..7a2cda03b --- /dev/null +++ b/accounts/migrations/0041_alter_profile_options.py @@ -0,0 +1,17 @@ +# Generated by Django 3.2.23 on 2024-02-23 22:08 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('accounts', '0040_auto_20230328_1205'), + ] + + operations = [ + migrations.AlterModelOptions( + name='profile', + options={'ordering': ('-user__date_joined',), 'permissions': (('can_beta_test', 'Show beta features to that user.'),)}, + ), + ] diff --git a/accounts/models.py b/accounts/models.py index a5272f023..6e1b18cfd 100644 --- a/accounts/models.py +++ b/accounts/models.py @@ -226,7 +226,7 @@ def get_user_sounds_in_search_url(self): return f'{reverse("sounds-search")}?f=username:"{ self.user.username }"&s=Date+added+(newest+first)&g=0' def get_user_packs_in_search_url(self): - return f'{reverse("sounds-search")}?f=username:"{ self.user.username }"&s=Date+added+(newest+first)&g=1&only_p=1' + return f'{reverse("sounds-search")}?f=username:"{ self.user.username }"&s=Date+added+(newest+first)&g=1&dp=1' def get_latest_packs_for_profile_page(self): latest_pack_ids = Pack.objects.select_related().filter(user=self.user, num_sounds__gt=0).exclude(is_deleted=True) \ @@ -649,6 +649,9 @@ def get_stats_for_profile_page(self): class Meta: ordering = ('-user__date_joined', ) + permissions = ( + ("can_beta_test", "Show beta features to that user."), + ) class GdprAcceptance(models.Model): diff --git a/accounts/tests/test_views.py b/accounts/tests/test_views.py index 2c6cd9e6a..612d0884a 100644 --- a/accounts/tests/test_views.py +++ b/accounts/tests/test_views.py @@ -262,14 +262,14 @@ def test_sounds_response(self): reverse('pack-downloaders', kwargs={'username': user.username, "pack_id": self.pack.id}) + '?ajax=1') self.assertEqual(resp.status_code, 200) - @mock.patch('search.views.perform_search_engine_query') + @mock.patch('tags.views.perform_search_engine_query') def test_tags_response(self, perform_search_engine_query): perform_search_engine_query.return_value = (create_fake_perform_search_engine_query_results_tags_mode(), None) # 200 response on tags page access resp = self.client.get(reverse('tags')) self.assertEqual(resp.status_code, 200) - self.assertEqual(resp.context['tags_mode'], True) + self.assertEqual(resp.context['sqp'].tags_mode_active(), True) def test_packs_response(self): # 302 response (note that since BW, there will be a redirect to the search page in between) diff --git a/accounts/urls.py b/accounts/urls.py index 564f2968c..7b42fc756 100644 --- a/accounts/urls.py +++ b/accounts/urls.py @@ -27,7 +27,7 @@ import bookmarks.views as bookmarks import follow.views as follow import apiv2.views as api -from utils.urlpatterns import redirect_inline +from utils.url import redirect_inline diff --git a/clustering/__init__.py b/clustering/__init__.py index f862c5e1b..e69de29bb 100644 --- a/clustering/__init__.py +++ b/clustering/__init__.py @@ -1,23 +0,0 @@ -# -# Freesound is (c) MUSIC TECHNOLOGY GROUP, UNIVERSITAT POMPEU FABRA -# -# Freesound is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# Freesound is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . -# -# Authors: -# See AUTHORS file. -# - -# strings used for communicating the state of the clustering process -CLUSTERING_RESULT_STATUS_PENDING = "pending" -CLUSTERING_RESULT_STATUS_FAILED = "failed" diff --git a/clustering/clustering.py b/clustering/clustering.py index ccee9b738..c7f4891e8 100644 --- a/clustering/clustering.py +++ b/clustering/clustering.py @@ -33,21 +33,15 @@ import six from time import time -from . import clustering_settings as clust_settings - -# The following packages are only needed if the running process is configured to be a Celery worker. -# We avoid importing them in appservers to avoid having to install unneeded dependencies. -if settings.IS_CELERY_WORKER: - import community as com - import numpy as np - import networkx as nx - from networkx.readwrite import json_graph - from networkx.algorithms.community import k_clique_communities, greedy_modularity_communities - from sklearn import metrics - from sklearn.feature_selection import mutual_info_classif - from sklearn.neighbors import kneighbors_graph - - from .features_store import FeaturesStore +import community as com +import numpy as np +import networkx as nx +from networkx.readwrite import json_graph +from networkx.algorithms.community import k_clique_communities, greedy_modularity_communities +from sklearn import metrics +from sklearn.feature_selection import mutual_info_classif +from sklearn.neighbors import kneighbors_graph + logger = logging.getLogger('clustering') @@ -65,8 +59,6 @@ class ClusteringEngine(object): method. Moreover, a few unsued alternative methods for performing some intermediate steps are left here for developement and research purpose. """ - def __init__(self): - self.feature_store = FeaturesStore() def _prepare_clustering_result_and_reference_features_for_evaluation(self, partition): """Formats the clustering classes and some reference features in order to then estimate how good is the @@ -157,6 +149,9 @@ def _evaluation_metrics(self, partition): """ # we compute the evaluation metrics only if some reference features are available for evaluation # we return None when they are not available not to break the following part of the code + ''' + # NOTE: the following code is commented because the reference features are not available in the current version of the code + # If in the future we wan to perform further evaluation, we should re-implement some of these functions if clust_settings.REFERENCE_FEATURES in clust_settings.AVAILABLE_FEATURES: reference_features, clusters = self._prepare_clustering_result_and_reference_features_for_evaluation(partition) ami = np.average(mutual_info_classif(reference_features, clusters, discrete_features=True)) @@ -165,6 +160,8 @@ def _evaluation_metrics(self, partition): return ami, ss, ci else: return None, None, None + ''' + return None, None, None def _ratio_intra_community_edges(self, graph, communities): """Computes the ratio of the number of intra-community (cluster) edges to the total number of edges in the cluster. @@ -212,55 +209,13 @@ def _point_centralities(self, graph, communities): node_community_centralities = {k: old_div(v,max(d.values())) for d in communities_centralities for k, v in d.items()} return node_community_centralities - - def _save_results_to_file(self, query_params, features, graph_json, sound_ids, modularity, - num_communities, ratio_intra_community_edges, ami, ss, ci, communities): - """Saves a json file to disk containing the clustering results information listed below. - This is used when developing the clustering method. The results and the evaluation metrics are made accessible - for post-analysis. - - Args: - query_params (str): string representing the query parameters submited by the user to the search engine. - features (str): name of the features used for clustering. - graph_json: (dict) NetworkX graph representation of sounds data in node-link format that is suitable for JSON - serialization. - sound_ids (List[Int]): list of the sound ids. - modularity (float): modularity of the graph partition. - num_communities (Int): number of communities (clusters). - ratio_intra_community_edges (List[Float]): intra-community edges ratio. - ami (Numpy.float): Average Mutual Information score. - ss (Numpy.float): Silhouette Coefficient score. - ci (Numpy.float): Calinski and Harabaz Index score. - communities (List[List[Int]]): List storing Lists containing the Sound ids that are in each community (cluster). - """ - if clust_settings.SAVE_RESULTS_FOLDER: - result = { - 'query_params' : query_params, - 'sound_ids': sound_ids, - 'num_clusters': num_communities, - 'graph': graph_json, - 'features': features, - 'modularity': modularity, - 'ratio_intra_community_edges': ratio_intra_community_edges, - 'average_mutual_information': ami, - 'silouhette_coeff': ss, - 'calinski_harabaz_score': ci, - 'communities': communities - } - with open(os.path.join( - clust_settings.SAVE_RESULTS_FOLDER, - f'{query_params}.json' - ), 'w') as f: - json.dump(result, f) - - def create_knn_graph(self, sound_ids_list, features=clust_settings.DEFAULT_FEATURES): + def create_knn_graph(self, sound_ids_list, similarity_vectors_map): """Creates a K-Nearest Neighbors Graph representation of the given sounds. Args: sound_ids_list (List[str]): list of sound ids. - features (str): name of the features to be used for nearest neighbors computation. - Available features are listed in the clustering settings file. + similarity_vectors_map (Dict{int:List[float]}): dictionary with the similarity feature vectors for each sound. Returns: (nx.Graph): NetworkX graph representation of sounds. @@ -272,58 +227,21 @@ def create_knn_graph(self, sound_ids_list, features=clust_settings.DEFAULT_FEATU # neighbors for small collections, while limiting it for larger collections, which ensures low-computational complexity. k = int(np.ceil(np.log2(len(sound_ids_list)))) - sound_features, sound_ids_out = self.feature_store.return_features(sound_ids_list) + features = [] + sound_ids_out = [] + for sound_id, feature_vector in similarity_vectors_map.items(): + features.append(feature_vector) + sound_ids_out.append(sound_id) + sound_features = np.array(features).astype('float32') + A = kneighbors_graph(sound_features, k) for idx_from, (idx_to, distance) in enumerate(zip(A.indices, A.data)): idx_from = int(idx_from / k) - if distance < clust_settings.MAX_NEIGHBORS_DISTANCE: + if distance < settings.CLUSTERING_MAX_NEIGHBORS_DISTANCE: graph.add_edge(sound_ids_out[idx_from], sound_ids_out[idx_to]) # Remove isolated nodes graph.remove_nodes_from(list(nx.isolates(graph))) - - return graph - - def create_common_nn_graph(self, sound_ids_list, features=clust_settings.DEFAULT_FEATURES): - """Creates a Common Nearest Neighbors Graph representation of the given sounds. - - Args: - sound_ids_list (List[str]): list of sound ids. - features (str): name of the features to be used for nearest neighbors computation. - Available features are listed in the clustering settings file. - - Returns: - (nx.Graph): NetworkX graph representation of sounds. - """ - # first create a knn graph - knn_graph = self.create_knn_graph(sound_ids_list, features=features) - - # create the common nn graph - graph = nx.Graph() - graph.add_nodes_from(knn_graph.nodes) - - for i, node_i in enumerate(knn_graph.nodes): - for j, node_j in enumerate(knn_graph.nodes): - if j > i: - num_common_neighbors = len(set(knn_graph.neighbors(node_i)).intersection(knn_graph.neighbors(node_j))) - if num_common_neighbors > 0: - graph.add_edge(node_i, node_j, weight=num_common_neighbors) - - # keep only k most weighted edges - k = int(np.ceil(np.log2(len(graph.nodes)))) - # we iterate through the node ids and get all its corresponding edges using graph[node] - # there seem to be no way to get node_id & edges in the for loop. - for node in graph.nodes: - ordered_neighbors = sorted(list(six.iteritems(graph[node])), key=lambda x: x[1]['weight'], reverse=True) - try: - neighbors_to_remove = [neighbor_distance[0] for neighbor_distance in ordered_neighbors[k:]] - graph.remove_edges_from([(node, neighbor) for neighbor in neighbors_to_remove]) - except IndexError: - pass - - # Remove isolated nodes - graph.remove_nodes_from(list(nx.isolates(graph))) - return graph def cluster_graph(self, graph): @@ -349,7 +267,7 @@ def cluster_graph(self, graph): modularity = com.modularity(partition , graph) return partition, num_communities, communities, modularity - + def cluster_graph_overlap(self, graph, k=5): """Applies overlapping community detection in the given graph. @@ -371,7 +289,7 @@ def cluster_graph_overlap(self, graph, k=5): partition = {sound_id: cluster_id for cluster_id, cluster in enumerate(communities) for sound_id in cluster} return partition, num_communities, communities, None - + def remove_lowest_quality_cluster(self, graph, partition, communities, ratio_intra_community_edges): """Removes the lowest quality cluster in the given graph. @@ -404,13 +322,13 @@ def remove_lowest_quality_cluster(self, graph, partition, communities, ratio_int partition[snd] -= 1 return graph, partition, communities, ratio_intra_community_edges - def cluster_points(self, query_params, features, sound_ids): + def cluster_points(self, query_params, sound_ids, similarity_vectors_map): """Applies clustering on the requested sounds using the given features name. Args: query_params (str): string representing the query parameters submited by the user to the search engine. - features (str): name of the features used for clustering the sounds. sound_ids (List[int]): list containing the ids of the sound to cluster. + similarity_vectors_map (Dict{int:List[float]}): dictionary with the similarity feature vectors for each sound. Returns: Dict: contains the resulting clustering classes and the graph in node-link format suitable for JSON serialization. @@ -420,17 +338,17 @@ def cluster_points(self, query_params, features, sound_ids): logger.info('Request clustering of {} points: {} ... from the query "{}"' .format(len(sound_ids), ', '.join(sound_ids[:20]), json.dumps(query_params))) - graph = self.create_knn_graph(sound_ids, features=features) + graph = self.create_knn_graph(sound_ids, similarity_vectors_map=similarity_vectors_map) if len(graph.nodes) == 0: # the graph does not contain any node - return {'error': False, 'result': None, 'graph': None} + return {'clusters': None, 'graph': None} partition, num_communities, communities, modularity = self.cluster_graph(graph) ratio_intra_community_edges = self._ratio_intra_community_edges(graph, communities) # Discard low quality cluster if there are more than NUM_MAX_CLUSTERS clusters - num_exceeding_clusters = num_communities - clust_settings.NUM_MAX_CLUSTERS + num_exceeding_clusters = num_communities - settings.CLUSTERING_NUM_MAX_CLUSTERS if num_exceeding_clusters > 0: for _ in range(num_exceeding_clusters): graph, partition, communities, ratio_intra_community_edges = self.remove_lowest_quality_cluster( @@ -459,8 +377,4 @@ def cluster_points(self, query_params, features, sound_ids): # Export graph as json graph_json = json_graph.node_link_data(graph) - # Save results to file if SAVE_RESULTS_FOLDER is configured in clustering settings - self._save_results_to_file(query_params, features, graph_json, sound_ids, modularity, - num_communities, ratio_intra_community_edges, ami, ss, ci, communities) - - return {'error': False, 'result': communities, 'graph': graph_json} + return {'clusters': communities, 'graph': graph_json} diff --git a/clustering/clustering_settings.py b/clustering/clustering_settings.py deleted file mode 100644 index 2e63cccbc..000000000 --- a/clustering/clustering_settings.py +++ /dev/null @@ -1,72 +0,0 @@ -# -# Freesound is (c) MUSIC TECHNOLOGY GROUP, UNIVERSITAT POMPEU FABRA -# -# Freesound is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# Freesound is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . -# -# Authors: -# See AUTHORS file. -# - - -# Directory where the Gaia dataset index files are located. -INDEX_DIR = '/freesound-data/clustering_index/' - -# Configuration of the features used for clustering or evaluation. -# We define here for each features a json file index file, and possible additonal info. -# The minimum requirement is to have one available set of features and set it as the default features -# used for clustering (see variable bellow). -AVAILABLE_FEATURES = { - # AudioSet Features (feature vector of the frame of max energy) - 'AUDIOSET_FEATURES': { - 'DATASET_FILE': 'AS_features_max_nrg.json' - }, - # tag-based features used as reference features (Bag of Words - LDA) - 'TAG_DERIVED_FEATURES': None, -} - -# Default features used for clustering -DEFAULT_FEATURES = 'AUDIOSET_FEATURES' - -# Key of AVAILABLE_FEATURES used for evaluating the clustering results -# Typically tag-derived features -REFERENCE_FEATURES = None - -# Maximum number of results to cluster -MAX_RESULTS_FOR_CLUSTERING = 1000 - -# Cache settings -# One day timeout for keeping clustering results. The cache timer is reset when the clustering is -# requested so that popular queries that are performed once a day minimum will always stay in cache -# and won't be recomputed. -CLUSTERING_CACHE_TIME = 24*60*60*1 -# One minute timeout for keeping the pending state. When a clustering is being performed async in a -# Celery worker, we consider the clustering as pending for only 1 minute. This may be useful if a -# worker task got stuck. There should be a settings in celery to stop a worker task if it is running -# for too long. -CLUSTERING_PENDING_CACHE_TIME = 60*1 - -# Folder for saving the clustering results with evaluation (dev/debug/research purpose) -SAVE_RESULTS_FOLDER = None - -# Limit of distance when creating Nearest Neighbors graph -MAX_NEIGHBORS_DISTANCE = 20 - -# Number of sound examples extracted per cluster for cluster facet sound preview -NUM_SOUND_EXAMPLES_PER_CLUSTER_FACET = 7 - -# Number of most common tags extracted per cluster for clustering facet name -NUM_TAGS_SHOWN_PER_CLUSTER_FACET = 3 - -# Number of maximum clusters to show to the user -NUM_MAX_CLUSTERS = 8 diff --git a/clustering/features_store.py b/clustering/features_store.py deleted file mode 100644 index 51915ce2d..000000000 --- a/clustering/features_store.py +++ /dev/null @@ -1,81 +0,0 @@ -# -# Freesound is (c) MUSIC TECHNOLOGY GROUP, UNIVERSITAT POMPEU FABRA -# -# Freesound is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# Freesound is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . -# -# Authors: -# See AUTHORS file. -# - -from __future__ import absolute_import - -from builtins import zip -from builtins import str -from builtins import object -import json -import logging -import os - -from . import clustering_settings as clust_settings -import numpy as np -import redis -from django.conf import settings - -logger = logging.getLogger('clustering') - - -class RedisStore(object): - def __init__(self): - self.r = redis.StrictRedis( - host=settings.REDIS_HOST, port=settings.REDIS_PORT, db=settings.AUDIO_FEATURES_REDIS_STORE_ID) - - def set_feature(self, sound_id, feature): - self.r.set(str(sound_id), json.dumps(feature)) - - def get_feature(self, sound_id): - feature = self.r.get(str(sound_id)) - if feature: - return json.loads(feature) - - def set_features(self, d): - self.r.mset({k: json.dumps(v) for k, v in d.items()}) - - def get_features(self, sound_ids): - return self.r.mget(sound_ids) - - -class FeaturesStore(object): - """Method for storing and retrieving audio features - """ - def __init__(self): - self.redis = RedisStore() - self.__load_features() - - def __load_features(self): - self.AS_features = json.load(open(os.path.join( - clust_settings.INDEX_DIR, - clust_settings.AVAILABLE_FEATURES[clust_settings.DEFAULT_FEATURES]['DATASET_FILE'] - ), 'r')) - self.redis.set_features(self.AS_features) - - def return_features(self, sound_ids): - features = [] - sound_ids_out = [] - output = self.redis.get_features(sound_ids) - for sound_id, feature in zip(sound_ids, output): - if feature: - features.append(json.loads(feature)) - sound_ids_out.append(sound_id) - - return np.array(features).astype('float32'), sound_ids_out diff --git a/clustering/interface.py b/clustering/interface.py deleted file mode 100644 index ccbbca602..000000000 --- a/clustering/interface.py +++ /dev/null @@ -1,140 +0,0 @@ -# -# Freesound is (c) MUSIC TECHNOLOGY GROUP, UNIVERSITAT POMPEU FABRA -# -# Freesound is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# Freesound is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . -# -# Authors: -# See AUTHORS file. -# -from __future__ import absolute_import -from builtins import str -from django.conf import settings -from django.core.cache import caches - -from .clustering_settings import DEFAULT_FEATURES, MAX_RESULTS_FOR_CLUSTERING -from freesound.celery import app as celery_app -from utils.encryption import create_hash -from utils.search.search_sounds import perform_search_engine_query, search_prepare_parameters -from . import CLUSTERING_RESULT_STATUS_PENDING, CLUSTERING_RESULT_STATUS_FAILED - -cache_clustering = caches["clustering"] - - -def get_sound_ids_from_search_engine_query(query_params): - """Performs Solr query and returns results as a list of sound ids. - - This method performs a single query to Solr with a very big page size argument so all results are - returned at once. A very big page size will make the clustering take a lot of time to be performed. - The number of results to retrieve is defined in the clustering settings file as MAX_RESULTS_FOR_CLUSTERING. - - Args: - query_params (dict): contains the query parameters to replicate the user query. - - Returns - List[int]: list containing the ids of the retrieved sounds. - """ - # We set include_facets to False in order to reduce the amount of data that search engine will return. - query_params.update({ - 'current_page': 1, - 'num_sounds': MAX_RESULTS_FOR_CLUSTERING, - }) - results, _ = perform_search_engine_query(query_params) - resultids = [d.get("id") for d in results.docs] - - return resultids - - -def cluster_sound_results(request, features=DEFAULT_FEATURES): - """Performs clustering on the search results of the given search request with the requested features. - - This is the main entry to the clustering method. It will either get the clustering results from cache, - or compute it (and store it in cache). When needed, the clustering will be performed async by a celery - worker. - - Args: - request (HttpRequest): request associated with the search query submitted by the user. - features (str): name of the features to be used for clustering. The available features are defined in the - clustering settings file. - - Returns: - Dict: contains either the state of the clustering ('pending' or 'failed') or the resulting clustering classes - and the graph in node-link format suitable for JSON serialization. - """ - query_params, _, extra_vars = search_prepare_parameters(request) - # We change filter_query to filter_query_non_facets in order to ensure that the clustering is always - # done on the non faceted filtered results. Without that, people directly requesting a facet filtered - # page would have a clustering performed on filtered results. - query_params['query_filter'] = extra_vars['filter_query_non_facets'] - cache_key = 'cluster-results-{textual_query}-{query_filter}-{sort}-{group_by_pack}'\ - .format(**query_params).replace(' ', '') - cache_key += f"-{str(query_params['query_fields'])}" - cache_key += f'-{features}' - cache_key_hashed = hash_cache_key(cache_key) - - # check if result is in cache - result = cache_clustering.get(cache_key_hashed) - - if result and result not in (CLUSTERING_RESULT_STATUS_PENDING, CLUSTERING_RESULT_STATUS_FAILED): - result.update({'finished': True, 'error': False}) - return result - - elif result == CLUSTERING_RESULT_STATUS_PENDING: - return {'finished': False, 'error': False} - - elif result == CLUSTERING_RESULT_STATUS_FAILED: - return {'finished': False, 'error': True} - - else: - # if not in cache, query solr and perform clustering - sound_ids = get_sound_ids_from_search_engine_query(query_params) - - # launch clustering with celery async task - celery_app.send_task('cluster_sounds', kwargs={ - 'cache_key_hashed': cache_key_hashed, - 'sound_ids': sound_ids, - 'features': features - }, queue='clustering') - - return {'finished': False, 'error': False} - - -def get_ids_in_cluster(request, requested_cluster_id): - """Get the sound ids in the requested cluster. Used for applying a filter by id when using a cluster facet. - """ - try: - requested_cluster_id = int(requested_cluster_id) - 1 - - # results are cached in clustering_utilities, available features are defined in the clustering settings file. - result = cluster_sound_results(request, features=DEFAULT_FEATURES) - results = result['result'] - - sounds_from_requested_cluster = results[int(requested_cluster_id)] - - except ValueError: - return [] - except IndexError: - return [] - except KeyError: - # If the clustering is not in cache the 'result' key won't exist - # This means that the clustering computation will be triggered asynchronously. - # Moreover, the applied clustering filter will have no effect. - # Somehow, we should inform the user that the clustering results were not available yet, and that - # he should try again later to use a clustering facet. - return [] - - return sounds_from_requested_cluster - - -def hash_cache_key(key): - return create_hash(key, limit=32) diff --git a/clustering/tasks.py b/clustering/tasks.py index d9d79dc23..0558f882c 100644 --- a/clustering/tasks.py +++ b/clustering/tasks.py @@ -18,54 +18,27 @@ # See AUTHORS file. # -from __future__ import absolute_import - from django.conf import settings -from django.core.cache import caches from celery import shared_task from celery import Task -import logging from .clustering import ClusteringEngine -from .clustering_settings import CLUSTERING_CACHE_TIME, CLUSTERING_PENDING_CACHE_TIME -from . import CLUSTERING_RESULT_STATUS_PENDING, CLUSTERING_RESULT_STATUS_FAILED - -logger = logging.getLogger('clustering') - -cache_clustering = caches["clustering"] - class ClusteringTask(Task): """ Task Class used for defining the clustering engine only required in celery workers """ def __init__(self): - if settings.IS_CELERY_WORKER: - self.engine = ClusteringEngine() + self.engine = ClusteringEngine() -@shared_task(name="cluster_sounds", base=ClusteringTask) -def cluster_sounds(cache_key_hashed, sound_ids, features): - """ Triggers the clustering of the sounds given as argument with the specified features. - - This is the task that is used for clustering the sounds of a search result asynchronously with Celery. +@shared_task(name="cluster_sounds", base=ClusteringTask, queue=settings.CELERY_CLUSTERING_TASK_QUEUE_NAME) +def cluster_sounds(cache_key, sound_ids, similarity_vectors_map=None): + """ Triggers the clustering of the sounds given as argument with the provided similarity vectors. The clustering result is stored in cache using the hashed cache key built with the query parameters. Args: - cache_key_hashed (str): hashed key for storing/retrieving the results in cache. + cache_key (str): hashed key for storing/retrieving the results in cache. sound_ids (List[int]): list containing the ids of the sound to cluster. - features (str): name of the features used for clustering the sounds (defined in the clustering settings file). + similarity_vectors_map (Dict{int:List[float]}): dictionary with the similarity feature vectors for each sound. """ - # store pending state in cache - cache_clustering.set(cache_key_hashed, CLUSTERING_RESULT_STATUS_PENDING, CLUSTERING_PENDING_CACHE_TIME) - - try: - # perform clustering - result = cluster_sounds.engine.cluster_points(cache_key_hashed, features, sound_ids) - - # store result in cache - cache_clustering.set(cache_key_hashed, result, CLUSTERING_CACHE_TIME) - - except Exception as e: - # delete pending state if exception raised during clustering - cache_clustering.set(cache_key_hashed, CLUSTERING_RESULT_STATUS_FAILED, CLUSTERING_PENDING_CACHE_TIME) - logger.info("Exception raised while clustering sounds", exc_info=True) + return cluster_sounds.engine.cluster_points(cache_key, sound_ids, similarity_vectors_map=similarity_vectors_map) diff --git a/docker-compose.yml b/docker-compose.yml index fe87562d6..ce59cd8bc 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -80,7 +80,7 @@ services: context: ./ dockerfile: ./docker/Dockerfile.workers_web init: true - command: celery -A freesound worker --concurrency=2 -l info -Q async_tasks_queue,sound_processing_queue,sound_analysis_old_queue + command: celery -A freesound worker --concurrency=2 -l info -Q async_tasks_queue,sound_processing_queue,sound_analysis_old_queue,clustering_queue volumes: - .:/code - ./freesound-data/:/freesound-data @@ -149,25 +149,6 @@ services: depends_on: - rabbitmq - # Clustering worker (not using the previous one as this has some specific requirements) - worker_clustering: - profiles: ["all"] - build: - context: ./ - dockerfile: ./docker/Dockerfile.clustering - init: true - command: celery -A freesound worker -l info -Q clustering - volumes: - - .:/code - - ./freesound-data/:/freesound-data - - ./freesound-data/clustering_index:/freesound-data/clustering_index - depends_on: - - rabbitmq - - redis - environment: - - ENV_CELERY_WORKER=1 - - FS_USER_ID - # Similarity http server similarity: profiles: ["all"] diff --git a/docker/Dockerfile.clustering b/docker/Dockerfile.clustering index a04588cfe..c37511187 100644 --- a/docker/Dockerfile.clustering +++ b/docker/Dockerfile.clustering @@ -2,20 +2,10 @@ FROM freesound:2023-07 RUN apt-get update \ && apt-get install -y --no-install-recommends \ - libqt4-dev \ libyaml-dev \ - swig \ libsndfile1-dev \ && rm -rf /var/lib/apt/lists/* -# Gaia - https://github.com/MTG/gaia -RUN git clone https://github.com/MTG/gaia.git /tmp/gaia \ - && cd /tmp/gaia \ - && git checkout v2.4.5 \ - && ./waf configure --with-python-bindings \ - && ./waf \ - && ./waf install \ - && cd / && rm -r /tmp/gaia RUN mkdir /code RUN mkdir /gaia_index diff --git a/freesound/settings.py b/freesound/settings.py index 8e1c33cb4..712b4227a 100644 --- a/freesound/settings.py +++ b/freesound/settings.py @@ -648,7 +648,6 @@ SEARCH_ENGINE_NUM_SIMILAR_SOUNDS_PER_QUERY = 500 USE_SEARCH_ENGINE_SIMILARITY = False -SEARCH_ALLOW_DISPLAY_RESULTS_IN_MAP = True MAX_SEARCH_RESULTS_IN_MAP_DISPLAY = 10000 # This is the maximum number of sounds that will be shown when using "display results in map" mode # ------------------------------------------------------------------------------- @@ -736,16 +735,29 @@ # Search results clustering # NOTE: celery configuration is set after the local settings import -# Environment variables -# '1' indicates that a process is running as a celery worker. -# We get it from environment variable to avoid the need of a specific settings file for celery workers. -# We enable the imports of clustering dependencies only in celery workers. -IS_CELERY_WORKER = os.getenv('ENV_CELERY_WORKER', None) == "1" +MAX_RESULTS_FOR_CLUSTERING = 1000 -# Determines whether to use or not the clustering feature. -# Set to False by default (to be overwritten in local_settings.py) -# When activated, Enables to do js calls & html clustering facets rendering -ENABLE_SEARCH_RESULTS_CLUSTERING = False +# One day timeout for keeping clustering results. The cache timer is reset when the clustering is +# requested so that popular queries that are performed once a day minimum will always stay in cache +# and won't be recomputed. +CLUSTERING_CACHE_TIME = 24*60*60*1 + +# Limit of distance when creating Nearest Neighbors graph +CLUSTERING_MAX_NEIGHBORS_DISTANCE = 20 + +# Number of sound examples extracted per cluster for cluster facet sound preview +NUM_SOUND_EXAMPLES_PER_CLUSTER = 7 + +# Number of most common tags extracted per cluster for clustering facet name +NUM_TAGS_SHOWN_PER_CLUSTER = 3 + +# Number of maximum clusters to show to the user +CLUSTERING_NUM_MAX_CLUSTERS = 8 + +# Timeout for returning clustering results to the user +CLUSTERING_TASK_TIMEOUT = 30 + +CLUSTERING_SIMILARITY_ANALYZER = FSDSINET_ANALYZER_NAME # ------------------------------------------------------------------------------- # Rate limiting @@ -887,6 +899,7 @@ CELERY_RESULT_SERIALIZER = 'json' CELERY_ASYNC_TASKS_QUEUE_NAME = 'async_tasks_queue' CELERY_SOUND_PROCESSING_QUEUE_NAME = 'sound_processing_queue' +CELERY_CLUSTERING_TASK_QUEUE_NAME = 'clustering_queue' # ------------------------------------------------------------------------------- diff --git a/freesound/static/bw-frontend/src/components/asyncSection.js b/freesound/static/bw-frontend/src/components/asyncSection.js index 76343a4dc..7cdf54a16 100644 --- a/freesound/static/bw-frontend/src/components/asyncSection.js +++ b/freesound/static/bw-frontend/src/components/asyncSection.js @@ -15,11 +15,13 @@ const prepareAsyncSections = (container) => { } else { // Unexpected errors happened while processing request: show toast showToast('Unexpected errors occurred while loading some of the content of this page. Please try again later...') + element.innerHTML = ''; } }; req.onerror = () => { - // Unexpected errors happened while processing request: show toast + // Unexpected errors happened while processing request: show toast and clear async element showToast('Unexpected errors occurred while loading some of the content of this page. Please try again later...') + element.innerHTML = ''; }; // Send the form diff --git a/freesound/static/bw-frontend/src/components/select.js b/freesound/static/bw-frontend/src/components/select.js index 550b562a7..a519bf64f 100644 --- a/freesound/static/bw-frontend/src/components/select.js +++ b/freesound/static/bw-frontend/src/components/select.js @@ -121,6 +121,10 @@ function makeSelect(container) { buttonElement.className = 'select-dropdown__button select-dropdown__button--' + i; buttonElement.setAttribute('data-value', ''); buttonElement.setAttribute('type', 'button'); + if (el.getAttribute('disabled') !== null){ + buttonElement.setAttribute('disabled', 'disabled'); + buttonElement.classList.add('opacity-020'); + } spanElement.className = 'select-dropdown select-dropdown--' + i; iElement.className = 'zmdi bw-icon-chevron-up bw-select__chevron'; ulElement.className = 'select-dropdown__list select-dropdown__list--' + i; diff --git a/freesound/static/bw-frontend/src/pages/search.js b/freesound/static/bw-frontend/src/pages/search.js index 6f73a7178..cb2cb3756 100644 --- a/freesound/static/bw-frontend/src/pages/search.js +++ b/freesound/static/bw-frontend/src/pages/search.js @@ -4,10 +4,10 @@ import navbar from "../components/navbar"; // Main search input box behaviour const searchInputBrowse = document.getElementById('search-input-browse'); -const tagsModeInput = document.getElementById('tags-mode'); -const tagsMode = tagsModeInput.value == '1'; const searchInputBrowsePlaceholder = searchInputBrowse.getAttribute("placeholder"); const removeSearchInputValueBrowse = document.getElementById('remove-content-search'); +const advancedSearchOptionsDiv = document.getElementById('advanced-search-options'); +const tagsMode = location.pathname.indexOf('/browse/tags/') > -1; const updateRemoveSearchInputButtonVisibility = (searchInputElement) => { if (searchInputElement.value.length) { @@ -47,7 +47,7 @@ const searchFormIsVisible = () => { let heroRect; if (advancedSearchOptionsIsVisible()){ // If advanced search options is expanded, use that as heroRect to check if search form is visible - heroRect = advanced_search_options_div.getBoundingClientRect() + heroRect = advancedSearchOptionsDiv.getBoundingClientRect() } else { if (!tagsMode){ heroRect = searchInputBrowse.getBoundingClientRect() @@ -74,84 +74,35 @@ const checkShouldShowSearchInNavbar = throttle(() => { window.addEventListener('scroll', checkShouldShowSearchInNavbar) -/* - ADVANCED SEARCH STUFF - The functions below correspond to the javascript bits for handling the advanced search options - The JS code is old and probably doing things in wrong ways (and more complex that it should) - This should be completely refactored, but to avoid changes in backend and for compatibility between - BeastWhoosh and Nightingale interfaces, we leave everything as is for now (just with small updates to - avoid using JQuery). -*/ +// Advanced search options behaviour -var search_form_element = document.getElementById('search_form'); -var search_page_navbar_form = document.getElementById('search-page-navbar-form'); -var advanced_search_options_div = document.getElementById('advanced-search-options'); -var advanced_search_hidden_field = document.getElementById('advanced_search_hidden'); -var toggle_advanced_search_options_element = document.getElementById('toggle_advanced_search_options'); -var filter_query_element = document.getElementById('filter_query'); -var filter_duration_min_element = document.getElementById('filter_duration_min'); -var filter_duration_max_element = document.getElementById('filter_duration_max'); -var filter_is_geotagged_element = document.getElementById('filter_is_geotagged'); -var filter_in_remix_group_element = document.getElementById('filter_in_remix_group'); -var sort_by_element = document.getElementById('sort-by'); -var group_by_pack_element = document.getElementById('group_by_pack'); -var only_sounds_with_pack_element = document.getElementById('only_sounds_with_pack'); -var use_compact_mode_element = document.getElementById('use_compact_mode'); -var use_map_mode_element = document.getElementById('use_map_mode'); - -function update_hidden_compact_mode_element() { - var hiddenElement = document.getElementById('use_compact_mode_hidden'); - if (use_compact_mode_element.checked) { - hiddenElement.value = "1"; - } else { - hiddenElement.value = "0"; - } -} - -update_hidden_compact_mode_element() -use_compact_mode_element.addEventListener('change', function() { - update_hidden_compact_mode_element() -}) - -function update_hidden_map_mode_element() { - var hiddenElement = document.getElementById('use_map_mode_hidden'); - if (use_map_mode_element.checked) { - hiddenElement.value = "1"; - } else { - hiddenElement.value = "0"; - } -} - -update_hidden_map_mode_element() -use_map_mode_element.addEventListener('change', function() { - update_hidden_map_mode_element() -}) +const toggleAdvancedSearchOptionsElement = document.getElementById('toggle_advanced_search_options'); function advancedSearchOptionsIsVisible() { - return advanced_search_hidden_field.value === "1"; + return advancedSearchOptionsDiv.dataset.visible === "1"; } function updateToggleAdvancedSearchOptionsText() { if (advancedSearchOptionsIsVisible()){ - toggle_advanced_search_options_element.innerHTML = 'Hide advanced search options'; + toggleAdvancedSearchOptionsElement.innerHTML = 'Hide advanced search options'; } else { - toggle_advanced_search_options_element.innerHTML = 'Show advanced search options'; + toggleAdvancedSearchOptionsElement.innerHTML = 'Show advanced search options'; } } function showAdvancedSearchOptions() { - advanced_search_hidden_field.value = "1"; - advanced_search_options_div.style.display = 'block'; + advancedSearchOptionsDiv.dataset.visible = "1"; + advancedSearchOptionsDiv.style.display = 'block'; updateToggleAdvancedSearchOptionsText(); } function hideAdvancedSearchOptions() { - advanced_search_hidden_field.value = "0"; - advanced_search_options_div.style.display = 'none'; + advancedSearchOptionsDiv.dataset.visible = "0"; + advancedSearchOptionsDiv.style.display = 'none'; updateToggleAdvancedSearchOptionsText(); } @@ -163,196 +114,75 @@ function toggleAdvancedSearchOptions(){ } } -toggle_advanced_search_options_element.addEventListener('click', toggleAdvancedSearchOptions); +toggleAdvancedSearchOptionsElement.addEventListener('click', toggleAdvancedSearchOptions); -function set_hidden_grouping_value(){ - - var hiddenElement = document.getElementById('group_by_pack_hidden'); - if (group_by_pack_element.checked) { - hiddenElement.value = "1"; - } else { - hiddenElement.value = ""; - } -} +// Track changes in advanced search options -function set_hidden_only_sounds_with_pack_value(){ - var element = document.getElementById('only_sounds_with_pack'); - var hiddenElement = document.getElementById('only_sounds_with_pack_hidden'); - if (element.checked) { - hiddenElement.value = "1"; - } else { - hiddenElement.value = ""; - } -} - -// Return the value of a filter given its name -// If filter has a range, optional "range" parameter must be set to "min or "max" -function getFilterValue(name, range) -{ - if (!range) { range = "min"} - - var filter_query_element = document.getElementById('filter_query'); - var value = filter_query_element.value; - var position_value = value.search(name) + (name + ":").length - if (value.search((name + ":")) !== -1) - { - if (value[position_value] === "[") // Is range (with spaces) - { - var aux_value = value.substring(position_value + 1) - var position_end = position_value + aux_value.search("]") + 2 - - var range_string = value.substring(position_value + 1, position_end -1) // Without [ ] - var parts = range_string.split(" ") - if (range === "min"){ - return parts[0] - } else if (range === "max") { - return parts[2] - } - } - else if (value[position_value] === "\"") // Is string (with spaces) - { - aux_value = value.substring(position_value + 1) - position_end = position_value + aux_value.search("\"") + 2 - return value.substring(position_value, position_end) +let initialAdvancedSearchInputValues = undefined; // NOTE: this is filled out in onDocumentReady function +const serializeAdvanceSearchOptionsInputsData = () => { + const values = []; + advancedSearchOptionsDiv.getElementsByTagName("input").forEach(inputElement => { + if (inputElement.type == "hidden"){ + // Don't include hidden elements as only the visible items are necessary + } else if (inputElement.type == "checkbox"){ + values.push(inputElement.checked); + } else { + values.push(inputElement.value); } - else // Is number or normal text (without spaces) - { - aux_value = value.substring(position_value + 1) - if (aux_value.search(" ") !== -1){ - position_end = position_value + aux_value.search(" ") + 1 - } else { - position_end = value.length - } - return value.substring(position_value, position_end) - } - } else { - return "" - } + }); + return values.join(","); } -// Remove a filter given the full tag ex: type:aiff, pack:"pack name" -function removeFilter(tag) -{ - var filter_query_element = document.getElementById('filter_query'); - var value = filter_query_element.value; - var cleaned = value.replace(tag + " ", "").replace(tag, "").trim(); - filter_query_element.value = cleaned; +const advancedSearchOptionsHaveChangedSinceLastQuery = () => { + const currentAdvancedSearchInputValues = serializeAdvanceSearchOptionsInputsData(); + return initialAdvancedSearchInputValues != currentAdvancedSearchInputValues; } -function onDocumentReady(){ - // Fill advanced search fields that were passed through the f parameter - // Duration - - if (getFilterValue("duration","min") === ""){ - filter_duration_min_element.value = "0"; - } else { - filter_duration_min_element.value = getFilterValue("duration","min"); - } - - if (getFilterValue("duration","max") === ""){ - filter_duration_max_element.value = "*"; - } else { - filter_duration_max_element.value = getFilterValue("duration","max"); - } - - // Geotagged - if (getFilterValue("is_geotagged") === "1"){ - filter_is_geotagged_element.checked = true; - } - - // Remix filter - if (getFilterValue("in_remix_group") === "1"){ - // NOTE we only check "is_remix" and don't check "was_remixed" because these will go together - filter_in_remix_group_element.checked = true; - } - - // Update the text of the button to toggle advanced search options panel - updateToggleAdvancedSearchOptionsText(); - - // Store values of advanced search filters so later we can check if they were modified - initialAdvancedSearchInputValues = serializeAdvanceSearchOptionsInputsData(); +const onAdvancedSearchOptionsInputsChange = () => { + document.getElementById('avanced-search-apply-button').disabled = !advancedSearchOptionsHaveChangedSinceLastQuery(); } -document.addEventListener('DOMContentLoaded', onDocumentReady); - -function addAdvancedSearchOptionsFilters() -{ - // Remove previously existing advanced options filters (will be replaced by current ones) - var existing_duration_filter = "duration:[" + getFilterValue("duration","min") + " TO " + getFilterValue("duration","max") + "]"; - removeFilter(existing_duration_filter); - removeFilter("is_geotagged:1"); - removeFilter("in_remix_group:1"); - - // if advanced options is activated add all updated filters - if (advanced_search_hidden_field.value === "1") - { - // Create and add new filter with all the advanced options - var filter = ""; - - // Duration filter - var duration_min = parseFloat(filter_duration_min_element.value); - var duration_max = parseFloat(filter_duration_max_element.value); - - if ((duration_min >= 0.0) || (duration_max >= 0.0)) { - var duration_filter = ""; - if ((duration_min >= 0.0) && (duration_max >= 0.0)) { // Both min and max have been set - if (duration_max < duration_min) { - // interchange values if duration_min > duration_max - var duration_aux = duration_min; - duration_min = duration_max; - duration_max = duration_aux; - } - duration_filter = "duration:[" + duration_min + " TO " + duration_max + "]"; - } else if (duration_min >= 0.0) { // Only minimum has been set - duration_filter = "duration:[" + duration_min + " TO *]"; - } else if (duration_max >= 0.0) { // Only maximum has been set - duration_filter = "duration:[* TO " + duration_max + "]"; - } - filter = filter + duration_filter; - } +advancedSearchOptionsDiv.getElementsByTagName("input").forEach(inputElement => { + inputElement.addEventListener('change', evt => { + onAdvancedSearchOptionsInputsChange(); + }); + inputElement.addEventListener('input', evt => { + onAdvancedSearchOptionsInputsChange(); + }); +}); - // Is geotagged filter - if (filter_is_geotagged_element.checked){ - if (filter !== ""){ - filter = filter + " "; - } - filter = filter + "is_geotagged:1"; - } +// Other sutff: form submission, navbar search form, hidden checkboxes etc. - // Is remix filter - if (filter_in_remix_group_element.checked){ - if (filter !== ""){ - filter = filter + " "; - } - filter = filter + "in_remix_group:1"; - } +var searchFormElement = document.getElementById('search_form'); - // Update general filter with the advanced options filter - var value = filter_query_element.value; - if (value !== ""){ - filter_query_element.value = value + " " + filter; - } else { - filter_query_element.value = filter; - } - } +searchFormElement.getElementsByClassName('bw-checkbox').forEach(checkbox => { + const hiddenCheckbox = document.createElement('input'); + hiddenCheckbox.type = 'hidden'; + hiddenCheckbox.name = checkbox.name; + checkbox.name = ''; // remove name attribute so checkbox is not submitted (the hidden input will be submitted instead) + hiddenCheckbox.value = checkbox.checked ? '1' : '0'; + checkbox.addEventListener('change', evt => { // Update hidden checkbox value when checkbox is changed + hiddenCheckbox.value = checkbox.checked ? '1' : '0'; + }); + checkbox.parentNode.appendChild(hiddenCheckbox); +}); + +// Make the search select element submit the form when changed +var sortByElement = document.getElementById('id_sort_by'); +if (sortByElement !== null){ + sortByElement.addEventListener('change', function() { + searchFormElement.submit(); + }) } -search_form_element.addEventListener('submit', function() { - addAdvancedSearchOptionsFilters(); -}) - -sort_by_element.addEventListener('change', function() { - addAdvancedSearchOptionsFilters(); - search_form_element.submit(); -}) - -group_by_pack_element.addEventListener('change', function() { - set_hidden_grouping_value(); -}) - -only_sounds_with_pack_element.addEventListener('change', function() { - set_hidden_only_sounds_with_pack_value(); +// Make radio cluster elements submit the form when changed +document.getElementsByName('cid').forEach(radio => { + radio.addEventListener('change', (evt) => { + setTimeout(() => { + searchFormElement.submit(); + }, 100); // Give it a little time to update the radio widget before submitting + }); }) document.body.addEventListener('keydown', evt => { @@ -360,60 +190,31 @@ document.body.addEventListener('keydown', evt => { if(evt.keyCode === ENTER_KEY){ // If ENTER key is pressed and search form is visible, trigger form submission if (searchFormIsVisible()){ - addAdvancedSearchOptionsFilters(); - search_form_element.submit(); + searchFormElement.submit(); } } }) -if (search_page_navbar_form !== null){ - search_page_navbar_form.addEventListener('submit', function(evt){ +var searchPageNavbarForm = document.getElementById('search-page-navbar-form'); +if (searchPageNavbarForm !== null){ + searchPageNavbarForm.addEventListener('submit', function(evt){ // Prevent default form submission if (evt.preventDefault) evt.preventDefault(); // Copy input element contents to the main input element and do submission of the main form instead of the navbar one const searchInputBrowseNavbar = document.getElementById('search-input-browse-navbar'); searchInputBrowse.value = searchInputBrowseNavbar.value; - addAdvancedSearchOptionsFilters(); - search_form_element.submit(); + searchFormElement.submit(); // It is also needed to return false to prevent default form submission return false; }) } -// Enable/disable "apply adbanced search filters" when filters are modified - -const serializeAdvanceSearchOptionsInputsData = () => { - const values = []; - advanced_search_options_div.getElementsByTagName("input").forEach(inputElement => { - if (inputElement.type == "hidden"){ - // Don't include hidden elements as only the visible items are necessary - } else if (inputElement.type == "checkbox"){ - values.push(inputElement.checked); - } else { - values.push(inputElement.value); - } - }); - return values.join(","); -} - -let initialAdvancedSearchInputValues = undefined; // NOTE: this is filled out in onDocumentReady function - -const advancedSearchOptionsHaveChangedSinceLastQuery = () => { - const currentAdvancedSearchInputValues = serializeAdvanceSearchOptionsInputsData(); - return initialAdvancedSearchInputValues != currentAdvancedSearchInputValues; -} - -const onAdvancedSearchOptionsInputsChange = () => { - document.getElementById('avanced-search-apply-button').disabled = !advancedSearchOptionsHaveChangedSinceLastQuery(); +function onDocumentReady(){ + // Update the text of the button to toggle advanced search options panel + updateToggleAdvancedSearchOptionsText(); + // Store values of advanced search filters so later we can check if they were modified + initialAdvancedSearchInputValues = serializeAdvanceSearchOptionsInputsData(); } - -advanced_search_options_div.getElementsByTagName("input").forEach(inputElement => { - inputElement.addEventListener('change', evt => { - onAdvancedSearchOptionsInputsChange(); - }); - inputElement.addEventListener('input', evt => { - onAdvancedSearchOptionsInputsChange(); - }); -}); \ No newline at end of file +document.addEventListener('DOMContentLoaded', onDocumentReady); \ No newline at end of file diff --git a/freesound/static/bw-frontend/styles/pages/search.scss b/freesound/static/bw-frontend/styles/pages/search.scss index 96aafed0c..37ca81a75 100644 --- a/freesound/static/bw-frontend/styles/pages/search.scss +++ b/freesound/static/bw-frontend/styles/pages/search.scss @@ -56,15 +56,6 @@ } } -.bw-search__advanced-search-filter-section { - - padding-top: $small-spacing; - - .bw-search__filter-section-name > span { - font-size: 18px; - } -} - .bw-search__player-small { flex: 0 0 120px; } @@ -145,7 +136,7 @@ margin-left: 20px; } -.bw-search__filter-duration { +.bw-search__filter-range { color: $navy-grey; font-size: 14px; @@ -153,13 +144,13 @@ margin-left: 12px; } - .bw-search_input-duration { + .bw-search_input-range { padding: 16px 13px; border: 1px solid $navy-light-grey; background-color: $background-input; border-radius: 5px; font-size: 14px; - max-width: 85px; + max-width: 65px; &::-webkit-calendar-picker-indicator { display: none; @@ -172,6 +163,23 @@ } } +.bw-search_input { + border: 1px solid $border-input; + color: $black; + background-color: $background-input; + padding: 10px 20px; + border-radius: 5px; + + &::placeholder { + color: $navy-light-grey; + } + + &:focus { + background-color: $white; + border: 1px solid $black; + } +} + .browse__search-overview-sorter { display: flex; align-items: center; diff --git a/freesound/urls.py b/freesound/urls.py index 26ad58917..11152e57b 100644 --- a/freesound/urls.py +++ b/freesound/urls.py @@ -99,7 +99,7 @@ path('contact/', support.views.contact, name="contact"), path('search/', search.views.search, name='sounds-search'), - path('clustering_facet/', search.views.clustering_facet, name='clustering-facet'), + path('search/clusters_section/', search.views.clusters_section, name='clusters-section'), path('clustered_graph/', search.views.clustered_graph, name='clustered-graph-json'), path('query_suggestions/', search.views.query_suggestions, name='query-suggestions'), diff --git a/geotags/tests.py b/geotags/tests.py index 30778961a..c3eaa009b 100644 --- a/geotags/tests.py +++ b/geotags/tests.py @@ -94,5 +94,5 @@ def test_browse_geotags_case_insensitive(self): def test_browse_geotags_for_query(self): resp = self.client.get(reverse('geotags-query') + f'?q=barcelona') - check_values = {'query_description': 'barcelona'} + check_values = {'query_description': '"barcelona"'} self.check_context(resp.context, check_values) diff --git a/geotags/views.py b/geotags/views.py index ffe0ccfa2..208faa279 100644 --- a/geotags/views.py +++ b/geotags/views.py @@ -34,9 +34,9 @@ from django.views.decorators.clickjacking import xframe_options_exempt from accounts.models import Profile -from search.views import search_prepare_parameters from sounds.models import Sound, Pack from utils.logging_filters import get_client_ip +from utils.search.search_query_processor import SearchQueryProcessor from utils.search.search_sounds import perform_search_engine_query from utils.username import redirect_if_old_username_or_404, raise_404_if_user_is_deleted @@ -48,27 +48,6 @@ def log_map_load(map_type, num_geotags, request): 'map_type': map_type, 'num_geotags': num_geotags, 'ip': get_client_ip(request)})) -def update_query_params_for_map_query(query_params, preserve_facets=False): - # Force is_geotagged filter to be present - if query_params['query_filter']: - if 'is_geotagged' not in query_params['query_filter']: - query_params['query_filter'] = query_params['query_filter'] + ' is_geotagged:1' - else: - query_params['query_filter'] = 'is_geotagged:1' - # Force one single page with "all" results, and don't group by pack - query_params.update({ - 'current_page': 1, - 'num_sounds': settings.MAX_SEARCH_RESULTS_IN_MAP_DISPLAY, - 'group_by_pack': False, - 'only_sounds_with_pack': False, - 'field_list': ['id', 'score', 'geotag'] - }) - if not preserve_facets: - # No need to compute facets for the bytearray, but it might be needed for the main query - if 'facets' in query_params: - del query_params['facets'] - - def generate_bytearray(sound_queryset_or_list): # sounds as bytearray packed_sounds = io.BytesIO() @@ -169,8 +148,11 @@ def geotags_for_query_barray(request): results_docs = cache.get(results_cache_key) else: # Otherwise, perform a search query to get the results - query_params, _, _ = search_prepare_parameters(request) - update_query_params_for_map_query(query_params) + sqp = SearchQueryProcessor(request) + query_params = sqp.as_query_params() + if 'facets' in query_params: + # No need to compute facets for bytearray query + del query_params['facets'] results, _ = perform_search_engine_query(query_params) results_docs = results.docs @@ -283,20 +265,6 @@ def for_pack(request, username, pack_id): def for_query(request): tvars = _get_geotags_query_params(request) request_parameters_string = request.get_full_path().split('?')[-1] - q = request.GET.get('q', None) - if q == '': - q = None - f = request.GET.get('f', None) - query_description = '' - if q is None and f is None: - query_description = 'Empty query' - elif q is not None and f is not None: - query_description = f'{q} (some filters applied)' - else: - if q is not None: - query_description = q - if f is not None: - query_description = f'Empty query with some filters applied' tvars.update({ 'tag': None, 'username': None, @@ -305,7 +273,7 @@ def for_query(request): 'query_params': request_parameters_string, 'query_params_encoded': urllib.parse.quote(request_parameters_string), 'query_search_page_url': reverse('sounds-search') + f'?{request_parameters_string}', - 'query_description': query_description, + 'query_description': SearchQueryProcessor(request).get_textual_description(), 'url': reverse('geotags-for-query-barray') + f'?{request_parameters_string}', }) return render(request, 'geotags/geotags.html', tvars) diff --git a/requirements.in b/requirements.in index c1b929c73..d2b0d8080 100644 --- a/requirements.in +++ b/requirements.in @@ -14,7 +14,6 @@ django-extensions==3.1.5 django-modeladmin-reorder==0.3.1 django-multiupload==0.6.1 django-oauth-toolkit==2.2.0 -oauthlib django-object-actions==4.1.0 django-ratelimit==3.0.1 django-recaptcha==3.0.0 @@ -32,24 +31,29 @@ future~=0.18.2 graypy==0.2.12 gunicorn==21.2.0 ipython==8.14.0 +jinja2==3.0.3 # This version needed for sphinx to not raise errors +luqum==0.13.0 mapbox==0.18.1 markdown==3.4.1 -networkx==1.5 +networkx==3.2.1 numpy==1.24.3 +oauthlib +openpyxl==3.1.0 # for reading .xlsx files (but not .xls) Pillow==9.5.0 pip-tools==7.1.0 psycopg2-binary==2.9.6 PyJWT==2.6.0 pyparsing==2.4.7 -pysolr==3.10.0b1 pysndfile==1.4.4 +pysolr==3.10.0b1 +python-louvain==0.16 # community detection in clustering pytz==2023.3 PyYAML==6.0.1 redis==3.2.0 +scikit-learn==1.4.1.post1 # clustering +scipy==1.12.0 # clustering sentry-sdk[django]~=1.31 Sphinx==1.6.3 stripe==2.28.1 xlrd==2.0.1 # for reading .xls files (but not .xlsx) -openpyxl==3.1.0 # for reading .xlsx files (but not .xls) zenpy==1.1.3 -jinja2==3.0.3 # This version needed for sphinx to not raise errors diff --git a/requirements.txt b/requirements.txt index 37b90ce6f..cfad9dab8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile +# pip-compile requirements.in # akismet==1.0.1 # via -r requirements.in @@ -10,8 +10,6 @@ alabaster==0.7.13 # via sphinx amqp==5.2.0 # via kombu -appnope==0.1.3 - # via ipython asgiref==3.7.2 # via django asttokens==2.4.1 @@ -176,10 +174,14 @@ jmespath==1.0.1 # via # boto3 # botocore +joblib==1.3.2 + # via scikit-learn jwcrypto==1.5.1 # via django-oauth-toolkit kombu==5.3.4 # via celery +luqum==0.13.0 + # via -r requirements.in mapbox==0.18.1 # via -r requirements.in markdown==3.4.1 @@ -190,12 +192,17 @@ matplotlib-inline==0.1.6 # via ipython msgpack==1.0.7 # via cachecontrol -networkx==1.5 - # via -r requirements.in +networkx==3.2.1 + # via + # -r requirements.in + # python-louvain numpy==1.24.3 # via # -r requirements.in # pysndfile + # python-louvain + # scikit-learn + # scipy oauthlib==3.2.2 # via # -r requirements.in @@ -220,6 +227,8 @@ pillow==9.5.0 # via -r requirements.in pip-tools==7.1.0 # via -r requirements.in +ply==3.11 + # via luqum polyline==2.0.1 # via mapbox prompt-toolkit==3.0.43 @@ -259,6 +268,8 @@ python-dateutil==2.8.2 # freezegun # mapbox # zenpy +python-louvain==0.16 + # via -r requirements.in pytz==2023.3 # via # -r requirements.in @@ -284,6 +295,12 @@ requests==2.31.0 # zenpy s3transfer==0.6.2 # via boto3 +scikit-learn==1.4.1.post1 + # via -r requirements.in +scipy==1.12.0 + # via + # -r requirements.in + # scikit-learn sentry-sdk[django]==1.39.1 # via -r requirements.in sgmllib3k==1.0.0 @@ -314,6 +331,8 @@ stack-data==0.6.3 # via ipython stripe==2.28.1 # via -r requirements.in +threadpoolctl==3.3.0 + # via scikit-learn toml==0.10.2 # via autopep8 tomli==2.0.1 diff --git a/requirements_clustering.txt b/requirements_clustering.txt deleted file mode 100644 index 31dc09c66..000000000 --- a/requirements_clustering.txt +++ /dev/null @@ -1,5 +0,0 @@ -networkx==2.2 -python-louvain==0.13 -scikit-learn==0.19.1 -scipy==0.18.1 -six diff --git a/search/templatetags/search.py b/search/templatetags/search.py index b847f8a7a..541851ea1 100644 --- a/search/templatetags/search.py +++ b/search/templatetags/search.py @@ -21,18 +21,30 @@ from django import template from django.conf import settings -from urllib.parse import quote_plus from sounds.models import License +from utils.search import search_query_processor_options from utils.tags import annotate_tags register = template.Library() @register.inclusion_tag('search/facet.html', takes_context=True) -def display_facet(context, flt, facet, facet_type, title=""): - facet = annotate_tags([dict(name=f[0], count=f[1]) for f in facet if f[0] != "0"], - sort="name", small_size=0.7, large_size=2.0) +def display_facet(context, facet_name): + sqp = context['sqp'] + facets = context['facets'] + facet_type = {'tag': 'cloud', 'username': 'cloud'}.get(facet_name, 'list') + facet_title = { + 'tag': 'Related tags', + 'username': 'Related users', + 'grouping_pack': 'Packs', + 'license': 'Licenses' + }.get(facet_name, facet_name.capitalize()) + if facet_name in facets: + facet = annotate_tags([dict(value=f[0], count=f[1]) for f in facets[facet_name] if f[0] != "0"], + sort="value", small_size=0.7, large_size=2.0) + else: + facet = [] # If the filter is grouping_pack and there are elements which do not contain the character "_" means that # these sounds do not belong to any pack (as grouping pack values should by "packId_packName" if there is a pack @@ -41,81 +53,80 @@ def display_facet(context, flt, facet, facet_type, title=""): # the element name is a single number that does not contain the character "_" # We add the extra Free Cultural Works license facet - if flt == 'license': + if facet_name == 'license': fcw_count = 0 only_fcw_in_facet = True for element in facet: - if element['name'].lower() == 'attribution' or element['name'].lower() == 'creative commons 0': + if element['value'].lower() == 'attribution' or element['value'].lower() == 'creative commons 0': fcw_count += element['count'] else: only_fcw_in_facet = False if fcw_count and not only_fcw_in_facet: facet.append({ - 'name': settings.FCW_FILTER_VALUE, + 'value': settings.FCW_FILTER_VALUE, 'count': fcw_count, 'size': 1.0, }) - - filtered_facet = [] - filter_query = quote_plus(context['filter_query']) + + # Remove "no pack" elements form pack facet (no pack elements are those in which "grouping pack" only has the sound id and not any pack id/name) + if facet_name == "grouping_pack": + facet = [element for element in facet if '_' in element['value']] + for element in facet: - if flt == "grouping_pack": - if element['name'].count("_") > 0: - # We also modify the display name to remove the id - element['display_name'] = element['name'][element['name'].find("_")+1:] - else: - # If facet element belongs to "grouping pack" filter but does not have the "_" character in it, it - # means this corresponds to the "no pack" grouping which we don't want to show as a facet element. - continue - elif element['name'] == settings.FCW_FILTER_VALUE: - element['display_name'] = "Approved for Free Cultural Works" - elif flt == 'license': + # Set display values (the values how they'll be shown in the UI) + if facet_name == "grouping_pack": + # Modify the display name to remove the pack id + element['display_value'] = element['value'][element['value'].find("_")+1:] + elif element['value'] == settings.FCW_FILTER_VALUE: + element['display_value'] = "Approved for Free Cultural Works" + elif facet_name == 'license': # License field in solr is case insensitive and will return facet names in lowercase. # We need to properly capitalize them to use official CC license names. - element['display_name'] = element['name'].title().replace('Noncommercial', 'NonCommercial') + element['display_value'] = element['value'].title().replace('Noncommercial', 'NonCommercial') else: - element['display_name'] = element['name'] + # In all other cases, use the value as is for display purposes + element['display_value'] = element['value'] - if element['name'] == settings.FCW_FILTER_VALUE: - # If adding the FCW filter (which has more complex logic) don't wrap the filter in " as it breaks the syntax parsing - element['params'] = f"{filter_query} {flt}:{quote_plus(element['name'])}" + # Set the URL to add facet values as filters + if element["value"].startswith('('): + # If the filter value is a "complex" operation , don't wrap it in quotes + filter_str = f'{facet_name}:{element["value"]}' + elif element["value"].isdigit(): + # If the filter value is a digit, also don't wrap it in quotes + filter_str = f'{facet_name}:{element["value"]}' else: - element['params'] = f"{filter_query} {flt}:\"{quote_plus(element['name'])}\"" - - element['id'] = f"{flt}--{quote_plus(element['name'])}" - element['add_filter_url'] = '.?advanced={}&g={}&only_p={}&q={}&f={}&s={}&w={}'.format( - context['advanced'], - context['group_by_pack_in_request'], - context['only_sounds_with_pack'], - context['search_query'], - element['params'], - context['sort'] if context['sort'] is not None else '', - context['weights'] or '' - ) - if context['similar_to'] is not None: - element['add_filter_url'] += '&similar_to={}'.format(context['similar_to']) - if context['use_map_mode'] == True: - element['add_filter_url'] += '&mm=1' - filtered_facet.append(element) - - # We sort the facets by count. Also, we apply an opacity filter on "could" type pacets - if filtered_facet: - filtered_facet = sorted(filtered_facet, key=lambda x: x['count'], reverse=True) - max_count = max([element['count'] for element in filtered_facet]) - for element in filtered_facet: + # Otherwise wrap in quotes + filter_str = f'{facet_name}:"{element["value"]}"' + element['add_filter_url'] = sqp.get_url(add_filters=[filter_str]) + + # We sort the facets by count. Also, we apply an opacity filter on "could" type facets + if facet: + facet = sorted(facet, key=lambda x: x['count'], reverse=True) + max_count = max([element['count'] for element in facet]) + for element in facet: element['weight'] = element['count'] / max_count # We also add icons to license facets - if flt == 'license': - for element in filtered_facet: - if element['name'] != settings.FCW_FILTER_VALUE: - element['icon'] = License.bw_cc_icon_name_from_license_name(element['display_name']) + if facet_name == 'license': + for element in facet: + if element['value'] != settings.FCW_FILTER_VALUE: + element['icon'] = License.bw_cc_icon_name_from_license_name(element['display_value']) else: element['icon'] = 'fcw' - context.update({ - "facet": filtered_facet, - "type": facet_type, - "filter": flt, - "title": title - }) - return context + + return {'type': facet_type, 'title': facet_title, 'facet': facet} + + +@register.inclusion_tag('search/search_option.html', takes_context=True) +def display_search_option(context, option_name, widget=None): + sqp = context['sqp'] + option = sqp.options[option_name] + if widget is None: + # If a widget is not provided as a parameter, use a sensible default + widget = { + search_query_processor_options.SearchOptionBool: 'checkbox', + search_query_processor_options.SearchOptionStr: 'text', + search_query_processor_options.SearchOptionChoice: 'select', + }.get(type(option), 'text') + label = option.label if option.label else option_name.capitalize().replace('_', ' ') + return {'option': option, 'option_name': option_name, 'label': label, 'widget': widget} \ No newline at end of file diff --git a/search/tests.py b/search/tests.py index 388a55337..227b07523 100644 --- a/search/tests.py +++ b/search/tests.py @@ -18,14 +18,19 @@ # See AUTHORS file. # +from django.contrib.auth.models import User from django.core.cache import cache -from django.test import TestCase +from django.conf import settings +from django.test import TestCase, RequestFactory from django.test.utils import skipIf, override_settings from django.urls import reverse +from utils.search import search_query_processor from sounds.models import Sound from utils.search import SearchResults, SearchResultsPaginator from utils.test_helpers import create_user_and_sounds +from utils.url import ComparableUrl from unittest import mock +from django.contrib.auth.models import AnonymousUser def create_fake_search_engine_results(): @@ -90,7 +95,7 @@ def return_successful_clustering_results(sound_id_1, sound_id_2, sound_id_3, sou 'multigraph': False }, 'finished': True, - 'result': [ + 'clusters': [ [ sound_id_1, sound_id_2 @@ -100,12 +105,12 @@ def return_successful_clustering_results(sound_id_1, sound_id_2, sound_id_3, sou sound_id_4 ], ], - 'error':False + 'cluster_ids': [23, 24], + 'cluster_names': ['tag1 tag2 tag3', 'tag1 tag2 tag3'], + 'example_sounds_data': [['a'], ['b', 'c']], } -pending_clustering_results = {'finished': False, 'error': False} - -failed_clustering_results = {'finished': False, 'error': True} +failed_clustering_results = None def create_fake_perform_search_engine_query_response(num_results=15): @@ -163,12 +168,12 @@ def test_search_page_num_queries(self, perform_search_engine_query): # Now check number of queries when displaying results as packs (i.e., searching for packs) cache.clear() with self.assertNumQueries(6): - self.client.get(reverse('sounds-search') + '?only_p=1') + self.client.get(reverse('sounds-search') + '?dp=1') # Also check packs when displaying in grid mode cache.clear() with self.assertNumQueries(6): - self.client.get(reverse('sounds-search') + '?only_p=1&cm=1') + self.client.get(reverse('sounds-search') + '?dp=1&cm=1') with override_settings(USE_SEARCH_ENGINE_SIMILARITY=False): # When not using search engine similarity, there'll be one less query performed as similarity state is retrieved directly from sound object @@ -176,31 +181,12 @@ def test_search_page_num_queries(self, perform_search_engine_query): # Now check number of queries when displaying results as packs (i.e., searching for packs) cache.clear() with self.assertNumQueries(5): - self.client.get(reverse('sounds-search') + '?only_p=1') + self.client.get(reverse('sounds-search') + '?dp=1') # Also check packs when displaying in grid mode cache.clear() with self.assertNumQueries(5): - self.client.get(reverse('sounds-search') + '?only_p=1&cm=1') - - @mock.patch('search.views.perform_search_engine_query') - def test_search_page_with_filters(self, perform_search_engine_query): - perform_search_engine_query.return_value = self.perform_search_engine_query_response - - # 200 response on sound search page access - resp = self.client.get(reverse('sounds-search'), {"f": 'grouping_pack:"Clutter" tag:"acoustic-guitar"'}) - self.assertEqual(resp.status_code, 200) - - # In this case we check if a non valid filter is applied it should be ignored. - # grouping_pack it shouldn't be in filter_query_split, since is a not valid filter - self.assertEqual(resp.context['filter_query_split'][0]['name'], 'tag:"acoustic-guitar"') - self.assertEqual(len(resp.context['filter_query_split']), 1) - - resp = self.client.get(reverse('sounds-search'), {"f": 'grouping_pack:"19894_Clutter" tag:"acoustic-guitar"'}) - # Now we check if two valid filters are applied, then they are present in filter_query_split - # Which means they are going to be displayed - self.assertEqual(resp.status_code, 200) - self.assertEqual(len(resp.context['filter_query_split']), 2) + self.client.get(reverse('sounds-search') + '?dp=1&cm=1') class SearchResultClustering(TestCase): @@ -217,44 +203,34 @@ def setUp(self): self.sound_id_preview_urls = sound_id_preview_urls self.successful_clustering_results = return_successful_clustering_results(*sound_ids) - self.pending_clustering_results = pending_clustering_results + self.num_sounds_clustering_results = [2, 2] self.failed_clustering_results = failed_clustering_results - @skipIf(True, "Clustering not yet enabled in BW") - @mock.patch('search.views.cluster_sound_results') - def test_successful_search_result_clustering_view(self, cluster_sound_results): - cluster_sound_results.return_value = self.successful_clustering_results - resp = self.client.get(reverse('clustering-facet')) + @mock.patch('search.views.get_num_sounds_per_cluster') + @mock.patch('search.views.get_clusters_for_query') + def test_successful_search_result_clustering_view(self, get_clusters_for_query, get_num_sounds_per_cluster): + get_clusters_for_query.return_value = self.successful_clustering_results + get_num_sounds_per_cluster.return_value = self.num_sounds_clustering_results + resp = self.client.get(reverse('clusters-section')) # 200 status code & use of clustering facets template self.assertEqual(resp.status_code, 200) - self.assertTemplateUsed(resp, 'search/clustering_facet.html') + self.assertTemplateUsed(resp, 'search/clustering_results.html') # check cluster's content - # 2 sounds per clusters - # 3 most used tags in the cluster 'tag1 tag2 tag3' - # context variable cluster_id_num_results_tags_sound_examples: [(, , , ), ...] - self.assertEqual(resp.context['cluster_id_num_results_tags_sound_examples'], [ - (0, 2, 'tag1 tag2 tag3', self.sound_id_preview_urls[:2]), - (1, 2, 'tag1 tag2 tag3', self.sound_id_preview_urls[2:]) + self.assertEqual(resp.context['clusters_data'], [ + (23, 2, 'tag1 tag2 tag3', ['a']), + (24, 2, 'tag1 tag2 tag3', ['b', 'c']) ]) - @skipIf(True, "Clustering not yet enabled in BW") - @mock.patch('search.views.cluster_sound_results') - def test_pending_search_result_clustering_view(self, cluster_sound_results): - cluster_sound_results.return_value = self.pending_clustering_results - resp = self.client.get(reverse('clustering-facet')) - - # 200 status code & JSON response content - self.assertEqual(resp.status_code, 200) - self.assertJSONEqual(resp.content, {'status': 'pending'}) - - @skipIf(True, "Clustering not yet enabled in BW") - @mock.patch('search.views.cluster_sound_results') - def test_failed_search_result_clustering_view(self, cluster_sound_results): - cluster_sound_results.return_value = self.failed_clustering_results - resp = self.client.get(reverse('clustering-facet')) + @mock.patch('search.views.get_num_sounds_per_cluster') + @mock.patch('search.views.get_clusters_for_query') + def test_failed_search_result_clustering_view(self, get_clusters_for_query, get_num_sounds_per_cluster): + get_clusters_for_query.return_value = self.failed_clustering_results + get_num_sounds_per_cluster.return_value = self.num_sounds_clustering_results + resp = self.client.get(reverse('clusters-section')) # 200 status code & JSON response content self.assertEqual(resp.status_code, 200) - self.assertJSONEqual(resp.content, {'status': 'failed'}) + self.assertTemplateUsed(resp, 'search/clustering_results.html') + self.assertEqual(resp.context['clusters_data'], None) diff --git a/search/views.py b/search/views.py index 2ea1cfea6..e49896c21 100644 --- a/search/views.py +++ b/search/views.py @@ -21,10 +21,7 @@ import datetime import json import logging -import re -import uuid import sentry_sdk -from collections import defaultdict, Counter from django.core.cache import cache from django.conf import settings @@ -34,146 +31,70 @@ import forum import sounds -import geotags -from clustering.clustering_settings import DEFAULT_FEATURES, NUM_SOUND_EXAMPLES_PER_CLUSTER_FACET, \ - NUM_TAGS_SHOWN_PER_CLUSTER_FACET -from clustering.interface import cluster_sound_results, get_sound_ids_from_search_engine_query from forum.models import Post from utils.encryption import create_hash +from utils.clustering_utilities import get_clusters_for_query, get_num_sounds_per_cluster, \ + cluster_data_is_fully_available, get_clustering_data_for_graph_display from utils.logging_filters import get_client_ip from utils.ratelimit import key_for_ratelimiting, rate_per_ip -from utils.search.search_sounds import perform_search_engine_query, search_prepare_parameters, \ - split_filter_query, should_use_compact_mode, contains_active_advanced_search_filters -from utils.search import get_search_engine, SearchEngineException, SearchResultsPaginator +from utils.search import get_search_engine, SearchEngineException, SearchResultsPaginator, search_query_processor +from utils.search.search_sounds import perform_search_engine_query, allow_beta_search_features + search_logger = logging.getLogger("search") -def search_view_helper(request, tags_mode=False): - query_params, advanced_search_params_dict, extra_vars = search_prepare_parameters(request) +def search_view_helper(request): + # Process request data with the SearchQueryProcessor + sqp = search_query_processor.SearchQueryProcessor(request) - # Check if there was a filter parsing error - if extra_vars['parsing_error']: - search_logger.info(f"Query filter parsing error. filter: {request.GET.get('f', '')}") - extra_vars.update({'error_text': 'There was an error while searching, is your query correct?'}) - return extra_vars + # Check if there was a filter parsing error and return error if so + if sqp.errors: + search_logger.info(f"Errors in SearchQueryProcessor: {sqp.errors}") + return {'error_text': 'There was an error while searching, is your query correct?'} - # Get the url query params for later sending it to the clustering engine (this is only used with the clustering feature) - url_query_params_string = request.META['QUERY_STRING'] + # Update compact mode prefernece if user has explicitely specified a different value than the preference + if request.user.is_authenticated: + option = sqp.options['grid_mode'] + if option.set_in_request: + request_preference = option.value + user_preference = request.user.profile.use_compact_mode + if request_preference != user_preference: + request.user.profile.use_compact_mode = request_preference + request.user.profile.save() - # Get a "split" version of the filter which is used to display filters in UI and for some other checks (see below) - filter_query_split = split_filter_query(query_params['query_filter'], extra_vars['parsed_filters'], extra_vars['cluster_id']) - - # Get tags taht are being used in filters (this is used later to remove them from the facet and also for tags mode) - tags_in_filter = [] - for filter_data in filter_query_split: - if filter_data['name'].startswith('tag:'): - tag = filter_data['name'].replace('tag:', '') - if tag.startswith('"'): - # If tag name has quotes, remove them - tag = tag[1:-1] - tags_in_filter.append(tag) - - # Process tags mode stuff - initial_tagcloud = None - if tags_mode: - # In tags mode, we increase the size of the tags facet so we include more related tags - query_params['facets'][settings.SEARCH_SOUNDS_FIELD_TAGS]['limit'] = 50 - - # If no tags are in filter, we are "starting" tag-based browsing so display the initial tagcloud - if not tags_in_filter: - initial_tagcloud = cache.get('initial_tagcloud') - if initial_tagcloud is None: - # If tagcloud is not cached, make a query to retrieve it and save it to cache - results, _ = perform_search_engine_query(dict( - textual_query='', - query_filter= "*:*", - num_sounds=1, - facets={settings.SEARCH_SOUNDS_FIELD_TAGS: {'limit': 100}}, - group_by_pack=True, - group_counts_as_one_in_facets=False, - )) - initial_tagcloud = [dict(name=f[0], count=f[1], browse_url=reverse('tags', args=[f[0]])) for f in results.facets["tag"]] - cache.set('initial_tagcloud', initial_tagcloud, 60 * 60 * 12) # cache for 12 hours - return { - 'tags_mode': True, - 'tags_in_filter': tags_in_filter, - 'initial_tagcloud': initial_tagcloud, - } - - # In the tvars section we pass the original group_by_pack value to avoid it being set to false if there is a pack filter (see search_prepare_parameters) - # This is so that we keep track of the original setting of group_by_pack before the filter was applied, and so that if the pack filter is removed, we can - # automatically revert to the previous group_by_pack setting. Also, we compute "disable_group_by_pack_option" so that when we have changed the real - # group_by_pack because there is a pack filter, we can grey out the option in the search form. Similar thing we do for only_sounds_with_pack as also - # it does not make sense when filtering by pack - group_by_pack_in_request = request.GET.get("g", "1") == "1" - only_sounds_with_pack_in_request = request.GET.get("only_p", "0") == "1" - disable_group_by_pack_option = 'pack:' in query_params['query_filter'] or only_sounds_with_pack_in_request - disable_only_sounds_by_pack_option= 'pack:' in query_params['query_filter'] - only_sounds_with_pack = "1" if query_params['only_sounds_with_pack'] else "" - if only_sounds_with_pack: - # If displaying search results as packs, include 3 sounds per pack group in the results so we can display these sounds as selected sounds in the - # display_pack templatetag - query_params['num_sounds_per_pack_group'] = 3 - - # Parpare variables for map view - disable_display_results_in_grid_option = False - map_bytearray_url = '' - use_map_mode = settings.SEARCH_ALLOW_DISPLAY_RESULTS_IN_MAP and request.GET.get("mm", "0") == "1" - map_mode_query_results_cache_key = None + # Parpare variables for map view (prepare some URLs for loading sounds and providing links to map) open_in_map_url = None - if use_map_mode: - # Prepare some URLs for loading sounds and providing links to map + map_mode_query_results_cache_key = None + map_bytearray_url = '' + if sqp.map_mode: current_query_params = request.get_full_path().split("?")[-1] open_in_map_url = reverse('geotags-query') + f'?{current_query_params}' map_mode_query_results_cache_key = f'map-query-results-{create_hash(current_query_params, 10)}' map_bytearray_url = reverse('geotags-for-query-barray') + f'?key={map_mode_query_results_cache_key}' - # Update some query parameters and options to adapt to map mode - disable_group_by_pack_option = True - disable_only_sounds_by_pack_option = True - disable_display_results_in_grid_option = True - geotags.views.update_query_params_for_map_query(query_params, preserve_facets=True) - - tvars = { - 'error_text': None, - 'filter_query': query_params['query_filter'], - 'filter_query_split': filter_query_split, - 'search_query': query_params['textual_query'], - 'similar_to': query_params['similar_to'], - 'group_by_pack_in_request': "1" if group_by_pack_in_request else "", - 'disable_group_by_pack_option': disable_group_by_pack_option, - 'only_sounds_with_pack': only_sounds_with_pack, - 'only_sounds_with_pack_in_request': "1" if only_sounds_with_pack_in_request else "", - 'disable_only_sounds_by_pack_option': disable_only_sounds_by_pack_option, - 'use_compact_mode': should_use_compact_mode(request), - 'disable_display_results_in_grid_option': disable_display_results_in_grid_option, - 'advanced': extra_vars['advanced'], - 'sort': query_params['sort'], - 'sort_options': [(option, option) for option in settings.SEARCH_SOUNDS_SORT_OPTIONS_WEB], - 'filter_query_link_more_when_grouping_packs': extra_vars['filter_query_link_more_when_grouping_packs'], - 'current_page': query_params['current_page'], - 'url_query_params_string': url_query_params_string, - 'cluster_id': extra_vars['cluster_id'], - 'clustering_on': settings.ENABLE_SEARCH_RESULTS_CLUSTERING, - 'weights': extra_vars['raw_weights_parameter'], - 'initial_tagcloud': initial_tagcloud, - 'tags_mode': tags_mode, - 'tags_in_filter': tags_in_filter, - 'has_advanced_search_settings_set': contains_active_advanced_search_filters(request, query_params, extra_vars), - 'advanced_search_closed_on_load': settings.ADVANCED_SEARCH_MENU_ALWAYS_CLOSED_ON_PAGE_LOAD, - 'allow_map_mode': settings.SEARCH_ALLOW_DISPLAY_RESULTS_IN_MAP, - 'use_map_mode': use_map_mode, - 'map_bytearray_url': map_bytearray_url, - 'open_in_map_url': open_in_map_url, - 'max_search_results_map_mode': settings.MAX_SEARCH_RESULTS_IN_MAP_DISPLAY - } - tvars.update(advanced_search_params_dict) + # Prepare variables for clustering + get_clusters_url = None + clusters_data = None + if sqp.compute_clusters_active() and allow_beta_search_features(request): + if cluster_data_is_fully_available(sqp): + # If clustering data for the current query is fully available, we can get it directly + clusters_data = _get_clusters_data_helper(sqp) + else: + # Otherwise pass the url where the cluster data fill be fetched asyncronously from + get_clusters_url = reverse('clusters-section') + f'?{request.get_full_path().split("?")[-1]}' + + # If in tags mode and no tags in filter, return before making the query as we'll make + # the initial tagcloud in tags.views.tags view and no need to make any further query here + if sqp.tags_mode_active() and not sqp.get_tags_in_filters(): + return {'sqp': sqp} # sqp will be needed in tags.views.tags view - try: + # Run the query and post-process the results + try: + query_params = sqp.as_query_params() results, paginator = perform_search_engine_query(query_params) - if not use_map_mode: - if not only_sounds_with_pack: + if not sqp.map_mode_active(): + if not sqp.display_as_packs_active(): resultids = [d.get("id") for d in results.docs] resultsounds = sounds.models.Sound.objects.bulk_query_id(resultids) allsounds = {} @@ -186,6 +107,11 @@ def search_view_helper(request, tags_mode=False): docs = [doc for doc in results.docs if doc["id"] in allsounds] for d in docs: d["sound"] = allsounds[d["id"]] + + # Add URLs to "more from this pack" in the result object so these are easily accessible in the template + for d in docs: + if d.get("n_more_in_group") and d["sound"].pack_id is not None: + d["more_from_this_pack_url"] = sqp.get_url(add_filters=[f'grouping_pack:"{d["sound"].pack_id}_{d["sound"].pack_name}"']) else: resultspackids = [] sound_ids_for_pack_id = {} @@ -204,6 +130,7 @@ def search_view_helper(request, tags_mode=False): docs = [d for d in results.docs if int(d.get("group_name").split('_')[0]) in allpacks] for d in docs: d["pack"] = allpacks[int(d.get("group_name").split('_')[0])] + d["more_from_this_pack_url"] = sqp.get_url(add_filters=[f'grouping_pack:"{d["pack"].id}_{d["pack"].name}"']) else: # In map we configure the search query to already return geotags data. Here we collect all this data # and save it to the cache so we can collect it in the 'geotags_for_query_barray' view which prepares @@ -221,173 +148,92 @@ def search_view_helper(request, tags_mode=False): 'username': request.user.username, 'page': query_params['current_page'], 'sort': query_params['sort'], - 'group_by_pack': query_params['group_by_pack'], - 'advanced': json.dumps(advanced_search_params_dict) if extra_vars['advanced'] == "1" else "", + 'url': sqp.get_url(), + 'tags_mode': sqp.tags_mode_active(), 'query_time': results.q_time })) # For the facets of fields that could have mulitple values (i.e. currently, only "tags" facet), make - # sure to remove the filters for the corresponding facet field thar are already active (so we remove + # sure to remove the filters for the corresponding facet field that are already active (so we remove # redundant information) - if tags_in_filter: - if 'tag' in results.facets: - results.facets['tag'] = [(tag, count) for tag, count in results.facets['tag'] if tag not in tags_in_filter] - - tvars.update({ + if 'tag' in results.facets: + results.facets['tag'] = [(tag, count) for tag, count in results.facets['tag'] if tag not in sqp.get_tags_in_filters()] + + # Compile template variables + return { + 'sqp': sqp, + 'error_text': None, + 'current_page': query_params['current_page'], + 'has_advanced_search_settings_set': sqp.contains_active_advanced_search_options(), + 'advanced_search_closed_on_load': settings.ADVANCED_SEARCH_MENU_ALWAYS_CLOSED_ON_PAGE_LOAD, + 'map_bytearray_url': map_bytearray_url, + 'open_in_map_url': open_in_map_url, + 'max_search_results_map_mode': settings.MAX_SEARCH_RESULTS_IN_MAP_DISPLAY, + 'get_clusters_url': get_clusters_url, + 'clusters_data': clusters_data, 'paginator': paginator, 'page': paginator.page(query_params['current_page']), 'docs': docs, 'facets': results.facets, 'non_grouped_number_of_results': results.non_grouped_number_of_results, - }) + 'show_beta_search_options': allow_beta_search_features(request), + } except SearchEngineException as e: search_logger.info(f'Search error: query: {str(query_params)} error {e}') sentry_sdk.capture_exception(e) # Manually capture exception so it has mroe info and Sentry can organize it properly - tvars.update({'error_text': 'There was an error while searching, is your query correct?'}) + return {'error_text': 'There was an error while searching, is your query correct?'} except Exception as e: search_logger.info(f'Could probably not connect to Solr - {e}') sentry_sdk.capture_exception(e) # Manually capture exception so it has mroe info and Sentry can organize it properly - tvars.update({'error_text': 'The search server could not be reached, please try again later.'}) - - return tvars + return {'error_text': 'The search server could not be reached, please try again later.'} @ratelimit(key=key_for_ratelimiting, rate=rate_per_ip, group=settings.RATELIMIT_SEARCH_GROUP, block=True) def search(request): - tvars = search_view_helper(request, tags_mode=False) - template = 'search/search.html' if request.GET.get("ajax", "") != "1" else 'search/search_ajax.html' - return render(request, template, tvars) + tvars = search_view_helper(request) + return render(request, 'search/search.html', tvars) -def clustering_facet(request): - """Triggers the computation of the clustering, returns the state of processing or the clustering facet. - """ - # pass the url query params for later sending it to the clustering engine - url_query_params_string = request.META['QUERY_STRING'] - # remove existing cluster facet filter from the params since the returned cluster facets will include - # their correspondinng cluster_id query parameter (done in the template) - url_query_params_string = re.sub(r"(&cluster_id=[0-9]*)", "", url_query_params_string) - - result = cluster_sound_results(request, features=DEFAULT_FEATURES) - - # check if computation is finished. If not, send computation state. - if result['finished']: - if result['result'] is not None: - results = result['result'] - num_clusters = len(results) - else: - return JsonResponse({'status': 'failed'}, safe=False) - elif result['error']: - return JsonResponse({'status': 'failed'}, safe=False) - else: - return JsonResponse({'status': 'pending'}, safe=False) - - # check if facet filters are present in the search query - # if yes, filter sounds from clusters - query_params, _, extra_vars = search_prepare_parameters(request) - if extra_vars['has_facet_filter']: - sound_ids_filtered = get_sound_ids_from_search_engine_query(query_params) - results = [[sound_id for sound_id in cluster if int(sound_id) in sound_ids_filtered] - for cluster in results] - - num_sounds_per_cluster = [len(cluster) for cluster in results] - partition = {sound_id: cluster_id for cluster_id, cluster in enumerate(results) for sound_id in cluster} - - # label clusters using most occuring tags - sound_instances = sounds.models.Sound.objects.bulk_query_id(list(map(int, list(partition.keys())))) - sound_tags = {sound.id: sound.tag_array for sound in sound_instances} - cluster_tags = defaultdict(list) - - # extract tags for each clusters and do not use query terms for labeling clusters - query_terms = {t.lower() for t in request.GET.get('q', '').split(' ')} - for sound_id, tags in sound_tags.items(): - cluster_tags[partition[str(sound_id)]] += [t.lower() for t in tags if t.lower() not in query_terms] - - # count 3 most occuring tags - # we iterate with range(len(results)) to ensure that we get the right order when iterating through the dict - cluster_most_occuring_tags = [ - [tag for tag, _ in Counter(cluster_tags[cluster_id]).most_common(NUM_TAGS_SHOWN_PER_CLUSTER_FACET)] - if cluster_tags[cluster_id] else [] - for cluster_id in range(len(results)) - ] - most_occuring_tags_formatted = [ - ' '.join(sorted(most_occuring_tags)) - for most_occuring_tags in cluster_most_occuring_tags - ] - - # extract sound examples for each cluster - sound_ids_examples_per_cluster = [ - list(map(int, cluster_sound_ids[:NUM_SOUND_EXAMPLES_PER_CLUSTER_FACET])) - for cluster_sound_ids in results - ] - sound_ids_examples = [item for sublist in sound_ids_examples_per_cluster for item in sublist] - sound_urls = { - sound.id: sound.locations()['preview']['LQ']['ogg']['url'] - for sound in sound_instances - if sound.id in sound_ids_examples - } - sound_url_examples_per_cluster = [ - [(sound_id, sound_urls[sound_id]) for sound_id in cluster_sound_ids] - for cluster_sound_ids in sound_ids_examples_per_cluster - ] - - return render(request, 'search/clustering_facet.html', { - 'results': partition, - 'url_query_params_string': url_query_params_string, - 'cluster_id_num_results_tags_sound_examples': list(zip( - list(range(num_clusters)), - num_sounds_per_cluster, - most_occuring_tags_formatted, - sound_url_examples_per_cluster - )), - }) +def _get_clusters_data_helper(sqp): + # Get main cluster data + results = get_clusters_for_query(sqp) + if results is None: + return None + + # Get the number of sounds per cluster + # This number depends on the facet filters which are applied AFTER the main clustering. + # See get_num_sounds_per_cluster for more details. + num_sounds_per_cluster = get_num_sounds_per_cluster(sqp, results['clusters']) + + # Resurn a list with information for each cluster + # Note that this information DOES NOT include the actual sound IDs per cluster. + return list(zip( + results.get('cluster_ids', []), # cluster ID + num_sounds_per_cluster, # Num sounds + results.get('cluster_names', []), # Cluster name + results.get('example_sounds_data', []) # Example sounds + )) + + +def clusters_section(request): + sqp = search_query_processor.SearchQueryProcessor(request) + clusters_data = _get_clusters_data_helper(sqp) + if clusters_data is None: + return render(request, 'search/clustering_results.html', {'clusters_data': None}) + return render(request, 'search/clustering_results.html', {'sqp': sqp, 'clusters_data': clusters_data}) def clustered_graph(request): """Returns the clustered sound graph representation of the search results. """ - result = cluster_sound_results(request, features=DEFAULT_FEATURES) - graph = result['graph'] - - # check if facet filters are present in the search query - # if yes, filter nodes and links from the graph - query_params, _, extra_vars = search_prepare_parameters(request) - if extra_vars['has_facet_filter']: - nodes = graph['nodes'] - links = graph['links'] - graph['nodes'] = [] - graph['links'] = [] - sound_ids_filtered = get_sound_ids_from_search_engine_query(query_params) - for node in nodes: - if int(node['id']) in sound_ids_filtered: - graph['nodes'].append(node) - for link in links: - if int(link['source']) in sound_ids_filtered and int(link['target']) in sound_ids_filtered: - graph['links'].append(link) - - results = sounds.models.Sound.objects.bulk_query_id([int(node['id']) for node in graph['nodes']]) - - sound_metadata = {} - for sound in results: - sound_locations = sound.locations() - sound_metadata.update( - {sound.id: ( - sound_locations['preview']['LQ']['ogg']['url'], - sound.original_filename, - ' '.join(sound.tag_array), - reverse("sound", args=(sound.username, sound.id)), - sound_locations['display']['wave']['M']['url'], - )} - ) - - for node in graph['nodes']: - node['url'] = sound_metadata[int(node['id'])][0] - node['name'] = sound_metadata[int(node['id'])][1] - node['tags'] = sound_metadata[int(node['id'])][2] - node['sound_page_url'] = sound_metadata[int(node['id'])][3] - node['image_url'] = sound_metadata[int(node['id'])][4] - + # TODO: this view is currently not used in the new UI, but we could add a modal in the + # clustering section to show results in a graph. + sqp = search_query_processor.SearchQueryProcessor(request) + results = get_clusters_for_query(sqp) + if results is None: + JsonResponse(json.dumps({'error': True}), safe=False) + graph = get_clustering_data_for_graph_display(sqp, results['graph']) return JsonResponse(json.dumps(graph), safe=False) diff --git a/sounds/management/commands/create_remix_groups.py b/sounds/management/commands/create_remix_groups.py index 69de15789..503c5b517 100644 --- a/sounds/management/commands/create_remix_groups.py +++ b/sounds/management/commands/create_remix_groups.py @@ -24,7 +24,7 @@ from django.core.management.base import BaseCommand from django.db import connection -from networkx import nx +import networkx as nx from sounds.models import Sound, RemixGroup @@ -63,14 +63,15 @@ def handle(self, *args, **options): dg = _create_nodes(dg) # 4) Find weakly connected components (single direction) - subgraphs = nx.weakly_connected_component_subgraphs(dg) + subgraphs = nx.weakly_connected_components(dg) # 5) delete all remixgroup objects to recalculate RemixGroup.objects.all().delete() # 6) Loop through all connected graphs in the dataset and create the groups n_groups_created = 0 - for sg in subgraphs: + for sg_nodes in subgraphs: + sg = dg.subgraph(sg_nodes).copy() _create_and_save_remixgroup(sg, RemixGroup()) n_groups_created += 1 @@ -80,19 +81,19 @@ def handle(self, *args, **options): def _create_nodes(dg): for node in dg.nodes(): sound = Sound.objects.get(id=node) - dg.add_node(node, {'date': sound.created, - 'nodeName': sound.original_filename, - 'username': sound.user.username, - 'sound_url_mp3': sound.locations()['preview']['LQ']['mp3']['url'], - 'sound_url_ogg': sound.locations()['preview']['LQ']['ogg']['url'], - 'waveform_url': sound.locations()['display']['wave']['M']['url']}) + dg.add_node(node, **{'date': sound.created, + 'nodeName': sound.original_filename, + 'username': sound.user.username, + 'sound_url_mp3': sound.locations()['preview']['LQ']['mp3']['url'], + 'sound_url_ogg': sound.locations()['preview']['LQ']['ogg']['url'], + 'waveform_url': sound.locations()['display']['wave']['M']['url']}) return dg def _create_and_save_remixgroup(sg, remixgroup): # print ' ========================================= ' # add to list the subgraphs(connected components) with the extra data - node_list = sg.nodes(data=True) + node_list = list(sg.nodes(data=True)) # pp(node_list) # sort by date (holds all subgraph nodes sorted by date) @@ -111,7 +112,7 @@ def _create_and_save_remixgroup(sg, remixgroup): links = [] remixgroup.save() # need to save to have primary key before ManyToMany # FIXME: no idea why nx.weakly_connected_components(sg) return list in list... - remixgroup.sounds.set(set(nx.weakly_connected_components(sg)[0])) + remixgroup.sounds.set(max(nx.weakly_connected_components(sg), key=len)) for sound in remixgroup.sounds.all(): sound.invalidate_template_caches() @@ -141,5 +142,5 @@ def _create_and_save_remixgroup(sg, remixgroup): "\"nodes\": " + json.dumps(nodes) + "," \ "\"links\": " + json.dumps(links) + "}" - remixgroup.networkx_data = json.dumps(dict(nodes=sg.nodes(), edges=sg.edges())) + remixgroup.networkx_data = json.dumps(dict(nodes=list(sg.nodes()), edges=list(sg.edges()))) remixgroup.save() diff --git a/sounds/views.py b/sounds/views.py index 385415a1b..552c4f705 100644 --- a/sounds/views.py +++ b/sounds/views.py @@ -149,7 +149,7 @@ def random(request): def packs(request): - return HttpResponseRedirect(reverse('sounds-search') + '?s=Date+added+(newest+first)&g=1&only_p=1') + return HttpResponseRedirect(reverse('sounds-search') + '?s=Date+added+(newest+first)&g=1&dp=1') def front_page(request): diff --git a/tags/templatetags/tags.py b/tags/templatetags/tags.py index e9463f1d9..8118f3114 100644 --- a/tags/templatetags/tags.py +++ b/tags/templatetags/tags.py @@ -41,7 +41,7 @@ def join_tags_include(list, include): @register.inclusion_tag('molecules/bw_follow_tags_widget.html', takes_context=True) def bw_follow_tags_widget(context): request = context['request'] - slash_tag = "/".join(context['tags_in_filter']) + slash_tag = "/".join(context['sqp'].get_tags_in_filters()) follow_tags_url = '' unfollow_tags_url = '' show_unfollow_button = False diff --git a/tags/views.py b/tags/views.py index ed80ceac2..444f60f89 100644 --- a/tags/views.py +++ b/tags/views.py @@ -20,12 +20,15 @@ import logging +from django.conf import settings +from django.core.cache import cache from django.http import Http404, HttpResponsePermanentRedirect, HttpResponseRedirect from django.shortcuts import render from django.urls import reverse from search.views import search_view_helper from tags.models import Tag, FS1Tag +from utils.search.search_sounds import perform_search_engine_query search_logger = logging.getLogger("search") @@ -51,8 +54,26 @@ def tags(request, multiple_tags=None): return HttpResponseRedirect(f"{reverse('tags')}?f={search_filter}") else: - # Share same view code as for the search view, but set "tags mode" on - tvars = search_view_helper(request, tags_mode=True) + # Share same view code as for the search view, but "tags mode" will be on + tvars = search_view_helper(request) + + # If there are no tags in filter, get initial tagcloud and add it to tvars + if 'sqp' in tvars and not tvars['sqp'].get_tags_in_filters(): + initial_tagcloud = cache.get('initial_tagcloud') + if initial_tagcloud is None: + # If tagcloud is not cached, make a query to retrieve it and save it to cache + results, _ = perform_search_engine_query(dict( + textual_query='', + query_filter= "*:*", + num_sounds=1, + facets={settings.SEARCH_SOUNDS_FIELD_TAGS: {'limit': 200}}, + group_by_pack=True, + group_counts_as_one_in_facets=False, + )) + initial_tagcloud = [dict(name=f[0], count=f[1], browse_url=reverse('tags', args=[f[0]])) for f in results.facets["tag"]] + cache.set('initial_tagcloud', initial_tagcloud, 60 * 60 * 12) # cache for 12 hours + tvars.update({'initial_tagcloud': initial_tagcloud}) + return render(request, 'search/search.html', tvars) diff --git a/templates/molecules/navbar_search_page.html b/templates/molecules/navbar_search_page.html index f75a526f4..6b8d05705 100644 --- a/templates/molecules/navbar_search_page.html +++ b/templates/molecules/navbar_search_page.html @@ -11,7 +11,7 @@
- +
diff --git a/templates/search/clustering_results.html b/templates/search/clustering_results.html new file mode 100644 index 000000000..166324a4a --- /dev/null +++ b/templates/search/clustering_results.html @@ -0,0 +1,12 @@ +{% if clusters_data and clusters_data|length > 0 %} +
+
+ {% for cluster_id, num_sounds, cluster_name, sound_examples in clusters_data %} +
+ +
{{ num_sounds }} sound{{ num_sounds|pluralize }}
+
+ {% endfor %} +
+
+{% endif %} \ No newline at end of file diff --git a/templates/search/facet.html b/templates/search/facet.html index e1e8adc82..91af229c6 100644 --- a/templates/search/facet.html +++ b/templates/search/facet.html @@ -1,50 +1,26 @@ {% load bw_templatetags %} - +{% if facet and facet|length > 1%}
-
- {{ title }} - {% comment %} - - - - {% endcomment %} -
- {% ifequal type "checkbox" %} {% comment %}This type of facet is no used so far{% endcomment %} -
    - {% for f in facet %} -
  • - -
  • - {% endfor %} -
- {% endifequal %} +
{{ title }}
{% ifequal type "list" %} - + {% endifequal %} {% ifequal type "cloud" %} -
- {% for f in facet %} - {% bw_tag f.display_name 1 '' f.add_filter_url f.weight %} - {% endfor %} -
+
+ {% for f in facet %} + {% bw_tag f.display_value 1 '' f.add_filter_url f.weight %} + {% endfor %} +
{% endifequal %}
- +{% endif %} diff --git a/templates/search/search.html b/templates/search/search.html index 87de30986..7137a3119 100644 --- a/templates/search/search.html +++ b/templates/search/search.html @@ -10,7 +10,7 @@ {% block title %}Search{% endblock %} -{% block navbar %}{% if not tags_mode %}{% include 'molecules/navbar_search_page.html' %}{% else %}{% include 'molecules/navbar.html' %}{% endif %}{% endblock %} +{% block navbar %}{% if not sqp.tags_mode_active %}{% include 'molecules/navbar_search_page.html' %}{% else %}{% include 'molecules/navbar.html' %}{% endif %}{% endblock %} {% block content %} @@ -32,25 +32,33 @@

Choose a tag to start browsing {% else %} -
- -
+ + {% comment %}main search input section and hidden fields{% endcomment %} +
-
- - - {% if similar_to %}{% endif %} - {% comment %}This is used so that we can know from JS whether we are in tags mode or not{% endcomment %} +
+ +
{% bw_icon 'close' %}
+ + {% if not sqp.options.field_weights.is_default_value %}{% display_search_option "field_weights" "hidden" %}{% endif %} + {% if not sqp.options.similar_to.is_default_value %}{% display_search_option "similar_to" "hidden" %}{% endif %}
- - {% if tags_mode %} + {% if sqp.tags_mode_active %} + {% comment %}"sounds tagged as" label with follow/unfollow buttons{% endcomment %}

Sounds tagged as - {% for tag in tags_in_filter %} + {% for tag in sqp.get_tags_in_filters %} {{ tag }}{% if not forloop.last %}·{% endif %} {% endfor %}

@@ -58,190 +66,135 @@

{% endif %} -
+ {% comment %}search navbar{% endcomment %}
- {% if not only_sounds_with_pack %} -
{% if non_grouped_number_of_results > 0 %}{{ non_grouped_number_of_results|bw_intcomma }}{% else %}{{ paginator.count|bw_intcomma }}{% endif %} sound{{ non_grouped_number_of_results|pluralize }}
+ {% comment %}number of results{% endcomment %} +
+ {% if not sqp.display_as_packs_active %} + {% if non_grouped_number_of_results > 0 %}{{ non_grouped_number_of_results|bw_intcomma }}{% else %}{{ paginator.count|bw_intcomma }}{% endif %} sound{{ non_grouped_number_of_results|pluralize }} {% else %} -
{{ paginator.count|bw_intcomma }} pack{{ paginator.count|pluralize }}
+ {{ paginator.count|bw_intcomma }} pack{{ paginator.count|pluralize }} {% endif %} - {% comment %} -
Sounds
- {% endcomment %} - {% comment %} - This section of the UI show allow to choose between different search modes: - sounds and packs. However, this is currently not implemented in the backend so we don't - enable this feature. Instead, we add here the element to toggle advanced search options. - In the future we might need to redesign that. - {% endcomment %} +
+ {% comment %}advanced search toggle{% endcomment %}
{% if has_advanced_search_settings_set %}·{% endif %}
+ {% comment %}sorting options{% endcomment %}
-
- Sort by: - {% if not similar_to %} - +
+ {% if not sqp.similar_to_active %} + {% display_search_option "sort_by" %} {% else %} - + {{ sqp.options.sort_by.label }}: + Similarity to target {% endif %}
-
-
- - -
-
- Search in + {% comment %}advanced search options{% endcomment %} +
+
+ {% comment %}first row of advanced search options{% endcomment %} +
+ {% comment %}left section{% endcomment %} +
+
+ {{ sqp.options.search_in.label }} +
+
+ {% for option in sqp.options.search_in.get_choices_annotated_with_selection %} +
+ +
+ {% endfor %} +
-
-
-
    -
  • - -
  • -
  • - -
  • -
  • - -
  • -
+ {% comment %}middle section{% endcomment %} +
+
+ {{ sqp.options.duration.label }}
-
-
    -
  • - -
  • -
  • - -
  • -
  • - -
  • -
+
+ - seconds
-
- -
-
- Duration + {% comment %}right section{% endcomment %} +
+
+ Other +
+
+
+ {% display_search_option "is_geotagged" %} +
+
+ {% display_search_option "is_remix" %} +
+
+ {% display_search_option "group_by_pack" %} +
+
+ {% display_search_option "display_as_packs" %} +
+
+ {% display_search_option "grid_mode" %} +
+
+ {% display_search_option "map_mode" %} +
+
-
- - seconds +
+ {% if show_beta_search_options %} +
+
+
+ Beta Search Options +
- -
-
- Other +
+
+
{% display_search_option "compute_clusters" %}
+
{% display_search_option "similar_to" %}
-
    -
  • - -
  • -
  • - -
  • -
  • - -
  • -
  • - -
  • -
  • - -
  • - {% if allow_map_mode %} -
  • - -
  • - {% endif %} -
+
+ {% endif %}
+ {% comment %}apply button{% endcomment %}
+ {% comment %}cluster results section{% endcomment %} + {% if sqp.compute_clusters_active %} + {% if clusters_data %} + {% include 'search/clustering_results.html' %} + {% else %} + {% if get_clusters_url %} +
+
+
+ +
+
+
+ {% endif %} + {% endif %} + {% endif %}
{% endif %} @@ -251,86 +204,81 @@

- + {% comment %}facets{% endcomment%} -
- + {% comment %}search results{% endcomment %}
- {% if filter_query_split %} + {% comment %}filters{% endcomment %} + {% with sqp.get_filters_data_to_display_in_search_results_page as filters_data %} + {% if filters_data %} {% endif %} - {% if not use_map_mode %} + {% endwith %} + {% comment %}map{% endcomment %} + {% if sqp.map_mode_active %} +
+
Loading map...
+
+
+ +
+ {% if paginator.count > max_search_results_map_mode %} +
+

{% bw_icon 'notification' %} Note that only the first {{ max_search_results_map_mode|bw_intcomma }} search results are shown on the map

+
+ {% endif %} +
+
+ {% else %} + {% comment %}list/grid of sounds{% endcomment %}
{% if paginator.count > 0 %} - {% if use_compact_mode %} + {% if sqp.grid_mode_active %}
{% for result in docs %}
- {% if not only_sounds_with_pack %} - {% display_sound_small result.sound %} - {% if result.n_more_in_group and result.sound.pack_id is not None %} -

- {% bw_icon 'plus' %} See {{result.n_more_in_group|add:1|bw_intcomma}} result{{ result.n_more_in_group|add:1|pluralize }} from pack: {{ result.sound.pack_name|truncate_string:35 }} -

- {% endif %} + {% if sqp.display_as_packs_active %} + {% display_pack result.pack %} +

+ {% bw_icon 'plus' %} See {{result.n_more_in_group|add:1|bw_intcomma}} result{{ result.n_more_in_group|add:1|pluralize }} from pack +

{% else %} - {% display_pack result.pack %} -

- {% bw_icon 'plus' %} See {{result.n_more_in_group|add:1|bw_intcomma}} result{{ result.n_more_in_group|add:1|pluralize }} from pack -

+ {% display_sound_small result.sound %} + {% if result.more_from_this_pack_url %} +

+ {% bw_icon 'plus' %} See {{result.n_more_in_group|add:1|bw_intcomma}} result{{ result.n_more_in_group|add:1|pluralize }} from pack: {{ result.sound.pack_name|truncate_string:35 }} +

+ {% endif %} {% endif %}
{% endfor %} @@ -338,18 +286,18 @@

{% else %} {% for result in docs %}
- {% if not only_sounds_with_pack %} - {% display_sound_middle result.sound %} - {% if result.n_more_in_group and result.sound.pack_id is not None %} - - {% endif %} + {% if sqp.display_as_packs_active %} + {% display_pack_big result.pack %} + {% else %} - {% display_pack_big result.pack %} - + {% display_sound_middle result.sound %} + {% if result.more_from_this_pack_url %} + + {% endif %} {% endif %} {% if not forloop.last %}
@@ -367,26 +315,6 @@
No results... 😟
{% bw_paginator paginator page current_page request "sound" non_grouped_number_of_results %}
- {% else %} -
-
Loading map...
-
-
- -
- {% if paginator.count > max_search_results_map_mode %} -
-

{% bw_icon 'notification' %} Note that only the first {{ max_search_results_map_mode|bw_intcomma }} search results are shown on the map

-
- {% endif %} -
-
{% endif %}

diff --git a/templates/search/search_option.html b/templates/search/search_option.html new file mode 100644 index 000000000..d97d1fea0 --- /dev/null +++ b/templates/search/search_option.html @@ -0,0 +1,20 @@ +{% if widget == 'checkbox' %} + +{% elif widget == 'text' %} +: + +{% elif widget == 'select' %} +{{ label }}: + +{% elif widget == 'hidden' %} + +{% endif %} \ No newline at end of file diff --git a/tickets/tests.py b/tickets/tests.py index cc737c58c..3f024c998 100644 --- a/tickets/tests.py +++ b/tickets/tests.py @@ -180,14 +180,14 @@ def setUp(self): TicketTests.setUp(self) self.ticket = self._create_assigned_ticket() - def _perform_action(self, action): + @mock.patch('general.tasks.post_moderation_assigned_tickets.delay') + def _perform_action(self, action, post_moderation_assigned_tickets_task): return self.client.post(reverse('tickets-moderation-assigned', args=[self.test_moderator.id]), { 'action': action, 'message': '', 'ticket': self.ticket.id, 'is_explicit': IS_EXPLICIT_KEEP_USER_PREFERENCE_KEY}) - @mock.patch('general.tasks.post_moderation_assigned_tickets.delay') @mock.patch('sounds.models.delete_sounds_from_search_engine') - def test_delete_ticket_from_queue(self, delete_sound_solr, post_moderation_assigned_tickets_task): + def test_delete_ticket_from_queue(self, delete_sound_solr): resp = self._perform_action('Delete') self.assertIn(resp.status_code, [200, 302]) # This test is reused, and the response code is different in each case @@ -197,9 +197,8 @@ def test_delete_ticket_from_queue(self, delete_sound_solr, post_moderation_assig self.assertEqual(self.ticket.status, TICKET_STATUS_CLOSED) self.assertIsNone(self.ticket.sound) - @mock.patch('general.tasks.post_moderation_assigned_tickets.delay') @mock.patch('general.tasks.whitelist_user.delay') - def test_whitelist_from_queue(self, whitelist_task, post_moderation_assigned_tickets_task): + def test_whitelist_from_queue(self, whitelist_task): self._perform_action('Whitelist') whitelist_task.assert_called_once_with(ticket_ids=[self.ticket.id]) @@ -213,20 +212,17 @@ def _assert_ticket_and_sound_fields(self, status, assignee, moderation_state): else: self.assertEqual(self.ticket.assignee, assignee) - @mock.patch('general.tasks.post_moderation_assigned_tickets.delay') - def test_approve_ticket_from_queue(self, post_moderation_assigned_tickets_task): + def test_approve_ticket_from_queue(self): resp = self._perform_action('Approve') self.assertIn(resp.status_code, [200, 302]) # This test is reused, and the response code is different in each case self._assert_ticket_and_sound_fields(TICKET_STATUS_CLOSED, self.test_moderator, 'OK') - @mock.patch('general.tasks.post_moderation_assigned_tickets.delay') - def test_return_ticket_from_queue(self, post_moderation_assigned_tickets_task): + def test_return_ticket_from_queue(self): resp = self._perform_action('Return') self.assertIn(resp.status_code, [200, 302]) # This test is reused, and the response code is different in each case self._assert_ticket_and_sound_fields(TICKET_STATUS_NEW, None, 'PE') - @mock.patch('general.tasks.post_moderation_assigned_tickets.delay') - def test_defer_ticket_from_queue(self, post_moderation_assigned_tickets_task): + def test_defer_ticket_from_queue(self): resp = self._perform_action('Defer') self.assertIn(resp.status_code, [200, 302]) # This test is reused, and the response code is different in each case self._assert_ticket_and_sound_fields(TICKET_STATUS_DEFERRED, self.test_moderator, 'PE') @@ -234,7 +230,9 @@ def test_defer_ticket_from_queue(self, post_moderation_assigned_tickets_task): class TicketTestsFromTicketViewOwn(TicketTestsFromQueue): """Ticket state changes in a response to actions from ticket inspection page for own ticket""" - def _perform_action(self, action): + + @mock.patch('general.tasks.post_moderation_assigned_tickets.delay') + def _perform_action(self, action, post_moderation_assigned_tickets): return self.client.post(reverse('tickets-ticket', args=[self.ticket.key]), { 'ss-action': action}) @@ -245,7 +243,8 @@ def setUp(self): TicketTests.setUp(self) self.ticket = self._create_ticket(self.sound, self.test_user) - def _perform_action(self, action): + @mock.patch('general.tasks.post_moderation_assigned_tickets.delay') + def _perform_action(self, action, post_moderation_assigned_tickets): return self.client.post(reverse('tickets-ticket', args=[self.ticket.key]), { 'ss-action': action}) @@ -257,10 +256,12 @@ def setUp(self): TicketTests.setUp(self) self.ticket = self._create_assigned_ticket() - def _perform_action(self, action, is_explicit_flag_key): + @mock.patch('general.tasks.post_moderation_assigned_tickets.delay') + def _perform_action(self, action, is_explicit_flag_key, post_moderation_assigned_tickets): return self.client.post(reverse('tickets-moderation-assigned', args=[self.test_moderator.id]), { 'action': action, 'message': '', 'ticket': self.ticket.id, 'is_explicit': is_explicit_flag_key}) + def test_keep_is_explicit_preference_for_explicit_sound(self): """Test that when approving a sound marked as 'is_explicit' it continues to be marked as such the moderator chooses to preserve author's preference on the flag diff --git a/utils/clustering_utilities.py b/utils/clustering_utilities.py new file mode 100644 index 000000000..539d0e149 --- /dev/null +++ b/utils/clustering_utilities.py @@ -0,0 +1,205 @@ +# +# Freesound is (c) MUSIC TECHNOLOGY GROUP, UNIVERSITAT POMPEU FABRA +# +# Freesound is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# Freesound is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +# Authors: +# See AUTHORS file. +# + +from collections import defaultdict, Counter +import random + +import celery +from django.conf import settings +from django.core.cache import caches +from django.urls import reverse + +from clustering.tasks import cluster_sounds +import sounds +from utils.search.search_sounds import get_sound_similarity_from_search_engine_query, get_sound_ids_from_search_engine_query + + +cache_clustering = caches["clustering"] + + +def get_clusters_for_query(sqp, compute_if_not_in_cache=True): + # Note we don't include facet filters in the generated params because we want to apply clustering "before" the rest of the facets + # (so clustering only depends on the search options specified in the form, but not the facet filters) + query_params = sqp.as_query_params(exclude_facet_filters=True) + + # get result from cache or perform clustering + cache_key = sqp.get_clustering_data_cache_key() + results = cache_clustering.get(cache_key, None) + if results is None and compute_if_not_in_cache: + # First get the similarity vectors for the first settings.MAX_RESULTS_FOR_CLUSTERING results from the query + similarity_vectors_map = get_sound_similarity_from_search_engine_query( + query_params, + analyzer_name=settings.CLUSTERING_SIMILARITY_ANALYZER, + num_sounds=settings.MAX_RESULTS_FOR_CLUSTERING, + current_page=1) + sound_ids = list(similarity_vectors_map.keys()) + if sound_ids: + # Now launch the clustering celery task + # Note that we launch the task synchronously (i.e. we block here until the task finishes). This is because this + # view will be loaded asynchronously from the search page, and the clustering task should only take a few seconds. + # If for some reason the clustering task takes longer and a timeout erorr is raised, that is fine as we'll simply + # not show the clustering section. + async_task_result = cluster_sounds.apply_async(kwargs={ + 'cache_key': cache_key, + 'sound_ids': sound_ids, + 'similarity_vectors_map': similarity_vectors_map + }) + try: + results = async_task_result.get(timeout=settings.CLUSTERING_TASK_TIMEOUT) # Will raise exception if task takes too long + except celery.exceptions.TimeoutError as e: + # Cancel the task so it stops running (or it never starts) + async_task_result.revoke(terminate=True) + if results['clusters'] is not None: + # Generate cluster summaries (cluster names and sound examples) + clusters = results['clusters'] + partition = {sound_id: cluster_id for cluster_id, cluster in enumerate(clusters) for sound_id in cluster} + + # label clusters using most occuring tags + sound_instances = sounds.models.Sound.objects.bulk_query_id(list(map(int, list(partition.keys())))) + sound_tags = {sound.id: sound.tag_array for sound in sound_instances} + cluster_tags = defaultdict(list) + + # extract tags for each clusters and do not use query terms for labeling clusters + query_terms = {t.lower() for t in sqp.options['query'].value.split(' ')} + for sound_id, tags in sound_tags.items(): + cluster_tags[partition[str(sound_id)]] += [t.lower() for t in tags if t.lower() not in query_terms] + + # count 3 most occuring tags + # we iterate with range(len(clusters)) to ensure that we get the right order when iterating through the dict + cluster_most_occuring_tags = [ + [tag for tag, _ in Counter(cluster_tags[cluster_id]).most_common(settings.NUM_TAGS_SHOWN_PER_CLUSTER)] + if cluster_tags[cluster_id] else [] + for cluster_id in range(len(clusters)) + ] + most_occuring_tags_formatted = [ + ' '.join(sorted(most_occuring_tags)) + for most_occuring_tags in cluster_most_occuring_tags + ] + results['cluster_names'] = most_occuring_tags_formatted + + # select sound examples for each cluster + sound_ids_examples_per_cluster = [ + list(map(int, cluster_sound_ids[:settings.NUM_SOUND_EXAMPLES_PER_CLUSTER])) + for cluster_sound_ids in clusters + ] + sound_ids_examples = [item for sublist in sound_ids_examples_per_cluster for item in sublist] + # TODO: collect some metadata for the sound examples and pass it to the template so we can display/play them + example_sounds_data = range(len(sound_ids_examples)) + results['example_sounds_data'] = example_sounds_data + + # Generate random IDs for the clusters that will be used to identify them + cluster_ids = [random.randint(0, 99999) for _ in range(len(clusters))] + results['cluster_ids'] = cluster_ids + else: + # If no sounds to cluster, set to None + results = {'clusters': None} + + # Save results in cache + cache_clustering.set(cache_key, results, settings.CLUSTERING_CACHE_TIME) + return results + + +def get_clustering_data_for_graph_display(sqp, initial_graph): + cache_key = sqp.get_clustering_data_cache_key(include_filters_from_facets=True) + '-graph_display' + graph = cache_clustering.get(cache_key, None) + if graph is None: + # If graph data is not in cache, we need to generate it + # To compute the graph we need to know which sounds are still part of the set of results AFTER the + # facet filters have been applied. To get this information we need to make a query to the search engine. + + # check if facet filters are present in the search query + # if yes, filter nodes and links from the graph + graph = initial_graph + query_params = sqp.as_query_params() + if len(sqp.non_option_filters): + nodes = graph['nodes'] + links = graph['links'] + graph['nodes'] = [] + graph['links'] = [] + sound_ids_filtered = get_sound_ids_from_search_engine_query(query_params, num_sounds=settings.MAX_RESULTS_FOR_CLUSTERING, current_page=1) + for node in nodes: + if int(node['id']) in sound_ids_filtered: + graph['nodes'].append(node) + for link in links: + if int(link['source']) in sound_ids_filtered and int(link['target']) in sound_ids_filtered: + graph['links'].append(link) + + results = sounds.models.Sound.objects.bulk_query_id([int(node['id']) for node in graph['nodes']]) + sound_metadata = {} + for sound in results: + sound_locations = sound.locations() + sound_metadata.update( + {sound.id: ( + sound_locations['preview']['LQ']['ogg']['url'], + sound.original_filename, + ' '.join(sound.tag_array), + reverse("sound", args=(sound.username, sound.id)), + sound_locations['display']['wave']['M']['url'], + )} + ) + + for node in graph['nodes']: + node['url'] = sound_metadata[int(node['id'])][0] + node['name'] = sound_metadata[int(node['id'])][1] + node['tags'] = sound_metadata[int(node['id'])][2] + node['sound_page_url'] = sound_metadata[int(node['id'])][3] + node['image_url'] = sound_metadata[int(node['id'])][4] + cache_clustering.set(cache_key, graph, settings.CLUSTERING_CACHE_TIME) + return graph + + +def get_num_sounds_per_cluster(sqp, clusters): + cache_key = sqp.get_clustering_data_cache_key(include_filters_from_facets=True) + '-num_sounds' + num_sounds_per_cluster = cache_clustering.get(cache_key, None) + if num_sounds_per_cluster is None: + if clusters: + # To compute the number of sounds per cluster we need to know which sounds are still part of the set of results AFTER the + # facet filters have been applied. To get this information we need to make a query to the search engine. + query_params = sqp.as_query_params() + if len(sqp.non_option_filters): + sound_ids_filtered = get_sound_ids_from_search_engine_query(query_params, num_sounds=settings.MAX_RESULTS_FOR_CLUSTERING, current_page=1) + clusters = [[sound_id for sound_id in cluster if int(sound_id) in sound_ids_filtered] + for cluster in clusters] + num_sounds_per_cluster = [len(cluster) for cluster in clusters] + else: + num_sounds_per_cluster = [] + cache_clustering.set(cache_key, num_sounds_per_cluster, settings.CLUSTERING_CACHE_TIME) + return num_sounds_per_cluster + + +def cluster_data_is_fully_available(sqp): + cache_key = sqp.get_clustering_data_cache_key() + if cache_clustering.get(cache_key, None) is None: + return False + cache_key_num_sounds = sqp.get_clustering_data_cache_key(include_filters_from_facets=True) + '-num_sounds' + if cache_clustering.get(cache_key_num_sounds, None) is None: + return False + return True + + +def get_ids_in_cluster(cache_key, cluster_id): + results = cache_clustering.get(cache_key, None) + if results is not None: + try: + cluster_index = results['cluster_ids'].index(cluster_id) + return results['clusters'][cluster_index] + except (IndexError, ValueError) as e: + pass + return [] diff --git a/utils/search/lucene_parser.py b/utils/search/lucene_parser.py deleted file mode 100644 index c531ad4a8..000000000 --- a/utils/search/lucene_parser.py +++ /dev/null @@ -1,142 +0,0 @@ -# -# Freesound is (c) MUSIC TECHNOLOGY GROUP, UNIVERSITAT POMPEU FABRA -# -# Freesound is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# Freesound is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . -# -# Authors: -# See AUTHORS file. -# -import collections - -import pyparsing as pp -from pyparsing import pyparsing_common as ppc - - -pp.ParserElement.enablePackrat() - -COLON, LBRACK, RBRACK, LBRACE, RBRACE, TILDE, CARAT = list(map(pp.Literal, ":[]{}~^")) -LPAR, RPAR = list(map(pp.Literal, "()")) -and_, or_, not_, to_ = list(map(pp.CaselessKeyword, "AND OR NOT TO".split())) -keyword = and_ | or_ | not_ | to_ - -expression = pp.Forward() - -valid_word = pp.Regex( - r'([a-zA-Z0-9*_+.-]|\\\\|\\([+\-!(){}\[\]^"~*?:]|\|\||&&))+' -).setName("word") -valid_word.setParseAction( - lambda t: t[0].replace("\\\\", chr(127)).replace("\\", "").replace(chr(127), "\\") -) - -string = pp.QuotedString('"', unquoteResults=False) -alphanums_plus = pp.alphanums + '_' -float_nums = pp.nums + '.' -alphanum_float_plus_minus_star = alphanums_plus + float_nums + '+' + '-' + '*' - -required_modifier = pp.Literal("+")("required") -prohibit_modifier = pp.Literal("-")("prohibit") -integer = ppc.integer() -proximity_modifier = pp.Group(TILDE + integer("proximity")) -number = ppc.fnumber() -fuzzy_modifier = TILDE + pp.Optional(number, default=0.5)("fuzzy") - -term = pp.Forward().setName("field") -field_name = valid_word().setName("fieldname") -incl_range_search = pp.Group(LBRACK - term("lower") + to_ + term("upper") + RBRACK) -excl_range_search = pp.Group(LBRACE - term("lower") + to_ + term("upper") + RBRACE) -range_search = incl_range_search("incl_range") | excl_range_search("excl_range") -boost = CARAT - number("boost") - -geotag_filter = pp.Literal("'{!") + pp.Word(' ' + '=' + ',' + alphanum_float_plus_minus_star) + pp.Literal("}'") -string_expr = pp.Group(string + proximity_modifier) | string -word_expr = pp.Group(valid_word + fuzzy_modifier) | valid_word -term << ( - pp.Optional(field_name("field") + COLON) - + (word_expr | string_expr | range_search | pp.Group(LPAR + expression + RPAR)) - + pp.Optional(boost) -) -term.setParseAction(lambda t: [t] if "field" in t or "boost" in t else None) - -expression << pp.infixNotation( - pp.Group(term | geotag_filter), - [ - (required_modifier | prohibit_modifier, 1, pp.opAssoc.RIGHT), - ((not_ | "!").setParseAction(lambda: "NOT"), 1, pp.opAssoc.RIGHT), - ((and_ | "&&").setParseAction(lambda: "AND"), 2, pp.opAssoc.LEFT), - ( - pp.Optional(or_ | "||").setName("or"), - 2, - pp.opAssoc.LEFT, - ), - ], -) - - -def flatten(l): - for el in l: - if isinstance(el, collections.abc.Iterable) and not isinstance(el, str): - yield from flatten(el) - else: - # for range filter with TO, we manually add the mandatory spaces in the parsed output - if el == 'TO': - yield ' ' + el + ' ' - else: - yield el - - -def flatten_sub(l): - return [list(flatten(sub)) for sub in l] - - -def parse_query_filter_string(filter_query): - """Parse the query filter string containing field names and values. - - This is useful for for being able to manipulate different filters and removing filters coming - from facets (which is needed for applying clustering without being affected by filtering facets). - Additionally it removes filters that contain empty values. - - Example: - f = " duration:[1 TO *] is_geotagged:1 tag:dog" - parse_query_filter_string(f) - -> [['duration', ':', '[', '1', ' ', 'TO', ' ', '*', ']'], - ['is_geotagged', ':', '1'], - ['tag', ':', 'dog']] - - Args: - filter_query (str): query filter string from a user submitted search query. - - Returns: - List[List[str]]: list containing lists of filter fields' names and values - """ - if filter_query: - try: - filter_list_str = expression.parseString(filter_query).asList()[0] - except pp.ParseSyntaxException: - return [] - - # check if not nested meaning there is only one filter - # if yes, make it nested to treat it the same way as if there were several filters - if isinstance(filter_list_str[0], str): - filter_list_str = [filter_list_str] - - # we flatten the sub lists contained in the parsed output - filter_list_str = flatten_sub(filter_list_str) - - # remove empty filter values - filter_list_str = [ - filter_str for filter_str in filter_list_str if filter_str[-1] != ":" - ] - return filter_list_str - else: - return [] diff --git a/utils/search/search_query_processor.py b/utils/search/search_query_processor.py new file mode 100644 index 000000000..2758a0527 --- /dev/null +++ b/utils/search/search_query_processor.py @@ -0,0 +1,571 @@ +# +# Freesound is (c) MUSIC TECHNOLOGY GROUP, UNIVERSITAT POMPEU FABRA +# +# Freesound is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# Freesound is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +# Authors: +# See AUTHORS file. +# + + +import json +import urllib + +from django.conf import settings +from django.urls import reverse +from django.utils.http import urlencode +import luqum.tree +from luqum.parser import parser +from luqum.pretty import prettify + +from utils.clustering_utilities import get_ids_in_cluster, get_clusters_for_query +from utils.encryption import create_hash +from utils.search.backends.solr555pysolr import FIELD_NAMES_MAP +from utils.search.search_sounds import allow_beta_search_features +from .search_query_processor_options import SearchOptionStr, SearchOptionChoice, \ + SearchOptionInt, SearchOptionBool, SearchOptionRange, SearchOptionMultipleChoice, \ + SearchOption, SearchOptionBoolElementInPath, SearchOptionFieldWeights + + +def _get_value_to_apply_group_by_pack(self): + # Force return True if display_as_packs is enabled, and False if map_mode is enabled + if self.sqp.has_filter_with_name('grouping_pack'): + return False + elif self.sqp.get_option_value_to_apply('display_as_packs'): + return True + elif self.sqp.get_option_value_to_apply('map_mode'): + return False + return self.value + + +class SearchQueryProcessor(object): + """The SearchQueryProcessor class is used to parse and process search query information from a request object and + compute a number of useful items for displaying search information in templates, constructing search URLs, and + preparing search options to be passed to the backend search engine. + """ + request = None + errors = '' + + query = SearchOptionStr( + advanced=False, + query_param_name='q', + should_be_disabled=lambda option: bool(option.sqp.get_option_value_to_apply('similar_to'))) + sort_by = SearchOptionChoice( + advanced=False, + query_param_name='s', + label='Sort', + choices = [(option, option) for option in settings.SEARCH_SOUNDS_SORT_OPTIONS_WEB], + should_be_disabled = lambda option: bool(option.sqp.get_option_value_to_apply('similar_to')), + get_default_value = lambda option: settings.SEARCH_SOUNDS_SORT_OPTION_DATE_NEW_FIRST if option.sqp.get_option_value_to_apply('query') == '' else settings.SEARCH_SOUNDS_SORT_DEFAULT) + page = SearchOptionInt( + advanced=False, + query_param_name='page', + value_default=1, + get_value_to_apply = lambda option: 1 if option.sqp.get_option_value_to_apply('map_mode') else option.value) + search_in = SearchOptionMultipleChoice( + query_param_name_prefix='si', + label='Search in', + value_default=[], + choices = [ + (settings.SEARCH_SOUNDS_FIELD_TAGS, 'Tags'), + (settings.SEARCH_SOUNDS_FIELD_NAME, 'Sound name'), + (settings.SEARCH_SOUNDS_FIELD_DESCRIPTION, 'Description'), + (settings.SEARCH_SOUNDS_FIELD_PACK_NAME, 'Pack name'), + (settings.SEARCH_SOUNDS_FIELD_ID, 'Sound ID'), + (settings.SEARCH_SOUNDS_FIELD_USER_NAME, 'Username')], + should_be_disabled = lambda option: option.sqp.get_option_value_to_apply('tags_mode') or bool(option.sqp.get_option_value_to_apply('similar_to'))) + duration = SearchOptionRange( + query_param_min='d0', + query_param_max='d1', + search_engine_field_name = 'duration', + label = 'Duration', + value_default=['0', '*']) + is_geotagged = SearchOptionBool( + query_param_name='ig', + search_engine_field_name='is_geotagged', + label='Only geotagged sounds', + help_text='Only find sounds that have geolocation information', + should_be_disabled = lambda option: option.sqp.get_option_value_to_apply('map_mode'), + get_value_to_apply = lambda option: True if option.sqp.get_option_value_to_apply('map_mode') else option.value) + is_remix = SearchOptionBool( + query_param_name='r', + search_engine_field_name='in_remix_group', + label='Only remix sounds', + help_text='Only find sounds that are a remix of other sounds or have been remixed') + group_by_pack = SearchOptionBool( + query_param_name='g', + label='Group sounds by pack', + help_text='Group search results so that multiple sounds of the same pack only represent one item', + value_default=True, + get_value_to_apply = _get_value_to_apply_group_by_pack, + should_be_disabled = lambda option: option.sqp.has_filter_with_name('grouping_pack') or option.sqp.get_option_value_to_apply('display_as_packs') or option.sqp.get_option_value_to_apply('map_mode')) + display_as_packs = SearchOptionBool( + advanced=False, + query_param_name='dp', + label='Display results as packs', + help_text='Display search results as packs rather than individual sounds', + get_value_to_apply = lambda option: False if option.sqp.has_filter_with_name('grouping_pack') else option.value, + should_be_disabled = lambda option: option.sqp.has_filter_with_name('grouping_pack') or option.sqp.get_option_value_to_apply('map_mode')) + grid_mode = SearchOptionBool( + advanced=False, + query_param_name='cm', + label='Display results in grid', + help_text='Display search results in a grid so that more sounds are visible per search results page', + get_default_value = lambda option: option.request.user.profile.use_compact_mode if option.request.user.is_authenticated else False, + should_be_disabled = lambda option: option.sqp.get_option_value_to_apply('map_mode')) + map_mode = SearchOptionBool( + advanced=False, + query_param_name='mm', + label='Display results in map', + help_text='Display search results in a map') + tags_mode = SearchOptionBoolElementInPath( + advanced=False, + element_in_path='/browse/tags/') + similar_to = SearchOptionStr( + query_param_name='st') + compute_clusters = SearchOptionBool( + query_param_name='cc', + label='Cluster results by sound similarity') + cluster_id = SearchOptionInt( + advanced=False, + query_param_name='cid', + get_value_to_apply = lambda option: -1 if not option.sqp.get_option_value_to_apply('compute_clusters') else option.value) + field_weights = SearchOptionFieldWeights( + query_param_name = 'w' + ) + + def __init__(self, request, facets=None): + """Initializes the SearchQueryProcessor object by parsing data from the request and setting up search options. + + Args: + request (django.http.HttpRequest): request object from which to parse search options + facets (dict, optional): dictionary with facet options to be used in the search. If not provided, default + facets will be used. Default is None. + """ + + # Store the request and the facets argument as it will be used later + self.request = request + if facets is None: + self.facets = settings.SEARCH_SOUNDS_DEFAULT_FACETS.copy() # NOTE: not sure if .copy() is needed here to avoid mutating original setting + else: + self.facets = facets + + # Put all SearchOption objects in a self.options dictionary so we can easily iterate them and we can access them through self.options attribute + # In this was SearchOption objects are accessible in a similar way as Django form fields are accessible in form objects + # NOTE: even though we add references to the SearchOption objects in the self.options dictionary, we don't actually remove these references from + # the SearchQueryProcessor "root". Ideally we should remove these references from the root object to avoid confusion. + self.options = {} + for member in dir(self): + if isinstance(getattr(self, member), SearchOption): + self.options[member] = getattr(self, member) + + # Get filter and parse it. Make sure it is iterable (even if it only has one element) + self.f = urllib.parse.unquote(request.GET.get('f', '')).strip().lstrip() + if self.f: + try: + f_parsed = parser.parse(self.f) + if type(f_parsed) == luqum.tree.SearchField: + self.f_parsed = [f_parsed] + else: + self.f_parsed = f_parsed.children + except luqum.exceptions.ParseError as e: + self.errors = f"Filter parsing error: {e}" + self.f_parsed = [] + else: + self.f_parsed = [] + + # Remove duplicate filters if any + nodes_in_filter = [] + f_parsed_no_duplicates = [] + for node in self.f_parsed: + if node not in nodes_in_filter: + nodes_in_filter.append(node) + f_parsed_no_duplicates.append(node) + self.f_parsed = f_parsed_no_duplicates + + # Implement compatibilty with old URLs in which "duration"/"is remix"/"is geotagged" options were passed as raw filters. + # If any of these filters are present, we parse them to get their values and modify the request to simulate the data being + # passed in the new expected way (through request parameters). If present, we also remove these filters from the f_parsed object. + values_to_update = {} + for field_name in [self.options['is_remix'].search_engine_field_name, self.options['is_geotagged'].search_engine_field_name]: + for node in self.f_parsed: + if type(node) == luqum.tree.SearchField: + if node.name == field_name: + values_to_update[field_name] = str(node.expr) == '1' + self.f_parsed = [f for f in self.f_parsed if f != node] + + field_name = self.options['duration'].search_engine_field_name + for node in self.f_parsed: + if type(node) == luqum.tree.SearchField: + if node.name == field_name: + # node.expr is expected to be of type luqum.tree.Range + values_to_update[field_name] = [str(node.expr.low), str(node.expr.high)] + self.f_parsed = [f for f in self.f_parsed if f != node] + + if values_to_update: + self.request.GET = self.request.GET.copy() + if self.is_remix.search_engine_field_name in values_to_update: + self.request.GET[self.options['is_remix'].query_param_name] = '1' if values_to_update[self.options['is_remix'].search_engine_field_name] else '0' + if self.is_geotagged.search_engine_field_name in values_to_update: + self.request.GET[self.options['is_geotagged'].query_param_name] = '1' if values_to_update[self.options['is_geotagged'].search_engine_field_name] else '0' + if self.duration.search_engine_field_name in values_to_update: + self.request.GET[self.options['duration'].query_param_min] = values_to_update[self.options['duration'].search_engine_field_name][0] + self.request.GET[self.options['duration'].query_param_max] = values_to_update[self.options['duration'].search_engine_field_name][1] + + # Pass the reference to the SearchQueryProcessor object to all search options, and load the search option values from the request + for option in self.options.values(): + option.set_search_query_processor(self) + option.load_value() + + # Some of the filters included in the search query (in f_parsed) might belong to filters which are added by SearchOption objects, but some others might + # be filters added by search facets or "raw filters" directly added to the URL by the user. Some methods of the SearchQueryProcessor need to know which + # filters belong to search options, so we pre-compute the list of non-option filters here as a list of (field,value) tuples. For example, if + # a query has the filter "f=is_geotagged:1 samplerate:44100", self.non_option_filters will be [('samplerate', '44100')] as "is_geotagged" is a filter managed + # by the SearchOptionIsGeotagged option, but "samplerate" is a facet filter and not managed by a search option. + self.non_option_filters = [] + search_engine_field_names_used_in_options = [option.search_engine_field_name for option in self.options.values() if hasattr(option, 'search_engine_field_name')] + for node in self.f_parsed: + if type(node) == luqum.tree.SearchField: + if node.name not in search_engine_field_names_used_in_options: + self.non_option_filters.append(( + node.name, + str(node.expr) + )) + + # Filter-related methods + + def get_active_filters(self, include_filters_from_options=True, + include_non_option_filters=True, + include_filters_from_facets=True, + extra_filters=None, + ignore_filters=None): + """Returns a list of all filters which are active in the query in a ["field:value", "field:value", ...] format. This method + also allows to add extra filters to the list or ignore some of the existing filters. + + Args: + include_filters_from_options (bool, optional): If True, filters from search options will be included. Default is True. + include_non_option_filters (bool, optional): If True, filters from non-option filters will be included. Default is True. + include_filters_from_facets (bool, optional): If True, filters from search facets will be included. Note that if + include_non_option_filters is set to False, include_filters_from_facets will have no effect as facet filters are part of + non-option filters. Default is True. + extra_filters (list, optional): List of extra filters to be added. Each filter should be a string in the format "field:value", + e.g.: extra_filters=["tag:tagname"]. Default is None. + ignore_filters (list, optional): List of filters to be ignored. Each filter should be a string in the format "field:value", + e.g.: ignore_filters=["tag:tagname"]. Default is None. + """ + # Create initial list of the active filters according to the types of filters that are requested to be included + ff = [] + if include_filters_from_options: + for option in self.options.values(): + fit = option.as_filter() + if fit is not None: + ff.append(fit) + if include_non_option_filters: + for non_option_filter in self.non_option_filters: + should_be_included = True + facet_search_engine_field_names = [FIELD_NAMES_MAP[f] for f in self.facets.keys()] + if not include_filters_from_facets and non_option_filter[0] in facet_search_engine_field_names: + should_be_included = False + if should_be_included: + ff.append(f'{non_option_filter[0]}:{non_option_filter[1]}') + + # Remove ignored filters + if ignore_filters is not None: + ff = [f for f in ff if f not in ignore_filters] + + # Add extra filter + if extra_filters is not None: + ff += extra_filters + return ff + + def get_num_active_filters(self, include_filters_from_options=True, + include_non_option_filters=True, + include_filters_from_facets=True, + extra_filters=None, + ignore_filters=None): + """Returns the number of active filters in the query. This method has the same parameters of self.get_active_filters. + """ + return len(self.get_active_filters(include_filters_from_options=include_filters_from_options, include_non_option_filters=include_non_option_filters, + include_filters_from_facets=include_filters_from_facets, extra_filters=extra_filters, ignore_filters=ignore_filters)) + + def get_filters_data_to_display_in_search_results_page(self): + """Returns a list of filters to be displayed in the search results page. Each element in the list is a tuple with (field, value, remove_url), where + field is the name of the field, value is the value of the filter, and remove_url is the URL thta should be followed to remove the filter from the query. + """ + filters_data = [] + for name, value in self.non_option_filters: + filter_data = [name, value, self.get_url(remove_filters=[f'{name}:{value}'])] + if name == 'grouping_pack': + # There is a special case for the grouping_pack filter in which we only want to display the name of the pack and not the ID + filter_data[0] = 'pack' + if value.startswith('"'): + filter_data[1] = '"'+ value[value.find("_")+1:] + else: + filter_data[1] = value[value.find("_")+1:] + filters_data.append(filter_data) + + cluster_id = self.get_option_value_to_apply('cluster_id') + if cluster_id > -1: + # If a cluster ID filer is present, we also add it to the list of removable filters + cluster_results = get_clusters_for_query(self) + if cluster_results is not None and cluster_id in cluster_results['cluster_ids']: + cluster_number = cluster_results['cluster_ids'].index(cluster_id) + 1 + filters_data.append(['cluster', f'#{cluster_number}', self.get_url().replace(f'cid={cluster_id}', 'cid=-1')]) + + return filters_data + + def has_filter_with_name(self, filter_name): + """Returns True if the parsed filter has a filter with the given name. + """ + for node in self.f_parsed: + if type(node) == luqum.tree.SearchField: + if node.name == filter_name: + return True + return False + + def get_tags_in_filters(self): + """Returns a list of tags that are being used in the filters. E.g.: ["tag1", "tag2"] + """ + tags_in_filter = [] + for field, value in self.non_option_filters: + if field == 'tag': + if value[0] == '"' and value[-1] == '"': + value = value[1:-1] # Remove quotes + tags_in_filter.append(value) + return tags_in_filter + + def get_filter_string_for_search_engine(self, + include_filters_from_options=True, + include_non_option_filters=True, + include_filters_from_facets=True, + extra_filters=None, + ignore_filters=None): + """Returns a filter string with the proper format to be used by the search engine. This method has the same parameters of self.get_active_filters + to indicate which filters should or should not be included. By default all filters are included. En example of a filter string returned by that + method could be something like: 'duration:[0.25 TO 20] tag:"tag1" is_geotagged:1 (id:1 OR id:2 OR id:3) tag:"tag2"' + """ + ff = self.get_active_filters(include_filters_from_options=include_filters_from_options, include_non_option_filters=include_non_option_filters, + include_filters_from_facets=include_filters_from_facets, extra_filters=extra_filters, ignore_filters=ignore_filters) + return ' '.join(ff) + + def get_filter_string_for_url(self, extra_filters=None, ignore_filters=None): + """Returns a filter string to be used in search URLs. Note that filters which are managed by SearchOption objects must not be included here as + these are added to URLs as query parameters. Note that this method also includes the "extra_filters" and "ignore_filters" parameters from + self.get_active_filters as this is useful to create URLs to add or remove filters.""" + return self.get_filter_string_for_search_engine(include_filters_from_options=False, extra_filters=extra_filters, ignore_filters=ignore_filters) + + # Other util methods + + def contains_active_advanced_search_options(self): + """Returns true if the query has any active options which belong to the "advanced search" panel + Also returns true if the query has active undocumented options which are hidden in the advanced + search panel but that are allowed as "power user" options + """ + for option in self.options.values(): + if option.advanced: + if option.set_in_request: + if not option.is_default_value: + return True + return False + + def get_clustering_data_cache_key(self, include_filters_from_facets=False): + """Generates a cache key used to store clustering results in the cache. Note that the key excludes facet filters + by default because clusters are computed on the subset of results BEFORE applying the facet filters (this is by + design to avoid recomputing clusters when changing facets). However, the key can be generated including facets as + well because in some occasions we want to store clustering-related data which does depend on the facet filters which + are applied after the main clustaering computation. + + Args: + include_filters_from_facets (bool): If True, the key will include filters from facets as well. Default is False. + Filters that are included in facets correspond to the facet fields defined in self.facets, which defaults to + settings.SEARCH_SOUNDS_DEFAULT_FACETS. + + Returns: + str: Cache key for the clustering data + """ + query_filter = self.get_filter_string_for_search_engine(include_filters_from_facets=include_filters_from_facets) + key = f'cluster-results-{self.get_option_value_to_apply("query")}-' + \ + f'{query_filter}-{self.get_option_value_to_apply("sort_by")}-' + \ + f'{self.get_option_value_to_apply("similar_to")}-' + \ + f'{self.get_option_value_to_apply("group_by_pack")}' + return create_hash(key, limit=32) + + def get_textual_description(self): + """Returns a textual description of the search query, e.g.: "cat (some filters applied)"' + """ + query_description = '' + textual_query = self.get_option_value_to_apply('query') + if textual_query: + query_description = f'"{textual_query}"' + else: + query_description = 'Empty query' + num_filters = self.get_num_active_filters() + if num_filters: + query_description += f' with {num_filters} filter{"" if num_filters == 1 else "s"}' + return query_description + + def print(self): + """Prints the SearchQueryProcessor object in a somewhat human readable format + """ + print('\nSEARCH QUERY') + print('f_parsed:') + print(prettify(self.f_parsed)) + if self.errors: + print('errors:') + print(self.errors) + print('options:') + for name, option in self.options.items(): + print('-', name, option) + if self.non_option_filters: + print('non_option_filters:') + for filter in self.non_option_filters: + print('-', f'{filter[0]}={filter[1]}') + + def as_query_params(self, exclude_facet_filters=False): + """Returns a dictionary with the search options and filters to be used as parameters for the SearchEngine.search_sounds method. + This method post-processes the data loaded into the SearchQueryProcessor to generate an approptiate query_params dict. Note that + this method includes some complex logic that takes into account the interaction with some option values to calculate the + query_params values to be used by the search engine. + + Args: + exclude_facet_filters (bool, optional): If True, facet filters will not be used to create the query_params dict. Default is False. + This is useful as part of the clustering features for which we want to make a query which ignores the facet filters provided in the URL. + + Returns: + dict: Dictionary with the query parameters to be used by the SearchEngine.search_sounds method. + """ + + # Filter field weights by "search in" options + field_weights = self.get_option_value_to_apply('field_weights') + search_in_value = self.get_option_value_to_apply('search_in') + if search_in_value: + field_weights = {field: weight for field, weight in field_weights.items() if field in search_in_value} + + # Number of sounds + if self.get_option_value_to_apply('display_as_packs'): + # When displaying results as packs, always return the same number regardless of the compact mode setting + # This because returning a large number of packs makes the search page very slow + # If we optimize pack search, this should be removed + num_sounds = settings.SOUNDS_PER_PAGE + else: + num_sounds = settings.SOUNDS_PER_PAGE if not self.get_option_value_to_apply('grid_mode') else settings.SOUNDS_PER_PAGE_COMPACT_MODE + + # Clustering + only_sounds_within_ids = [] + if allow_beta_search_features(self.request): + cluster_id = self.get_option_value_to_apply('cluster_id') + if cluster_id > -1: + only_sounds_within_ids = get_ids_in_cluster(self.get_clustering_data_cache_key(), cluster_id) + + # Facets + facets = self.facets + if self.get_option_value_to_apply('tags_mode'): + facets[settings.SEARCH_SOUNDS_FIELD_TAGS]['limit'] = 50 + + # Number of sounds per pack group + num_sounds_per_pack_group = 1 + if self.get_option_value_to_apply('display_as_packs'): + # If displaying search results as packs, include 3 sounds per pack group in the results so we can display these sounds as selected sounds in the + # display_pack templatetag + num_sounds_per_pack_group = 3 + + # Process similar_to parameter to convert it to a list if a vector is passed instead of a sound ID + similar_to = self.get_option_value_to_apply('similar_to') + if similar_to != '': + # If it stars with '[', then we assume this is a serialized vector passed as target for similarity + if similar_to.startswith('['): + similar_to = json.loads(similar_to) + else: + # Othrwise, we assume it is a sound id and we pass it as integer + similar_to = int(similar_to) + else: + similar_to = None + + return dict( + textual_query=self.get_option_value_to_apply('query'), + query_fields=field_weights, + query_filter=self.get_filter_string_for_search_engine(include_filters_from_facets=not exclude_facet_filters), + field_list=['id', 'score'] if not self.get_option_value_to_apply('map_mode') else ['id', 'score', 'geotag'], + current_page=self.get_option_value_to_apply('page'), + num_sounds=num_sounds if not self.get_option_value_to_apply('map_mode') else settings.MAX_SEARCH_RESULTS_IN_MAP_DISPLAY, + sort=self.get_option_value_to_apply('sort_by'), + group_by_pack=self.get_option_value_to_apply('group_by_pack') or self.get_option_value_to_apply('display_as_packs'), + num_sounds_per_pack_group=num_sounds_per_pack_group, + facets=facets, + only_sounds_with_pack=self.get_option_value_to_apply('display_as_packs'), + only_sounds_within_ids=only_sounds_within_ids, + similar_to=similar_to + ) + + def get_url(self, add_filters=None, remove_filters=None): + """Returns the URL of the search page (or tags page, see below) corresponding to the current parameters loaded in the SearchQueryProcessor. + This method also ha sparameters to "add_filters" and "remove_filters", which will return the URL to the search page corresponding to the + current parameters loaded in the SearchQueryProcessor BUT with some filters added or removed. + + Args: + add_filters (list, optional): List of filters to be added. Each filter should be a string in the format "field:value", + e.g.: add_filters=["tag:tagname"]. Default is None. + remove_filters (list, optional): List of filters to be ignored. Each filter should be a string in the format "field:value", + e.g.: remove_filters=["tag:tagname"]. Default is None. + """ + # Decide the base url (if in the tags page, we'll use the base URL for tags, otherwise we use the one for the normal search page) + if self.get_option_value_to_apply('tags_mode'): + base_url = reverse("tags") + else: + base_url = reverse("sounds-search") + + # Add query parameters from search options + parameters_to_add = {} + for option in self.options.values(): + if option.set_in_request and not option.is_default_value: + params_for_url = option.as_URL_params() + if params_for_url is not None: + parameters_to_add.update(params_for_url) + + # Add filter parameter + # Also pass extra filters to be added and/or filters to be removed when making the URL + filter_for_url = self.get_filter_string_for_url(extra_filters=add_filters, ignore_filters=remove_filters) + if filter_for_url: + parameters_to_add['f'] = filter_for_url + encoded_params = urlencode(parameters_to_add) + if encoded_params: + return f'{base_url}?{encoded_params}' + else: + return base_url + + # Some util methods to access option values more easily + + def get_option_value_to_apply(self, option_name): + option = self.options[option_name] + return option.value_to_apply + + def tags_mode_active(self): + return self.options['tags_mode'].value_to_apply + + def similar_to_active(self): + return self.options['similar_to'].value_to_apply + + def compute_clusters_active(self): + return self.options['compute_clusters'].value_to_apply + + def display_as_packs_active(self): + return self.options['display_as_packs'].value_to_apply + + def grid_mode_active(self): + return self.options['grid_mode'].value_to_apply + + def map_mode_active(self): + return self.options['map_mode'].value_to_apply + + \ No newline at end of file diff --git a/utils/search/search_query_processor_options.py b/utils/search/search_query_processor_options.py new file mode 100644 index 000000000..b4a0f025e --- /dev/null +++ b/utils/search/search_query_processor_options.py @@ -0,0 +1,398 @@ +# +# Freesound is (c) MUSIC TECHNOLOGY GROUP, UNIVERSITAT POMPEU FABRA +# +# Freesound is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# Freesound is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +# Authors: +# See AUTHORS file. +# + +from django.conf import settings + + +class SearchOption(object): + """Base class to process and hold information about a search option (e.g. a filter or a sort option) of a search query. + SearchOption objects parse option data from a request object, and are later able to translate such data into a search + engine filter and/or a request parameter depeding on the type of option. The class is meant to be subclassed to implement + particular types of search options (e.g. boolean, integer, string, etc.). The SearchOption object can also include logic + for determining when a specific option should be enabled or disabled based on the data of the search query (including the + values of other search options). + """ + sqp = None # SearchQueryProcessor object that holds this option object + value = None # Value of the option as a valid Python type (e.g. bool, int, str, list, etc.) + set_in_request = None # Stores whether or not the option data is present in the search request + + def __init__(self, + advanced=True, + label='', + help_text='', + search_engine_field_name=None, + query_param_name=None, + value_default=None, + get_default_value=None, + should_be_disabled=None, + get_value_to_apply=None): + """Initialize the SearchOption object. + + Args: + advanced (bool, optional): Whether this option is part of the advanced search options (defaults to True). + label (str, optional): Label to be used in the frontend template when displaying the option. + help_text (str, optional): Help text to be used in the frontend template when displaying the option. + search_engine_field_name (str, optional): Field name of the search engine index correspoding to this option (can be None). + query_param_name (str, optional): Name to represent this option in the URL query parameters (can be None). + value_default (any, optional): Value of the option to be used when not set in the request (as valid Python type). Note that + this value can be overriden if passing the get_default_value optional parameter. + get_default_value (function, optional): A function returning the default value (as a valid Python type) that the option should take + if not set in the request. The function will be passed the SearchOption itself as an argument. + should_be_disabled (function, optional): Function to determine if the option should be disabled based on the data of the search query. + The function will be passed the SearchOption itself as an argument. + get_value_to_apply (function, optional): Function to determine the value to be used when applying the option in the search engine. + The function will be passed the SearchOption itself as an argument. + """ + self.advanced = advanced + self.label = label + self.help_text = help_text + self.search_engine_field_name = search_engine_field_name + self.query_param_name = query_param_name + if value_default is not None: + self.value_default = value_default + if get_default_value is not None: + self.get_default_value = get_default_value + if should_be_disabled is not None: + self.should_be_disabled = should_be_disabled + if get_value_to_apply is not None: + self.get_value_to_apply = get_value_to_apply + + def set_search_query_processor(self, sqp): + """Set the SearchQueryProcessor object that holds this option object. The sqp parameter is a SearchQueryProcessor object + that allows SearchOption objects to access the request object and the value of other search options.""" + self.sqp = sqp + + @property + def request(self): + """Property to access the request object from the SearchQueryProcessor object in a convenient way.""" + return self.sqp.request + + def load_value(self): + """Sets the value of the option based on the data present in the request. If the option is not present in the request, + the default value is used. The set_in_request attribute is also set to True or False depending on whether the option + is present in the request or not. + """ + value_from_request = self.get_value_from_request() + if value_from_request is not None: + self.set_in_request = True + self.value = value_from_request + else: + self.set_in_request = False + self.value = self.default_value + + def get_value_from_request(self): + """Return the value of the option as a valid Python type (e.g. bool, int, str, list, etc.) based on the data present + in the search request. Must return None if the option is not passed in the request. This method is expected to be + implemented in the subclasses of SearchOption.""" + raise NotImplementedError + + @property + def default_value(self): + """Returns the default value of the search option as a valid Python type (e.g. bool, int, str, list, etc.) using the + self.get_default_value member passed as an argument to SearchOption or the existing self.value_default property.""" + if hasattr(self, 'get_default_value'): + return self.get_default_value(self) + else: + return self.value_default + + @property + def is_default_value(self): + """Returns True if the parsed value of the option is the same as the default value, returns False otherwise.""" + return self.value == self.default_value + + @property + def disabled(self): + """Returns True if the search option is disabled and false otherwise. If the method self.should_be_disabled has been set in + the object constructor, then it is used to compute the disabled property. When a search option is disabled, users will not + be able to edit their values when redered in the search form. Other than that, the SearchOption is treated as any other + option and will be used when computing query params for the search engine.""" + if hasattr(self, 'should_be_disabled'): + return self.should_be_disabled(self) + else: + return False + + def format_value(self, value): + """Returns a string representation of the value passed as argument to be used in search engine parameters or as a URL parameter. + This method must be subclassed to implement meaningful conversions from the passed Python-type value to a string. + """ + raise NotImplementedError + + @property + def value_to_apply(self): + """Returns the value of the option to be used when applying the option in the search engine. By default, this method returns + the same value which is stored after reading the SearchOption value from the request, but some SearchOptions might use this + method to implement additional logic for computing the value to be used in the search engine (for example, if the actual value + to be used should change depending on the value of other search options).""" + if hasattr(self, 'get_value_to_apply'): + return self.get_value_to_apply(self) + else: + return self.value + + def as_filter(self): + """Returns a string to be used as a search engine filter for applying the search option. If this method returns None, then it + will not be applied as a filter. The filter will be applied if the option is expected to be applied as a filter (if it has + self.search_engine_field_name set) and if it is set in the request or the self.value_to_apply is diffrent than the default + value.""" + if self.search_engine_field_name is not None: + if self.set_in_request or (self.value_to_apply != self.default_value): + return f'{self.search_engine_field_name}:{self.format_value(self.value_to_apply)}' + + def as_URL_params(self): + """Returns a dictionary with the URL parameters to be used when rendering the search option in a URL. Most search options + will be rendered as a single URL parameter, but some subclasses might override this method to implement more complex logic + using multiple parameters or other types of calculations. If the option should not be rendered in the URL, return None. Note + that unlike the value we use to send to the search engine, here we want to use the value as it is set in the original request, + without including any additiontal post-processing from self.value_to_apply. This is because we want the URL parameters + generated by a search option to be equivalent to the parameters of that option passed in the original request. + """ + if self.query_param_name is not None: + return {self.query_param_name: self.format_value(self.value)} + + @property + def value_formatted(self): + """Returns the value of the option formatted to be used in search engine parameters or as a URL parameter. This method is + conveninent in the search frontend templates to set the request value of the option in the search form.""" + return self.format_value(self.value) + + def __str__(self): + return f"{self.label}={self.value}, apply: {self.value_to_apply} ({'in request' if self.set_in_request else 'not in request'}, {'disabled' if self.disabled else 'enabled'})" + + def __copy2__(self): + newone = type(self)() + newone.__dict__.update(self.__dict__) + return newone + + +class SearchOptionBool(SearchOption): + value_default = False + + def get_value_from_request(self): + if self.query_param_name is not None: + if self.query_param_name in self.request.GET: + return self.request.GET.get(self.query_param_name) == '1' or self.request.GET.get(self.query_param_name) == 'on' + + def format_value(self, value): + return '1' if value else '0' + + def as_filter(self): + """Boolean search options are only added to the filter if they set to True. In this way, when set to False, we return the + whole set of results without filtering by this option, but when the option is set, we filter results by this option. It + might happen that boolean search options added in the future require a different logic and then we should consider + updating this class to support a different behavior.""" + if self.value_to_apply == True: + return super().as_filter() + + +class SearchOptionInt(SearchOption): + """SearchOption class to represent integer options. + """ + value_default = -1 + + def get_value_from_request(self): + if self.query_param_name is not None: + if self.query_param_name in self.request.GET: + return int(self.request.GET.get(self.query_param_name)) + + def format_value(self, value): + return str(value) + + +class SearchOptionStr(SearchOption): + """SearchOption class to represent string options. + """ + value_default = '' + + def get_value_from_request(self): + if self.query_param_name is not None: + if self.query_param_name in self.request.GET: + return self.request.GET.get(self.query_param_name) + + def format_value(self, value): + return str(value) + + +class SearchOptionChoice(SearchOptionStr): + """SearchOption class to represent choice options in which one string option is selected + from a list of available choices. Choices must have the format [(value, label), ...], typical from + Django forms. + """ + def __init__(self, choices=[], **kwargs): + """Args: + choices (list): List of available choices in the format [(value, label), ...]. + """ + self.choices =choices + super().__init__(**kwargs) + + def get_choices_annotated_with_selection(self): + """Returns the list of available choices annotated with a boolean indicating whether the choice + is selected or not. This is useful in search templates. + """ + choices_annotated = [] + for value, label in self.choices: + choices_annotated.append((value, label, value == self.value)) + return choices_annotated + + +class SearchOptionMultipleChoice(SearchOption): + """SearchOption class to represent choice options in which multiple string options are selected + from a list of available choices. Choices must have the format [(value, label), ...], typical from + Django forms. Multiple choices are expected to be passed in the request as multiple URL parameters + with a common prefix such as "&{prefix}_{value}=1". + """ + value_default = [] + + def __init__(self, choices=[], query_param_name_prefix='', **kwargs): + """Args: + choices (list): List of available choices in the format [(value, label), ...]. + query_param_name_prefix (str): Prefix to be used in the URL parameters to represent the multiple choices. + """ + self.choices = choices + self.query_param_name_prefix = query_param_name_prefix + super().__init__(**kwargs) + + def get_query_param_name(self, value): + return f'{self.query_param_name_prefix}_{value}' + + def get_value_from_request(self): + selected_values = [] + for value, _ in self.choices: + query_param_name = self.get_query_param_name(value) + if query_param_name in self.request.GET: + if self.request.GET.get(query_param_name) == '1' or self.request.GET.get(query_param_name) == 'on': + selected_values.append(value) + return selected_values + + def format_value(self, value): + return "[" + " OR ".join([str(v) for v in value]) + "]" + + def get_choices_annotated_with_selection(self): + """Returns the list of available choices annotated with a boolean indicating whether the choice + is selected or not. This is useful in search templates. + """ + choices_annotated = [] + for value, label in self.choices: + choices_annotated.append((self.get_query_param_name(value), label, value in self.value)) + return choices_annotated + + def as_URL_params(self): + params = {self.get_query_param_name(value): '1' for value, _ in self.choices if value in self.value} + return params + + +class SearchOptionRange(SearchOption): + value_default = ['*', '*'] + query_param_min = None + query_param_max = None + + def __init__(self, query_param_min=None, query_param_max=None, **kwargs): + """Args: + query_param_min (str, optional): Name of the URL parameter to represent the minimum value of the range. + query_param_max (str, optional): Name of the URL parameter to represent the maximum value of the range. + """ + self.query_param_min = query_param_min + self.query_param_max = query_param_max + super().__init__(**kwargs) + + + def get_value_from_request(self): + if self.query_param_min is not None and self.query_param_max is not None: + if self.query_param_min in self.request.GET or self.query_param_max in self.request.GET: + value = self.value_default.copy() + if self.query_param_min in self.request.GET: + value_from_param = str(self.request.GET[self.query_param_min]) + if value_from_param: + value[0] = value_from_param + if self.query_param_max in self.request.GET: + value_from_param = str(self.request.GET[self.query_param_max]) + if value_from_param: + value[1] = value_from_param + return value + + def format_value(self, value): + return f'[{value[0]} TO {value[1]}]' + + def as_URL_params(self): + return {self.query_param_min: self.value[0], + self.query_param_max: self.value[1]} + + def as_filter(self): + """SearchOptionRange search options are only added to the filter if the specified range is not covering all possible + fieldd values. The defined self.default_value for a range option is expected to include all results, therefore if the + value is the same as the default value, we don't need to include this option as a filter. It might happen that range + search options added in the future require a different logic and then we should consider updating this class to support + a different behavior.""" + if not self.is_default_value: + return super().as_filter() + + +class SearchOptionBoolElementInPath(SearchOptionBool): + """This is a special type of search option which is not passed as a URL parameter but is determined based on the URL path. + The "element_in_path" is compared with the request path and the value of the option is set to True if the element is present. + """ + + def __init__(self, element_in_path='', **kwargs): + """Args: + element_in_path (str): Element to be checked in the request path. + """ + self.element_in_path = element_in_path + super().__init__(**kwargs) + + def get_value_from_request(self): + return self.element_in_path in self.request.path + + +class SearchOptionFieldWeights(SearchOptionStr): + """This is a search option for the "field weights" parameter in the search engine. This parameter must be parsed in a particular + way which requires further customisation of SearchOption object and therefore can't be implemented with another generic + SearchOptionX class. + """ + value_default = settings.SEARCH_SOUNDS_DEFAULT_FIELD_WEIGHTS + + def get_value_from_request(self): + """param weights can be used to specify custom field weights with this format + w=field_name1:integer_weight1,field_name2:integrer_weight2, eg: w=name:4,tags:1 + ideally, field names should any of those specified in settings.SEARCH_SOUNDS_FIELD_* + so the search engine can implement ways to translate the "web names" to "search engine" + names if needed. + """ + weights_param = self.request.GET.get(self.query_param_name, None) + parsed_field_weights = {} + if weights_param: + for part in weights_param.split(','): + if ':' in part: + try: + field_name = part.split(':')[0] + weight = int(part.split(':')[1]) + parsed_field_weights[field_name] = weight + except Exception: + # If format is wrong, ignore parameter + pass + if len(parsed_field_weights): + return parsed_field_weights + else: + return None + + def as_URL_params(self): + value_for_url = '' + for field, weight in self.value.items(): + value_for_url += f'{field}:{weight},' + if value_for_url.endswith(','): + value_for_url = value_for_url[:-1] + return {self.query_param_name : value_for_url} \ No newline at end of file diff --git a/utils/search/search_sounds.py b/utils/search/search_sounds.py index 81f084a00..6e673f521 100644 --- a/utils/search/search_sounds.py +++ b/utils/search/search_sounds.py @@ -22,237 +22,22 @@ from django.conf import settings from django.db.models.query import RawQuerySet -from urllib.parse import quote_plus -from pyparsing import ParseException -import clustering from utils.search import SearchEngineException, get_search_engine, SearchResultsPaginator -from utils.search.lucene_parser import parse_query_filter_string +import utils.search + search_logger = logging.getLogger("search") console_logger = logging.getLogger("console") -def should_use_compact_mode(request): - use_compact_mode_enabled_in_form = request.GET.get('cm') - if not request.user.is_authenticated: - return use_compact_mode_enabled_in_form == '1' - else: - if use_compact_mode_enabled_in_form is None: - # Use user default - return request.user.profile.use_compact_mode - elif use_compact_mode_enabled_in_form == '1': - # Use compact mode, but update user preferences if these differ from form value - if use_compact_mode_enabled_in_form and not request.user.profile.use_compact_mode: - request.user.profile.use_compact_mode = True - request.user.profile.save() - return True - else: - # Do not use compact mode, but update user preferences if these differ from form value - if use_compact_mode_enabled_in_form and request.user.profile.use_compact_mode: - request.user.profile.use_compact_mode = False - request.user.profile.save() - return False - -def contains_active_advanced_search_filters(request, query_params, extra_vars): - duration_filter_is_default = True - if 'duration:' in query_params['query_filter']: - if 'duration:[0 TO *]' not in query_params['query_filter']: - duration_filter_is_default = False - using_advanced_search_weights = request.GET.get("a_tag", False) \ - or request.GET.get("a_filename", False) \ - or request.GET.get("a_description", False) \ - or request.GET.get("a_packname", False) \ - or request.GET.get("a_soundid", False) \ - or request.GET.get("a_username", False) - return using_advanced_search_weights \ - or 'is_geotagged:' in query_params['query_filter'] \ - or 'in_remix_group:' in query_params['query_filter'] \ - or not duration_filter_is_default - - -def search_prepare_parameters(request): - """Parses and pre-process search input parameters from the search view request object and returns them as a dict. - - From the request object, it constructs a dict with query parameters which will be compatible with - utils.search.SearchEngine.search_sounds(...) parameters. Additionally, other variables are returned which - are used for logging purpose and for building the search view context variables. - - Args: - request (HttpRequest): request associated with the search query submitted by the user. - - Returns: - Tuple(dict, dict, dict): 3-element tuple containing the query parameters compatible with the search_sounds, - method from SearchEngine, the search params used for logging, and some extra parameters needed in - the search view. - """ - search_query = request.GET.get("q", "") - filter_query = request.GET.get("f", "").strip().lstrip() - cluster_id = request.GET.get('cluster_id', "") - - try: - current_page = int(request.GET.get("page", 1)) - except ValueError: - current_page = 1 - sort = request.GET.get("s", None) - - if search_query == "" and sort is None: - # When making empty queries and no sorting is specified, automatically set sort to "created desc" as - # relevance score based sorting makes no sense - sort = settings.SEARCH_SOUNDS_SORT_OPTION_DATE_NEW_FIRST - - # If the query is filtered by pack, do not collapse sounds of the same pack (makes no sense) - # If the query is through AJAX (for sources remix editing), do not collapse by pack - group_by_pack = request.GET.get("g", "1") == "1" # Group by default - if "pack" in filter_query or request.GET.get("ajax", "") == "1": - group_by_pack = False - - # If the query is filtered by pack, do not add the "only sounds with pack" filter (makes no sense) - only_sounds_with_pack = request.GET.get("only_p", "0") == "1" # By default, do not limit to sounds with pack - if "pack" in filter_query: - only_sounds_with_pack = False - - # If the query is displaying only sounds with pack, also enable group by pack as this is needed to display - # results as packs - if only_sounds_with_pack: - group_by_pack = True - - # Set default values for field weights - id_weight = settings.SEARCH_SOUNDS_DEFAULT_FIELD_WEIGHTS[settings.SEARCH_SOUNDS_FIELD_ID] - tag_weight = settings.SEARCH_SOUNDS_DEFAULT_FIELD_WEIGHTS[settings.SEARCH_SOUNDS_FIELD_TAGS] - description_weight = settings.SEARCH_SOUNDS_DEFAULT_FIELD_WEIGHTS[settings.SEARCH_SOUNDS_FIELD_DESCRIPTION] - username_weight = settings.SEARCH_SOUNDS_DEFAULT_FIELD_WEIGHTS[settings.SEARCH_SOUNDS_FIELD_USER_NAME] - pack_tokenized_weight = settings.SEARCH_SOUNDS_DEFAULT_FIELD_WEIGHTS[settings.SEARCH_SOUNDS_FIELD_PACK_NAME] - original_filename_weight = settings.SEARCH_SOUNDS_DEFAULT_FIELD_WEIGHTS[settings.SEARCH_SOUNDS_FIELD_NAME] - - # Parse advanced search options - advanced = request.GET.get("advanced", "") - advanced_search_params_dict = {} - - if advanced == "1": - a_tag = request.GET.get("a_tag", "") - a_filename = request.GET.get("a_filename", "") - a_description = request.GET.get("a_description", "") - a_packname = request.GET.get("a_packname", "") - a_soundid = request.GET.get("a_soundid", "") - a_username = request.GET.get("a_username", "") - - # These are stored in a dict to facilitate logging and passing to template - advanced_search_params_dict.update({ - 'a_tag': a_tag, - 'a_filename': a_filename, - 'a_description': a_description, - 'a_packname': a_packname, - 'a_soundid': a_soundid, - 'a_username': a_username, - }) - - # If none is selected use all (so other filter can be applied) - if a_tag != "" or a_filename != "" or a_description != "" or a_packname != "" or a_soundid != "" \ - or a_username != "": - - # Initialize all weights to 0 - id_weight = 0 - tag_weight = 0 - description_weight = 0 - username_weight = 0 - pack_tokenized_weight = 0 - original_filename_weight = 0 - - # Set the weights of selected checkboxes - if a_soundid != "": - id_weight = settings.SEARCH_SOUNDS_DEFAULT_FIELD_WEIGHTS[settings.SEARCH_SOUNDS_FIELD_ID] - if a_tag != "": - tag_weight = settings.SEARCH_SOUNDS_DEFAULT_FIELD_WEIGHTS[settings.SEARCH_SOUNDS_FIELD_TAGS] - if a_description != "": - description_weight = \ - settings.SEARCH_SOUNDS_DEFAULT_FIELD_WEIGHTS[settings.SEARCH_SOUNDS_FIELD_DESCRIPTION] - if a_username != "": - username_weight = settings.SEARCH_SOUNDS_DEFAULT_FIELD_WEIGHTS[settings.SEARCH_SOUNDS_FIELD_USER_NAME] - if a_packname != "": - pack_tokenized_weight = \ - settings.SEARCH_SOUNDS_DEFAULT_FIELD_WEIGHTS[settings.SEARCH_SOUNDS_FIELD_PACK_NAME] - if a_filename != "": - original_filename_weight = \ - settings.SEARCH_SOUNDS_DEFAULT_FIELD_WEIGHTS[settings.SEARCH_SOUNDS_FIELD_NAME] - - field_weights = { - settings.SEARCH_SOUNDS_FIELD_ID: id_weight, - settings.SEARCH_SOUNDS_FIELD_TAGS: tag_weight, - settings.SEARCH_SOUNDS_FIELD_DESCRIPTION: description_weight, - settings.SEARCH_SOUNDS_FIELD_USER_NAME: username_weight, - settings.SEARCH_SOUNDS_FIELD_PACK_NAME: pack_tokenized_weight, - settings.SEARCH_SOUNDS_FIELD_NAME: original_filename_weight - } - - # if query param 'w' is present, override field weights - weights_parameter = request.GET.get("w", "") - custom_field_weights = parse_weights_parameter(weights_parameter) - if custom_field_weights is not None: - field_weights = custom_field_weights - - # parse query filter string and remove empty value fields - parsing_error = False - try: - parsed_filters = parse_query_filter_string(filter_query) - except ParseException: - parsed_filters = [] - parsing_error = True - - filter_query = ' '.join([''.join(filter_str) for filter_str in parsed_filters]) - filter_query_non_facets, has_facet_filter = remove_facet_filters(parsed_filters) - - if only_sounds_with_pack: - # When displaying results as packs, always return the same number regardless of the compact mode setting - # This because returning a large number of packs makes the search page very slow - num_sounds = settings.SOUNDS_PER_PAGE - else: - num_sounds = settings.SOUNDS_PER_PAGE if not should_use_compact_mode(request) else settings.SOUNDS_PER_PAGE_COMPACT_MODE - - if settings.ENABLE_SEARCH_RESULTS_CLUSTERING: - cluster_id = request.GET.get('cluster_id') - if cluster_id: - in_ids = clustering.interface.get_ids_in_cluster(request, cluster_id) - query_params.update({'only_sounds_within_ids': in_ids}) - - query_params = { - 'textual_query': search_query, - 'query_filter': filter_query, - 'sort': sort, - 'current_page': current_page, - 'num_sounds': num_sounds, - 'query_fields': field_weights, - 'group_by_pack': group_by_pack, - 'only_sounds_with_pack': only_sounds_with_pack, - 'only_sounds_within_ids': [], - 'similar_to': request.GET.get('similar_to', None), - 'facets': settings.SEARCH_SOUNDS_DEFAULT_FACETS.copy(), - } - - # These variables are not used for querying the sound collection - # We keep them separated in order to facilitate the distinction between variables used for performing - # the Solr query and these extra ones needed for rendering the search template page - filter_query_link_more_when_grouping_packs = filter_query.replace(' ', '+') - extra_vars = { - 'filter_query_link_more_when_grouping_packs': filter_query_link_more_when_grouping_packs, - 'advanced': advanced, - 'cluster_id': cluster_id, - 'filter_query_non_facets': filter_query_non_facets, - 'has_facet_filter': has_facet_filter, - 'parsed_filters': parsed_filters, - 'parsing_error': parsing_error, - 'raw_weights_parameter': weights_parameter, - } - - return query_params, advanced_search_params_dict, extra_vars - - def parse_weights_parameter(weights_param): """param weights can be used to specify custom field weights with this format w=field_name1:integer_weight1,field_name2:integrer_weight2, eg: w=name:4,tags:1 ideally, field names should any of those specified in settings.SEARCH_SOUNDS_FIELD_* so the search engine can implement ways to translate the "web names" to "search engine" names if needed. + NOTE: this function is only used in the API """ parsed_field_weights = {} if weights_param: @@ -271,103 +56,6 @@ def parse_weights_parameter(weights_param): return None -def split_filter_query(filter_query, parsed_filters, cluster_id): - """Pre-process parsed search filter parameters and returns the filters' information. - - This function is used in the search template to display the filter and the link when removing them. - The cluster ID is provided separated from the parsed filters in order to keep clustering explicitly - separated from the rest of the filters. - - Args: - filter_query (str): query filter string. - parsed_filters (List[List[str]]): parsed query filter. - cluster_id (str): cluster filter string. - - Returns: - List[dict]: list of dictionaries containing the filter name and the url when removing the filter. - """ - # Generate array with information of filters - filter_query_split = [] - if parsed_filters: - for filter_list_str in parsed_filters: - # filter_list_str is a list of str ['', ':', '"', '', '"'] - filter_name = filter_list_str[0] - if filter_name != "duration" and filter_name != "is_geotagged" and filter_name != "in_remix_group": - valid_filter = True - filter_str = ''.join(filter_list_str) - filter_display = ''.join(filter_list_str) - if filter_name == "grouping_pack": - filter_value = filter_list_str[-1].rstrip('"') - # If pack does not contain "_" then it's not a valid pack filter - if "_" in filter_value: - filter_display = "pack:"+ ''.join(filter_value.split("_")[1:]) - else: - valid_filter = False - - if valid_filter: - filter = { - 'name': filter_display, - 'remove_url': quote_plus(filter_query.replace(filter_str, '')), - 'cluster_id': cluster_id, - } - filter_query_split.append(filter) - - # add cluster filter information - if settings.ENABLE_SEARCH_RESULTS_CLUSTERING: - if cluster_id and cluster_id.isdigit(): - filter_query_split.append({ - 'name': "Cluster #" + cluster_id, - 'remove_url': quote_plus(filter_query), - 'cluster_id': '', - }) - - return filter_query_split - - -def remove_facet_filters(parsed_filters): - """Process query filter string to keep only non facet filters - - Fact filters correspond to the filters that can be applied using one of the displayed facet in - the search interface. This method is useful for being able to combine classic facet filters and clustering - because clustering has to be done on the results of a search without applying facet filters (we want - to have the clustering facet behaving as a traditional facet, meaning that the clustering should not - be re-triggered when applying new facet filters on the results). - Additionally, it returns a boolean that indicates if a facet filter was present in the query. - - Args: - parsed_filters (List[List[str]]): parsed query filter. - - Returns: - filter_query (str): query filter string with only non facet filters. - has_facet_filter (bool): boolean indicating if there exist facet filters in the processed string. - """ - facet_filter_strings = ( - "samplerate", - "grouping_pack", - "username", - "tag", - "bitrate", - "bitdepth", - "type", - "channels", - "license", - ) - has_facet_filter = False - filter_query = "" - - if parsed_filters: - filter_query_parts = [] - for parsed_filter in parsed_filters: - if parsed_filter[0] in facet_filter_strings: - has_facet_filter = True - else: - filter_query_parts.append(''.join(parsed_filter)) - - filter_query = ' '.join(filter_query_parts) - - return filter_query, has_facet_filter - - def perform_search_engine_query(query_params): """Perform a query in the search engine given some query parameters and get the paginated results @@ -468,3 +156,72 @@ def get_random_sound_id_from_search_engine(): except SearchEngineException as e: search_logger.info(f"Could not retrieve a random sound ID from search engine: {str(e)}") return 0 + +def get_sound_similarity_from_search_engine_query(query_params, analyzer_name=settings.SEARCH_ENGINE_DEFAULT_SIMILARITY_ANALYZER, current_page=None, num_sounds=None): + '''Gets the similarity vectors for the first "num_results" sounds for the given query. + + Args: + query_params (dict): query parameters dictionary with parameters following the specification of search_sounds + function from utils.search.SearchEngine. + analyzer_name (str): name of the similarity analyzer from which to get the vector + current_page (int): page number of the results to retrieve similarity vectors for. If None, the current page + from query_params will be used. + num_sounds (int): number of sounds to retrieve similarity vectors for. If None, the number of sounds + in the query_params will be used. + + Returns: + dict: dictionary with sound IDs as keys and similarity vectors as values + ''' + + # Update query params to get similarity vectors of the first + config_options = settings.SEARCH_ENGINE_SIMILARITY_ANALYZERS[analyzer_name] + vector_field_name = utils.search.backends.solr555pysolr.SOLR_VECTOR_FIELDS_DIMENSIONS_MAP.get(config_options['vector_size']) + query_params.update({ + 'facets': None, + 'current_page': current_page if current_page is not None else query_params['current_page'], + 'num_sounds': num_sounds if num_sounds is not None else query_params['num_sounds'], + 'field_list': ['id', 'score', 'similarity_vectors', 'sim_vector100', f'[child childFilter="content_type:v AND analyzer:{analyzer_name}" limit=1]'] + }) + results, _ = perform_search_engine_query(query_params) + + # Collect sound IDs and similarity vectors from query results + similarity_vectors_map = {} + for d in results.docs: + if 'group_docs' in d: + d0 = d['group_docs'][0] + else: + d0 = d + if len(d0.get("similarity_vectors", [])) > 0: + similarity_vectors_map[d0['id']] = d0["similarity_vectors"][0][vector_field_name] + + return similarity_vectors_map + +def get_sound_ids_from_search_engine_query(query_params, current_page=None, num_sounds=None): + """Performs Solr query and returns results as a list of sound ids. + + Args: + query_params (dict): contains the query parameters to replicate the user query. + current_page (int): page number of the results to retrieve IDs for. If None, the current page + from query_params will be used. + num_sounds (int): number of sounds to retrieve IDs for. If None, the number of sounds + in the query_params will be used. + + Returns + List[int]: list containing the ids of the retrieved sounds (for the current_page or num_sounds). + """ + # We set include_facets to False in order to reduce the amount of data that search engine will return. + query_params.update({ + 'facets': None, + 'current_page': current_page if current_page is not None else query_params['current_page'], + 'num_sounds': num_sounds if num_sounds is not None else query_params['num_sounds'], + }) + results, _ = perform_search_engine_query(query_params) + resultids = [d.get("id") for d in results.docs] + return resultids + + +def allow_beta_search_features(request): + if not request.user.is_authenticated: + return False + if request.user.has_perm('profile.show_beta_search_options'): + return True diff --git a/utils/search/search_summary.txt b/utils/search/search_summary.txt deleted file mode 100644 index e01a961a4..000000000 --- a/utils/search/search_summary.txt +++ /dev/null @@ -1,56 +0,0 @@ -fields: - regular field searches: - fieldname:hello AND fieldname:"long sentence" - - search default field for hello: - hello - - search a field for two terms: - fielname:(bass drum) - - -term modifiers: - wildcart searches: - te?t (matches text and test) - test* (matches test, testing and tests) - te*t (...) - - fuzzy: - roam~ (matches foam and roams) - roam~0.1 (1 > matches only roam, 0 > matches very fuzzy) - - range searches: - mod_date:[20020101 TO 20030101] - mod_date:{20020101 TO 20030101} (excluding boundaries) - mod_date:[20020101 TO *] - pubdate:[NOW-1YEAR/DAY TO NOW/DAY+1DAY] - title:[aida TO zoroaster] - title:{aida TO zoroaster} - - boosting: - jakarta^4 apache (jakarta more relevant) - jakarta^0.1 apache (jakarta less relevant) - - -boolean operators: - AND: - "jakarta apache" AND "Apache Lucene" - "jakarta apache" && "Apache Lucene" - - OR: - "jakarta apache" OR "Apache Lucene" - "jakarta apache" || "Apache Lucene" - - + (include): - +jakarta OR lucene (must have jakarta, lucene optional) - - NOT: - NOT "jakarta" - - - (exclude): - jakarta AND -apache (contain jakarta, but not lucene) - -inStock:false (all field values where inStock is not false) - - -grouping: - title:(+return +"pink panther") (contains both the word "return" and the phrase "pink panther") \ No newline at end of file diff --git a/utils/tests/test_search_general.py b/utils/tests/test_search_general.py deleted file mode 100644 index fc6bc589c..000000000 --- a/utils/tests/test_search_general.py +++ /dev/null @@ -1,317 +0,0 @@ -# -# Freesound is (c) MUSIC TECHNOLOGY GROUP, UNIVERSITAT POMPEU FABRA -# -# Freesound is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# Freesound is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . -# -# Authors: -# See AUTHORS file. -# -from django.contrib.sessions.middleware import SessionMiddleware -from django.contrib.auth.middleware import AuthenticationMiddleware -from django.test import TestCase, override_settings -from django.test.client import RequestFactory -from django.urls import reverse -from django.conf import settings -from urllib.parse import quote_plus -from utils.search.search_sounds import search_prepare_parameters, split_filter_query, remove_facet_filters -from utils.search.lucene_parser import parse_query_filter_string - - -class SearchUtilsTest(TestCase): - - def setUp(self): - self.factory = RequestFactory() - - def test_search_prepare_parameters_without_query_params(self): - request = self.factory.get(reverse('sounds-search')) - SessionMiddleware().process_request(request) - AuthenticationMiddleware().process_request(request) - request.session.save() - query_params, advanced_search_params_dict, extra_vars = search_prepare_parameters(request) - - expected_default_query_params = { - 'query_fields': settings.SEARCH_SOUNDS_DEFAULT_FIELD_WEIGHTS, - 'sort': settings.SEARCH_SOUNDS_SORT_OPTION_DATE_NEW_FIRST, - 'num_sounds': settings.SOUNDS_PER_PAGE, - 'current_page': 1, - 'group_by_pack': True, - 'query_filter': '', - 'textual_query': '', - 'similar_to': None, - 'only_sounds_with_pack': False, - 'only_sounds_within_ids': [], - 'facets': settings.SEARCH_SOUNDS_DEFAULT_FACETS - } - - expected_extra_vars = { - 'advanced': '', - 'filter_query_link_more_when_grouping_packs': '', - 'cluster_id': '', - 'filter_query_non_facets': '', - 'has_facet_filter': False, - 'parsed_filters': [], - 'parsing_error': False, - 'raw_weights_parameter': '', - } - - self.assertDictEqual(query_params, expected_default_query_params) - self.assertDictEqual(advanced_search_params_dict, {}) - self.assertDictEqual(extra_vars, expected_extra_vars) - - def test_search_prepare_parameters_with_query_params(self): - # "dog" query, search only in tags and descriptions, duration from 1-10 sec, only geotag, sort by duration, no group by pack - url_query_str = '?q=dog&f=duration:[1+TO+10]+is_geotagged:1&s=Duration+(longest+first)&advanced=1&a_tag=1&a_description=1&g=' - request = self.factory.get(reverse('sounds-search')+url_query_str) - SessionMiddleware().process_request(request) - AuthenticationMiddleware().process_request(request) - request.session.save() - query_params, advanced_search_params_dict, extra_vars = search_prepare_parameters(request) - expected_default_query_params = { - 'query_fields': { - settings.SEARCH_SOUNDS_FIELD_ID: 0, - settings.SEARCH_SOUNDS_FIELD_TAGS: - settings.SEARCH_SOUNDS_DEFAULT_FIELD_WEIGHTS[settings.SEARCH_SOUNDS_FIELD_TAGS], - settings.SEARCH_SOUNDS_FIELD_DESCRIPTION: - settings.SEARCH_SOUNDS_DEFAULT_FIELD_WEIGHTS[settings.SEARCH_SOUNDS_FIELD_DESCRIPTION], - settings.SEARCH_SOUNDS_FIELD_USER_NAME: 0, - settings.SEARCH_SOUNDS_FIELD_PACK_NAME: 0, - settings.SEARCH_SOUNDS_FIELD_NAME: 0 - }, - 'sort': settings.SEARCH_SOUNDS_SORT_OPTION_DURATION_LONG_FIRST, - 'num_sounds': settings.SOUNDS_PER_PAGE, - 'current_page': 1, - 'group_by_pack': False, - 'query_filter': 'duration:[1 TO 10] is_geotagged:1', - 'textual_query': 'dog', - 'similar_to': None, - 'only_sounds_with_pack': False, - 'only_sounds_within_ids': [], - 'facets': settings.SEARCH_SOUNDS_DEFAULT_FACETS - } - - expected_extra_vars = { - 'advanced': '1', - 'filter_query_link_more_when_grouping_packs': 'duration:[1+TO+10]+is_geotagged:1', - 'cluster_id': '', - 'filter_query_non_facets': 'duration:[1 TO 10] is_geotagged:1', - 'has_facet_filter': False, - 'parsed_filters': [['duration', ':', '[', '1', ' TO ', '10', ']'], ['is_geotagged', ':', '1']], - 'parsing_error': False, - 'raw_weights_parameter': '', - } - - expected_advanced_search_params_dict = { - 'a_tag': '1', - 'a_username': '', - 'a_description': '1', - 'a_packname': '', - 'a_filename': '', - 'a_soundid': '', - } - - self.assertDictEqual(query_params, expected_default_query_params) - self.assertDictEqual(advanced_search_params_dict, expected_advanced_search_params_dict) - self.assertDictEqual(extra_vars, expected_extra_vars) - - def test_remove_facet_filters(self): - query_filter_str = 'is_geotagged:1 tag:"dog"' - parsed_filters = parse_query_filter_string(query_filter_str) - filter_without_facet, has_facet_filter = remove_facet_filters(parsed_filters) - self.assertTrue(has_facet_filter) - self.assertEqual(filter_without_facet, 'is_geotagged:1') - - def test_remove_facet_filters_no_facet(self): - query_filter_str = 'duration:[1 TO 10] is_geotagged:1' - parsed_filters = parse_query_filter_string(query_filter_str) - filter_without_facet, has_facet_filter = remove_facet_filters(parsed_filters) - self.assertFalse(has_facet_filter) - self.assertEqual(filter_without_facet, query_filter_str) - - def test_remove_facet_filters_special_char(self): - query_filter_str = 'grouping_pack:"1_:)" tag:"dog"' - parsed_filters = parse_query_filter_string(query_filter_str) - filter_without_facet, has_facet_filter = remove_facet_filters(parsed_filters) - self.assertTrue(has_facet_filter) - self.assertEqual(filter_without_facet, '') - - def test_remove_facet_filters_special_char2(self): - query_filter_str = 'grouping_pack:"19265_Impacts, Hits, Friction & Tools" tag:"tools" samplerate:"44100" \ - bitrate:"1379" duration:[0 TO 10]' - parsed_filters = parse_query_filter_string(query_filter_str) - filter_without_facet, has_facet_filter = remove_facet_filters(parsed_filters) - self.assertTrue(has_facet_filter) - self.assertEqual(filter_without_facet, 'duration:[0 TO 10]') - - def test_remove_facet_filters_special_char3(self): - query_filter_str = 'grouping_pack:"..." tag:"da@," duration:[0 TO 1.1]' - parsed_filters = parse_query_filter_string(query_filter_str) - filter_without_facet, has_facet_filter = remove_facet_filters(parsed_filters) - self.assertTrue(has_facet_filter) - self.assertEqual(filter_without_facet, 'duration:[0 TO 1.1]') - - def test_search_prepare_parameters_non_ascii_query(self): - # Simple test to check if some non ascii characters are correctly handled by search_prepare_parameters() - request = self.factory.get(reverse('sounds-search')+'?q=Æ æ ¿ É') - SessionMiddleware().process_request(request) - AuthenticationMiddleware().process_request(request) - request.session.save() - query_params, advanced_search_params_dict, extra_vars = search_prepare_parameters(request) - self.assertEqual(query_params['textual_query'], '\xc6 \xe6 \xbf \xc9') - - def test_split_filter_query_duration_and_facet(self): - # We check that the combination of a duration filter and a facet filter (CC Attribution) works correctly. - filter_query_string = 'duration:[0 TO 10] license:"attribution" username:"XavierFav" grouping_pack:"1_best-pack-ever"' - parsed_filters = parse_query_filter_string(filter_query_string) - filter_query_split = split_filter_query(filter_query_string, parsed_filters, '') - - # duraton filter is not a facet, but should stay present when removing a facet. - expected_filter_query_split = [ - {'remove_url': 'duration:[0 TO 10]', 'name': 'license:"attribution"'}, - ] - expected_filter_query_split = [ - {'remove_url': quote_plus('duration:[0 TO 10] username:"XavierFav" grouping_pack:"1_best-pack-ever"'), 'name': 'license:"attribution"'}, - {'remove_url': quote_plus('duration:[0 TO 10] license:"attribution" grouping_pack:"1_best-pack-ever"'), 'name': 'username:"XavierFav"'}, - {'remove_url': quote_plus('duration:[0 TO 10] license:"attribution" username:"XavierFav"'), 'name': 'pack:best-pack-ever'}, - ] - - # the order does not matter for the list of facet dicts. - # we get the index of the correspondings facets dicts. - filter_query_names = [filter_query_dict['name'] for filter_query_dict in filter_query_split] - cc_attribution_facet_dict_idx = filter_query_names.index('license:"attribution"') - username_facer_dict_idx = filter_query_names.index('username:"XavierFav"') - grouping_pack_facet_dict_idx = filter_query_names.index('pack:best-pack-ever') - - # we use assertIn because the unicode strings that split_filter_query generates can incorporate - # additional spaces at the end of the string, which is not a problem. - # Additonally, some additional spaces have been observed in the middle of the remove_url string. We replace double - # spaces with single ones in this test. However, we should probably identify where does this additional spaces - # come from. - # 1-Attribution - self.assertIn(expected_filter_query_split[0]['name'], - filter_query_split[cc_attribution_facet_dict_idx]['name']) - self.assertIn(expected_filter_query_split[0]['remove_url'], - filter_query_split[cc_attribution_facet_dict_idx]['remove_url'].replace('++', '+')) - - # 2-Username - self.assertIn(expected_filter_query_split[1]['name'], - filter_query_split[username_facer_dict_idx]['name']) - self.assertIn(expected_filter_query_split[1]['remove_url'], - filter_query_split[username_facer_dict_idx]['remove_url'].replace('++', '+')) - - # 3-Pack - self.assertIn(expected_filter_query_split[2]['name'], - filter_query_split[grouping_pack_facet_dict_idx]['name']) - self.assertIn(expected_filter_query_split[2]['remove_url'], - filter_query_split[grouping_pack_facet_dict_idx]['remove_url'].replace('++', '+')) - - def test_split_filter_query_special_chars(self): - filter_query_string = 'license:"sampling+" grouping_pack:"1_example pack + @ #()*"' - parsed_filters = parse_query_filter_string(filter_query_string) - filter_query_split = split_filter_query(filter_query_string, parsed_filters, '') - filter_query_names = [filter_query_dict['name'] for filter_query_dict in filter_query_split] - - expected_filter_query_split = [ - {'remove_url': quote_plus('grouping_pack:"1_example pack + @ #()*"'), 'name': 'license:"sampling+"'}, - {'remove_url': quote_plus('license:"sampling+"'), 'name': 'pack:example pack + @ #()*'}, - ] - cc_samplingplus_facet_dict_idx = filter_query_names.index('license:"sampling+"') - grouping_pack_facet_dict_idx = filter_query_names.index('pack:example pack + @ #()*') - - self.assertIn(expected_filter_query_split[0]['name'], - filter_query_split[cc_samplingplus_facet_dict_idx]['name']) - self.assertIn(expected_filter_query_split[0]['remove_url'], - filter_query_split[cc_samplingplus_facet_dict_idx]['remove_url']) - - self.assertIn(expected_filter_query_split[1]['name'], - filter_query_split[grouping_pack_facet_dict_idx]['name']) - self.assertIn(expected_filter_query_split[1]['remove_url'], - filter_query_split[grouping_pack_facet_dict_idx]['remove_url']) - - # most of these tests just ensure that no exception is returned when trying to parse filter strings - # that gave problems while developping the filter string parser function - # utils.search.lucene_parser.parse_query_filter_string() - def test_parse_filter_query_special_created(self): - filter_query_string = 'created:[NOW-7DAY TO NOW] license:"Creative Commons 0"' - filter_query_split = parse_query_filter_string(filter_query_string) - self.assertEqual(filter_query_split, [ - ['created', ':', '[', 'NOW-7DAY', ' TO ', 'NOW', ']'], - ['license', ':', '"Creative Commons 0"'], - ]) - - def test_parse_filter_query_special_char(self): - filter_query_string = 'grouping_pack:"32119_Conch Blowing (शङ्ख)"' - filter_query_split = parse_query_filter_string(filter_query_string) - self.assertEqual(filter_query_split, [ - ['grouping_pack', ':', '"32119_Conch Blowing (शङ्ख)"'], - ]) - - def test_parse_filter_query_special_char2(self): - filter_query_string = 'grouping_pack:"2806_Hurt & Pain sounds"' - filter_query_split = parse_query_filter_string(filter_query_string) - self.assertEqual(filter_query_split, [ - ['grouping_pack', ':', '"2806_Hurt & Pain sounds"'], - ]) - - def test_parse_filter_query_geofilter(self): - filter_query_string = 'tag:"cool" \'{!geofilt sfield=geotag pt=39.7750014,-94.2735586 d=50}\'' - filter_query_split = parse_query_filter_string(filter_query_string) - self.assertEqual(filter_query_split, [ - ['tag', ':', '"cool"'], - ["'{!", 'geofilt sfield=geotag pt=39.7750014,-94.2735586 d=50', "}'"] - ]) - - def test_parse_filter_composed_with_OR(self): - filter_query_string = 'tag:"cool" license:("Attribution" OR "Creative Commons 0")' - parsed_filters = parse_query_filter_string(filter_query_string) - self.assertEqual(parsed_filters, [ - ['tag', ':', '"cool"'], - ['license', ':', '(', '"Attribution"', "OR", '"Creative Commons 0"', ')'] - ]) - - def test_parse_filter_nested_composed_with_OR(self): - filter_query_string = '("Attribution" OR ("Attribution" OR "Creative Commons 0"))' - parsed_filters = parse_query_filter_string(filter_query_string) - - @override_settings(ENABLE_SEARCH_RESULTS_CLUSTERING=True) - def test_split_filter_query_cluster_facet(self): - # We check that the combination of a duration filter, a facet filter (CC Attribution) and a cluster filter - # works correctly. - filter_query_string = 'duration:[0 TO 10] license:"attribution"' - # the cluster filter is set in the second argument of split_filter_query() - parsed_filters = parse_query_filter_string(filter_query_string) - filter_query_split = split_filter_query(filter_query_string, parsed_filters, '1') - - expected_filter_query_split = [ - {'remove_url': quote_plus('duration:[0 TO 10]'), 'name': 'license:"attribution"'}, - {'remove_url': quote_plus('duration:[0 TO 10] license:"attribution"'), 'name': 'Cluster #1'} - ] - - # check that the cluster facet exists - filter_query_names = [filter_query_dict['name'] for filter_query_dict in filter_query_split] - self.assertIn('Cluster #1', filter_query_names) - - # the order does not matter for the list of facet dicts. - # we get the index of the correspondings facets dicts. - cc_attribution_facet_dict_idx = filter_query_names.index('license:"attribution"') - cluster_facet_dict_idx = filter_query_names.index('Cluster #1') - - self.assertIn(expected_filter_query_split[0]['name'], - filter_query_split[cc_attribution_facet_dict_idx]['name']) - self.assertIn(expected_filter_query_split[0]['remove_url'], - filter_query_split[cc_attribution_facet_dict_idx]['remove_url']) - - self.assertIn(expected_filter_query_split[1]['name'], - filter_query_split[cluster_facet_dict_idx]['name']) - self.assertIn(expected_filter_query_split[1]['remove_url'], - filter_query_split[cluster_facet_dict_idx]['remove_url']) diff --git a/utils/tests/test_search_query_processor.py b/utils/tests/test_search_query_processor.py new file mode 100644 index 000000000..c9b2c0a53 --- /dev/null +++ b/utils/tests/test_search_query_processor.py @@ -0,0 +1,395 @@ +# +# Freesound is (c) MUSIC TECHNOLOGY GROUP, UNIVERSITAT POMPEU FABRA +# +# Freesound is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# Freesound is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +# +# Authors: +# See AUTHORS file. +# + +from django.contrib.auth.models import User +from django.conf import settings +from django.test import TestCase, RequestFactory +from django.urls import reverse +from utils.search import search_query_processor +from utils.url import ComparableUrl +from unittest import mock + + +class SearchQueryProcessorTests(TestCase): + + default_expected_params = { + 'current_page': 1, + 'facets': settings.SEARCH_SOUNDS_DEFAULT_FACETS, + 'field_list': ['id', 'score'], + 'group_by_pack': True, + 'num_sounds': settings.SOUNDS_PER_PAGE, + 'num_sounds_per_pack_group': 1, + 'only_sounds_with_pack': False, + 'only_sounds_within_ids': [], + 'query_fields': settings.SEARCH_SOUNDS_DEFAULT_FIELD_WEIGHTS, + 'query_filter': '', + 'similar_to': None, + 'sort': settings.SEARCH_SOUNDS_SORT_OPTION_DATE_NEW_FIRST, # Empty query should sort by date added, so use this as expected default + 'textual_query': ''} + + def setUp(self): + self.factory = RequestFactory() + self.maxDiff = None + self.user = User.objects.create_user("testuser", password="testpass", email='email@freesound.org') + self.user.is_superuser = True + self.user.save() + + def assertExpectedParams(self, returned_query_params, specific_expected_params={}): + dict_to_compare = self.default_expected_params.copy() + dict_to_compare.update(specific_expected_params) + self.assertDictEqual(returned_query_params, dict_to_compare) + + def assertGetUrlAsExpected(self, sqp, expected_url): + sqp_url = sqp.get_url() + self.assertEqual(ComparableUrl(sqp_url), ComparableUrl(expected_url)) + + def run_fake_search_query_processor(self, base_url=reverse('sounds-search'), url=None, params={}, user=None): + if url is None: + request = self.factory.get(base_url, params) + else: + request = self.factory.get(url) + request.user = user if user is not None else self.user + return search_query_processor.SearchQueryProcessor(request), request.get_full_path() + + @mock.patch('utils.search.search_query_processor.get_ids_in_cluster') + def test_search_query_processor_as_query_params_and_make_url(self, fake_get_ids_in_cluster): + # This will test that the SearchQueryProcessor correctly processes the request parameters and generates the + # expected query_params object to be sent to a SearchEngine object. Also it tests that once SearchQueryProcessor + # has loaded parameters from the request, it is able to generate URLs which are equivalent to the original request. + + # Query with no params, all should be default behaviour (sorting by date added) + sqp, url = self.run_fake_search_query_processor() + self.assertExpectedParams(sqp.as_query_params()) + self.assertGetUrlAsExpected(sqp, url) + + # Empty query with no sorting specified, will sort by date added just like query with no params at all + sqp, url = self.run_fake_search_query_processor(params={'q': ''}) + self.assertExpectedParams(sqp.as_query_params()) + self.assertGetUrlAsExpected(sqp, url) + + # Empty query with sorting specified, will sort as indicated + sqp, url = self.run_fake_search_query_processor(params={'s': settings.SEARCH_SOUNDS_SORT_OPTION_AUTOMATIC}) + self.assertExpectedParams(sqp.as_query_params(), {'sort': settings.SEARCH_SOUNDS_SORT_OPTION_AUTOMATIC}) + self.assertGetUrlAsExpected(sqp, url) + + # Basic query with only text, results should be sorted by score + sqp, url = self.run_fake_search_query_processor(params={'q':'test'}) + self.assertExpectedParams(sqp.as_query_params(), {'textual_query': 'test', + 'sort': settings.SEARCH_SOUNDS_SORT_OPTION_AUTOMATIC}) + self.assertGetUrlAsExpected(sqp, url) + + # With page number specified + sqp, url = self.run_fake_search_query_processor(params={'page': '3'}) + self.assertExpectedParams(sqp.as_query_params(), {'current_page': 3}) + self.assertGetUrlAsExpected(sqp, url) + + # With "search in" options specified + sqp, url = self.run_fake_search_query_processor(params={'si_tags': '1', 'si_description': '1', 'si_sound_id': '0'}) + self.assertExpectedParams(sqp.as_query_params(), {'query_fields': { + settings.SEARCH_SOUNDS_FIELD_DESCRIPTION: settings.SEARCH_SOUNDS_DEFAULT_FIELD_WEIGHTS[settings.SEARCH_SOUNDS_FIELD_DESCRIPTION], + settings.SEARCH_SOUNDS_FIELD_TAGS: settings.SEARCH_SOUNDS_DEFAULT_FIELD_WEIGHTS[settings.SEARCH_SOUNDS_FIELD_TAGS] + }}) + self.assertGetUrlAsExpected(sqp, url.replace('si_sound_id=0', '')) # Here we remove a_soundid from the expected URL because sqp.get_url() will exclude it as value is not '1' + + # With custom field weights specified + sqp, url = self.run_fake_search_query_processor(params={'w': f'{settings.SEARCH_SOUNDS_FIELD_DESCRIPTION}:2,{settings.SEARCH_SOUNDS_FIELD_ID}:1'}) + self.assertExpectedParams(sqp.as_query_params(), {'query_fields': { + settings.SEARCH_SOUNDS_FIELD_DESCRIPTION: 2, + settings.SEARCH_SOUNDS_FIELD_ID: 1 + }}) + self.assertGetUrlAsExpected(sqp, url) + + # With custom field weights specified AND search in + sqp, url = self.run_fake_search_query_processor(params={'si_sound_id': '1', 'w': f'{settings.SEARCH_SOUNDS_FIELD_DESCRIPTION}:2,{settings.SEARCH_SOUNDS_FIELD_ID}:1'}) + self.assertExpectedParams(sqp.as_query_params(), {'query_fields': { + settings.SEARCH_SOUNDS_FIELD_ID: 1 + }}) + self.assertGetUrlAsExpected(sqp, url) + + # With duration filter + sqp, url = self.run_fake_search_query_processor(params={'d0': '0.25', 'd1': '2.05'}) + self.assertExpectedParams(sqp.as_query_params(), {'query_filter': 'duration:[0.25 TO 2.05]'}) + self.assertGetUrlAsExpected(sqp, url) + sqp, url = self.run_fake_search_query_processor(params={'d0': '0.25'}) + self.assertExpectedParams(sqp.as_query_params(), {'query_filter': 'duration:[0.25 TO *]'}) + self.assertGetUrlAsExpected(sqp, url + '&d1=*') # Add d1 to the expected url as sqp will add it + sqp, url = self.run_fake_search_query_processor(params={'d1': '0.25'}) + self.assertExpectedParams(sqp.as_query_params(), {'query_filter': 'duration:[0 TO 0.25]'}) + self.assertGetUrlAsExpected(sqp, url + '&d0=0') # Add d0 to the expected url as sqp will add it + + # With geotag filter + sqp, url = self.run_fake_search_query_processor(params={'ig': '1'}) + self.assertExpectedParams(sqp.as_query_params(), {'query_filter': 'is_geotagged:1'}) + self.assertGetUrlAsExpected(sqp, url) + sqp, url = self.run_fake_search_query_processor(params={'ig': '0'}) + self.assertExpectedParams(sqp.as_query_params(), {'query_filter': ''}) # If geotagged option is 0, no filter should be added + self.assertGetUrlAsExpected(sqp, '/search/') # URL should not include ig=0 as this is the default value + + # With remix filter + sqp, url = self.run_fake_search_query_processor(params={'r': '1'}) + self.assertExpectedParams(sqp.as_query_params(), {'query_filter': 'in_remix_group:1'}) + self.assertGetUrlAsExpected(sqp, url) + sqp, url = self.run_fake_search_query_processor(params={'r': '0'}) + self.assertExpectedParams(sqp.as_query_params(), {'query_filter': ''}) # If remix option is 0, no filter should be added + self.assertGetUrlAsExpected(sqp, '/search/') # URL should not include r=0 as this is the default value + + # With group by pack option (defaults to True) + sqp, url = self.run_fake_search_query_processor(params={'g': '1'}) + self.assertExpectedParams(sqp.as_query_params(), {'group_by_pack': True}) + self.assertGetUrlAsExpected(sqp, '/search/') # URL should not include g=1 as this is the default value + sqp, url = self.run_fake_search_query_processor(params={'g': '0'}) + self.assertExpectedParams(sqp.as_query_params(), {'group_by_pack': False}) + self.assertGetUrlAsExpected(sqp, url) + sqp, url = self.run_fake_search_query_processor() + self.assertExpectedParams(sqp.as_query_params(), {'group_by_pack': True}) + self.assertGetUrlAsExpected(sqp, url) + + # With display results as packs option + sqp, url = self.run_fake_search_query_processor(params={'dp': '1'}) + self.assertExpectedParams(sqp.as_query_params(), {'group_by_pack': True, 'only_sounds_with_pack': True, 'num_sounds_per_pack_group': 3}) + self.assertGetUrlAsExpected(sqp, url) + sqp, url = self.run_fake_search_query_processor(params={'dp': '1', 'g': '0'}) # When display packs is enabled, always group by pack + self.assertExpectedParams(sqp.as_query_params(), {'group_by_pack': True, 'only_sounds_with_pack': True, 'num_sounds_per_pack_group': 3}) + self.assertGetUrlAsExpected(sqp, url) + + # With compact mode option + sqp, url = self.run_fake_search_query_processor(params={'cm': '1'}) + self.assertExpectedParams(sqp.as_query_params(), {'num_sounds': settings.SOUNDS_PER_PAGE_COMPACT_MODE }) + self.assertGetUrlAsExpected(sqp, url) + sqp, url = self.run_fake_search_query_processor(params={'cm': '1', 'dp': '1'}) # In display pack mode, number of sounds stays the same + self.assertExpectedParams(sqp.as_query_params(), {'num_sounds': settings.SOUNDS_PER_PAGE, + 'only_sounds_with_pack': True, + 'num_sounds_per_pack_group': 3}) + self.assertGetUrlAsExpected(sqp, url) + + # With map mode option + sqp, url = self.run_fake_search_query_processor(params={'mm': '1'}) + self.assertExpectedParams(sqp.as_query_params(), {'group_by_pack': False, + 'num_sounds': settings.MAX_SEARCH_RESULTS_IN_MAP_DISPLAY, + 'query_filter': 'is_geotagged:1', + 'field_list': ['id', 'score', 'geotag']}) + self.assertGetUrlAsExpected(sqp, url) + sqp, url = self.run_fake_search_query_processor(params={'mm': '1', 'page': '3'}) # Page number in map mode is always 1 + self.assertExpectedParams(sqp.as_query_params(), {'group_by_pack': False, + 'num_sounds': settings.MAX_SEARCH_RESULTS_IN_MAP_DISPLAY, + 'query_filter': 'is_geotagged:1', + 'field_list': ['id', 'score', 'geotag']}) + self.assertGetUrlAsExpected(sqp, url) + + # With tags mode + sqp, url = self.run_fake_search_query_processor(base_url=reverse('tags')) + expected_facets = settings.SEARCH_SOUNDS_DEFAULT_FACETS.copy() + expected_facets['tags']['limit'] = 50 + self.assertExpectedParams(sqp.as_query_params(), {'facets': expected_facets}) + self.assertGetUrlAsExpected(sqp, url) + + # With cluster id option + fake_get_ids_in_cluster.return_value = [1, 2 ,3, 4] # Mock the response of get_ids_in_cluster + sqp, url = self.run_fake_search_query_processor(params={'cid': '31', 'cc': '1'}) + self.assertExpectedParams(sqp.as_query_params(), {'only_sounds_within_ids': [1, 2 ,3, 4]}) + self.assertGetUrlAsExpected(sqp, url) + + # With similar to option + sqp, url = self.run_fake_search_query_processor(params={'st': '1234'}) # Passing similarity target as sound ID + self.assertExpectedParams(sqp.as_query_params(), {'similar_to': 1234}) + self.assertGetUrlAsExpected(sqp, url) + sqp, url = self.run_fake_search_query_processor(params={'st': '[1.34,3.56,5.78]'}) # Passing similarity target as sound ID + self.assertExpectedParams(sqp.as_query_params(), {'similar_to': [1.34, 3.56, 5.78]}) + self.assertGetUrlAsExpected(sqp, url) + + # Using a pack filter, sounds should not be grouped by pack + sqp, url = self.run_fake_search_query_processor(params={'f': 'grouping_pack:"19894_Clutter"'}) + self.assertExpectedParams(sqp.as_query_params(), {'query_filter': 'grouping_pack:"19894_Clutter"', 'group_by_pack': False}) + self.assertGetUrlAsExpected(sqp, url) + + def test_search_query_processor_disabled_options(self): + # Test that some search options are marked as disabled depending on the state of some other options + # NOTE: disabled state is used when displaying the options in the UI, but has no other effects + + # query if similarity on + sqp, _ = self.run_fake_search_query_processor(params={'st': '1234'}) + self.assertTrue(sqp.options['query'].disabled) + + # sort if similarity on + sqp, _ = self.run_fake_search_query_processor(params={'st': '1234'}) + self.assertTrue(sqp.options['sort_by'].disabled) + + # group_by_pack if display_as_packs or map_mode + sqp, _ = self.run_fake_search_query_processor(params={'dp': '1'}) + self.assertTrue(sqp.options['group_by_pack'].disabled) + sqp, _ = self.run_fake_search_query_processor(params={'mm': '1'}) + self.assertTrue(sqp.options['group_by_pack'].disabled) + + # display as packs if map_mode + sqp, _ = self.run_fake_search_query_processor(params={'mm': '1'}) + self.assertTrue(sqp.options['display_as_packs'].disabled) + + # grid_mode if map_mode + sqp, _ = self.run_fake_search_query_processor(params={'mm': '1'}) + self.assertTrue(sqp.options['grid_mode'].disabled) + + # is_geotagged if map_mode + sqp, _ = self.run_fake_search_query_processor(params={'mm': '1'}) + self.assertTrue(sqp.options['is_geotagged'].disabled) + + # search_in if tags_mode or similar_to_mode + sqp, _ = self.run_fake_search_query_processor(params={'st': '1'}) + self.assertTrue(sqp.options['search_in'].disabled) + sqp, _ = self.run_fake_search_query_processor(base_url=reverse('tags')) + self.assertTrue(sqp.options['search_in'].disabled) + + # group_by_pack and display_as_packs if filter contains a pack + sqp, _ = self.run_fake_search_query_processor(params={'f': 'grouping_pack:"19894_Clutter"'}) + self.assertTrue(sqp.options['group_by_pack'].disabled) + self.assertTrue(sqp.options['display_as_packs'].disabled) + + def test_search_query_processor_tags_in_filter(self): + sqp, _ = self.run_fake_search_query_processor(params={ + 'f': 'duration:[0.25 TO 20] tag:"tag1" is_geotagged:1 (id:1 OR id:2 OR id:3) tag:"tag2" (tag:"tag3" OR tag:"tag4")', + }) + self.assertEqual(sorted(sqp.get_tags_in_filters()), sorted(['tag1', 'tag2'])) + + sqp, _ = self.run_fake_search_query_processor(params={ + 'f': 'duration:[0.25 TO 20] is_geotagged:1 (id:1 OR id:2 OR id:3)', + }) + self.assertEqual(sqp.get_tags_in_filters(), []) + + def test_search_query_processor_make_url_add_remove_filters(self): + # Test add_filters adds them to the URL + sqp, _ = self.run_fake_search_query_processor() + self.assertEqual(sqp.get_url(add_filters=['tag:"tag1"']), '/search/?f=tag%3A%22tag1%22') + + # Test remove_filters removes them from the URL + sqp, _ = self.run_fake_search_query_processor(params={'f': 'filter1:"aaa" filter2:123'}) + self.assertEqual(sqp.get_url(remove_filters=['filter1:"aaa"', 'filter2:123']), '/search/') + + def test_search_query_processor_contains_active_advanced_search_options(self): + # Query with no params + sqp, _ = self.run_fake_search_query_processor() + self.assertEqual(sqp.contains_active_advanced_search_options(), False) + + # Empty query + sqp, _ = self.run_fake_search_query_processor(params={'q': ''}) + self.assertEqual(sqp.contains_active_advanced_search_options(), False) + + # Empty query with sorting specifyied + sqp, _ = self.run_fake_search_query_processor(params={'s': settings.SEARCH_SOUNDS_SORT_OPTION_AUTOMATIC}) + self.assertEqual(sqp.contains_active_advanced_search_options(), False) + + # Basic query with only text + sqp, _ = self.run_fake_search_query_processor(params={'q':'test'}) + self.assertEqual(sqp.contains_active_advanced_search_options(), False) + + # With page number specified + sqp, _ = self.run_fake_search_query_processor(params={'page': '3'}) + self.assertEqual(sqp.contains_active_advanced_search_options(), False) + + # With "search in" options specified + sqp, _ = self.run_fake_search_query_processor(params={'si_tags': '1', 'si_description': '1', 'si_sound_id': '0'}) + self.assertEqual(sqp.contains_active_advanced_search_options(), True) + + # With custom field weights specified + sqp, _ = self.run_fake_search_query_processor(params={'w': f'{settings.SEARCH_SOUNDS_FIELD_DESCRIPTION}:2,{settings.SEARCH_SOUNDS_FIELD_ID}:1'}) + self.assertEqual(sqp.contains_active_advanced_search_options(), True) + + # With custom field weights specified AND search in + sqp, _ = self.run_fake_search_query_processor(params={'si_sound_id': '1', 'w': f'{settings.SEARCH_SOUNDS_FIELD_DESCRIPTION}:2,{settings.SEARCH_SOUNDS_FIELD_ID}:1'}) + self.assertEqual(sqp.contains_active_advanced_search_options(), True) + + # With duration filter + sqp, _ = self.run_fake_search_query_processor(params={'d0': '0.25', 'd1': '2.05'}) + self.assertEqual(sqp.contains_active_advanced_search_options(), True) + sqp, _ = self.run_fake_search_query_processor(params={'d0': '0', 'd1': '*'}) + self.assertEqual(sqp.contains_active_advanced_search_options(), False) # False if parameters are default + + # With geotag filter + sqp, _ = self.run_fake_search_query_processor(params={'ig': '1'}) + self.assertEqual(sqp.contains_active_advanced_search_options(), True) + sqp, _ = self.run_fake_search_query_processor(params={'ig': '0'}) + self.assertEqual(sqp.contains_active_advanced_search_options(), False) # False if parameters are default + + # With remix filter + sqp, _ = self.run_fake_search_query_processor(params={'r': '1'}) + self.assertEqual(sqp.contains_active_advanced_search_options(), True) + sqp, _ = self.run_fake_search_query_processor(params={'r': '0'}) + self.assertEqual(sqp.contains_active_advanced_search_options(), False) # False if parameters are default + + # With group by pack option (defaults to True) + sqp, _ = self.run_fake_search_query_processor(params={'g': '0'}) + self.assertEqual(sqp.contains_active_advanced_search_options(), True) + sqp, _ = self.run_fake_search_query_processor(params={'g': '1'}) + self.assertEqual(sqp.contains_active_advanced_search_options(), False) # False if parameters are default + + # With display results as packs option + sqp, _ = self.run_fake_search_query_processor(params={'dp': '1'}) + self.assertEqual(sqp.contains_active_advanced_search_options(), False) # Not considered an active filter + + # With compact mode option + sqp, _ = self.run_fake_search_query_processor(params={'cm': '1'}) + self.assertEqual(sqp.contains_active_advanced_search_options(), False) # Not considered an active filter + + # With map mode option + sqp, _ = self.run_fake_search_query_processor(params={'mm': '1'}) + self.assertEqual(sqp.contains_active_advanced_search_options(), False) # Not considered an active filter + + # With tags mode + sqp, _ = self.run_fake_search_query_processor(base_url=reverse('tags')) + self.assertEqual(sqp.contains_active_advanced_search_options(), False) # Not considered an active filter + + # With cluster id + sqp, _ = self.run_fake_search_query_processor(params={'cid': '31'}) + self.assertEqual(sqp.contains_active_advanced_search_options(), False) # Clustering not an advanced search option + + # With similar to option + sqp, _ = self.run_fake_search_query_processor(params={'st': '1234'}) + self.assertEqual(sqp.contains_active_advanced_search_options(), True) + + def test_search_query_processor_as_query_params_exclude_facet_filters(self): + for filter_name, is_facet in [ + ('samplerate', True), + ('grouping_pack', True), + ('username', True), + ('tag', True), + ('bitrate', True), + ('bitdepth', True), + ('type', True), + ('channels', True), + ('license', True), + ('non_facet_filter', False), + ]: + sqp, _ = self.run_fake_search_query_processor(params={'f': f'{filter_name}:123'}) + self.assertEqual(f'{filter_name}:123' in sqp.as_query_params(exclude_facet_filters=True)['query_filter'], not is_facet) + self.assertEqual(f'{filter_name}:123' in sqp.as_query_params(exclude_facet_filters=False)['query_filter'], True) + + + def test_search_query_processor_as_query_params_special_chars(self): + # Special chars in query + query = 'Æ æ ¿ É' + sqp, _ = self.run_fake_search_query_processor(params={'q': query}) + self.assertEqual(sqp.as_query_params()['textual_query'], query) + + # Special chars in filter + flt = 'grouping_pack:"32119_Conch Blowing (शङ्ख)"' + sqp, _ = self.run_fake_search_query_processor(params={'f': flt}) + self.assertEqual(sqp.as_query_params()['query_filter'], flt) + + flt = 'license:"smapling+"' + sqp, _ = self.run_fake_search_query_processor(params={'f': flt}) + self.assertEqual(sqp.as_query_params()['query_filter'], flt) diff --git a/utils/urlpatterns.py b/utils/url.py similarity index 68% rename from utils/urlpatterns.py rename to utils/url.py index 511da79f8..291d68fcc 100644 --- a/utils/urlpatterns.py +++ b/utils/url.py @@ -19,6 +19,8 @@ # from functools import wraps +from urllib.parse import urlparse, parse_qsl, unquote_plus + from django.http import HttpResponseRedirect from django.urls import reverse @@ -40,4 +42,24 @@ def _wrapped_view(request, *args, **kwargs): url += f'?{query_string}' return HttpResponseRedirect(url) return _wrapped_view - return decorator(function) \ No newline at end of file + return decorator(function) + + +class ComparableUrl(object): + '''A url object that can be compared with other url orbjects + without regard to the vagaries of encoding, escaping, and ordering + of parameters in query strings.''' + # NOTE: from https://stackoverflow.com/questions/5371992/comparing-two-urls-in-python + + def __init__(self, url): + parts = urlparse(url) + _query = frozenset(parse_qsl(parts.query)) + _path = unquote_plus(parts.path) + parts = parts._replace(query=_query, path=_path) + self.parts = parts + + def __eq__(self, other): + return self.parts == other.parts + + def __hash__(self): + return hash(self.parts)