Merge pull request #1756 from MTG/search-refactor2

Search "frontend" refactor
MTG · Apr 10, 2024 · 0fd1dbc · 0fd1dbc
2 parents 2214bf7 + 83e6fbc
commit 0fd1dbc
Show file tree

Hide file tree

Showing 46 changed files with 2,429 additions and 2,401 deletions.
diff --git a/DEVELOPERS.md b/DEVELOPERS.md
@@ -71,6 +71,7 @@ Currently, we only use the following custom permissions:
 * `tickets.can_moderate` (in `Ticket` model, used to allow sound moderation)
 * `forum.can_moderate_forum` (in `Post` model, used to allow forum moderation)
 * `sounds.can_describe_in_bulk` (in `BulkUploadProgress` model, used to allow bulk upload for users who don't meet the other common requirements)
+* `profile.show_beta_search_options` (in `Profile` model, used to allow using beta search features)
 
 
 ### URLs that include a username
@@ -131,6 +132,33 @@ creating `DeletedSound` objects in the `sounds-models.on_delete_sound` function
 signal of the `Sound` model.
 
 
+### Adding new search options in the search page
+
+The available options for searching and filtering sounds in the search page ara managed using a `SearchQueryProcessor`
+object (implemented in `/utils/search/search_query_processor.py`). The `SearchQueryProcessor` class is used to parse and 
+process search query information from a Django `request` object, and compute a number of useful items for displaying search 
+information in templates, constructing search URLs, and preparing search options to be passed to the backend search engine.
+
+To add a new option to the search page, a new member of a specific `SearchOption` class should be added to the `SearchQueryProcessor` 
+class (see `SearchQueryProcessor` definion for examples). There are a number of already existing types of `SearchOption`s 
+as you can see by looking at the search options which are already implemented in `SearchQueryProcessor`. If the newly added search
+option implies doing some calcualtions for determining the `query_params` to be sent to the `search_sounds` function of the search 
+engine backend, this should be done in the `SearchQueryProcessor.as_query_params` method.
+
+Adding a new search option to `SearchQueryProcessor` will make the option work with the search engine backend and with search URLs, 
+but it will NOT automatically add the option to the form in the search page. This will need to be done manually by adding the 
+search option in the desired place in `templates/search/search.html` (see how other search options are implemented for inspiration,
+there is a `display_search_option` templatetag which will facilitate things in most cases).
+
+All this will add the search option to the user interface and send corresponding information to the search backend. For example,
+if the new search option should apply a filter in the search backend of some `new_property`, this will be handled by the `SearchQueryProcessor`.
+However, it is expected that this `new_property` has been added to the search engine schema and indexed properly, otherwise there
+will be errors when running the queries.
+
+Please have a look at the documentation of `SearchQueryProcessor` and the various `SearchOption` classes to get a better
+understanding of how all this works.
+
+
 ### Search Engine Backends
 
 The way in which Freesound communicates with a search engine to search for sounds and forum posts is abstracted through
@@ -149,7 +177,6 @@ the implementation of a search backend. You can run it like:
 Please read carefully the documentation of the management command to better understand how it works and how is it
 doing the testing.
 
-
 ### Freesound analysis pipeline
 
 In February 2022 we released a refactoring of the analysis pipeline that allows us to more easily incorporate new audio 

diff --git a/accounts/migrations/0041_alter_profile_options.py b/accounts/migrations/0041_alter_profile_options.py
@@ -0,0 +1,17 @@
+# Generated by Django 3.2.23 on 2024-02-23 22:08
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('accounts', '0040_auto_20230328_1205'),
+    ]
+
+    operations = [
+        migrations.AlterModelOptions(
+            name='profile',
+            options={'ordering': ('-user__date_joined',), 'permissions': (('can_beta_test', 'Show beta features to that user.'),)},
+        ),
+    ]
diff --git a/accounts/models.py b/accounts/models.py
@@ -226,7 +226,7 @@ def get_user_sounds_in_search_url(self):
         return f'{reverse("sounds-search")}?f=username:"{ self.user.username }"&s=Date+added+(newest+first)&g=0'
 
     def get_user_packs_in_search_url(self):
-        return f'{reverse("sounds-search")}?f=username:"{ self.user.username }"&s=Date+added+(newest+first)&g=1&only_p=1'
+        return f'{reverse("sounds-search")}?f=username:"{ self.user.username }"&s=Date+added+(newest+first)&g=1&dp=1'
 
     def get_latest_packs_for_profile_page(self):
         latest_pack_ids = Pack.objects.select_related().filter(user=self.user, num_sounds__gt=0).exclude(is_deleted=True) \
@@ -649,6 +649,9 @@ def get_stats_for_profile_page(self):
 
     class Meta:
         ordering = ('-user__date_joined', )
+        permissions = (
+            ("can_beta_test", "Show beta features to that user."),
+        )
 
 
 class GdprAcceptance(models.Model):

diff --git a/accounts/tests/test_views.py b/accounts/tests/test_views.py
@@ -262,14 +262,14 @@ def test_sounds_response(self):
             reverse('pack-downloaders', kwargs={'username': user.username, "pack_id": self.pack.id}) + '?ajax=1')
         self.assertEqual(resp.status_code, 200)
 
-    @mock.patch('search.views.perform_search_engine_query')
+    @mock.patch('tags.views.perform_search_engine_query')
     def test_tags_response(self, perform_search_engine_query):
         perform_search_engine_query.return_value = (create_fake_perform_search_engine_query_results_tags_mode(), None)
 
         # 200 response on tags page access
         resp = self.client.get(reverse('tags'))
         self.assertEqual(resp.status_code, 200)
-        self.assertEqual(resp.context['tags_mode'], True)
+        self.assertEqual(resp.context['sqp'].tags_mode_active(), True)
 
     def test_packs_response(self):
         # 302 response (note that since BW, there will be a redirect to the search page in between)

diff --git a/accounts/urls.py b/accounts/urls.py
@@ -27,7 +27,7 @@
 import bookmarks.views as bookmarks
 import follow.views as follow
 import apiv2.views as api
-from utils.urlpatterns import redirect_inline
+from utils.url import redirect_inline
 
 
 

diff --git a/clustering/__init__.py b/clustering/__init__.py
@@ -1,23 +0,0 @@
-#
-# Freesound is (c) MUSIC TECHNOLOGY GROUP, UNIVERSITAT POMPEU FABRA
-#
-# Freesound is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as
-# published by the Free Software Foundation, either version 3 of the
-# License, or (at your option) any later version.
-#
-# Freesound is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-#
-# Authors:
-#     See AUTHORS file.
-#
-
-# strings used for communicating the state of the clustering process
-CLUSTERING_RESULT_STATUS_PENDING = "pending"
-CLUSTERING_RESULT_STATUS_FAILED = "failed"

diff --git a/clustering/clustering.py b/clustering/clustering.py
@@ -33,21 +33,15 @@
 import six
 from time import time
 
-from . import clustering_settings as clust_settings
-
-# The following packages are only needed if the running process is configured to be a Celery worker. 
-# We avoid importing them in appservers to avoid having to install unneeded dependencies.
-if settings.IS_CELERY_WORKER:
-    import community as com
-    import numpy as np
-    import networkx as nx
-    from networkx.readwrite import json_graph
-    from networkx.algorithms.community import k_clique_communities, greedy_modularity_communities
-    from sklearn import metrics
-    from sklearn.feature_selection import mutual_info_classif
-    from sklearn.neighbors import kneighbors_graph
-
-    from .features_store import FeaturesStore
+import community as com
+import numpy as np
+import networkx as nx
+from networkx.readwrite import json_graph
+from networkx.algorithms.community import k_clique_communities, greedy_modularity_communities
+from sklearn import metrics
+from sklearn.feature_selection import mutual_info_classif
+from sklearn.neighbors import kneighbors_graph
+
 
 logger = logging.getLogger('clustering')
 
@@ -65,8 +59,6 @@ class ClusteringEngine(object):
     method. Moreover, a few unsued alternative methods for performing some intermediate steps are left 
     here for developement and research purpose.
     """
-    def __init__(self):
-        self.feature_store = FeaturesStore()
 
     def _prepare_clustering_result_and_reference_features_for_evaluation(self, partition):
         """Formats the clustering classes and some reference features in order to then estimate how good is the 
@@ -157,6 +149,9 @@ def _evaluation_metrics(self, partition):
         """
         # we compute the evaluation metrics only if some reference features are available for evaluation
         # we return None when they are not available not to break the following part of the code
+        '''
+        # NOTE: the following code is commented because the reference features are not available in the current version of the code
+        # If in the future we wan to perform further evaluation, we should re-implement some of these functions
         if clust_settings.REFERENCE_FEATURES in clust_settings.AVAILABLE_FEATURES:
             reference_features, clusters = self._prepare_clustering_result_and_reference_features_for_evaluation(partition)
             ami = np.average(mutual_info_classif(reference_features, clusters, discrete_features=True))
@@ -165,6 +160,8 @@ def _evaluation_metrics(self, partition):
             return ami, ss, ci
         else:
             return None, None, None
+        '''
+        return None, None, None
 
     def _ratio_intra_community_edges(self, graph, communities):
         """Computes the ratio of the number of intra-community (cluster) edges to the total number of edges in the cluster.
@@ -212,55 +209,13 @@ def _point_centralities(self, graph, communities):
         node_community_centralities = {k: old_div(v,max(d.values())) for d in communities_centralities for k, v in d.items()}
 
         return node_community_centralities
-
-    def _save_results_to_file(self, query_params, features, graph_json, sound_ids, modularity, 
-                              num_communities, ratio_intra_community_edges, ami, ss, ci, communities):
-        """Saves a json file to disk containing the clustering results information listed below.
 
-        This is used when developing the clustering method. The results and the evaluation metrics are made accessible 
-        for post-analysis.
-        
-        Args:
-            query_params (str): string representing the query parameters submited by the user to the search engine.
-            features (str): name of the features used for clustering. 
-            graph_json: (dict) NetworkX graph representation of sounds data in node-link format that is suitable for JSON 
-                serialization.
-            sound_ids (List[Int]): list of the sound ids.
-            modularity (float): modularity of the graph partition.
-            num_communities (Int): number of communities (clusters).
-            ratio_intra_community_edges (List[Float]): intra-community edges ratio.
-            ami (Numpy.float): Average Mutual Information score.
-            ss (Numpy.float): Silhouette Coefficient score.
-            ci (Numpy.float): Calinski and Harabaz Index score.
-            communities (List[List[Int]]): List storing Lists containing the Sound ids that are in each community (cluster).
-        """
-        if clust_settings.SAVE_RESULTS_FOLDER:
-            result = {
-                'query_params' : query_params,
-                'sound_ids': sound_ids,
-                'num_clusters': num_communities,
-                'graph': graph_json,
-                'features': features,
-                'modularity': modularity,
-                'ratio_intra_community_edges': ratio_intra_community_edges,
-                'average_mutual_information': ami,
-                'silouhette_coeff': ss,
-                'calinski_harabaz_score': ci,
-                'communities': communities
-            }
-            with open(os.path.join(
-                clust_settings.SAVE_RESULTS_FOLDER, 
-                f'{query_params}.json'
-            ), 'w') as f:
-                json.dump(result, f)
-
-    def create_knn_graph(self, sound_ids_list, features=clust_settings.DEFAULT_FEATURES):
+    def create_knn_graph(self, sound_ids_list, similarity_vectors_map):
         """Creates a K-Nearest Neighbors Graph representation of the given sounds.
 
         Args:
             sound_ids_list (List[str]): list of sound ids.
-            features (str): name of the features to be used for nearest neighbors computation. 
-                Available features are listed in the clustering settings file.
+            similarity_vectors_map (Dict{int:List[float]}): dictionary with the similarity feature vectors for each sound.
 
         Returns:
             (nx.Graph): NetworkX graph representation of sounds.
@@ -272,58 +227,21 @@ def create_knn_graph(self, sound_ids_list, features=clust_settings.DEFAULT_FEATU
         # neighbors for small collections, while limiting it for larger collections, which ensures low-computational complexity.
         k = int(np.ceil(np.log2(len(sound_ids_list))))
 
-        sound_features, sound_ids_out = self.feature_store.return_features(sound_ids_list)
+        features = []
+        sound_ids_out = []
+        for sound_id, feature_vector in similarity_vectors_map.items():
+            features.append(feature_vector)
+            sound_ids_out.append(sound_id)            
+        sound_features = np.array(features).astype('float32')
+
         A = kneighbors_graph(sound_features, k)
         for idx_from, (idx_to, distance) in enumerate(zip(A.indices, A.data)):
             idx_from = int(idx_from / k)
-            if distance < clust_settings.MAX_NEIGHBORS_DISTANCE:
+            if distance < settings.CLUSTERING_MAX_NEIGHBORS_DISTANCE:
                 graph.add_edge(sound_ids_out[idx_from], sound_ids_out[idx_to])
 
         # Remove isolated nodes
         graph.remove_nodes_from(list(nx.isolates(graph)))
-
-        return graph
-
-    def create_common_nn_graph(self, sound_ids_list, features=clust_settings.DEFAULT_FEATURES):
-        """Creates a Common Nearest Neighbors Graph representation of the given sounds.
-
-        Args:
-            sound_ids_list (List[str]): list of sound ids.
-            features (str): name of the features to be used for nearest neighbors computation. 
-                Available features are listed in the clustering settings file.
-
-        Returns:
-            (nx.Graph): NetworkX graph representation of sounds.
-        """
-        # first create a knn graph
-        knn_graph = self.create_knn_graph(sound_ids_list, features=features)
-
-        # create the common nn graph
-        graph = nx.Graph()
-        graph.add_nodes_from(knn_graph.nodes)
-
-        for i, node_i in enumerate(knn_graph.nodes):
-            for j, node_j in enumerate(knn_graph.nodes):
-                if j > i:
-                    num_common_neighbors = len(set(knn_graph.neighbors(node_i)).intersection(knn_graph.neighbors(node_j)))
-                    if num_common_neighbors > 0:
-                        graph.add_edge(node_i, node_j, weight=num_common_neighbors)
-
-        # keep only k most weighted edges
-        k = int(np.ceil(np.log2(len(graph.nodes))))
-        # we iterate through the node ids and get all its corresponding edges using graph[node]
-        # there seem to be no way to get node_id & edges in the for loop.
-        for node in graph.nodes:
-            ordered_neighbors = sorted(list(six.iteritems(graph[node])), key=lambda x: x[1]['weight'], reverse=True)
-            try:
-                neighbors_to_remove = [neighbor_distance[0] for neighbor_distance in ordered_neighbors[k:]]
-                graph.remove_edges_from([(node, neighbor) for neighbor in neighbors_to_remove])
-            except IndexError:
-                pass
-
-        # Remove isolated nodes
-        graph.remove_nodes_from(list(nx.isolates(graph)))
-
         return graph
 
     def cluster_graph(self, graph):
@@ -349,7 +267,7 @@ def cluster_graph(self, graph):
         modularity = com.modularity(partition , graph)
 
         return partition, num_communities, communities, modularity
-
+    
     def cluster_graph_overlap(self, graph, k=5):
         """Applies overlapping community detection in the given graph.
 
@@ -371,7 +289,7 @@ def cluster_graph_overlap(self, graph, k=5):
         partition = {sound_id: cluster_id for cluster_id, cluster in enumerate(communities) for sound_id in cluster}
 
         return  partition, num_communities, communities, None
-
+    
     def remove_lowest_quality_cluster(self, graph, partition, communities, ratio_intra_community_edges):
         """Removes the lowest quality cluster in the given graph.
 
@@ -404,13 +322,13 @@ def remove_lowest_quality_cluster(self, graph, partition, communities, ratio_int
                 partition[snd] -= 1
         return graph, partition, communities, ratio_intra_community_edges
 
-    def cluster_points(self, query_params, features, sound_ids):
+    def cluster_points(self, query_params, sound_ids, similarity_vectors_map):
         """Applies clustering on the requested sounds using the given features name.
 
         Args:
             query_params (str): string representing the query parameters submited by the user to the search engine.
-            features (str): name of the features used for clustering the sounds.
             sound_ids (List[int]): list containing the ids of the sound to cluster.
+            similarity_vectors_map (Dict{int:List[float]}): dictionary with the similarity feature vectors for each sound.
         
         Returns:
             Dict: contains the resulting clustering classes and the graph in node-link format suitable for JSON serialization.
@@ -420,17 +338,17 @@ def cluster_points(self, query_params, features, sound_ids):
         logger.info('Request clustering of {} points: {} ... from the query "{}"'
                 .format(len(sound_ids), ', '.join(sound_ids[:20]), json.dumps(query_params)))
 
-        graph = self.create_knn_graph(sound_ids, features=features)
+        graph = self.create_knn_graph(sound_ids, similarity_vectors_map=similarity_vectors_map)
 
         if len(graph.nodes) == 0:  # the graph does not contain any node
-            return {'error': False, 'result': None, 'graph': None}
+            return {'clusters': None, 'graph': None}
 
         partition, num_communities, communities, modularity = self.cluster_graph(graph)
 
         ratio_intra_community_edges = self._ratio_intra_community_edges(graph, communities)
 
         # Discard low quality cluster if there are more than NUM_MAX_CLUSTERS clusters
-        num_exceeding_clusters = num_communities - clust_settings.NUM_MAX_CLUSTERS
+        num_exceeding_clusters = num_communities - settings.CLUSTERING_NUM_MAX_CLUSTERS
         if num_exceeding_clusters > 0:
             for _ in range(num_exceeding_clusters):
                 graph, partition, communities, ratio_intra_community_edges = self.remove_lowest_quality_cluster(
@@ -459,8 +377,4 @@ def cluster_points(self, query_params, features, sound_ids):
         # Export graph as json
         graph_json = json_graph.node_link_data(graph)
 
-        # Save results to file if SAVE_RESULTS_FOLDER is configured in clustering settings
-        self._save_results_to_file(query_params, features, graph_json, sound_ids, modularity, 
-                                   num_communities, ratio_intra_community_edges, ami, ss, ci, communities)
-
-        return {'error': False, 'result': communities, 'graph': graph_json}
+        return {'clusters': communities, 'graph': graph_json}