From 9de140408154a0ffffef0bf1858599035e599128 Mon Sep 17 00:00:00 2001 From: Angelo Antonio Salatino Date: Fri, 13 Dec 2024 17:14:31 +0000 Subject: [PATCH 01/15] changed config file --- cso_classifier/config.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cso_classifier/config.ini b/cso_classifier/config.ini index 5ddae79..93d389e 100644 --- a/cso_classifier/config.ini +++ b/cso_classifier/config.ini @@ -1,5 +1,5 @@ [classifier] -classifier_version = 3.1 +classifier_version = 3.2 package_name = cso-classifier [ontology] From 8d3ab4d97ce67408af76e15e4c92a1072bf0c1bd Mon Sep 17 00:00:00 2001 From: Angelo Antonio Salatino Date: Fri, 13 Dec 2024 18:51:30 +0000 Subject: [PATCH 02/15] changed to accomodate two new variables (statistics) --- cso_classifier/result.py | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/cso_classifier/result.py b/cso_classifier/result.py index 98c2bb0..053c17a 100644 --- a/cso_classifier/result.py +++ b/cso_classifier/result.py @@ -1,19 +1,23 @@ class Result: """ A simple abstraction layer for retrieving the results """ - def __init__(self, explanation = False): + def __init__(self, explanation = False, statistics=False): """ Initialising the ontology class """ self.syntactic = list() self.semantic = list() self.union = list() self.enhanced = list() + self.result_attr = ('syntactic', 'semantic', 'union', 'enhanced') + self.explanation_flag = explanation if self.explanation_flag: self.explanation = dict() - self.result_attr = ('syntactic', 'semantic', 'union', 'enhanced', 'explanation') - else: - self.result_attr = ('syntactic', 'semantic', 'union', 'enhanced') + self.result_attr += ('explanation',) + + self.statistics = statistics + if self.statistics: + self.result_attr += ('syntactic_statistics','semantic_statistics',) def get_dict(self): @@ -77,6 +81,30 @@ def __merge(self): """ Function that fills the union object """ self.union = list(set(self.syntactic + self.semantic)) + + + def set_syntactic_statistic(self, syntactic_statistic): + """ Sets the syntactic_statistic variable + """ + self.syntactic_statistic = syntactic_statistic + + + def get_syntactic_statistic(self): + """ Gets the syntactic_statistic variable + """ + return self.syntactic_statistic + + + def set_semantic(self, semantic_statistic): + """ Sets the semantic_statistic variable + """ + self.semantic_statistic = semantic_statistic + + + def get_semantic_statistic(self): + """ Gets the semantic_statistic variable + """ + return self.semantic_statistic def dump_temporary_explanation(self, temporary_explanation): From 887a8670246d266348c9adc65cb01b1b63688fb9 Mon Sep 17 00:00:00 2001 From: Angelo Antonio Salatino Date: Fri, 13 Dec 2024 19:12:44 +0000 Subject: [PATCH 03/15] fixed typo --- cso_classifier/result.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/cso_classifier/result.py b/cso_classifier/result.py index 053c17a..fcd4f2b 100644 --- a/cso_classifier/result.py +++ b/cso_classifier/result.py @@ -83,28 +83,28 @@ def __merge(self): self.union = list(set(self.syntactic + self.semantic)) - def set_syntactic_statistic(self, syntactic_statistic): - """ Sets the syntactic_statistic variable + def set_syntactic_statistics(self, syntactic_statistics): + """ Sets the syntactic_statistics variable """ - self.syntactic_statistic = syntactic_statistic + self.syntactic_statistics = syntactic_statistics - def get_syntactic_statistic(self): - """ Gets the syntactic_statistic variable + def get_syntactic_statistics(self): + """ Gets the syntactic_statistics variable """ - return self.syntactic_statistic + return self.syntactic_statistics - def set_semantic(self, semantic_statistic): - """ Sets the semantic_statistic variable + def set_semantic(self, semantic_statistics): + """ Sets the semantic_statistics variable """ - self.semantic_statistic = semantic_statistic + self.semantic_statistics = semantic_statistics - def get_semantic_statistic(self): - """ Gets the semantic_statistic variable + def get_semantic_statistics(self): + """ Gets the semantic_statistics variable """ - return self.semantic_statistic + return self.semantic_statistics def dump_temporary_explanation(self, temporary_explanation): From 7fd76eb7e52c31d34ced0e5ce73c8b10bd3a46d7 Mon Sep 17 00:00:00 2001 From: Angelo Antonio Salatino Date: Fri, 13 Dec 2024 19:12:58 +0000 Subject: [PATCH 04/15] now returns statistics --- cso_classifier/syntacticmodule.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/cso_classifier/syntacticmodule.py b/cso_classifier/syntacticmodule.py index 897f752..3f9360c 100644 --- a/cso_classifier/syntacticmodule.py +++ b/cso_classifier/syntacticmodule.py @@ -19,6 +19,8 @@ def __init__(self, cso = None, paper = None): self.min_similarity = 0.90 # Value of minimum similarity self.paper = paper # the paper object self.explanation = dict() # the explanation dictionary + self.extracted_topics = dict() # dictionary with the extract topics (including similarity measures) + def set_paper(self, paper): @@ -68,15 +70,27 @@ def classify_syntactic(self): Returns: - found_topics (dictionary): containing the found topics with their similarity and the n-gram analysed. + final_topics (list): containing the list of final topics. """ final_topics = list() # analysing similarity with terms in the ontology - extracted_topics = self.__statistic_similarity() + self.extracted_topics = self.__statistic_similarity() # stripping explanation - final_topics = self.__strip_service_fields(extracted_topics) + final_topics = self.__strip_service_fields(self.extracted_topics) return final_topics + + + def get_syntactic_statistics(self): + """Function that returns the full set of topics with the similarity measure + + Args: + + + Returns: + extracted_topics (dictionary): containing the found topics with their similarity and the n-gram analysed. + """ + return self.extracted_topics def __statistic_similarity(self): From b8c652473990acec297f63c539df1c48233b6a34 Mon Sep 17 00:00:00 2001 From: Angelo Antonio Salatino Date: Fri, 13 Dec 2024 19:20:27 +0000 Subject: [PATCH 05/15] fixed name of the semantic_statistics --- cso_classifier/result.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cso_classifier/result.py b/cso_classifier/result.py index fcd4f2b..3046f2c 100644 --- a/cso_classifier/result.py +++ b/cso_classifier/result.py @@ -95,7 +95,7 @@ def get_syntactic_statistics(self): return self.syntactic_statistics - def set_semantic(self, semantic_statistics): + def set_semantic_statistics(self, semantic_statistics): """ Sets the semantic_statistics variable """ self.semantic_statistics = semantic_statistics From 82f31625aafe0b0f2991017d988c26ad0d230763 Mon Sep 17 00:00:00 2001 From: Angelo Antonio Salatino Date: Fri, 13 Dec 2024 19:42:18 +0000 Subject: [PATCH 06/15] added missing variables in result --- cso_classifier/result.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cso_classifier/result.py b/cso_classifier/result.py index 3046f2c..df4e545 100644 --- a/cso_classifier/result.py +++ b/cso_classifier/result.py @@ -17,6 +17,8 @@ def __init__(self, explanation = False, statistics=False): self.statistics = statistics if self.statistics: + self.syntactic_statistics = dict() + self.semantic_statistics = dict() self.result_attr += ('syntactic_statistics','semantic_statistics',) From a7cf841b1f7bab6c3702ca5bc2248c5a95a45c0b Mon Sep 17 00:00:00 2001 From: Angelo Antonio Salatino Date: Fri, 13 Dec 2024 19:43:27 +0000 Subject: [PATCH 07/15] now returns statistics --- cso_classifier/semanticmodule.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/cso_classifier/semanticmodule.py b/cso_classifier/semanticmodule.py index bf10561..da2f14b 100644 --- a/cso_classifier/semanticmodule.py +++ b/cso_classifier/semanticmodule.py @@ -20,6 +20,7 @@ def __init__(self, model = None, cso = None, fast_classification = True, paper = self.min_similarity = 0.90 #Initialises the min_similarity self.fast_classification = fast_classification # if will use the full model or not self.explanation = dict() + self.extracted_topics = dict() # dictionary with the extract topics (including similarity measures) def set_paper(self, paper): @@ -74,9 +75,22 @@ def classify_semantic(self): found_topics, explanation = self.__find_topics(self.paper.get_semantic_chunks()) ##################### Ranking - final_topics = self.__rank_topics(found_topics, explanation) + self.extracted_topics = self.__rank_topics(found_topics, explanation) + + final_topics = list(self.extracted_topics.keys()) return final_topics + + def get_semantic_statistics(self): + """Function that returns the full set of topics with the similarity measure + + Args: + + + Returns: + extracted_topics (dictionary): containing the found topics with their metric. + """ + return self.extracted_topics def __find_topics(self, concepts): @@ -259,7 +273,7 @@ def __rank_topics(self, found_topics, explanation): explanation (dictionary): contains information about the explanation of topics Returns: - final_topics (list): list of final topics + final_topics (dictionary): dictionary of final topics """ max_value = 0 scores = [] @@ -335,8 +349,8 @@ def __rank_topics(self, found_topics, explanation): except IndexError: knee = len(sort_t) - final_topics = [] - final_topics = [self.cso.get_topic_wu(sort_t[i][0]) for i in range(0,knee)] + + final_topics = {self.cso.get_topic_wu(sort_t[i][0]):(sort_t[i][1]/max_value) for i in range(0,knee)} self.reset_explanation() self.explanation = {self.cso.topics_wu[sort_t[i][0]]: explanation[sort_t[i][0]] for i in range(0,knee)} From a244255750efc7aee9f51f86df0924e2d5b14092 Mon Sep 17 00:00:00 2001 From: Angelo Antonio Salatino Date: Fri, 13 Dec 2024 19:45:18 +0000 Subject: [PATCH 08/15] temporary adapted to the similarity metrics --- cso_classifier/classifier.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cso_classifier/classifier.py b/cso_classifier/classifier.py index 444f596..4a8fe54 100644 --- a/cso_classifier/classifier.py +++ b/cso_classifier/classifier.py @@ -45,6 +45,7 @@ def __init__(self, **parameters): self.delete_outliers = parameters["delete_outliers"] if "delete_outliers" in parameters else True self.fast_classification = parameters["fast_classification"] if "fast_classification" in parameters else True self.silent = parameters["silent"] if "silent" in parameters else False + self.statistics = parameters["statistics"] if "statistics" in parameters else False self.__check_parameters(parameters) @@ -81,7 +82,7 @@ def run(self, paper): self.models_loaded = True t_paper = Paper(paper, self.modules) - result = Result(self.explanation) + result = Result(self.explanation, self.statistics) # Passing parameters to the two classes (synt and sema) and actioning classifiers @@ -89,11 +90,16 @@ def run(self, paper): if self.modules in ('syntactic','both'): synt_module = synt(self.cso, t_paper) result.set_syntactic(synt_module.classify_syntactic()) + if self.statistics: + result.set_syntactic_statistics(synt_module.get_syntactic_statistics()) if self.explanation: result.dump_temporary_explanation(synt_module.get_explanation()) + if self.modules in ('semantic','both'): sema_module = sema(self.model, self.cso, self.fast_classification, t_paper) result.set_semantic(sema_module.classify_semantic()) + if self.statistics: + result.set_semantic_statistics(sema_module.get_semantic_statistics()) if self.explanation: result.dump_temporary_explanation(sema_module.get_explanation()) From d662ca74b8f26d1333a231b33c9e91051af1a9d2 Mon Sep 17 00:00:00 2001 From: Angelo Antonio Salatino Date: Mon, 16 Dec 2024 16:58:29 +0000 Subject: [PATCH 09/15] adapted the syntactic module to report values as the semantic module --- cso_classifier/syntacticmodule.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/cso_classifier/syntacticmodule.py b/cso_classifier/syntacticmodule.py index 3f9360c..de1a4fe 100644 --- a/cso_classifier/syntacticmodule.py +++ b/cso_classifier/syntacticmodule.py @@ -88,9 +88,16 @@ def get_syntactic_statistics(self): Returns: - extracted_topics (dictionary): containing the found topics with their similarity and the n-gram analysed. + statistics (dictionary): containing the found topics with their similarity and the n-gram analysed. """ - return self.extracted_topics + statistics = dict() + for topic, sim_values in self.extracted_topics.items(): + if len(sim_values) == 1: + statistics[topic] = sim_values[0]["similarity"] + else: + statistics[topic] = max([sim_value["similarity"] for sim_value in sim_values]) + + return statistics def __statistic_similarity(self): From 3340d47fd461cc8f399e0fe8c5edd1884232601e Mon Sep 17 00:00:00 2001 From: Angelo Antonio Salatino Date: Mon, 16 Dec 2024 16:58:49 +0000 Subject: [PATCH 10/15] propagated the changes to the run_batch --- cso_classifier/classifier.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cso_classifier/classifier.py b/cso_classifier/classifier.py index 4a8fe54..02334cf 100644 --- a/cso_classifier/classifier.py +++ b/cso_classifier/classifier.py @@ -184,17 +184,21 @@ def _batch_run_single_worker(self, papers): print("Processing:", paper_id) paper.set_paper(paper_value) - result = Result(self.explanation) + result = Result(self.explanation, self.statistics) # Passing paper and actioning the classifier if self.modules in ('syntactic','both'): synt_module.set_paper(paper) result.set_syntactic(synt_module.classify_syntactic()) + if self.statistics: + result.set_syntactic_statistics(synt_module.get_syntactic_statistics()) if self.explanation: result.dump_temporary_explanation(synt_module.get_explanation()) if self.modules in ('semantic','both'): sema_module.set_paper(paper) result.set_semantic(sema_module.classify_semantic()) + if self.statistics: + result.set_semantic_statistics(sema_module.get_semantic_statistics()) if self.explanation: result.dump_temporary_explanation(sema_module.get_explanation()) postprocess.set_result(result) From 50dbf7d80b0a2e9a5e2d31f2c28301e6abeed90c Mon Sep 17 00:00:00 2001 From: Angelo Antonio Salatino Date: Mon, 16 Dec 2024 18:31:33 +0000 Subject: [PATCH 11/15] added filter also for the statistics --- cso_classifier/postprocmodule.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cso_classifier/postprocmodule.py b/cso_classifier/postprocmodule.py index f675d61..d284212 100644 --- a/cso_classifier/postprocmodule.py +++ b/cso_classifier/postprocmodule.py @@ -25,6 +25,7 @@ def __init__(self, model = None, cso = None, **parameters): self.list_of_topics = list() self.enhancement = parameters["enhancement"] if "enhancement" in parameters else "first" #defines the type of enhancement self.delete_outliers = parameters["delete_outliers"] if "delete_outliers" in parameters else True + self.statistics = parameters["statistics"] if "statistics" in parameters else False if "result" in parameters: self.result = parameters["result"] # the result object @@ -226,13 +227,17 @@ def filtering_outliers(self): # Modulating the result. - selected_topics_set = set(selected_topics+syntactic_to_keep).union(topics_to_spare) + selected_topics_set = set()#set(selected_topics+syntactic_to_keep).union(topics_to_spare) selected_topics = list(selected_topics_set) self.result.set_syntactic(list(set(self.result.get_syntactic()).intersection(selected_topics_set))) self.result.set_semantic(list(set(self.result.get_semantic()).intersection(selected_topics_set))) self.result.set_union(selected_topics) self.result.set_enhanced(self.cso.climb_ontology(selected_topics, self.enhancement)) + if self.statistics: + self.result.set_syntactic_statistics({topic:val for topic, val in self.result.get_syntactic_statistics().items() if topic in selected_topics_set}) + self.result.set_semantic_statistics({topic:val for topic, val in self.result.get_semantic_statistics().items() if topic in selected_topics_set}) + else: self.result.set_enhanced(self.cso.climb_ontology(self.result.get_union(), self.enhancement)) From d7f39ebddc2e22355e71a2534cf3030160afada2 Mon Sep 17 00:00:00 2001 From: Angelo Antonio Salatino Date: Mon, 16 Dec 2024 18:32:03 +0000 Subject: [PATCH 12/15] cleaned code (trailing space) --- cso_classifier/classifier.py | 7 ++++--- cso_classifier/postprocmodule.py | 2 +- cso_classifier/semanticmodule.py | 6 +++--- cso_classifier/syntacticmodule.py | 8 ++++---- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/cso_classifier/classifier.py b/cso_classifier/classifier.py index 02334cf..727f47b 100644 --- a/cso_classifier/classifier.py +++ b/cso_classifier/classifier.py @@ -94,7 +94,7 @@ def run(self, paper): result.set_syntactic_statistics(synt_module.get_syntactic_statistics()) if self.explanation: result.dump_temporary_explanation(synt_module.get_explanation()) - + if self.modules in ('semantic','both'): sema_module = sema(self.model, self.cso, self.fast_classification, t_paper) result.set_semantic(sema_module.classify_semantic()) @@ -104,7 +104,7 @@ def run(self, paper): result.dump_temporary_explanation(sema_module.get_explanation()) - postprocess = post(self.model, self.cso, enhancement=self.enhancement, result=result, delete_outliers=self.delete_outliers) + postprocess = post(self.model, self.cso, enhancement=self.enhancement, result=result, delete_outliers=self.delete_outliers, statistics=self.statistics) result = postprocess.filtering_outliers() return result.get_dict() @@ -173,7 +173,7 @@ def _batch_run_single_worker(self, papers): # Passing parameters to the two classes (synt and sema) synt_module = synt(cso) sema_module = sema(model, cso, self.fast_classification) - postprocess = post(model, cso, enhancement=self.enhancement, delete_outliers=self.delete_outliers) + postprocess = post(model, cso, enhancement=self.enhancement, delete_outliers=self.delete_outliers, statistics=self.statistics) # initializing variable that will contain output @@ -201,6 +201,7 @@ def _batch_run_single_worker(self, papers): result.set_semantic_statistics(sema_module.get_semantic_statistics()) if self.explanation: result.dump_temporary_explanation(sema_module.get_explanation()) + postprocess.set_result(result) result = postprocess.filtering_outliers() diff --git a/cso_classifier/postprocmodule.py b/cso_classifier/postprocmodule.py index d284212..d77cc19 100644 --- a/cso_classifier/postprocmodule.py +++ b/cso_classifier/postprocmodule.py @@ -237,7 +237,7 @@ def filtering_outliers(self): if self.statistics: self.result.set_syntactic_statistics({topic:val for topic, val in self.result.get_syntactic_statistics().items() if topic in selected_topics_set}) self.result.set_semantic_statistics({topic:val for topic, val in self.result.get_semantic_statistics().items() if topic in selected_topics_set}) - + else: self.result.set_enhanced(self.cso.climb_ontology(self.result.get_union(), self.enhancement)) diff --git a/cso_classifier/semanticmodule.py b/cso_classifier/semanticmodule.py index da2f14b..da5406b 100644 --- a/cso_classifier/semanticmodule.py +++ b/cso_classifier/semanticmodule.py @@ -76,11 +76,11 @@ def classify_semantic(self): ##################### Ranking self.extracted_topics = self.__rank_topics(found_topics, explanation) - + final_topics = list(self.extracted_topics.keys()) return final_topics - + def get_semantic_statistics(self): """Function that returns the full set of topics with the similarity measure @@ -349,7 +349,7 @@ def __rank_topics(self, found_topics, explanation): except IndexError: knee = len(sort_t) - + final_topics = {self.cso.get_topic_wu(sort_t[i][0]):(sort_t[i][1]/max_value) for i in range(0,knee)} self.reset_explanation() self.explanation = {self.cso.topics_wu[sort_t[i][0]]: explanation[sort_t[i][0]] for i in range(0,knee)} diff --git a/cso_classifier/syntacticmodule.py b/cso_classifier/syntacticmodule.py index de1a4fe..80fbe38 100644 --- a/cso_classifier/syntacticmodule.py +++ b/cso_classifier/syntacticmodule.py @@ -20,7 +20,7 @@ def __init__(self, cso = None, paper = None): self.paper = paper # the paper object self.explanation = dict() # the explanation dictionary self.extracted_topics = dict() # dictionary with the extract topics (including similarity measures) - + def set_paper(self, paper): @@ -79,8 +79,8 @@ def classify_syntactic(self): # stripping explanation final_topics = self.__strip_service_fields(self.extracted_topics) return final_topics - - + + def get_syntactic_statistics(self): """Function that returns the full set of topics with the similarity measure @@ -96,7 +96,7 @@ def get_syntactic_statistics(self): statistics[topic] = sim_values[0]["similarity"] else: statistics[topic] = max([sim_value["similarity"] for sim_value in sim_values]) - + return statistics From 3eb7171da807c214447b0a5cef9e1a4e142b76ff Mon Sep 17 00:00:00 2001 From: Angelo Antonio Salatino Date: Mon, 16 Dec 2024 19:13:50 +0000 Subject: [PATCH 13/15] changed statistics to weights (which makes more sense) --- cso_classifier/classifier.py | 28 +++++++++++++----------- cso_classifier/postprocmodule.py | 10 ++++----- cso_classifier/result.py | 36 +++++++++++++++---------------- cso_classifier/semanticmodule.py | 4 ++-- cso_classifier/syntacticmodule.py | 14 ++++++------ 5 files changed, 47 insertions(+), 45 deletions(-) diff --git a/cso_classifier/classifier.py b/cso_classifier/classifier.py index 727f47b..66843da 100644 --- a/cso_classifier/classifier.py +++ b/cso_classifier/classifier.py @@ -35,6 +35,8 @@ def __init__(self, **parameters): - delete_outliers (boolean): if True it runs the outlier detection approach in the postprocessing - fast_classification (boolen): if True it runs the fast version of the classifier (cached model). If False the classifier uses the word2vec model which has higher computational complexity + - get_weights (boolean): determines whether to return the weights associated to the syntactic and semantic topics. + True to return weights. Default value is False - silent (boolean): determines whether to print the progress. If true goes in silent mode. Instead, if false does not print anything in standard output. @@ -45,7 +47,7 @@ def __init__(self, **parameters): self.delete_outliers = parameters["delete_outliers"] if "delete_outliers" in parameters else True self.fast_classification = parameters["fast_classification"] if "fast_classification" in parameters else True self.silent = parameters["silent"] if "silent" in parameters else False - self.statistics = parameters["statistics"] if "statistics" in parameters else False + self.get_weights = parameters["get_weights"] if "get_weights" in parameters else False self.__check_parameters(parameters) @@ -82,7 +84,7 @@ def run(self, paper): self.models_loaded = True t_paper = Paper(paper, self.modules) - result = Result(self.explanation, self.statistics) + result = Result(self.explanation, self.get_weights) # Passing parameters to the two classes (synt and sema) and actioning classifiers @@ -90,21 +92,21 @@ def run(self, paper): if self.modules in ('syntactic','both'): synt_module = synt(self.cso, t_paper) result.set_syntactic(synt_module.classify_syntactic()) - if self.statistics: - result.set_syntactic_statistics(synt_module.get_syntactic_statistics()) + if self.get_weights: + result.set_syntactic_topics_weights(synt_module.get_syntactic_topics_weights()) if self.explanation: result.dump_temporary_explanation(synt_module.get_explanation()) if self.modules in ('semantic','both'): sema_module = sema(self.model, self.cso, self.fast_classification, t_paper) result.set_semantic(sema_module.classify_semantic()) - if self.statistics: - result.set_semantic_statistics(sema_module.get_semantic_statistics()) + if self.get_weights: + result.set_semantic_topics_weights(sema_module.get_semantic_topics_weights()) if self.explanation: result.dump_temporary_explanation(sema_module.get_explanation()) - postprocess = post(self.model, self.cso, enhancement=self.enhancement, result=result, delete_outliers=self.delete_outliers, statistics=self.statistics) + postprocess = post(self.model, self.cso, enhancement=self.enhancement, result=result, delete_outliers=self.delete_outliers, get_weights=self.get_weights) result = postprocess.filtering_outliers() return result.get_dict() @@ -173,7 +175,7 @@ def _batch_run_single_worker(self, papers): # Passing parameters to the two classes (synt and sema) synt_module = synt(cso) sema_module = sema(model, cso, self.fast_classification) - postprocess = post(model, cso, enhancement=self.enhancement, delete_outliers=self.delete_outliers, statistics=self.statistics) + postprocess = post(model, cso, enhancement=self.enhancement, delete_outliers=self.delete_outliers, get_weights=self.get_weights) # initializing variable that will contain output @@ -184,21 +186,21 @@ def _batch_run_single_worker(self, papers): print("Processing:", paper_id) paper.set_paper(paper_value) - result = Result(self.explanation, self.statistics) + result = Result(self.explanation, self.get_weights) # Passing paper and actioning the classifier if self.modules in ('syntactic','both'): synt_module.set_paper(paper) result.set_syntactic(synt_module.classify_syntactic()) - if self.statistics: - result.set_syntactic_statistics(synt_module.get_syntactic_statistics()) + if self.get_weights: + result.set_syntactic_topics_weights(synt_module.get_syntactic_topics_weights()) if self.explanation: result.dump_temporary_explanation(synt_module.get_explanation()) if self.modules in ('semantic','both'): sema_module.set_paper(paper) result.set_semantic(sema_module.classify_semantic()) - if self.statistics: - result.set_semantic_statistics(sema_module.get_semantic_statistics()) + if self.get_weights: + result.set_semantic_topics_weights(sema_module.get_semantic_topics_weights()) if self.explanation: result.dump_temporary_explanation(sema_module.get_explanation()) diff --git a/cso_classifier/postprocmodule.py b/cso_classifier/postprocmodule.py index d77cc19..283c5f3 100644 --- a/cso_classifier/postprocmodule.py +++ b/cso_classifier/postprocmodule.py @@ -25,7 +25,7 @@ def __init__(self, model = None, cso = None, **parameters): self.list_of_topics = list() self.enhancement = parameters["enhancement"] if "enhancement" in parameters else "first" #defines the type of enhancement self.delete_outliers = parameters["delete_outliers"] if "delete_outliers" in parameters else True - self.statistics = parameters["statistics"] if "statistics" in parameters else False + self.get_weights = parameters["get_weights"] if "get_weights" in parameters else False if "result" in parameters: self.result = parameters["result"] # the result object @@ -227,16 +227,16 @@ def filtering_outliers(self): # Modulating the result. - selected_topics_set = set()#set(selected_topics+syntactic_to_keep).union(topics_to_spare) + selected_topics_set = set(selected_topics+syntactic_to_keep).union(topics_to_spare) selected_topics = list(selected_topics_set) self.result.set_syntactic(list(set(self.result.get_syntactic()).intersection(selected_topics_set))) self.result.set_semantic(list(set(self.result.get_semantic()).intersection(selected_topics_set))) self.result.set_union(selected_topics) self.result.set_enhanced(self.cso.climb_ontology(selected_topics, self.enhancement)) - if self.statistics: - self.result.set_syntactic_statistics({topic:val for topic, val in self.result.get_syntactic_statistics().items() if topic in selected_topics_set}) - self.result.set_semantic_statistics({topic:val for topic, val in self.result.get_semantic_statistics().items() if topic in selected_topics_set}) + if self.get_weights: + self.result.set_syntactic_topics_weights({topic:val for topic, val in self.result.get_syntactic_topics_weights().items() if topic in selected_topics_set}) + self.result.set_semantic_topics_weights({topic:val for topic, val in self.result.get_semantic_topics_weights().items() if topic in selected_topics_set}) else: diff --git a/cso_classifier/result.py b/cso_classifier/result.py index df4e545..b63c463 100644 --- a/cso_classifier/result.py +++ b/cso_classifier/result.py @@ -1,7 +1,7 @@ class Result: """ A simple abstraction layer for retrieving the results """ - def __init__(self, explanation = False, statistics=False): + def __init__(self, explanation = False, get_weights=False): """ Initialising the ontology class """ self.syntactic = list() @@ -15,11 +15,11 @@ def __init__(self, explanation = False, statistics=False): self.explanation = dict() self.result_attr += ('explanation',) - self.statistics = statistics - if self.statistics: - self.syntactic_statistics = dict() - self.semantic_statistics = dict() - self.result_attr += ('syntactic_statistics','semantic_statistics',) + self.get_weights = get_weights + if self.get_weights: + self.syntactic_weights = dict() + self.semantic_weights = dict() + self.result_attr += ('syntactic_weights','semantic_weights',) def get_dict(self): @@ -85,28 +85,28 @@ def __merge(self): self.union = list(set(self.syntactic + self.semantic)) - def set_syntactic_statistics(self, syntactic_statistics): - """ Sets the syntactic_statistics variable + def set_syntactic_topics_weights(self, syntactic_weights): + """ Sets the syntactic_weights variable """ - self.syntactic_statistics = syntactic_statistics + self.syntactic_weights = syntactic_weights - def get_syntactic_statistics(self): - """ Gets the syntactic_statistics variable + def get_syntactic_topics_weights(self): + """ Gets the syntactic_weights variable """ - return self.syntactic_statistics + return self.syntactic_weights - def set_semantic_statistics(self, semantic_statistics): - """ Sets the semantic_statistics variable + def set_semantic_topics_weights(self, semantic_weights): + """ Sets the semantic_weights variable """ - self.semantic_statistics = semantic_statistics + self.semantic_weights = semantic_weights - def get_semantic_statistics(self): - """ Gets the semantic_statistics variable + def get_semantic_topics_weights(self): + """ Gets the semantic_weights variable """ - return self.semantic_statistics + return self.semantic_weights def dump_temporary_explanation(self, temporary_explanation): diff --git a/cso_classifier/semanticmodule.py b/cso_classifier/semanticmodule.py index da5406b..33c1dbc 100644 --- a/cso_classifier/semanticmodule.py +++ b/cso_classifier/semanticmodule.py @@ -81,7 +81,7 @@ def classify_semantic(self): return final_topics - def get_semantic_statistics(self): + def get_semantic_topics_weights(self): """Function that returns the full set of topics with the similarity measure Args: @@ -90,7 +90,7 @@ def get_semantic_statistics(self): Returns: extracted_topics (dictionary): containing the found topics with their metric. """ - return self.extracted_topics + return self.extracted_topics #they are already in the correct format. def __find_topics(self, concepts): diff --git a/cso_classifier/syntacticmodule.py b/cso_classifier/syntacticmodule.py index 80fbe38..e0ac9a4 100644 --- a/cso_classifier/syntacticmodule.py +++ b/cso_classifier/syntacticmodule.py @@ -81,23 +81,23 @@ def classify_syntactic(self): return final_topics - def get_syntactic_statistics(self): - """Function that returns the full set of topics with the similarity measure + def get_syntactic_topics_weights(self): + """Function that returns the full set of topics with the similarity measure (weights) Args: Returns: - statistics (dictionary): containing the found topics with their similarity and the n-gram analysed. + weights (dictionary): containing the found topics with their similarity and the n-gram analysed. """ - statistics = dict() + weights = dict() for topic, sim_values in self.extracted_topics.items(): if len(sim_values) == 1: - statistics[topic] = sim_values[0]["similarity"] + weights[topic] = sim_values[0]["similarity"] else: - statistics[topic] = max([sim_value["similarity"] for sim_value in sim_values]) + weights[topic] = max([sim_value["similarity"] for sim_value in sim_values]) - return statistics + return weights def __statistic_similarity(self): From 8308f0d6ec3dc1486d1aa92d5ae367e52a06f53e Mon Sep 17 00:00:00 2001 From: Angelo Antonio Salatino Date: Mon, 16 Dec 2024 19:42:28 +0000 Subject: [PATCH 14/15] updated with the new version --- README.md | 228 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 137 insertions(+), 91 deletions(-) diff --git a/README.md b/README.md index 0c1a4ee..6ba215c 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,7 @@ Read more: [https://skm.kmi.open.ac.uk/cso-classifier/](https://skm.kmi.open.ac. - [Sample Output (BM)](#sample-output-bm) - [Parameters](#parameters) - [Releases](#releases) + - [v3.2](#v32) - [v3.1](#v31) - [v3.0](#v30) - [v2.3.2](#v232) @@ -217,99 +218,130 @@ Even if you are running multiple classifications, the current implementation of #### Sample Output (SP) -As output, the classifier returns a dictionary with five components: (i) syntactic, (ii) semantic, (iii) union, (iv) enhanced, and (v) explanation. The latter field is available only if the **explanation** flag is set to True. +As output, the classifier returns a dictionary with seven components: (i) syntactic, (ii) semantic, (iii) union, (iv) enhanced, (v) explanation, (vi) syntactic_weights and (vii) semantic_weights. The explanation field is available only if the **explanation** flag is set to True. The last two fields are available only if the **get_weights** is set to True. -Below you can find an example. The keys syntactic and semantic respectively contain the topics returned by the syntactic and semantic module. Union contains the unique topics found by the previous two modules. In enhanced you can find the relevant super-areas. *Please be aware that the results may change according to the version of Computer Science Ontology.* +Below you can find an example. The keys syntactic and semantic respectively contain the topics returned by the syntactic and semantic module. Union contains the unique topics found by the previous two modules. In enhanced you can find the relevant super-areas. For the sake of clarity, we run the example with all the flag on, and hence it contains the enhanced field and both syntactic_weights and semantic_weights. + +*Please be aware that the results may change according to the version of Computer Science Ontology.* ```json { - "syntactic":[ - "network topology", - "online social networks", - "real-world networks", - "anonymization", - "privacy", - "social networks", - "data privacy", - "graph theory", - "data mining", - "sensitive informations", - "anonymity", - "micro-blog", - "twitter" - ], - "semantic":[ - "network topology", - "online social networks", - "topology", - "data privacy", - "social networks", - "privacy", - "anonymization", - "graph theory", - "data mining", - "anonymity", - "micro-blog", - "twitter" - ], - "union":[ - "network topology", - "online social networks", - "topology", - "real-world networks", - "anonymization", - "privacy", - "social networks", - "data privacy", - "graph theory", - "data mining", - "sensitive informations", - "anonymity", - "micro-blog", - "twitter" - ], - "enhanced":[ - "computer networks", - "online systems", - "complex networks", - "privacy preserving", - "computer security", - "world wide web", - "theoretical computer science", - "computer science", - "access control", - "network security", - "authentication", - "social media" - ], - "explanation":{ - "social networks": ["social network", "online social networks", "microblogging service", "real-world networks", "social networks", "microblogging", "social networking", "twitter graph", "anonymous twitter", "twitter"], - "online social networks": ["online social networks", "social network", "social networks"], - "sensitive informations": ["sensitive information"], - "privacy": ["sensitive information", "anonymity", "anonymous", "data privacy", "privacy"], - "anonymization": ["anonymization"], - "anonymity": ["anonymity", "anonymous"], - "real-world networks": ["real-world networks"], - "twitter": ["twitter graph", "twitter", "microblogging service", "anonymous twitter", "microblogging"], - "micro-blog": ["twitter graph", "twitter", "microblogging service", "anonymous twitter", "microblogging"], - "network topology": ["topology", "network topology"], - "data mining": ["data mining", "mining"], - "data privacy": ["data privacy", "privacy"], - "graph theory": ["graph theory"], - "topology": ["topology", "network topology"], - "computer networks": ["topology", "network topology"], - "online systems": ["online social networks", "social network", "social networks"], - "complex networks": ["real-world networks"], - "privacy preserving": ["anonymization"], - "computer security": ["anonymity", "data privacy", "privacy"], - "world wide web": ["social network", "online social networks", "microblogging service", "real-world networks", "social networks", "microblogging", "social networking", "twitter graph", "anonymous twitter", "twitter"], - "theoretical computer science": ["graph theory"], - "computer science": ["data mining", "mining"], - "access control": ["sensitive information"], - "network security": ["anonymity", "sensitive information", "anonymous"], - "authentication": ["anonymity", "anonymous"], - "social media": ["microblogging service", "microblogging", "twitter graph", "anonymous twitter", "twitter"] - } + "syntactic": [ + "graph theory", + "anonymization", + "anonymity", + "online social networks", + "real-world networks", + "data privacy", + "privacy", + "twitter", + "sensitive informations", + "network topology", + "social networks", + "data mining", + "micro-blog" + ], + "semantic": [ + "graph theory", + "anonymization", + "anonymity", + "online social networks", + "data privacy", + "topology", + "data mining", + "privacy", + "twitter", + "social networks", + "network topology", + "micro-blog" + ], + "union": [ + "graph theory", + "anonymization", + "anonymity", + "online social networks", + "real-world networks", + "data privacy", + "topology", + "privacy", + "twitter", + "sensitive informations", + "network topology", + "social networks", + "data mining", + "micro-blog" + ], + "enhanced": [ + "theoretical computer science", + "privacy preserving", + "authentication", + "network security", + "online systems", + "complex networks", + "computer security", + "social media", + "access control", + "computer networks", + "world wide web", + "computer science" + ], + "explanation": { + "social networks": ["online social networks","microblogging","social-network","social network","real-world networks","social networking","twitter","social networks"], + "online social networks": ["social networks","social network","online social networks"], + "sensitive informations": ["sensitive information"], + "data mining": ["data mining","mining","data-mining"], + "privacy": ["sensitive information","privacy","anonymity","anonymous","data privacy"], + "anonymization": ["anonymization"], + "anonymity": ["anonymity","anonymous"], + "real-world networks": ["real-world networks"], + "twitter": ["twitter","twitter graph","microblogging","anonymous twitter","microblogging service"], + "micro-blog": ["twitter graph","twitter","microblogging","anonymous twitter","microblogging service"], + "network topology": ["network topology","topology"], + "data privacy": ["privacy","data privacy"], + "graph theory": ["graph theory"], + "topology": ["network topology","topology"], + "theoretical computer science": ["graph theory"], + "privacy preserving": ["anonymization"], + "authentication": ["anonymity","anonymous"], + "network security": ["sensitive information","anonymity","anonymous"], + "online systems": ["social networks","social network","online social networks"], + "complex networks": ["real-world networks"], + "computer security": ["sensitive information","privacy","anonymity","anonymous","data privacy"], + "social media": ["twitter","microblogging"], + "access control": ["sensitive information"], + "computer networks": ["network topology","topology"], + "world wide web": ["online social networks","microblogging","social-network","social network","real-world networks","social networking","twitter","social networks"], + "computer science": ["data mining","mining","data-mining"] + }, + "syntactic_weights": { + "social networks": 1.0, + "online social networks": 1.0, + "sensitive informations": 0.9545454545454546, + "data mining": 1.0, + "privacy": 1.0, + "anonymization": 1.0, + "anonymity": 1.0, + "real-world networks": 1.0, + "twitter": 1.0, + "micro-blog": 1.0, + "network topology": 1.0, + "data privacy": 1.0, + "graph theory": 1.0 + }, + "semantic_weights": { + "social networks": 1.0, + "online social networks": 1.0, + "data mining": 1.0, + "privacy": 1.0, + "data privacy": 1.0, + "anonymization": 1.0, + "anonymity": 1.0, + "twitter": 1.0, + "micro-blog": 1.0, + "topology": 1.0, + "network topology": 1.0, + "graph theory": 1.0 + } } ``` @@ -414,7 +446,10 @@ Beside the paper(s), the function running the CSO Classifier accepts seven addit (vi) The parameter *fast_classification* can be either *True* or *False*. This parameter determines whether the semantic module should use the full model or the cached one. Using the full model provides slightly better results than the cached one. However, using the cached model is more than 15x faster. Read [here](#word2vec-model-and-token-to-cso-combined-file-generation) for more details about these two models. The default value for *fast_classification* is *True*. -(vii) The parameter *silent* can be either *True* or *False*. This determines whether the classifier prints its progress in the console. If set to True, the classifier will be silent and will not print any output while classifying. The default value for *silent* is *False*. +(vii) The parameter *get_weights* can be either *True* or *False*. This determines whether the classifier returns the weights associated to the identified topics. For the syntactic topics these represent the value of string similarity (Levenshtein) of topics compared the chunks of text identified in the input text. Whereas, the weights for the semantic topics correspond to the normalised values from the topic distribution obtained from running the semantic module. + +(viii) The parameter *silent* can be either *True* or *False*. This determines whether the classifier prints its progress in the console. If set to True, the classifier will be silent and will not print any output while classifying. The default value for *silent* is *False*. + |# | Parameter | Single Paper | Batch Mode | @@ -425,7 +460,9 @@ Beside the paper(s), the function running the CSO Classifier accepts seven addit |iv | explanation | :white_check_mark: | :white_check_mark: | |v |delete_outliers| :white_check_mark: | :white_check_mark: | |vi | fast_classification| :white_check_mark: | :white_check_mark: | -|vii| silent | :white_check_mark: | :white_check_mark: | +|vii| get_weights | :white_check_mark: | :white_check_mark: | +|viii| silent | :white_check_mark: | :white_check_mark: | + **Table 1**: Parameters availability when using CSO Classifier @@ -434,6 +471,11 @@ Beside the paper(s), the function running the CSO Classifier accepts seven addit Here we list the available releases for the CSO Classifier. These releases are available for download both from [Github](https://github.com/angelosalatino/cso-classifier/releases) and [Zenodo](10.5281/zenodo.2660819). +### v3.2 + +This release extends version 3.1 by supporting users in exporting the weights associated to the identified topics. If enabled, within the result of the classification, the classifier include two new keys ```syntactic_weights``` and ```semantic_weights``` which respectively contain the identified syntactic and semantic topics as keys, and their weights as values. +This component is disabled by default and can be enabled by setting ```get_weights = True``` when calling the CSO Classifier (see [Parameters](#parameters)). + ### v3.1 This release brings in two main changes. The first change is related to the library (and the code) to perform the Levenshtein similarity. Before we relied on ```python-Levenshtein``` which required ```python3-devel```. This new version uses ```rapidfuzz``` which as fast as the previous library and it is much easier to install on the various systems. @@ -458,6 +500,10 @@ Please, be aware that having substantially restructured the code into classes, t We would like to thank James Dunham @jamesdunham from CSET (Georgetown University) for suggesting to us how to improve the code. +More details about this version of the classifier can be found within: +> Salatino, A., Osborne, F., & Motta, E. (2022). CSO Classifier 3.0: a Scalable Unsupervised Method for Classifying Documents in terms of Research Topics. International Journal on Digital Libraries, 1-20. [Read more](https://doi.org/10.1007/s00799-021-00305-y) + + Download from: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5095422.svg)](https://doi.org/10.5281/zenodo.5095422) From c3727df4f3775154219e97c822bea26184e7136b Mon Sep 17 00:00:00 2001 From: Angelo Antonio Salatino Date: Thu, 19 Dec 2024 17:15:37 +0000 Subject: [PATCH 15/15] changed interaction and result --- CSO-Classifier.ipynb | 2 +- CSO-Classifier.py | 2 +- output.json | 200 ++++++++++++++++++++++++------------------- 3 files changed, 116 insertions(+), 88 deletions(-) diff --git a/CSO-Classifier.ipynb b/CSO-Classifier.ipynb index e176a81..70ce273 100644 --- a/CSO-Classifier.ipynb +++ b/CSO-Classifier.ipynb @@ -70,7 +70,7 @@ "metadata": {}, "outputs": [], "source": [ - "cc = CSOClassifier(explanation=True)\n", + "cc = CSOClassifier(explanation=True, get_weights=True)\n", "\n", "result = cc.run(paper)" ] diff --git a/CSO-Classifier.py b/CSO-Classifier.py index aeb82c8..823eff0 100644 --- a/CSO-Classifier.py +++ b/CSO-Classifier.py @@ -46,7 +46,7 @@ # In[Run Classifier] -cc = CSOClassifier(explanation=True) +cc = CSOClassifier(explanation=True, get_weights=True) result = cc.run(paper) diff --git a/output.json b/output.json index 16fa94e..410bb8c 100644 --- a/output.json +++ b/output.json @@ -1,76 +1,73 @@ { "syntactic": [ - "data mining", - "micro-blog", - "social networks", - "data privacy", - "sensitive informations", - "anonymity", + "real-world networks", "anonymization", "network topology", + "data privacy", + "social networks", + "privacy", "twitter", - "real-world networks", "graph theory", "online social networks", - "privacy" - ], - "semantic": [ + "anonymity", "data mining", "micro-blog", - "privacy", - "social networks", - "data privacy", - "anonymity", + "sensitive informations" + ], + "semantic": [ "anonymization", "network topology", + "topology", + "data privacy", + "social networks", + "privacy", "twitter", "graph theory", "online social networks", - "topology" + "anonymity", + "data mining", + "micro-blog" ], "union": [ - "data mining", - "micro-blog", - "privacy", - "social networks", - "data privacy", - "sensitive informations", - "anonymity", + "real-world networks", "anonymization", "network topology", + "topology", + "data privacy", + "social networks", + "privacy", "twitter", - "real-world networks", "graph theory", "online social networks", - "topology" + "anonymity", + "data mining", + "micro-blog", + "sensitive informations" ], "enhanced": [ - "computer science", - "computer security", - "world wide web", - "access control", - "network security", - "authentication", + "complex networks", "privacy preserving", "computer networks", + "world wide web", + "computer security", "social media", - "complex networks", "theoretical computer science", - "online systems" + "online systems", + "authentication", + "network security", + "computer science", + "access control" ], "explanation": { "social networks": [ - "social-network", + "real-world networks", "social networks", - "social networking", - "microblogging service", "twitter", - "real-world networks", + "social-network", "online social networks", - "microblogging", "social network", - "twitter graph", - "anonymous twitter" + "microblogging", + "social networking" ], "online social networks": [ "online social networks", @@ -87,95 +84,80 @@ ], "privacy": [ "anonymous", - "data privacy", "anonymity", - "privacy", - "sensitive information" + "sensitive information", + "data privacy", + "privacy" ], "anonymization": [ "anonymization" ], "anonymity": [ - "anonymity", - "anonymous" + "anonymous", + "anonymity" ], "real-world networks": [ "real-world networks" ], "twitter": [ - "microblogging service", - "microblogging", "twitter graph", "anonymous twitter", + "microblogging", + "microblogging service", "twitter" ], "micro-blog": [ - "microblogging service", - "microblogging", "twitter graph", "anonymous twitter", + "microblogging", + "microblogging service", "twitter" ], "network topology": [ - "topology", - "network topology" + "network topology", + "topology" ], "data privacy": [ - "privacy", - "data privacy" + "data privacy", + "privacy" ], "graph theory": [ "graph theory" ], "topology": [ - "topology", - "network topology" + "network topology", + "topology" ], - "computer science": [ - "data mining", - "mining", - "data-mining" + "complex networks": [ + "real-world networks" ], - "computer security": [ - "anonymity", - "privacy", - "data privacy" + "privacy preserving": [ + "anonymization" + ], + "computer networks": [ + "network topology", + "topology" ], "world wide web": [ + "real-world networks", + "social networks", + "twitter", "social-network", "online social networks", - "social networks", "social network", + "microblogging", "social networking" ], - "access control": [ - "sensitive information" - ], - "network security": [ + "computer security": [ + "anonymous", "anonymity", "sensitive information", - "anonymous" - ], - "authentication": [ - "anonymity", - "anonymous" - ], - "privacy preserving": [ - "anonymization" - ], - "computer networks": [ - "topology", - "network topology" + "data privacy", + "privacy" ], "social media": [ - "twitter", - "microblogging service", "microblogging", - "twitter graph", - "anonymous twitter" - ], - "complex networks": [ - "real-world networks" + "twitter" ], "theoretical computer science": [ "graph theory" @@ -184,6 +166,52 @@ "online social networks", "social networks", "social network" + ], + "authentication": [ + "anonymous", + "anonymity" + ], + "network security": [ + "anonymous", + "anonymity", + "sensitive information" + ], + "computer science": [ + "data mining", + "mining", + "data-mining" + ], + "access control": [ + "sensitive information" ] + }, + "syntactic_weights": { + "social networks": 1.0, + "online social networks": 1.0, + "sensitive informations": 0.9545454545454546, + "data mining": 1.0, + "privacy": 1.0, + "anonymization": 1.0, + "anonymity": 1.0, + "real-world networks": 1.0, + "twitter": 1.0, + "micro-blog": 1.0, + "network topology": 1.0, + "data privacy": 1.0, + "graph theory": 1.0 + }, + "semantic_weights": { + "social networks": 1.0, + "online social networks": 1.0, + "data mining": 1.0, + "privacy": 1.0, + "data privacy": 1.0, + "anonymization": 1.0, + "anonymity": 1.0, + "twitter": 1.0, + "micro-blog": 1.0, + "topology": 1.0, + "network topology": 1.0, + "graph theory": 1.0 } } \ No newline at end of file