diff --git a/CSO-Classifier.ipynb b/CSO-Classifier.ipynb index e176a81..70ce273 100644 --- a/CSO-Classifier.ipynb +++ b/CSO-Classifier.ipynb @@ -70,7 +70,7 @@ "metadata": {}, "outputs": [], "source": [ - "cc = CSOClassifier(explanation=True)\n", + "cc = CSOClassifier(explanation=True, get_weights=True)\n", "\n", "result = cc.run(paper)" ] diff --git a/CSO-Classifier.py b/CSO-Classifier.py index aeb82c8..823eff0 100644 --- a/CSO-Classifier.py +++ b/CSO-Classifier.py @@ -46,7 +46,7 @@ # In[Run Classifier] -cc = CSOClassifier(explanation=True) +cc = CSOClassifier(explanation=True, get_weights=True) result = cc.run(paper) diff --git a/README.md b/README.md index 0c1a4ee..6ba215c 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,7 @@ Read more: [https://skm.kmi.open.ac.uk/cso-classifier/](https://skm.kmi.open.ac. - [Sample Output (BM)](#sample-output-bm) - [Parameters](#parameters) - [Releases](#releases) + - [v3.2](#v32) - [v3.1](#v31) - [v3.0](#v30) - [v2.3.2](#v232) @@ -217,99 +218,130 @@ Even if you are running multiple classifications, the current implementation of #### Sample Output (SP) -As output, the classifier returns a dictionary with five components: (i) syntactic, (ii) semantic, (iii) union, (iv) enhanced, and (v) explanation. The latter field is available only if the **explanation** flag is set to True. +As output, the classifier returns a dictionary with seven components: (i) syntactic, (ii) semantic, (iii) union, (iv) enhanced, (v) explanation, (vi) syntactic_weights and (vii) semantic_weights. The explanation field is available only if the **explanation** flag is set to True. The last two fields are available only if the **get_weights** is set to True. -Below you can find an example. The keys syntactic and semantic respectively contain the topics returned by the syntactic and semantic module. Union contains the unique topics found by the previous two modules. In enhanced you can find the relevant super-areas. *Please be aware that the results may change according to the version of Computer Science Ontology.* +Below you can find an example. The keys syntactic and semantic respectively contain the topics returned by the syntactic and semantic module. Union contains the unique topics found by the previous two modules. In enhanced you can find the relevant super-areas. For the sake of clarity, we run the example with all the flag on, and hence it contains the enhanced field and both syntactic_weights and semantic_weights. + +*Please be aware that the results may change according to the version of Computer Science Ontology.* ```json { - "syntactic":[ - "network topology", - "online social networks", - "real-world networks", - "anonymization", - "privacy", - "social networks", - "data privacy", - "graph theory", - "data mining", - "sensitive informations", - "anonymity", - "micro-blog", - "twitter" - ], - "semantic":[ - "network topology", - "online social networks", - "topology", - "data privacy", - "social networks", - "privacy", - "anonymization", - "graph theory", - "data mining", - "anonymity", - "micro-blog", - "twitter" - ], - "union":[ - "network topology", - "online social networks", - "topology", - "real-world networks", - "anonymization", - "privacy", - "social networks", - "data privacy", - "graph theory", - "data mining", - "sensitive informations", - "anonymity", - "micro-blog", - "twitter" - ], - "enhanced":[ - "computer networks", - "online systems", - "complex networks", - "privacy preserving", - "computer security", - "world wide web", - "theoretical computer science", - "computer science", - "access control", - "network security", - "authentication", - "social media" - ], - "explanation":{ - "social networks": ["social network", "online social networks", "microblogging service", "real-world networks", "social networks", "microblogging", "social networking", "twitter graph", "anonymous twitter", "twitter"], - "online social networks": ["online social networks", "social network", "social networks"], - "sensitive informations": ["sensitive information"], - "privacy": ["sensitive information", "anonymity", "anonymous", "data privacy", "privacy"], - "anonymization": ["anonymization"], - "anonymity": ["anonymity", "anonymous"], - "real-world networks": ["real-world networks"], - "twitter": ["twitter graph", "twitter", "microblogging service", "anonymous twitter", "microblogging"], - "micro-blog": ["twitter graph", "twitter", "microblogging service", "anonymous twitter", "microblogging"], - "network topology": ["topology", "network topology"], - "data mining": ["data mining", "mining"], - "data privacy": ["data privacy", "privacy"], - "graph theory": ["graph theory"], - "topology": ["topology", "network topology"], - "computer networks": ["topology", "network topology"], - "online systems": ["online social networks", "social network", "social networks"], - "complex networks": ["real-world networks"], - "privacy preserving": ["anonymization"], - "computer security": ["anonymity", "data privacy", "privacy"], - "world wide web": ["social network", "online social networks", "microblogging service", "real-world networks", "social networks", "microblogging", "social networking", "twitter graph", "anonymous twitter", "twitter"], - "theoretical computer science": ["graph theory"], - "computer science": ["data mining", "mining"], - "access control": ["sensitive information"], - "network security": ["anonymity", "sensitive information", "anonymous"], - "authentication": ["anonymity", "anonymous"], - "social media": ["microblogging service", "microblogging", "twitter graph", "anonymous twitter", "twitter"] - } + "syntactic": [ + "graph theory", + "anonymization", + "anonymity", + "online social networks", + "real-world networks", + "data privacy", + "privacy", + "twitter", + "sensitive informations", + "network topology", + "social networks", + "data mining", + "micro-blog" + ], + "semantic": [ + "graph theory", + "anonymization", + "anonymity", + "online social networks", + "data privacy", + "topology", + "data mining", + "privacy", + "twitter", + "social networks", + "network topology", + "micro-blog" + ], + "union": [ + "graph theory", + "anonymization", + "anonymity", + "online social networks", + "real-world networks", + "data privacy", + "topology", + "privacy", + "twitter", + "sensitive informations", + "network topology", + "social networks", + "data mining", + "micro-blog" + ], + "enhanced": [ + "theoretical computer science", + "privacy preserving", + "authentication", + "network security", + "online systems", + "complex networks", + "computer security", + "social media", + "access control", + "computer networks", + "world wide web", + "computer science" + ], + "explanation": { + "social networks": ["online social networks","microblogging","social-network","social network","real-world networks","social networking","twitter","social networks"], + "online social networks": ["social networks","social network","online social networks"], + "sensitive informations": ["sensitive information"], + "data mining": ["data mining","mining","data-mining"], + "privacy": ["sensitive information","privacy","anonymity","anonymous","data privacy"], + "anonymization": ["anonymization"], + "anonymity": ["anonymity","anonymous"], + "real-world networks": ["real-world networks"], + "twitter": ["twitter","twitter graph","microblogging","anonymous twitter","microblogging service"], + "micro-blog": ["twitter graph","twitter","microblogging","anonymous twitter","microblogging service"], + "network topology": ["network topology","topology"], + "data privacy": ["privacy","data privacy"], + "graph theory": ["graph theory"], + "topology": ["network topology","topology"], + "theoretical computer science": ["graph theory"], + "privacy preserving": ["anonymization"], + "authentication": ["anonymity","anonymous"], + "network security": ["sensitive information","anonymity","anonymous"], + "online systems": ["social networks","social network","online social networks"], + "complex networks": ["real-world networks"], + "computer security": ["sensitive information","privacy","anonymity","anonymous","data privacy"], + "social media": ["twitter","microblogging"], + "access control": ["sensitive information"], + "computer networks": ["network topology","topology"], + "world wide web": ["online social networks","microblogging","social-network","social network","real-world networks","social networking","twitter","social networks"], + "computer science": ["data mining","mining","data-mining"] + }, + "syntactic_weights": { + "social networks": 1.0, + "online social networks": 1.0, + "sensitive informations": 0.9545454545454546, + "data mining": 1.0, + "privacy": 1.0, + "anonymization": 1.0, + "anonymity": 1.0, + "real-world networks": 1.0, + "twitter": 1.0, + "micro-blog": 1.0, + "network topology": 1.0, + "data privacy": 1.0, + "graph theory": 1.0 + }, + "semantic_weights": { + "social networks": 1.0, + "online social networks": 1.0, + "data mining": 1.0, + "privacy": 1.0, + "data privacy": 1.0, + "anonymization": 1.0, + "anonymity": 1.0, + "twitter": 1.0, + "micro-blog": 1.0, + "topology": 1.0, + "network topology": 1.0, + "graph theory": 1.0 + } } ``` @@ -414,7 +446,10 @@ Beside the paper(s), the function running the CSO Classifier accepts seven addit (vi) The parameter *fast_classification* can be either *True* or *False*. This parameter determines whether the semantic module should use the full model or the cached one. Using the full model provides slightly better results than the cached one. However, using the cached model is more than 15x faster. Read [here](#word2vec-model-and-token-to-cso-combined-file-generation) for more details about these two models. The default value for *fast_classification* is *True*. -(vii) The parameter *silent* can be either *True* or *False*. This determines whether the classifier prints its progress in the console. If set to True, the classifier will be silent and will not print any output while classifying. The default value for *silent* is *False*. +(vii) The parameter *get_weights* can be either *True* or *False*. This determines whether the classifier returns the weights associated to the identified topics. For the syntactic topics these represent the value of string similarity (Levenshtein) of topics compared the chunks of text identified in the input text. Whereas, the weights for the semantic topics correspond to the normalised values from the topic distribution obtained from running the semantic module. + +(viii) The parameter *silent* can be either *True* or *False*. This determines whether the classifier prints its progress in the console. If set to True, the classifier will be silent and will not print any output while classifying. The default value for *silent* is *False*. + |# | Parameter | Single Paper | Batch Mode | @@ -425,7 +460,9 @@ Beside the paper(s), the function running the CSO Classifier accepts seven addit |iv | explanation | :white_check_mark: | :white_check_mark: | |v |delete_outliers| :white_check_mark: | :white_check_mark: | |vi | fast_classification| :white_check_mark: | :white_check_mark: | -|vii| silent | :white_check_mark: | :white_check_mark: | +|vii| get_weights | :white_check_mark: | :white_check_mark: | +|viii| silent | :white_check_mark: | :white_check_mark: | + **Table 1**: Parameters availability when using CSO Classifier @@ -434,6 +471,11 @@ Beside the paper(s), the function running the CSO Classifier accepts seven addit Here we list the available releases for the CSO Classifier. These releases are available for download both from [Github](https://github.com/angelosalatino/cso-classifier/releases) and [Zenodo](10.5281/zenodo.2660819). +### v3.2 + +This release extends version 3.1 by supporting users in exporting the weights associated to the identified topics. If enabled, within the result of the classification, the classifier include two new keys ```syntactic_weights``` and ```semantic_weights``` which respectively contain the identified syntactic and semantic topics as keys, and their weights as values. +This component is disabled by default and can be enabled by setting ```get_weights = True``` when calling the CSO Classifier (see [Parameters](#parameters)). + ### v3.1 This release brings in two main changes. The first change is related to the library (and the code) to perform the Levenshtein similarity. Before we relied on ```python-Levenshtein``` which required ```python3-devel```. This new version uses ```rapidfuzz``` which as fast as the previous library and it is much easier to install on the various systems. @@ -458,6 +500,10 @@ Please, be aware that having substantially restructured the code into classes, t We would like to thank James Dunham @jamesdunham from CSET (Georgetown University) for suggesting to us how to improve the code. +More details about this version of the classifier can be found within: +> Salatino, A., Osborne, F., & Motta, E. (2022). CSO Classifier 3.0: a Scalable Unsupervised Method for Classifying Documents in terms of Research Topics. International Journal on Digital Libraries, 1-20. [Read more](https://doi.org/10.1007/s00799-021-00305-y) + + Download from: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5095422.svg)](https://doi.org/10.5281/zenodo.5095422) diff --git a/cso_classifier/classifier.py b/cso_classifier/classifier.py index 444f596..66843da 100644 --- a/cso_classifier/classifier.py +++ b/cso_classifier/classifier.py @@ -35,6 +35,8 @@ def __init__(self, **parameters): - delete_outliers (boolean): if True it runs the outlier detection approach in the postprocessing - fast_classification (boolen): if True it runs the fast version of the classifier (cached model). If False the classifier uses the word2vec model which has higher computational complexity + - get_weights (boolean): determines whether to return the weights associated to the syntactic and semantic topics. + True to return weights. Default value is False - silent (boolean): determines whether to print the progress. If true goes in silent mode. Instead, if false does not print anything in standard output. @@ -45,6 +47,7 @@ def __init__(self, **parameters): self.delete_outliers = parameters["delete_outliers"] if "delete_outliers" in parameters else True self.fast_classification = parameters["fast_classification"] if "fast_classification" in parameters else True self.silent = parameters["silent"] if "silent" in parameters else False + self.get_weights = parameters["get_weights"] if "get_weights" in parameters else False self.__check_parameters(parameters) @@ -81,7 +84,7 @@ def run(self, paper): self.models_loaded = True t_paper = Paper(paper, self.modules) - result = Result(self.explanation) + result = Result(self.explanation, self.get_weights) # Passing parameters to the two classes (synt and sema) and actioning classifiers @@ -89,16 +92,21 @@ def run(self, paper): if self.modules in ('syntactic','both'): synt_module = synt(self.cso, t_paper) result.set_syntactic(synt_module.classify_syntactic()) + if self.get_weights: + result.set_syntactic_topics_weights(synt_module.get_syntactic_topics_weights()) if self.explanation: result.dump_temporary_explanation(synt_module.get_explanation()) + if self.modules in ('semantic','both'): sema_module = sema(self.model, self.cso, self.fast_classification, t_paper) result.set_semantic(sema_module.classify_semantic()) + if self.get_weights: + result.set_semantic_topics_weights(sema_module.get_semantic_topics_weights()) if self.explanation: result.dump_temporary_explanation(sema_module.get_explanation()) - postprocess = post(self.model, self.cso, enhancement=self.enhancement, result=result, delete_outliers=self.delete_outliers) + postprocess = post(self.model, self.cso, enhancement=self.enhancement, result=result, delete_outliers=self.delete_outliers, get_weights=self.get_weights) result = postprocess.filtering_outliers() return result.get_dict() @@ -167,7 +175,7 @@ def _batch_run_single_worker(self, papers): # Passing parameters to the two classes (synt and sema) synt_module = synt(cso) sema_module = sema(model, cso, self.fast_classification) - postprocess = post(model, cso, enhancement=self.enhancement, delete_outliers=self.delete_outliers) + postprocess = post(model, cso, enhancement=self.enhancement, delete_outliers=self.delete_outliers, get_weights=self.get_weights) # initializing variable that will contain output @@ -178,19 +186,24 @@ def _batch_run_single_worker(self, papers): print("Processing:", paper_id) paper.set_paper(paper_value) - result = Result(self.explanation) + result = Result(self.explanation, self.get_weights) # Passing paper and actioning the classifier if self.modules in ('syntactic','both'): synt_module.set_paper(paper) result.set_syntactic(synt_module.classify_syntactic()) + if self.get_weights: + result.set_syntactic_topics_weights(synt_module.get_syntactic_topics_weights()) if self.explanation: result.dump_temporary_explanation(synt_module.get_explanation()) if self.modules in ('semantic','both'): sema_module.set_paper(paper) result.set_semantic(sema_module.classify_semantic()) + if self.get_weights: + result.set_semantic_topics_weights(sema_module.get_semantic_topics_weights()) if self.explanation: result.dump_temporary_explanation(sema_module.get_explanation()) + postprocess.set_result(result) result = postprocess.filtering_outliers() diff --git a/cso_classifier/config.ini b/cso_classifier/config.ini index 5ddae79..93d389e 100644 --- a/cso_classifier/config.ini +++ b/cso_classifier/config.ini @@ -1,5 +1,5 @@ [classifier] -classifier_version = 3.1 +classifier_version = 3.2 package_name = cso-classifier [ontology] diff --git a/cso_classifier/postprocmodule.py b/cso_classifier/postprocmodule.py index f675d61..283c5f3 100644 --- a/cso_classifier/postprocmodule.py +++ b/cso_classifier/postprocmodule.py @@ -25,6 +25,7 @@ def __init__(self, model = None, cso = None, **parameters): self.list_of_topics = list() self.enhancement = parameters["enhancement"] if "enhancement" in parameters else "first" #defines the type of enhancement self.delete_outliers = parameters["delete_outliers"] if "delete_outliers" in parameters else True + self.get_weights = parameters["get_weights"] if "get_weights" in parameters else False if "result" in parameters: self.result = parameters["result"] # the result object @@ -233,6 +234,10 @@ def filtering_outliers(self): self.result.set_semantic(list(set(self.result.get_semantic()).intersection(selected_topics_set))) self.result.set_union(selected_topics) self.result.set_enhanced(self.cso.climb_ontology(selected_topics, self.enhancement)) + if self.get_weights: + self.result.set_syntactic_topics_weights({topic:val for topic, val in self.result.get_syntactic_topics_weights().items() if topic in selected_topics_set}) + self.result.set_semantic_topics_weights({topic:val for topic, val in self.result.get_semantic_topics_weights().items() if topic in selected_topics_set}) + else: self.result.set_enhanced(self.cso.climb_ontology(self.result.get_union(), self.enhancement)) diff --git a/cso_classifier/result.py b/cso_classifier/result.py index 98c2bb0..b63c463 100644 --- a/cso_classifier/result.py +++ b/cso_classifier/result.py @@ -1,19 +1,25 @@ class Result: """ A simple abstraction layer for retrieving the results """ - def __init__(self, explanation = False): + def __init__(self, explanation = False, get_weights=False): """ Initialising the ontology class """ self.syntactic = list() self.semantic = list() self.union = list() self.enhanced = list() + self.result_attr = ('syntactic', 'semantic', 'union', 'enhanced') + self.explanation_flag = explanation if self.explanation_flag: self.explanation = dict() - self.result_attr = ('syntactic', 'semantic', 'union', 'enhanced', 'explanation') - else: - self.result_attr = ('syntactic', 'semantic', 'union', 'enhanced') + self.result_attr += ('explanation',) + + self.get_weights = get_weights + if self.get_weights: + self.syntactic_weights = dict() + self.semantic_weights = dict() + self.result_attr += ('syntactic_weights','semantic_weights',) def get_dict(self): @@ -77,6 +83,30 @@ def __merge(self): """ Function that fills the union object """ self.union = list(set(self.syntactic + self.semantic)) + + + def set_syntactic_topics_weights(self, syntactic_weights): + """ Sets the syntactic_weights variable + """ + self.syntactic_weights = syntactic_weights + + + def get_syntactic_topics_weights(self): + """ Gets the syntactic_weights variable + """ + return self.syntactic_weights + + + def set_semantic_topics_weights(self, semantic_weights): + """ Sets the semantic_weights variable + """ + self.semantic_weights = semantic_weights + + + def get_semantic_topics_weights(self): + """ Gets the semantic_weights variable + """ + return self.semantic_weights def dump_temporary_explanation(self, temporary_explanation): diff --git a/cso_classifier/semanticmodule.py b/cso_classifier/semanticmodule.py index bf10561..33c1dbc 100644 --- a/cso_classifier/semanticmodule.py +++ b/cso_classifier/semanticmodule.py @@ -20,6 +20,7 @@ def __init__(self, model = None, cso = None, fast_classification = True, paper = self.min_similarity = 0.90 #Initialises the min_similarity self.fast_classification = fast_classification # if will use the full model or not self.explanation = dict() + self.extracted_topics = dict() # dictionary with the extract topics (including similarity measures) def set_paper(self, paper): @@ -74,10 +75,23 @@ def classify_semantic(self): found_topics, explanation = self.__find_topics(self.paper.get_semantic_chunks()) ##################### Ranking - final_topics = self.__rank_topics(found_topics, explanation) + self.extracted_topics = self.__rank_topics(found_topics, explanation) + + final_topics = list(self.extracted_topics.keys()) return final_topics + def get_semantic_topics_weights(self): + """Function that returns the full set of topics with the similarity measure + + Args: + + + Returns: + extracted_topics (dictionary): containing the found topics with their metric. + """ + return self.extracted_topics #they are already in the correct format. + def __find_topics(self, concepts): """Function that identifies topics starting from the ngram forund in the paper @@ -259,7 +273,7 @@ def __rank_topics(self, found_topics, explanation): explanation (dictionary): contains information about the explanation of topics Returns: - final_topics (list): list of final topics + final_topics (dictionary): dictionary of final topics """ max_value = 0 scores = [] @@ -335,8 +349,8 @@ def __rank_topics(self, found_topics, explanation): except IndexError: knee = len(sort_t) - final_topics = [] - final_topics = [self.cso.get_topic_wu(sort_t[i][0]) for i in range(0,knee)] + + final_topics = {self.cso.get_topic_wu(sort_t[i][0]):(sort_t[i][1]/max_value) for i in range(0,knee)} self.reset_explanation() self.explanation = {self.cso.topics_wu[sort_t[i][0]]: explanation[sort_t[i][0]] for i in range(0,knee)} diff --git a/cso_classifier/syntacticmodule.py b/cso_classifier/syntacticmodule.py index 897f752..e0ac9a4 100644 --- a/cso_classifier/syntacticmodule.py +++ b/cso_classifier/syntacticmodule.py @@ -19,6 +19,8 @@ def __init__(self, cso = None, paper = None): self.min_similarity = 0.90 # Value of minimum similarity self.paper = paper # the paper object self.explanation = dict() # the explanation dictionary + self.extracted_topics = dict() # dictionary with the extract topics (including similarity measures) + def set_paper(self, paper): @@ -68,17 +70,36 @@ def classify_syntactic(self): Returns: - found_topics (dictionary): containing the found topics with their similarity and the n-gram analysed. + final_topics (list): containing the list of final topics. """ final_topics = list() # analysing similarity with terms in the ontology - extracted_topics = self.__statistic_similarity() + self.extracted_topics = self.__statistic_similarity() # stripping explanation - final_topics = self.__strip_service_fields(extracted_topics) + final_topics = self.__strip_service_fields(self.extracted_topics) return final_topics + def get_syntactic_topics_weights(self): + """Function that returns the full set of topics with the similarity measure (weights) + + Args: + + + Returns: + weights (dictionary): containing the found topics with their similarity and the n-gram analysed. + """ + weights = dict() + for topic, sim_values in self.extracted_topics.items(): + if len(sim_values) == 1: + weights[topic] = sim_values[0]["similarity"] + else: + weights[topic] = max([sim_value["similarity"] for sim_value in sim_values]) + + return weights + + def __statistic_similarity(self): """Function that finds the similarity between the previously extracted concepts and topics in the ontology diff --git a/output.json b/output.json index 16fa94e..410bb8c 100644 --- a/output.json +++ b/output.json @@ -1,76 +1,73 @@ { "syntactic": [ - "data mining", - "micro-blog", - "social networks", - "data privacy", - "sensitive informations", - "anonymity", + "real-world networks", "anonymization", "network topology", + "data privacy", + "social networks", + "privacy", "twitter", - "real-world networks", "graph theory", "online social networks", - "privacy" - ], - "semantic": [ + "anonymity", "data mining", "micro-blog", - "privacy", - "social networks", - "data privacy", - "anonymity", + "sensitive informations" + ], + "semantic": [ "anonymization", "network topology", + "topology", + "data privacy", + "social networks", + "privacy", "twitter", "graph theory", "online social networks", - "topology" + "anonymity", + "data mining", + "micro-blog" ], "union": [ - "data mining", - "micro-blog", - "privacy", - "social networks", - "data privacy", - "sensitive informations", - "anonymity", + "real-world networks", "anonymization", "network topology", + "topology", + "data privacy", + "social networks", + "privacy", "twitter", - "real-world networks", "graph theory", "online social networks", - "topology" + "anonymity", + "data mining", + "micro-blog", + "sensitive informations" ], "enhanced": [ - "computer science", - "computer security", - "world wide web", - "access control", - "network security", - "authentication", + "complex networks", "privacy preserving", "computer networks", + "world wide web", + "computer security", "social media", - "complex networks", "theoretical computer science", - "online systems" + "online systems", + "authentication", + "network security", + "computer science", + "access control" ], "explanation": { "social networks": [ - "social-network", + "real-world networks", "social networks", - "social networking", - "microblogging service", "twitter", - "real-world networks", + "social-network", "online social networks", - "microblogging", "social network", - "twitter graph", - "anonymous twitter" + "microblogging", + "social networking" ], "online social networks": [ "online social networks", @@ -87,95 +84,80 @@ ], "privacy": [ "anonymous", - "data privacy", "anonymity", - "privacy", - "sensitive information" + "sensitive information", + "data privacy", + "privacy" ], "anonymization": [ "anonymization" ], "anonymity": [ - "anonymity", - "anonymous" + "anonymous", + "anonymity" ], "real-world networks": [ "real-world networks" ], "twitter": [ - "microblogging service", - "microblogging", "twitter graph", "anonymous twitter", + "microblogging", + "microblogging service", "twitter" ], "micro-blog": [ - "microblogging service", - "microblogging", "twitter graph", "anonymous twitter", + "microblogging", + "microblogging service", "twitter" ], "network topology": [ - "topology", - "network topology" + "network topology", + "topology" ], "data privacy": [ - "privacy", - "data privacy" + "data privacy", + "privacy" ], "graph theory": [ "graph theory" ], "topology": [ - "topology", - "network topology" + "network topology", + "topology" ], - "computer science": [ - "data mining", - "mining", - "data-mining" + "complex networks": [ + "real-world networks" ], - "computer security": [ - "anonymity", - "privacy", - "data privacy" + "privacy preserving": [ + "anonymization" + ], + "computer networks": [ + "network topology", + "topology" ], "world wide web": [ + "real-world networks", + "social networks", + "twitter", "social-network", "online social networks", - "social networks", "social network", + "microblogging", "social networking" ], - "access control": [ - "sensitive information" - ], - "network security": [ + "computer security": [ + "anonymous", "anonymity", "sensitive information", - "anonymous" - ], - "authentication": [ - "anonymity", - "anonymous" - ], - "privacy preserving": [ - "anonymization" - ], - "computer networks": [ - "topology", - "network topology" + "data privacy", + "privacy" ], "social media": [ - "twitter", - "microblogging service", "microblogging", - "twitter graph", - "anonymous twitter" - ], - "complex networks": [ - "real-world networks" + "twitter" ], "theoretical computer science": [ "graph theory" @@ -184,6 +166,52 @@ "online social networks", "social networks", "social network" + ], + "authentication": [ + "anonymous", + "anonymity" + ], + "network security": [ + "anonymous", + "anonymity", + "sensitive information" + ], + "computer science": [ + "data mining", + "mining", + "data-mining" + ], + "access control": [ + "sensitive information" ] + }, + "syntactic_weights": { + "social networks": 1.0, + "online social networks": 1.0, + "sensitive informations": 0.9545454545454546, + "data mining": 1.0, + "privacy": 1.0, + "anonymization": 1.0, + "anonymity": 1.0, + "real-world networks": 1.0, + "twitter": 1.0, + "micro-blog": 1.0, + "network topology": 1.0, + "data privacy": 1.0, + "graph theory": 1.0 + }, + "semantic_weights": { + "social networks": 1.0, + "online social networks": 1.0, + "data mining": 1.0, + "privacy": 1.0, + "data privacy": 1.0, + "anonymization": 1.0, + "anonymity": 1.0, + "twitter": 1.0, + "micro-blog": 1.0, + "topology": 1.0, + "network topology": 1.0, + "graph theory": 1.0 } } \ No newline at end of file