From 048a4f574ade59c99740972f9fb9141834e12be0 Mon Sep 17 00:00:00 2001 From: angelosalatino Date: Fri, 7 Jun 2019 15:41:19 +0100 Subject: [PATCH] added functionality to work in multithread. Tested on 1212 papers --- classifier/classifier.py | 56 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/classifier/classifier.py b/classifier/classifier.py index 56321ba..0592853 100644 --- a/classifier/classifier.py +++ b/classifier/classifier.py @@ -15,6 +15,10 @@ from classifier.syntacticmodule import CSOClassifierSyntactic as synt from classifier.semanticmodule import CSOClassifierSemantic as sema +#other dependencies +import math +from multiprocessing.pool import Pool +from functools import partial def run_cso_classifier(paper, modules = "both", enhancement = "first"): """Function that runs the CSO Classifier. It takes as input the text from abstract, title, and keywords of a research paper @@ -78,8 +82,58 @@ def run_cso_classifier(paper, modules = "both", enhancement = "first"): return class_res +def run_cso_classifier_batch_mode(papers, workers = 1, modules = "both", enhancement = "first"): + """Function that runs the CSO Classifier in *BATCH MODE* and in multiprocessing. + It takes as input a set of papers, which include abstract, title, and keywords + and for each one of them returns a list of relevant concepts from CSO. + This function requires a dictionary of papers, with each id corresponding to the metadata of a paper, and few flags: + (i) modules, determines whether to run only the syntatcic module, or the semantic module, or both; + (ii) enhancement, controls whether the classifier should infer super-topics, i.e., their first direct super-topics or the whole set of topics up until root. + + + Args: + papers (dictionary): contains the metadata of the papers, e.g., for each paper, there is title, abstract and keywords {"id1":{"title": "","abstract": "","keywords": ""},"id2":{"title": "","abstract": "","keywords": ""}}. + workers (integer): number of workers. If 1 is in single thread, otherwise multithread + modules (string): either "syntactic", "semantic" or "both" to determine which modules to use when classifying. "syntactic" enables only the syntactis module. "semantic" enables only the semantic module. Finally, with "both" the classifier takes advantage of both the syntactic and semantic modules. Default = "both". + enhances (string): either "first", "all" or "no". With "first" the CSO classifier returns only the topics one level above. With "all" it returns all topics above the resulting topics. With "no" the CSO Classifier does not provide any enhancement. + Returns: + fcso (dictionary): contains the CSO Ontology. + fmodel (dictionary): contains a cache of the model, i.e., each token is linked to the corresponding CSO topic. + """ + + if modules not in ["syntactic", "semantic", "both"]: + raise ValueError("Error: Field modules must be 'syntactic', 'semantic' or 'both'") + return + + if enhancement not in ["first", "all", "no"]: + raise ValueError("Error: Field enhances must be 'first', 'all' or 'no'") + return + + if workers < 1: + raise ValueError("Error: Number of workers must be equal or greater than 1") + return + + if type(workers) != int: + raise ValueError("Error: Number of workers must be integer") + return + + size_of_corpus = len(papers) + chunk_size = math.ceil( size_of_corpus / workers ) + papers_list = list(misc.chunks(papers, chunk_size)) + annotate = partial(run_cso_classifier_batch_model_single_worker, modules = modules, enhancement = enhancement) + + class_res = [] + with Pool(workers) as p: + result = p.map(annotate, papers_list) + + class_res = {k:v for d in result for k, v in d.items()} + + return class_res + + + -def run_cso_classifier_batch_mode(papers, modules = "both", enhancement = "first"): +def run_cso_classifier_batch_model_single_worker(papers, modules = "both", enhancement = "first"): """Function that runs the CSO Classifier in *BATCH MODE*. It takes as input a set of papers, which include abstract, title, and keywords and for each one of them returns a list of relevant concepts from CSO.