Skip to content

Commit

Permalink
added functionality to work in multithread. Tested on 1212 papers
Browse files Browse the repository at this point in the history
  • Loading branch information
angelosalatino committed Jun 7, 2019
1 parent b3952ca commit 048a4f5
Showing 1 changed file with 55 additions and 1 deletion.
56 changes: 55 additions & 1 deletion classifier/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
from classifier.syntacticmodule import CSOClassifierSyntactic as synt
from classifier.semanticmodule import CSOClassifierSemantic as sema

#other dependencies
import math
from multiprocessing.pool import Pool
from functools import partial

def run_cso_classifier(paper, modules = "both", enhancement = "first"):
"""Function that runs the CSO Classifier. It takes as input the text from abstract, title, and keywords of a research paper
Expand Down Expand Up @@ -78,8 +82,58 @@ def run_cso_classifier(paper, modules = "both", enhancement = "first"):
return class_res


def run_cso_classifier_batch_mode(papers, workers = 1, modules = "both", enhancement = "first"):
"""Function that runs the CSO Classifier in *BATCH MODE* and in multiprocessing.
It takes as input a set of papers, which include abstract, title, and keywords
and for each one of them returns a list of relevant concepts from CSO.
This function requires a dictionary of papers, with each id corresponding to the metadata of a paper, and few flags:
(i) modules, determines whether to run only the syntatcic module, or the semantic module, or both;
(ii) enhancement, controls whether the classifier should infer super-topics, i.e., their first direct super-topics or the whole set of topics up until root.
Args:
papers (dictionary): contains the metadata of the papers, e.g., for each paper, there is title, abstract and keywords {"id1":{"title": "","abstract": "","keywords": ""},"id2":{"title": "","abstract": "","keywords": ""}}.
workers (integer): number of workers. If 1 is in single thread, otherwise multithread
modules (string): either "syntactic", "semantic" or "both" to determine which modules to use when classifying. "syntactic" enables only the syntactis module. "semantic" enables only the semantic module. Finally, with "both" the classifier takes advantage of both the syntactic and semantic modules. Default = "both".
enhances (string): either "first", "all" or "no". With "first" the CSO classifier returns only the topics one level above. With "all" it returns all topics above the resulting topics. With "no" the CSO Classifier does not provide any enhancement.
Returns:
fcso (dictionary): contains the CSO Ontology.
fmodel (dictionary): contains a cache of the model, i.e., each token is linked to the corresponding CSO topic.
"""

if modules not in ["syntactic", "semantic", "both"]:
raise ValueError("Error: Field modules must be 'syntactic', 'semantic' or 'both'")
return

if enhancement not in ["first", "all", "no"]:
raise ValueError("Error: Field enhances must be 'first', 'all' or 'no'")
return

if workers < 1:
raise ValueError("Error: Number of workers must be equal or greater than 1")
return

if type(workers) != int:
raise ValueError("Error: Number of workers must be integer")
return

size_of_corpus = len(papers)
chunk_size = math.ceil( size_of_corpus / workers )
papers_list = list(misc.chunks(papers, chunk_size))
annotate = partial(run_cso_classifier_batch_model_single_worker, modules = modules, enhancement = enhancement)

class_res = []
with Pool(workers) as p:
result = p.map(annotate, papers_list)

class_res = {k:v for d in result for k, v in d.items()}

return class_res




def run_cso_classifier_batch_mode(papers, modules = "both", enhancement = "first"):
def run_cso_classifier_batch_model_single_worker(papers, modules = "both", enhancement = "first"):
"""Function that runs the CSO Classifier in *BATCH MODE*.
It takes as input a set of papers, which include abstract, title, and keywords
and for each one of them returns a list of relevant concepts from CSO.
Expand Down

0 comments on commit 048a4f5

Please sign in to comment.