-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
129 lines (96 loc) · 4.38 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import click
import os
from cube.api import Cube
import text_rank
def extract_sentences(sentences, length, length_pct, clean):
sentence_tokens = []
for sentence in sentences:
words = []
for entry in sentence:
words.append(entry.word)
sentence_tokens.append(' '.join(words))
return text_rank.extract_sentences(sentence_tokens,
summary_length=length,
summary_length_pct=length_pct,
clean_sentences=clean)
def extract_key_phrases(sentences, length, length_pct):
# Perform POS Tagging
tagged = []
for sentence in sentences:
for entry in sentence:
tagged.append((entry.lemma, entry.upos, entry.word))
return text_rank.extract_key_phrases(tagged, keywords_length=length, keywords_length_pct=length_pct)
@click.group()
@click.option('--language', default='ro')
@click.pass_context
def cli(ctx, language):
cube = Cube(verbose=True)
cube.load(language)
ctx.ensure_object(dict)
ctx.obj['CUBE'] = cube
@cli.command()
@click.argument('filename')
@click.option('--summary_length', default=100, help='Summary word length')
@click.option('--summary_length_pct', default=0.2, help='Summary word length in percentage of total')
@click.option('--clean_sentences', default=True, help='Clean sentences')
@click.pass_context
def summary(ctx, filename, summary_length, summary_length_pct, clean_sentences):
"""Print summary text to stdout."""
cube = ctx.obj['CUBE']
with open(filename) as f:
text = f.read()
sentences = cube(text)
summary_text = extract_sentences(sentences, summary_length, summary_length_pct, clean_sentences)
print(summary_text)
@cli.command()
@click.argument('filename')
@click.option('--keyword_length', default=10, help='Keyword word length')
@click.option('--keyword_length_pct', default=0.1, help='Keyword word length percentage of total')
@click.pass_context
def keywords(ctx, filename, keyword_length, keyword_length_pct):
"""Print key-phrases to stdout."""
cube = ctx.obj['CUBE']
with open(filename) as f:
text = f.read()
sentences = cube(text)
phrases = extract_key_phrases(sentences, keyword_length, keyword_length_pct)
print(phrases)
@cli.command()
@click.argument('source')
@click.argument('destination')
@click.option('--summary_length', default=100, help='Summary word length')
@click.option('--summary_length_pct', default=0.2, help='Summary word length in percentage of total')
@click.option('--clean_sentences', default=True, help='Clean sentences')
@click.option('--keyword_length', default=10, help='Keyword word length')
@click.option('--keyword_length_pct', default=0.1, help='Keyword word length percentage of total')
@click.pass_context
def extract_all(ctx, source, destination, summary_length, summary_length_pct, clean_sentences, keyword_length,
keyword_length_pct):
"""Summarizes and extracts keywords for all files in folder"""
cube = ctx.obj['CUBE']
articles = os.listdir(source)
for article in articles:
print(f'Reading article: {article}')
with open(f'{source}/{article}', encoding='utf-8') as f:
text = f.read()
sentences = cube(text)
# Extract summary
summary = extract_sentences(sentences, summary_length, summary_length_pct, clean_sentences)
summary_directory = f'{destination}/summaries'
if not os.path.exists(summary_directory):
os.makedirs(summary_directory)
print(f'Generating output to summaries/{article}')
with open(f'{summary_directory}/{article}', 'w', encoding='utf-8') as summary_file:
summary_file.write(summary)
# Extract keywords
phrases = extract_key_phrases(sentences, keyword_length, keyword_length_pct)
keywords_directory = f'{destination}/keywords'
if not os.path.exists(keywords_directory):
os.makedirs(keywords_directory)
print(f'Generating output to keywords/{article}')
with open(f'{keywords_directory}/{article}', 'w', encoding='utf-8') as keywords_file:
for key_phrase in phrases:
keywords_file.write(f'{key_phrase}\n')
print('Summarization complete')
if __name__ == '__main__':
cli()