forked from nassim199/Extraction_collocations
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_file.py
84 lines (59 loc) · 1.71 KB
/
test_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from document import Document, Tweet
from requete import Requete, RequeteArxiv, RequeteTwitter, get_documents_sample
import math
from corpus import Corpus
from networkx.algorithms import community
'''
#recuperation donnees twitter
req = RequeteTwitter("machine-learning")
documents = req.get_documents()
for i, doc in enumerate(documents):
print(doc.text)
print('\n--------------\n')
if i >= 4:
break
#recuperation donnees arxiv
req = RequeteArxiv("machine+learning")
documents = req.get_documents()
for i, doc in enumerate(documents):
print(doc.text)
print('\n--------------\n')
if i >= 4:
break
'''
#documents generiques pour faire des tests
docs = get_documents_sample()
#construction du vocabulaire
corpus = Corpus(docs)
corpus.build_vocab()
vocab = corpus.vocab
#affichage des mots du vocabulaire
for k, v in vocab.items():
print("mot : " + k)
print("ids des documents d'apparition: ")
print(v[1])
print('---------')
id_to_vocab = corpus.id_to_vocab
bi_grams_cooc = corpus.bi_grams_cooc
for (k1, k2), cooc in bi_grams_cooc.items():
w1 = id_to_vocab[k1]
w2 = id_to_vocab[k2]
print(w1 + " - " + w2 + " : " + str(cooc))
#construciton du graphe
corpus.build_graph()
g = corpus.graph
print()
print(g)
communities = list(community.greedy_modularity_communities(g, weight='weight'))
num_communities = len(communities)
print()
print("nombre de communautes : " + str(num_communities))
print("ID des mots de chaque communaute : ")
for colorIndex, comm in enumerate(communities):
c = list(comm)
print(c)
print()
expressions = corpus.find_expressions()
for exp in expressions:
print(exp)
print('------------')