-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathextract.py
28 lines (25 loc) · 1006 Bytes
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#download random wikipedia articles using the Python Wikipedia library
#and perform some basic preprocessing
#requires reliable internet connection and may take 30 minutes
import wikipedia
import urllib2
languages = ['el', 'eo', 'en', 'zh', 'vi', 'ca', 'it', 'cs', 'ar', 'fi', 'eu', 'et', 'gl', 'id', 'es', 'ru', 'pt', 'no', 'tr', 'lt', 'vo', 'th', 'ro', 'pl', 'fr', 'bg', 'uk', 'sl', 'hr', 'de', 'ko', 'hu', 'fa', 'hi', 'nl', 'da', 'ja', 'he', 'ka', 'nn', 'sv', 'mk', 'sk', 'ms', 'sr']
for code in languages:
wikipedia.set_lang(code)
titles = wikipedia.random(1000)
content = []
for t in titles:
try:
body = wikipedia.page(t).content
body = body.replace("==", " ")
body = body.replace("\n", " ")
print body
content.append(body)
except:
body = ""
file = open("testcorpus/" + code,'w')
for c in content:
file.write(c.encode('utf8') + " ")
file.close()
if(len(content) > 0):
print content[0]