-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwhatsapp.py
99 lines (77 loc) · 3.19 KB
/
whatsapp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os, re, codecs, sys
from pynlpl.formats import folia
from datetime import datetime
#dit is python3 want dat lost de unicode problemen ook meteen op
def process(l):
"split the whatsapp line into its parts"
a = line[:-1].split("\t") #or a = [" ", " ", " "]
date = a[0]
author = a[1]
message = a[2]
#print(anonymous)
return (date, author, message)
# read in meta data
metafile = "/vol/bigdata/corpora/CMC/Whatsapp2013/Files_gestandaardiseerd/metadata.txt"
#dictionary= hash
users = {}
# make a list of all nicknames and their anonymous versions
array = []
for line in open(metafile):
array = line[:-1].split("\t")
users[array[3]] = array[0]
print(users)
mydir = "/vol/bigdata/corpora/CMC/Whatsapp2013/Files_gestandaardiseerd/"
# read in a directory with whatsapp files ending in
for filename in os.listdir(mydir):
print(filename)
if filename.endswith("gestandaardiseerd.txt"):
print(filename)
#open the whatsapp file
f = open(os.path.join(mydir,filename))
lines = f.readlines()
docstr = filename
# create a folia document with a numbered id
doc = folia.Document(id=docstr)
doc.declare(folia.Event, "hdl:1839/00-SCHM-0000-0000-000A-B")
# first create an folia text opbject, then paste string into it
text = doc.append(folia.Text)
# iterating over the lines, while keeping a counter
#for i in range(len(lines)):
# print i, lines[i]
mybegindate = myactor = mymessage = ' '
messagecounter = 0
# lees regel voor regel uit bestand
for line in lines:
line= line.replace('\ufeff', '')
print(line)
# als regel begint met [datum]
# schrijf message-event weg
# maak nieuwe message-event
resultdate = re.search('.+\d+:\d\d', line)
if (resultdate):
# print the previous message befor processing the current one
#print("previous message")
#print(mybegindate, myactor, mymessage)
eventid = "text.%(docstr)s.event.%(messagecounter)s" % vars()
if(messagecounter>0):
if users[myactor]:
anonymous = users[myactor]
print(anonymous)
chatevent = folia.Event(doc, id=eventid, actor=anonymous, cls="message", begindatetime=mybegindate, text=mymessage)
text.append(chatevent)
else:
print("what is wrong",myactor)
messagecounter += 1
#print("message %(messagecounter)s" % vars())
(mybegindate, myactor, mymessage) = process(line)
# if no resultdate, append line to previous message
else:
mymessage += "\n"
mymessage += line
# dont forget the last line!
eventid = "text.%(docstr)s.event.%(messagecounter)s" % vars()
anonymous = myactor
chatevent = folia.Event(doc, id=eventid, actor=anonymous, begindatetime=mybegindate, text=mymessage)
text.append(chatevent)
outfile = docstr + ".folia.xml"
doc.save(outfile)