-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfrequency.py
81 lines (68 loc) · 2.61 KB
/
frequency.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import csv
import datetime
import re
import sys
import unicodedata
import json
from collections import Counter, defaultdict
def cleanup(corpus):
corpus = corpus.replace("Mảı: ", "")
corpus = corpus.replace("adefinite Magı", "")
corpus = corpus.replace("Hỏaqgīo (targets 2584/4360)", "")
corpus = unicodedata.normalize("NFD", corpus)
corpus = re.sub(r"([aeiıou](?:q|m(?![aeiıou]))?)([bcdfghjklmnprstꝡz]+[aeiıou])([\u0300\u0301\u0302\u0308])", lambda m: m[1] + "- " + m[2], corpus)
corpus = re.sub(r"\u0323", "- ", corpus)
corpus = re.sub(r"[\u0300-\u036f]", "", corpus).lower().replace("ı", "i").replace("ȷ", "j")
corpus = re.sub(r"\|\|[^|]+\|\|", "", corpus)
corpus = re.sub("w|vy?|y", "ꝡ", corpus)
corpus = corpus.replace("ke huoicia mi inari", "")
corpus = corpus.replace("lu rara puefuq ke pohoa shiaq", "")
return corpus
with open("toaq-corpus.txt") as f:
corpus = [f.read()]
who_said = defaultdict(set)
TOAQ_WORD = r"\b(?:(?:[bcdfghjklmnprstꝡz]?|ch|sh|nh)[aeiou]+(?:q|m(?![aeiıou]))?)+(?:-|\b)"
with open("toaq-only.csv") as f:
for row in csv.reader(f):
if row[2] == "Date":
continue
_id, speaker, date, line, *_ = row
date = datetime.datetime.strptime(date, "%m/%d/%Y %I:%M %p")
if (date.year, date.month, date.day) >= (2022, 12, 6):
corpus.append(line)
for word in re.findall(TOAQ_WORD, line):
who_said[word].add(speaker)
# corpus = cleanup(open("toaq-corpus.txt").read())
corpus = cleanup("\n".join(corpus))
ctr = Counter()
for line in corpus.split("\n"):
wds = re.findall(TOAQ_WORD, line)
for w in set(wds):
ctr[w] += 1
freq = list(ctr.most_common())
freq.sort(key=lambda x: (-x[1], x[0]))
gloss = {}
dic = json.load(open("dictionary/dictionary.json"))
for y in dic:
lemma = cleanup(y['toaq'])
if ' ' in lemma: continue
if lemma in gloss and lemma != y['toaq']:
continue
gloss[lemma] = y['gloss']
official = set(gloss.keys())
print("== Official words by frequency in the corpus ===")
for word, n in freq:
if word in official:
print(n, word, '('+gloss[word]+')')
for word in sorted(set(official) - set(ctr)):
print(0, word, '('+gloss[word]+')')
print()
print("== Unofficial words used ≥4 times in the corpus ===")
for word, n in freq:
if word not in official and n >= 4 and word not in "liqjiao chichie ziamia geojiao loeshuao doidote".split():
print(n, word)
print()
print("== Unofficial words used by ≥3 speakers ===")
for k, v in who_said.items():
if k not in official and len(v) >= 3:
print(k, len(v))