-
Notifications
You must be signed in to change notification settings - Fork 0
/
cldfbench_wacl.py
112 lines (101 loc) · 5.08 KB
/
cldfbench_wacl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import pathlib
import subprocess
import collections
from pycldf import Sources
from cldfbench import Dataset as BaseDataset, CLDFSpec
from clldutils.misc import slug
from clldutils.text import split_text
class Dataset(BaseDataset):
dir = pathlib.Path(__file__).parent
id = "wacl"
def cldf_specs(self): # A dataset must declare all CLDF sets it creates.
return CLDFSpec(dir=self.cldf_dir, module='StructureDataset')
def cmd_download(self, args):
pass
def cmd_readme(self, args):
subprocess.check_call([
'cldfbench',
'cldfviz.map',
str(self.cldf_specs().metadata_path),
'--parameters', 'CLF',
'--output', str(self.dir / 'map.jpg'),
'--width', '20',
'--height', '10',
'--format', 'jpg',
'--pacific-centered'])
desc = [
'\n{}'.format(self.cldf_reader().properties['dc:description']),
'\n![Distribution of classifier languages](map.jpg)\n'
]
pre, head, post = super().cmd_readme(args).partition('## CLDF ')
return pre + '\n'.join(desc) + head + post
def cmd_makecldf(self, args):
values = list(self.raw_dir.read_csv('WACL_v1.csv', delimiter=',', dicts=True))
args.writer.cldf.properties['dc:description'] = \
"The database, named World Atlas of Classifier Languages (WACL), has been " \
"systematically constructed over the last ten years via a manual survey of relevant " \
"literature and also an automatic scan of digitized grammars followed by manual " \
"checking. WACL presents a precise definition of numeral classifiers, steps to " \
"identify a numeral classifier language, and a database of {} languages, of which " \
"{} languages have been identified as having a numeral classifier system. " \
"The open-access release of WACL is thus a significant contribution to linguistic " \
"research in providing (i) a precise definition and examples of how to identify " \
"numeral classifiers in language data and (ii) the largest dataset of numeral " \
"classifier languages in the world. As such it offers researchers a rich and stable " \
"data source for conducting typological, quantitative, and phylogenetic analyses on " \
"numeral classifiers. The database will also be expanded with additional features " \
"relating to numeral classifiers in the future in order to allow more fine-grained " \
"analyses.".format(len(values), sum(1 for r in values if r['CLF'] == 'TRUE'))
args.writer.cldf.add_component('ParameterTable')
args.writer.cldf.add_component(
'LanguageTable',
'Continent', # we add more language metadata
)
args.writer.cldf['LanguageTable', 'Latitude'].null = ['', 'NA']
args.writer.cldf['LanguageTable', 'Longitude'].null = ['', 'NA']
args.writer.cldf['LanguageTable', 'ISO639P3code'].null = ['', 'NA']
args.writer.cldf.add_component('CodeTable')
args.writer.objects['ParameterTable'] = [
{
'ID': 'CLF',
'Name': 'Presence/absence of numeral classifiers',
'Description':
'Does the language have sortal classifiers, regardless of optional of obligatory?'
}]
args.writer.objects['CodeTable'] = [
{'ID': 'CLF-1', 'Parameter_ID': 'CLF', 'Name': 'TRUE'},
{'ID': 'CLF-0', 'Parameter_ID': 'CLF', 'Name': 'FALSE'},
]
codes = {r['Name']: r['ID'] for r in args.writer.objects['CodeTable']}
l2s = collections.defaultdict(list)
sources = []
for src in sorted(
Sources.from_file(self.raw_dir / 'sources.bib').items(), key=lambda i: i.id):
for k in ['title', 'author', 'editor']:
if k in src:
src[k] = src[k].replace('{', '').replace('}', '')
if src.get('glottocode'):
for code in split_text(src['glottocode'], ';', strip=True):
l2s[code].append(src.id)
sources += [src]
args.writer.cldf.add_sources(*sources)
for row in self.raw_dir.read_csv('WACL_v1.csv', delimiter=',', dicts=True):
lidx = slug(row['glottocode'], lowercase=False)
args.writer.objects['LanguageTable'].append({
'ID': lidx,
'Glottocode': row['glottocode'],
'Name': row['language_name'],
'Latitude': row['latitude'],
'Longitude': row['longitude'],
'ISO639P3code': row['iso_code'],
'Continent': row['continent']
})
for param in ['CLF']:
args.writer.objects['ValueTable'].append({
"ID": '{}-CLF'.format(lidx),
"Value": row[param],
"Language_ID": lidx,
"Parameter_ID": 'CLF',
"Code_ID": codes[row[param]],
"Source": l2s.get(row['glottocode'], [])
})