-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreader.py
58 lines (44 loc) · 1.99 KB
/
reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# -*- coding: utf-8 -*-
import csv
import gzip
import json
import marisa_trie
from utils import file_path
class Reader(object):
def __init__(self, fb_file, fi_file):
self.fi_fieldnames = [
'id', 'zefix', 'rechts_form_id', 'firma', 'regd_office_nr',
'regd_office_name', 'commercial_regd_no', 'capital', 'currency_id',
'status_id', 'close_date', 'open_date', 'shab_nr', 'shab_site', 'mutation_id',
'mutation_date', 'shab_seq', 'address', 'care_of', 'strasse', 'house_no',
'addr_additional', 'post_box', 'zip', 'place_name', 'purpose', 'uid'
]
self.fb_fieldnames = ['id', 'i', 'j', 'lang', 'v', 'name', 'start', 'end']
self.fi_file = fi_file
self.fb_file = fb_file
def run(self):
data = {}
keys = []
values = []
with gzip.open(self.fi_file, 'r') as fi:
fi_reader = csv.DictReader(fi, fieldnames=self.fi_fieldnames, delimiter='\t')
for row in fi_reader:
data[row['id']] = {
'zefix': row['zefix'].strip(), 'uid': row['uid'].strip(),
'house_no': format_unicode(row['house_no']), 'street': format_unicode(row['strasse']),
'zip': row['zip'], 'place': format_unicode(row['place_name'])
}
with gzip.open(self.fb_file, 'r') as fb:
fb_reader = csv.DictReader(fb, fieldnames=self.fb_fieldnames, delimiter='\t')
for row in fb_reader:
zefix = data.get(row['id'])
if zefix is None:
continue
name = format_unicode(row['name'])
keys.append(name.lower())
zefix['name'] = name
values.append(json.dumps(zefix))
trie = marisa_trie.BytesTrie(zip(keys, values))
trie.save(file_path('zefix.ds'))
def format_unicode(string):
return unicode(string.strip(), 'ISO-8859-1')