-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathutil.py
83 lines (65 loc) · 2.55 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import unicodedata
from bs4 import NavigableString
from typing import List
def normalize_str(text: str):
"""Replace diacritical characters and normalize the string this way.
Args:
text (str): String to be normalized
Returns:
str: Normalized version of the string
"""
text = clean_string(text.strip())
return unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore')
def clean_string(text: str):
"""Replace MS Office Special Characters from a String as well as double whitespace
Args:
text (str):
Returns:
str: Cleaned string
"""
result = ' '.join(text.split())
result = result.replace('\r', '').replace('.', '').replace(
'\n', ' ').replace(u'\xa0', u' ').replace(u'\xad', u'-').rstrip().lstrip()
return result
banned_set = set([
# No idea about this one, occurs in the dataset but has passed away before the time the dataset was made?
# Probably another person with the same name but can't find info about them. (see #10)
' Ramaekers Jef',
# Again, this person voted but was in the senate
# No records show him being elected to the House of Representatives.
' Collignon Christophe',
'Collignon Christophe',
'Christophe Collignon',
# This member wasn't a part of the House of Representatives
' Annane Jihane',
'Annane Jihane',
'Jihane Annane',
# Well, the string below was added because of some format issues in https://www.dekamer.be/doc/PCRI/html/52/ip078x.html, we should solve this better (by using a RegEx)
'(Ingevolge een technisch mankement werd de stemming van mevrouw Inge Vervotte',
' afwezig',
' opgenomen)',
'(A la suite d’une erreur technique',
' le vote de Mme Inge Vervotte',
' absente',
'(Om technische redenen is er geen stemming nr 2 / Pour raison technique',
" il n'y a pas de vote n° 2)",
'(De heer Guido De Padt heeft gestemd vanop de bank van de heer Ludo Van Campenhout',
' afwezig)',
' a été enregistré)',
# Bogus comments
'<![if !supportEmptyParas]> <![endif]>',
])
def is_string_banned(string: str):
return string in banned_set
def is_string_banned_or_empty(string: str):
return not string or is_string_banned(string)
def clean_list(list: List[any]):
"""Removes falsy items from a list
"""
return [clean_string(item) for item in list if not is_string_banned_or_empty(item)]
def go_to_p(tag: NavigableString):
"""Go to the nearest parent p tag of a NavigableString.
"""
while tag.name != "p":
tag = tag.parent
return tag