forked from jpwatts/publicsuffix
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpublicsuffix.py
180 lines (158 loc) · 5.56 KB
/
publicsuffix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 2010 Watts Lab, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
EFFECTIVE_TLD_NAMES = 'http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1'
def _normalize(s):
from exceptions import UnicodeEncodeError
s = s.strip().strip('.').split()[0].lower()
try:
return s.decode('utf-8')
except UnicodeEncodeError, ex:
return s
class PrefixNode:
'''
An internal class, each prefix in the tree represents a part of a domain name. (e.g. '.com')
The naming is a little confusing, a PrefixTree is the data structure, but the nodes are
used to store the suffixes of partial domain names.
'''
#passing a rule lets you build on entire branch in one call
def __init__(self, prefix, rule=None):
if prefix is not None:
self.is_exception = prefix.startswith('!')
if self.is_exception:
prefix = prefix[1:]
self.prefix = prefix
self.children = dict()
if rule is not None:
self.add(rule)
def __repr__(self):
return 'PrefixNode(\'%s\')' % (self.prefix,)
def add(self, rule):
if len(rule) == 0:
return
prefix = rule[0]
if prefix.startswith('!') and '*' in self.children:
#if this is an exception to a wildcard it should be a child of that wildcard
self.children['*'].add(rule)
return
if prefix in self.children:
self.children[prefix].add(rule[1:])
else:
if prefix.startswith('!'):
prefix = prefix[1:]
self.children[prefix] = PrefixNode( prefix, rule= rule[1:] )
# when given a rule to match, splits the tuple into a matching section and the domain name
# e.g. match( ('com', 'example', 'www') ) -> ( ['.com'], 'example' )
# e.g. match( ('com', 'eff-tld', 'site') ) -> ( ['.eff-tld', '.com'], 'site' )
def match(self, rule):
#print ' %s is matching %s' % (self.prefix, rule)
if len(rule) == 0:
#when a tld is also a hostname
return ( [], [])
if self.prefix == '*':
return self._match_as_wildcard(rule)
prefix = rule[0]
if prefix not in self.children:
if '*' in self.children:
return self.children['*'].match(rule)
return ( [], prefix )
else:
match = self.children[prefix].match( rule[1:] )
child_matched = match[0]
child_matched.append('.' + prefix)
return ( child_matched, match[1] )
def _match_as_wildcard(self, rule):
#print ' %s: matching %s as wildcard. My children are: %s' % (self.prefix, rule, self.children)
prefix = rule[0]
#if prefix matches no exception
if prefix not in self.children:
if len(rule) > 1:
return ( ['.' + prefix], rule[1])
return ( ['.' + prefix], None)
else:
return ( [], prefix)
class PrefixTree(PrefixNode):
'''
Helper to provide a nicer interface for dealing with the tree
'''
#seq is a sequence of tuples representing rules
#of the form ('com'), ('uk', 'co'), etc.
def __init__(self, seq):
PrefixNode.__init__(self, None)
for rule in seq:
self.add(rule)
def __repr__(self):
return '<PrefixTree (%s children)>' % (len(self.children),)
def match(self, s):
rule = tuple(reversed(s.strip().split('.')))
match = PrefixNode.match(self, rule)
#print 'Tree matching %s, match was %s' % (s, match)
if len(match[0]) == 0:
return None
return (''.join(match[0]), match[1])
def domain(self, s):
if s is None:
return None
s = _normalize(s)
match = self.match(s)
try:
return match[1] + match[0]
except:
return None
def domain(self, s):
if s is None:
return None
s = _normalize(s)
match = self.match(s)
try:
return match[1] + match[0]
except:
return None
def _tokenize(lines):
rules = []
for s in lines:
if s and not s.isspace() and not s.startswith('//'):
rule = tuple(reversed(_normalize(s).split('.')))
rules.append(rule)
return rules
def _is_ip(address):
parts = address.split(".")
if len(parts) != 4:
return False
for item in parts:
if not item.isdigit() or not 0 <= int(item) <= 255:
return False
return True
suffixtree = None
'''
Call this first to initialize the suffix tree
'''
def init_suffix_tree(tld_file):
fp = open(tld_file)
suffix_lines = fp.readlines()
suffix_rules = _tokenize(suffix_lines)
fp.close()
global suffixtree
suffixtree = PrefixTree(suffix_rules)
def get_root_domain(domain):
if not domain:
return None
domain = _normalize(domain)
if _is_ip(domain):
return domain
return suffixtree.domain(domain)
if __name__ == '__main__':
import doctest
doctest.testmod()