-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathstongs.py
184 lines (159 loc) · 6.06 KB
/
stongs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
"""
deli->cheese->blue cheeses (class 12, product group 35, product id 138)
deli->crackers->Gone Crackers (class 12, product group 61, product id 522)
dairy->cheese->baking cheese (class 11, product group id 34, product id 1548)
Category structure is:
Top-level category (class id)
-> Sub-category (product group id)
-> Brand category (product id)
-> product (id?)
"""
import logging
log = logging.getLogger("stongs")
import requests
import pickle
import os
import re
from BeautifulSoup import BeautifulSoup
COOKIEFILE = '/tmp/stongcookies.lwp'
INDEX_URL = "http://www.stongs.com/index.cfm?fuseaction=content&page_id=1"
PRODUCTS_URL = "http://www.stongs.com/index.cfm?fuseaction=order&class=%s&product_group_id=%s&product_id=%s"
CACHEFILE = '/tmp/stongs.cache'
cache = {}
class ParseError(Exception): pass
class Category(object):
def __init__(self, name, id, parent=None):
self.name = name
self.id = id
self.children = []
self.parent = parent
def __str__(self):
return self.name
def __repr__(self):
return str(self)
def slugified(self):
return self.name.lower().replace(" ", "_").replace("/", "_")
def as_url(self):
components = [self.slugified()]
parent = self.parent
while parent:
components.append(parent.slugified())
parent = parent.parent
components.reverse()
return "/".join(components)
def find_child(self, child_name):
for child in self.children:
if child.slugified() == child_name:
return child
return None
def get_parents(self):
parents = [self]
parent = self.parent
while parent:
parents.append(parent)
parent = parent.parent
parents.reverse()
return parents
def _get_page(url):
cached_pages = cache.setdefault("cached_pages", {})
# TODO: Add expiration times
if url in cached_pages:
return cached_pages[url]
log.info("Opening %s" % url)
r = requests.get(url)
cached_pages[url] = r.content
cache["cached_pages"] = cached_pages
return r.content
def _persist_cache():
log.info("Saving cache")
f = open(CACHEFILE, "w")
pickle.dump(cache, f)
f.close()
def load_cache():
if os.path.exists(CACHEFILE):
global cache
log.info("Loading cache")
f = open(CACHEFILE, "r")
cache = pickle.load(f)
f.close()
def get_category_list():
category_list = []
page = _get_page(INDEX_URL)
category_id_match = re.search("classes = Array\((.*?)\)\;", page)
category_match = re.search("classes_names = Array\((.*?)\)\;", page)
if not category_match:
raise ParseError("Could not find top-level category list")
cat_data = category_match.groups(0)[0]
cat_id_data = category_id_match.groups(0)[0]
categories = [c.replace("'", "").replace("\\", "") for c in cat_data.split("','")]
category_ids = cat_id_data.split(",")
for name, id in zip(categories, category_ids):
category = Category(name, id)
category_list.append(category)
group_list = []
groups_id_match = re.search("groups_class = Array\((.*?)\)\;", page)
groups_match = re.search("groups_names = Array\((.*?)\)\;", page)
if not groups_match:
raise ParseError("Could not find groups list")
groups_data = groups_match.groups(0)[0]
groups_id_data = groups_id_match.groups(0)[0]
groups = [c.replace("'", "").replace("\\", "") for c in groups_data.split("','")]
group_ids = groups_id_data.split("','")
for idx, (name_list, id_list) in enumerate(zip(groups, group_ids)):
parent_cat = category_list[idx]
for name, id in zip(name_list.split("|"), id_list.split(",")):
category = Category(name, id, parent=parent_cat)
parent_cat.children.append(category)
group_list.append(category)
products_id_match = re.search("products_groups = Array\((.*?)\)\;", page)
products_match = re.search("products_groups_names = Array\((.*?)\)\;", page)
if not products_match:
raise ParseError("Could not find products list")
products_data = products_match.groups(0)[0]
products_id_data = products_id_match.groups(0)[0]
products = [c.replace("'", "").replace("\\", "") for c in products_data.split("','")]
products_ids = products_id_data.split("','")
for idx, (name_list, id_list) in enumerate(zip(products, products_ids)):
parent_cat = group_list[idx]
for name, id in zip(name_list.split("|"), id_list.split(",")):
category = Category(name, id, parent=parent_cat)
parent_cat.children.append(category)
return category_list
def get_products(category):
products = []
products_id = category.id
groups_id = category.parent.id
class_id = category.parent.parent.id
url = PRODUCTS_URL % (class_id, groups_id, products_id)
page = _get_page(url)
soup = BeautifulSoup(page)
form = soup.find("form", {"id":"orderForm"})
table = form.parent
for row in table.findAll("tr", {"valign":"top"}):
cells = row.findAll("td")
title = cells[1].b.string
if cells[2].string:
price = cells[2].string.strip()
else:
price = "NA"
order_input = cells[3].input
product_id = int(order_input['id'].replace("sub_", ""))
products.append((title, price, product_id, url))
return products
if __name__=="__main__":
logging.basicConfig()
for handler in logging.getLogger().handlers:
handler.setFormatter(logging.Formatter("%(asctime)s %(name)-8s %(levelname)-7s - %(message)s"))
formatter = logging.Formatter('%(name)s: %(levelname)s %(message)s')
logging.getLogger().setLevel(logging.INFO)
load_cache()
cat_list = get_category_list()
deli = cat_list[12]
print deli
print deli.children
cheese = deli.children[0]
print cheese
print cheese.children
asiago = cheese.children[0]
print get_products(asiago)
_persist_cache()