-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathurlrss
executable file
·377 lines (316 loc) · 13.1 KB
/
urlrss
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
#!/usr/bin/env python3
# Make a local RSS file based on a list of URLs,
# so we can run feedme to collect those URLs.
# Can be run locally as a script, or can be installed on a web server
# as a CGI (see feedfetcher for details on how to set that up).
#
# The URLs may be passed in as a POST query, if running as a CGI;
# or if there's no query string (we're running it locally as a script),
# we'll try to mount the Android device # and collect URLs from a file
# there and from another one locally.
# Testing CGI mode: surf to https://servername/feedme/urlrss.cgi?xtraurls=STR
# STR is a concatenation of extra URLS wanted,
# * URL encoded and connected by the string '%0a'.
# * For instance,
# * /feedme/urlrss.cgi?xtraurls=http%3A%2F%2Fblog.arduino.cc%2F2013%2F07%2F10%2Fsend-in-the-clones%2F%0ahttp%3A%2F%2Fread.bi%2F10Lbfh9%0ahttp%3A%2F%2Fwww.popsci.com%2Ftechnology%2Farticle%2F2013-07%2Fdrones-civil-war%0ahttp%3A%2F%2Fwww.thisamericanlife.org%2Fblog%2F2015%2F05%2Fcanvassers-study-in-episode-555-has-been-retracted
import os, sys
import time, datetime
import subprocess
import urllib.request, urllib.error, urllib.parse
import xml.sax.saxutils
import cgi
# Being called as a CGI script?
form = cgi.FieldStorage()
if 'xtraurls' in form:
cgi_mode = True
else:
cgi_mode = False
##############################
# CONFIGURATION
##############################
# Path to mount the device, if any:
# this assumes a device that's mountable as usb-storage,
# which was once true for Android but hasn't been since KitKat.
# Left in just in case there are other devices that find it useful.
if len(sys.argv) > 1:
devicepath = sys.argv[1]
else:
devicepath = "/droidsd"
if cgi_mode:
# The home dir of the server -- under which we should find feeds and feedme.
# if in CGI mode. Must end with a slash.
# If not CGI mode, this isn't used.
serverhome = '/var/www/'
# Under serverhome, there are two directories, feedme and feeds.
# feedme contains .config, .cache and a link to the feedme executable.
# feeds is where the actual daily feeds will be written;
# anything under feeds may be copied to the feed viewing device,
# so don't store anything like logs or url lists there.
feeds_dir = os.path.join(serverhome, 'feeds')
feedme_dir = os.path.join(serverhome, 'feedme')
# The path to feedme on the web server, as seen from the www-data user.
feedme_exec = os.path.join(feedme_dir, 'feedme.py')
else: # Not cgi_mode
serverhome = os.path.expanduser('~')
feeds_dir = os.path.expanduser('~/feeds')
feedme_dir = os.path.expanduser('~')
# Assume feedme is in the user's path:
feedme_exec = os.path.expanduser('feedme')
# Additional directories:
# mostly the above definitions should define where these will be.
# The cache dir will also store the backup for xtraurls.
cachedir = os.path.join(feedme_dir, '.cache', 'feedme')
# A list of directories in which to look for the saved URL files.
urlfiles = [ os.path.join(cachedir, 'localurls') ]
# Where to create the RSS file.
# Will be tilde-expanded.
rssfile = os.path.join(cachedir, 'xtraurls.rss')
# Where to save a cumulative HTML-format archive of the URLs.
# We'll append to what's already there.
urlarchive = os.path.join(cachedir, 'xtraurls.html')
# Optional: a file path on the server to log output to.
# This will only be used in CGI mode.
# Set to None for no debugging.
# If you've set serverhome you can probably leave this as is.
# A timestamp will be inserted, e.g. urlrss-2024-08-28_16:41.log
debuglogfile = os.path.join(cachedir, 'urlrss.log')
##############################
# END CONFIGURATION
# (You shouldn't need to change anything below this)
##############################
urls = []
debuglog = None
def find_process(progname, uid):
'''Find the lowest numbered process running a program named progname
with user ID uid, which can be either integer or string.
This is just a quickie function, which probably has all
sorts of edge cases where it fails.
'''
bname = os.path.basename(progname)
print("Trying to find a process named", bname, "with uid", uid,
file=sys.stderr)
pids = [pid for pid in os.listdir('/proc') if pid.isdigit()]
for pid in pids:
try:
with open(os.path.join('/proc', pid, 'cmdline'), 'rb') as procfd:
args = procfd.read().split(b'\x00')
# The script name, feedme, probably isn't actually the first
# argument; the first is probably python, then feedme.
cname = os.path.basename(args[0])
if cname.startswith(b"python"):
cname = os.path.basename(args[1])
cname = cname.decode("utf-8")
if cname != bname:
continue
# The process name matches. Does the UID?
with open(os.path.join('/proc', pid, 'status'), 'r') as procfd:
procuid = None
for line in procfd:
if line.startswith("Uid:"):
procuid = line.split()[1]
if str(uid) == procuid:
args[-1] = pid
return args
except IOError: # proc has already terminated
continue
# Didn't find a process.
return None
output = ''
if debuglogfile:
name, ext = os.path.splitext(debuglogfile)
debuglogfile = datetime.datetime.now().strftime(
f'{name}-%Y-%m-%d_%H:%M{ext}')
debuglog = open(debuglogfile, 'w')
print("Opened debuglog", debuglogfile, file=sys.stderr)
# and redirect stderr to there:
sys.stderr = debuglog
else:
print("No debug log file, using stderr", file=sys.stderr)
debuglog = sys.stderr
# XXX Need to delete old log files here.
if cgi_mode:
# Force HOME to be one above the config dir,
# so the feedme subprocess will find .cache and .config in the right place.
os.environ['HOME'] = feedme_dir
formurls = form['xtraurls'].value.strip().split('\n')
for url in formurls:
# We have to pass xtraurls=none in the case of no URLs,
# otherwise 'xtraurls' in form tests False and we have
# no way of knowing we're being run from a CGI.
if url != 'none':
print("URL", url, "from CGI args", file=debuglog)
urls.append(url)
print("CGI mode", file=debuglog)
print("urlfiles:", urlfiles, file=debuglog)
print("urls passed in:", urls, file=debuglog)
print("rssfile:", rssfile, file=debuglog)
print("urlarchive:", urlarchive, file=debuglog)
print("feedme exec:", feedme_exec, file=debuglog)
print("HOME:", os.environ['HOME'])
output += '''Content-type: text/plain
Content-Disposition: inline; filename=urlrss-out.txt
'''
# If we're run as a CGI, then we'll be expected to start
# a new feedme process. But if there's already one running,
# that could cause problems, so check first.
# A good first check is whether there's a feedme_exec process
# running as the current user.
feedmeproc = find_process(feedme_exec, os.getuid())
if feedmeproc:
if debuglog:
print("feedme is already running, pid", feedmeproc[-1],
file=debuglog)
print(output)
print('Feedme is already running! PID %s. Quitting' % feedmeproc[-1])
sys.exit(0)
if debuglog:
print("feedme isn't running yet -- whew!", file=sys.stderr)
output += 'Extra URLs\n'
else:
# Not a CGI, run locally.
debuglog = sys.stderr
# Mount the device's SD card if it isn't already mounted:
if devicepath and os.path.exists(devicepath):
if not os.path.ismount(devicepath):
subprocess.call(["mount", devicepath], shell=False)
# Ignore return value: it will be nonzero if already mounted.
# So check to see if it's mounted now:
if not os.path.ismount(devicepath):
print("Can't mount %s!" % devicepath)
sys.exit(1)
else: print("Device is already mounted.")
urlfiles.append(os.path.join(devicepath, "feeds", "saved-urls"))
# Loop over url files to get a list of extra urls:
for urlfile in urlfiles:
try:
ifp = open(urlfile)
print("Reading URLs from", urlfile, file=debuglog)
except:
continue
for line in ifp:
line = line.strip()
if not line: continue
print("URL from", urlfile, file=debuglog)
urls.append(line)
ifp.close()
if not urls:
print("No saved URLs", file=debuglog)
if os.path.exists(rssfile):
os.unlink(rssfile)
else:
#
# We have URLs to save. So open the output files.
#
print("Full list of Xtra urls:", urls, file=debuglog)
now = datetime.datetime.now().astimezone()
try:
ofp = open(rssfile, "w")
except Exception as e:
output += "\nCouldn't write to RSS file %s\n" % rssfile
output += str(e)
print(output)
print(output, file=debuglog)
sys.exit(1)
print("""<rss version="0.91">
<channel>
<title>FeedMe Xtra URLs</title>
<description>Saved URLs %s</description>
<language>en</language>""" % datetime.datetime.now().ctime(), file=ofp)
try:
archivefp = open(urlarchive, "a")
print("\n<h4>Feedme URLs %s</h4>" \
% now.ctime(), file=archivefp)
except:
print("Couldn't open archive file", urlarchive, file=sys.stderr)
archivefp = None
print("", file=debuglog)
for url in urls:
# Follow it and see if it's a redirect -- we want the real, final url.
try:
output += url + " ..."
sys.stdout.flush()
urlobj = urllib.request.urlopen(url, timeout=60)
newurl = urlobj.geturl()
urlobj.close()
if newurl != url:
output += " -- redirected to %s" % newurl
url = newurl
output += "\n"
except:
pass
# The XML parser we use in feedme will sometimes barf on ampersands:
# "xml.sax._exceptions.SAXException: Read failed (no details available)"
# and not even a stack trace to give a line number.
# Try to avoid that by escaping everything (and hope that
# doesn't break link following):
escaped_url = xml.sax.saxutils.escape(url)
# Save it in the RSS file:
print("""
<item>
<title>%s</title>
<description>%s</description>
<link>%s</link>
<pubDate>%s</pubDate>
</item>""" % (escaped_url, escaped_url, escaped_url,
now.strftime('%a, %d %b %Y %H:%M:%S %z')),
file=ofp)
print("URL saved to RSS file:", url, file=debuglog)
# Save an HTML copy to the archive file:
if archivefp:
print("<a href='%s'>%s</a><br>" % (url, url), file=archivefp)
print("""</channel>
</rss>""", file=ofp)
ofp.close()
if archivefp:
archivefp.close()
if urls:
print("Saved %d urls to %s and %s" % (len(urls), rssfile,
urlarchive), file=debuglog)
# Done making the Xtra RSS file (if any).
if cgi_mode:
# If we're being run as a CGI, it's because someone wants us
# to kick off a feedme process. So do so:
# Don't need to pass env= in the next call,
# because changing os.environ earlier already changed the environment
# that will be passed through to subprocesses.
# We need to throw away feedme's output (but it should be captured
# in feedme's log file anyway) otherwise the web server won't
# deliver any page content until feedme is completely done
# (or it times out).
devnull = open(os.devnull, 'wb')
# Start feedme running in the background, so we can return immediately.
# Most of the time we want very little output, so the CGI finishes.
if True:
print("Trying to run %s ..." % feedme_exec, file=sys.stderr)
subprocess.Popen([feedme_exec], shell=False,
stdout=devnull, stderr=devnull)
else:
# But if there's a problem with urlrss, for debugging,
# you may want to change that True to a False so you can
# see the output in the browser.
pp = subprocess.Popen([feedme_exec], shell=False,
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
print("Feedme should be running now. Will try to capture output.",
file=sys.stderr)
print("Running %s", feedme_exec)
out_tuple = pp.communicate()
print("\n==== stdin:", file=sys.stderr)
print(out_tuple[0].decode(), file=sys.stderr)
print("\n==== stderr:", file=sys.stderr)
print(out_tuple[1].decode(), file=sys.stderr)
output += "\n\n==== stdin:\n" + out_tuple[0].decode()
output += "\n\n==== stderr:\n" + out_tuple[1].decode()
print(output)
print("CGI OUTPUT:", file=sys.stderr)
print(output, file=sys.stderr)
# Once feedme has started, but not before, it's okay to remove urlfiles
for urlfile in urlfiles:
try:
os.unlink(urlfile)
except FileNotFoundError:
pass
# ofp and archivefp has already been closed, but debuglog hasn't.
# But don't close it unless we're sure it's not sys.stderr,
# which is hard since sys.stderr has been set to debuglog.
# if debuglog and debuglog != sys.stderr:
# debuglog.close()