Implement postbox fetching and downloads

andsor · Sep 17, 2014 · 499fbe1 · 499fbe1
1 parent c98a2ac
commit 499fbe1
Showing 1 changed file with 193 additions and 0 deletions.
diff --git a/vbscraper.py b/vbscraper.py
@@ -8,6 +8,25 @@
 import lxml.html
 import getpass
 import bs4
+import collections
+import datetime
+import tempfile
+import shutil
+import os
+import os.path
+
+
+PostboxDocument = collections.namedtuple(
+    'PostboxDocument',
+    [
+        'msg_date',
+        'msg_type',
+        'is_new',
+        'url',
+        'subject',
+        'postbox_page',
+    ]
+)
 
 
 class VBSession(object):
@@ -79,3 +98,177 @@ def logout(self):
         self.s.close()
 
         return ret
+
+    def postbox_items(self):
+        """
+        Iterate through the postbox items
+        """
+
+        if self.verbose:
+            print("Access postbox")
+
+        # Get Postbox page
+        r = self.s.get(self.base_url + self.postbox_url)
+
+        # Parse Postbox page
+        soup = bs4.BeautifulSoup(r.text)
+
+        # Get number of pages
+        li = soup.find('li', attrs={'class': 'gad-paginationActivePageNumber'})
+
+        if not li.text.strip() == '1':
+            raise RuntimeError('Postbox does not start with page 1')
+
+        # loop through postbox pages
+        ret = []
+
+        while True:
+
+            # Get current page number
+            li = soup.find(
+                'li', attrs={'class': 'gad-paginationActivePageNumber'}
+            )
+
+            page_number = li.text.strip()
+
+            if self.verbose:
+                print("Postbox page {}".format(page_number))
+
+            # loop through documents on the current page
+            for tr in soup.table.tbody.findChildren('tr'):
+
+                is_new = 'gad-tableEntryHighlighted' in tr['class']
+
+                msg_date = datetime.datetime.strptime(
+                    tr.find(
+                        'td', attrs={'class': 'gad-dateColumn'}
+                    ).text.strip(),
+                    "%d.%m.%Y %H:%M"
+                )
+
+                a = tr.find(
+                    'td', attrs={'class': 'gad-textColumn'}
+                ).a
+
+                url = a['href']
+
+                subject = ' '.join([
+                    str2 for str2 in [
+                        str.strip() for str in a.text.split('\n')
+                    ] if str2
+                ])
+
+                msg_type = tr.find(
+                    'td', attrs={'class': 'gad-textColumn'}
+                ).a.span.text
+
+                ret.append(PostboxDocument(
+                    msg_date=msg_date,
+                    msg_type=msg_type,
+                    url=url,
+                    is_new=is_new,
+                    subject=subject,
+                    postbox_page=page_number,
+                ))
+
+            # terminate loop if this was the last page of the postbox
+            a_next_page = soup.find('a', title='nächste Seite')
+
+            if a_next_page.get('disabled'):
+                break
+
+            # get next page
+            r = self.s.get(self.base_url + a_next_page['href'])
+
+            # Parse Postbox page
+            soup = bs4.BeautifulSoup(r.text)
+
+        return ret
+
+    def download_document(self, document, destinations):
+        """
+        Download a document
+        """
+
+        if self.verbose:
+            print("Access postbox")
+
+        # Get Postbox page
+        r = self.s.get(self.base_url + self.postbox_url)
+
+        # Parse Postbox page
+        soup = bs4.BeautifulSoup(r.text)
+
+        # Get current page number
+        li = soup.find(
+            'li', attrs={'class': 'gad-paginationActivePageNumber'}
+        )
+        page_number = li.text.strip()
+
+        if not page_number == '1':
+            raise RuntimeError('Postbox does not start with page 1')
+
+        if not document.postbox_page == page_number:
+            # get postbox page
+            if self.verbose:
+                print("Access postbox page {}".format(document.postbox_page))
+
+            while not li.text.strip() == document.postbox_page:
+                li = li.find_next_sibling()
+
+            page_url = li.a['href']
+            r = self.s.get(self.base_url + page_url)
+            assert r.status_code == 200
+
+        # get message page
+        if self.verbose:
+            print("Download document {}".format(document.subject))
+
+        r = self.s.get(self.base_url + document.url)
+
+        # parse message page
+        soup = bs4.BeautifulSoup(r.text)
+
+        #subject = msg_soup.find(
+        #    'label', attrs={'for': 'messageSenderSubject'}
+        #).parent.find_next_sibling().span.text
+
+        attachment_a = soup.find(
+            'a', title='Anhang öffnen'
+        )
+
+        attachment_url = attachment_a['href']
+
+        filename = attachment_a.text
+
+        # download document
+        r = self.s.get(self.base_url + attachment_url, stream=True)
+
+        if not r.status_code == 200:
+            raise RuntimeError('Download failed.')
+
+        r.raw.decode_content = True
+
+        # copy http data to temporary file
+        with tempfile.NamedTemporaryFile(delete=False) as fp:
+            shutil.copyfileobj(r.raw, fp)
+            tmp_filename = fp.name
+
+        # copy file to destinations
+        for dest in destinations:
+
+            dest_filename = os.path.join(dest, filename)
+
+            if self.verbose:
+                print("Copy to {}".format(dest_filename))
+
+            if os.path.exists(dest_filename):
+                print('"{}" already exists'.format(dest_filename))
+                continue
+
+            shutil.copyfile(tmp_filename, dest_filename)
+
+        # delete temporary file
+        os.unlink(tmp_filename)
+
+        return True