adds scraper deployment

AustrianDataLAB · May 13, 2024 · ac6da87 · ac6da87
1 parent 9c1cb16
commit ac6da87
Show file tree

Hide file tree

Showing 7 changed files with 135 additions and 1,672 deletions.
diff --git a/scrapers/Wohnungen.json b/scrapers/Wohnungen.json
diff --git a/scrapers/function_app.py b/scrapers/function_app.py
@@ -4,4 +4,7 @@
 
 # NOTE (Laurenz): Register the created scrapers in the function app
 from impl import scraper_bwsg
+from impl import scraper_demo
+
 app.register_functions(scraper_bwsg.bp)
+app.register_functions(scraper_demo.bp)
diff --git a/scrapers/impl/scraper_bwsg.py b/scrapers/impl/scraper_bwsg.py
@@ -1,51 +1,90 @@
 import azure.functions as func
+from bs4 import BeautifulSoup
+import requests
 import logging
 import json
 import os
 
+URL = "https://immobilien.bwsg.at"
+PARAMS = {
+    'f[all][marketing_type]': 'rent', # Miete
+	'f[all][realty_type][0]': '2',    # Wohnung
+    'f[all][realty_type][1]': '3',    # Haus
+    'f[all][city]'          : 'Wien',
+	'from'                  : '1117350'
+}
 
 bp = func.Blueprint() 
 
-
-'''
-NOTE Laurenz: 
-- create a new module for each website inside scrapers/impl
-- function name has to be unique: <genossenschaft_name>_scraper
-- for debugging you can trigger the function via HTTP. Refer to https://learn.microsoft.com/en-us/azure/azure-functions/functions-manually-run-non-http?tabs=azure-portal
-'''
 @bp.timer_trigger(schedule="0 */5 * * * *", arg_name="timerObj", run_on_startup=False) 
 @bp.queue_output(arg_name="q", queue_name=os.getenv('QUEUE_NAME'), connection="AzureWebJobsStorage")
 def bwsg_scraper(timerObj: func.TimerRequest, q: func.Out[str]) -> None:
-    logging.info('Scraper Demo A triggered.')
+    logging.info('BWSG scraper triggered.')
+
+    req = requests.request(method='GET', url=URL, params=PARAMS)
+    soup = BeautifulSoup(req.text, 'html.parser')
+    pages = soup.find(class_ = 'pagination')
+    cur = pages.find(class_ = 'active')
+    pages = pages.find_all('li')
+
+    links = []
+
+    while True:
+        panel_wrapper = soup.find_all(class_='panel-wrapper')
+
+        for panel in panel_wrapper:
+            panel_footer = panel.find(class_='panel-footer')
+            links.append(panel.find('a').get('href'))
+
+        if cur == pages[-1]:
+            break
+
+        cur_index = pages.index(cur)
+        next_index = cur_index + 1
+        next_link = pages[next_index].find('a').get('href')
+
+        req = requests.request(method='GET', url=URL + next_link)
+        soup = BeautifulSoup(req.text, 'html.parser')
+        pages = soup.find(class_ = 'pagination')
+        cur = pages.find(class_ = 'active')
+        pages = pages.find_all('li')    
+
+    Wohnungen = list()
+
+    count = 0
+
+    for link in links:        
+        req = requests.request(method = 'GET', url = URL + links[0])
+        soup = BeautifulSoup(req.text, 'html.parser')
+        info = soup.find(class_ = 'container-wrapper')
+        detail_infos  = info.find(class_ = 'realty-detail-info').find_all('li')
+
+        wohnung = dict()
+        for detail in detail_infos:
+            desc = detail.find(class_ = 'list-item-desc').get_text().strip()
+            value = detail.find(class_ = 'list-item-value').get_text().strip()
+            wohnung[desc] = value
+
+        detail_preis  = info.find(class_ = 'rent-price-table w-100').find_all('tr')
+
+        for row in detail_preis:
+            cols = row.find_all('td')
+            desc = cols[0].get_text().strip()
+            value = cols[1].get_text().strip()
+            wohnung[desc] = value
+
+        extra_info = info.find(class_ = 'list-unstyled').find_all('li')
+        for detail in detail_infos:
+            desc = detail.find(class_ = 'list-item-desc').get_text().strip()
+            value = detail.find(class_ = 'list-item-value').get_text().strip()
+            wohnung[desc] = value
+
+        explanation = info.find(class_ = 'costs-explanation').get_text().strip()
+        wohnung['cost-explanation'] = explanation
+
+        wohnung['link'] = URL + links[0]
+        wohnung['Unternehmen'] = 'BWSG'
+        count += 1
+        Wohnungen.append(wohnung)
 
-    payload = {
-        "scraperId": "viennaHousingScraper002",
-        "timestamp": "2024-04-06T15:30:00Z",
-        "listings": [
-            {
-                "title": "Modern 3-Bedroom Apartment in Central Vienna",
-                "housingCooperative": "FutureLiving Genossenschaft",
-                "projectId": "FLG2024",
-                "listingId": "12345ABC",
-                "country": "Austria",
-                "city": "Vienna",
-                "postalCode": "1010",
-                "address": "Beispielgasse 42",
-                "roomCount": 3,
-                "squareMeters": 95,
-                "availabilityDate": "2024-09-01",
-                "yearBuilt": 2019,
-                "hwgEnergyClass": "A",
-                "fgeeEnergyClass": "A+",
-                "listingType": "both",
-                "rentPricePerMonth": 1200,
-                "cooperativeShare": 5000,
-                "salePrice": 350000,
-                "additionalFees": 6500,
-                "detailsUrl": "https://www.futurelivinggenossenschaft.at/listings/12345ABC",
-                "previewImageUrl": "https://www.futurelivinggenossenschaft.at/listings/12345ABC/preview.jpg"
-            }
-        ]
-    }
-
-    q.set(json.dumps(payload).encode(encoding='UTF-8'))
+    q.set(json.dumps(Wohnungen).encode(encoding='UTF-8'))
diff --git a/scrapers/impl/scraper_demo.py b/scrapers/impl/scraper_demo.py
@@ -0,0 +1,51 @@
+import azure.functions as func
+import logging
+import json
+import os
+
+
+bp = func.Blueprint() 
+
+
+'''
+NOTE Laurenz: 
+- create a new module for each website inside scrapers/impl
+- function name has to be unique: <genossenschaft_name>_scraper
+- for debugging you can trigger the function via HTTP. Refer to https://learn.microsoft.com/en-us/azure/azure-functions/functions-manually-run-non-http?tabs=azure-portal
+'''
+@bp.timer_trigger(schedule="0 */5 * * * *", arg_name="timerObj", run_on_startup=False) 
+@bp.queue_output(arg_name="q", queue_name=os.getenv('QUEUE_NAME'), connection="AzureWebJobsStorage")
+def demo_scraper(timerObj: func.TimerRequest, q: func.Out[str]) -> None:
+    logging.info('Scraper Demo A triggered.')
+
+    payload = {
+        "scraperId": "viennaHousingScraper002",
+        "timestamp": "2024-04-06T15:30:00Z",
+        "listings": [
+            {
+                "title": "Modern 3-Bedroom Apartment in Central Vienna",
+                "housingCooperative": "FutureLiving Genossenschaft",
+                "projectId": "FLG2024",
+                "listingId": "12345ABC",
+                "country": "Austria",
+                "city": "Vienna",
+                "postalCode": "1010",
+                "address": "Beispielgasse 42",
+                "roomCount": 3,
+                "squareMeters": 95,
+                "availabilityDate": "2024-09-01",
+                "yearBuilt": 2019,
+                "hwgEnergyClass": "A",
+                "fgeeEnergyClass": "A+",
+                "listingType": "both",
+                "rentPricePerMonth": 1200,
+                "cooperativeShare": 5000,
+                "salePrice": 350000,
+                "additionalFees": 6500,
+                "detailsUrl": "https://www.futurelivinggenossenschaft.at/listings/12345ABC",
+                "previewImageUrl": "https://www.futurelivinggenossenschaft.at/listings/12345ABC/preview.jpg"
+            }
+        ]
+    }
+
+    q.set(json.dumps(payload).encode(encoding='UTF-8'))
diff --git a/scrapers/requeriments.txt b/scrapers/requeriments.txt
diff --git a/scrapers/requirements.txt b/scrapers/requirements.txt
@@ -3,4 +3,6 @@
 # Manually managing azure-functions-worker may cause unexpected issues
 
 azure-functions
-pytest
+pytest
+requests
+beautifulsoup4
diff --git a/scrapers/webscrap.yml b/scrapers/webscrap.yml