Skip to content

Commit

Permalink
adds scraper deployment
Browse files Browse the repository at this point in the history
  • Loading branch information
laurenz-k committed May 13, 2024
1 parent 9c1cb16 commit ac6da87
Show file tree
Hide file tree
Showing 7 changed files with 135 additions and 1,672 deletions.
1,541 changes: 0 additions & 1,541 deletions scrapers/Wohnungen.json

This file was deleted.

3 changes: 3 additions & 0 deletions scrapers/function_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,7 @@

# NOTE (Laurenz): Register the created scrapers in the function app
from impl import scraper_bwsg
from impl import scraper_demo

app.register_functions(scraper_bwsg.bp)
app.register_functions(scraper_demo.bp)
117 changes: 78 additions & 39 deletions scrapers/impl/scraper_bwsg.py
Original file line number Diff line number Diff line change
@@ -1,51 +1,90 @@
import azure.functions as func
from bs4 import BeautifulSoup
import requests
import logging
import json
import os

URL = "https://immobilien.bwsg.at"
PARAMS = {
'f[all][marketing_type]': 'rent', # Miete
'f[all][realty_type][0]': '2', # Wohnung
'f[all][realty_type][1]': '3', # Haus
'f[all][city]' : 'Wien',
'from' : '1117350'
}

bp = func.Blueprint()


'''
NOTE Laurenz:
- create a new module for each website inside scrapers/impl
- function name has to be unique: <genossenschaft_name>_scraper
- for debugging you can trigger the function via HTTP. Refer to https://learn.microsoft.com/en-us/azure/azure-functions/functions-manually-run-non-http?tabs=azure-portal
'''
@bp.timer_trigger(schedule="0 */5 * * * *", arg_name="timerObj", run_on_startup=False)
@bp.queue_output(arg_name="q", queue_name=os.getenv('QUEUE_NAME'), connection="AzureWebJobsStorage")
def bwsg_scraper(timerObj: func.TimerRequest, q: func.Out[str]) -> None:
logging.info('Scraper Demo A triggered.')
logging.info('BWSG scraper triggered.')

req = requests.request(method='GET', url=URL, params=PARAMS)
soup = BeautifulSoup(req.text, 'html.parser')
pages = soup.find(class_ = 'pagination')
cur = pages.find(class_ = 'active')
pages = pages.find_all('li')

links = []

while True:
panel_wrapper = soup.find_all(class_='panel-wrapper')

for panel in panel_wrapper:
panel_footer = panel.find(class_='panel-footer')
links.append(panel.find('a').get('href'))

if cur == pages[-1]:
break

cur_index = pages.index(cur)
next_index = cur_index + 1
next_link = pages[next_index].find('a').get('href')

req = requests.request(method='GET', url=URL + next_link)
soup = BeautifulSoup(req.text, 'html.parser')
pages = soup.find(class_ = 'pagination')
cur = pages.find(class_ = 'active')
pages = pages.find_all('li')

Wohnungen = list()

count = 0

for link in links:
req = requests.request(method = 'GET', url = URL + links[0])
soup = BeautifulSoup(req.text, 'html.parser')
info = soup.find(class_ = 'container-wrapper')
detail_infos = info.find(class_ = 'realty-detail-info').find_all('li')

wohnung = dict()
for detail in detail_infos:
desc = detail.find(class_ = 'list-item-desc').get_text().strip()
value = detail.find(class_ = 'list-item-value').get_text().strip()
wohnung[desc] = value

detail_preis = info.find(class_ = 'rent-price-table w-100').find_all('tr')

for row in detail_preis:
cols = row.find_all('td')
desc = cols[0].get_text().strip()
value = cols[1].get_text().strip()
wohnung[desc] = value

extra_info = info.find(class_ = 'list-unstyled').find_all('li')
for detail in detail_infos:
desc = detail.find(class_ = 'list-item-desc').get_text().strip()
value = detail.find(class_ = 'list-item-value').get_text().strip()
wohnung[desc] = value

explanation = info.find(class_ = 'costs-explanation').get_text().strip()
wohnung['cost-explanation'] = explanation

wohnung['link'] = URL + links[0]
wohnung['Unternehmen'] = 'BWSG'
count += 1
Wohnungen.append(wohnung)

payload = {
"scraperId": "viennaHousingScraper002",
"timestamp": "2024-04-06T15:30:00Z",
"listings": [
{
"title": "Modern 3-Bedroom Apartment in Central Vienna",
"housingCooperative": "FutureLiving Genossenschaft",
"projectId": "FLG2024",
"listingId": "12345ABC",
"country": "Austria",
"city": "Vienna",
"postalCode": "1010",
"address": "Beispielgasse 42",
"roomCount": 3,
"squareMeters": 95,
"availabilityDate": "2024-09-01",
"yearBuilt": 2019,
"hwgEnergyClass": "A",
"fgeeEnergyClass": "A+",
"listingType": "both",
"rentPricePerMonth": 1200,
"cooperativeShare": 5000,
"salePrice": 350000,
"additionalFees": 6500,
"detailsUrl": "https://www.futurelivinggenossenschaft.at/listings/12345ABC",
"previewImageUrl": "https://www.futurelivinggenossenschaft.at/listings/12345ABC/preview.jpg"
}
]
}

q.set(json.dumps(payload).encode(encoding='UTF-8'))
q.set(json.dumps(Wohnungen).encode(encoding='UTF-8'))
51 changes: 51 additions & 0 deletions scrapers/impl/scraper_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import azure.functions as func
import logging
import json
import os


bp = func.Blueprint()


'''
NOTE Laurenz:
- create a new module for each website inside scrapers/impl
- function name has to be unique: <genossenschaft_name>_scraper
- for debugging you can trigger the function via HTTP. Refer to https://learn.microsoft.com/en-us/azure/azure-functions/functions-manually-run-non-http?tabs=azure-portal
'''
@bp.timer_trigger(schedule="0 */5 * * * *", arg_name="timerObj", run_on_startup=False)
@bp.queue_output(arg_name="q", queue_name=os.getenv('QUEUE_NAME'), connection="AzureWebJobsStorage")
def demo_scraper(timerObj: func.TimerRequest, q: func.Out[str]) -> None:
logging.info('Scraper Demo A triggered.')

payload = {
"scraperId": "viennaHousingScraper002",
"timestamp": "2024-04-06T15:30:00Z",
"listings": [
{
"title": "Modern 3-Bedroom Apartment in Central Vienna",
"housingCooperative": "FutureLiving Genossenschaft",
"projectId": "FLG2024",
"listingId": "12345ABC",
"country": "Austria",
"city": "Vienna",
"postalCode": "1010",
"address": "Beispielgasse 42",
"roomCount": 3,
"squareMeters": 95,
"availabilityDate": "2024-09-01",
"yearBuilt": 2019,
"hwgEnergyClass": "A",
"fgeeEnergyClass": "A+",
"listingType": "both",
"rentPricePerMonth": 1200,
"cooperativeShare": 5000,
"salePrice": 350000,
"additionalFees": 6500,
"detailsUrl": "https://www.futurelivinggenossenschaft.at/listings/12345ABC",
"previewImageUrl": "https://www.futurelivinggenossenschaft.at/listings/12345ABC/preview.jpg"
}
]
}

q.set(json.dumps(payload).encode(encoding='UTF-8'))
44 changes: 0 additions & 44 deletions scrapers/requeriments.txt

This file was deleted.

4 changes: 3 additions & 1 deletion scrapers/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,6 @@
# Manually managing azure-functions-worker may cause unexpected issues

azure-functions
pytest
pytest
requests
beautifulsoup4
47 changes: 0 additions & 47 deletions scrapers/webscrap.yml

This file was deleted.

0 comments on commit ac6da87

Please sign in to comment.