diff --git a/exploring.ipynb b/exploring.ipynb new file mode 100644 index 0000000..bc51c52 --- /dev/null +++ b/exploring.ipynb @@ -0,0 +1,151 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Park-Vorhersage" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Website to extract info from" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "url = r'https://www.parken-osnabrueck.de/'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Selenium\n", + "\n", + "Use selenium to drive headless firefox (other browsers can be configured, too)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "from selenium import webdriver\n", + "from selenium.webdriver.firefox.options import Options" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "options = Options()\n", + "options.headless = True\n", + "driver = webdriver.Firefox(options=options)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## robots.txt\n", + "\n", + "Check if crawling is generally not desired by website operator" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "from urllib import robotparser" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "parser = robotparser.RobotFileParser(url=url)\n", + "parser.read()\n", + "'Website can be parsed: {}'.format(parser.can_fetch('*', url))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read page source and extract information" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "driver.get(url)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use BeautifulSoup to easily read page source" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "from bs4 import BeautifulSoup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "soup = BeautifulSoup(driver.page_source)\n", + "\n", + "for item in soup.find_all('span', 'parking-ramp-utilization'):\n", + " print(item.text)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/exploring_beautifulsoup.ipynb b/exploring_beautifulsoup.ipynb deleted file mode 100644 index 4fcb91d..0000000 --- a/exploring_beautifulsoup.ipynb +++ /dev/null @@ -1,286 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Exploring BeautifulSoup" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import requests\n", - "from bs4 import BeautifulSoup\n", - "import urllib\n", - "from selenium import webdriver" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "url = ''\n", - "executable_path_geckodriver = r''" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from selenium import webdriver\n", - "from selenium.webdriver.firefox.options import Options\n", - "\n", - "options = Options()\n", - "options.headless = True\n", - "driver = webdriver.Firefox(options=options, executable_path=executable_path_geckodriver)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "driver.get(url)\n", - "items = driver.find_element_by_class_name('parking-ramp-utilization')" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n", - "\n", - " keine Info\n", - " \n" - ] - } - ], - "source": [ - "#soup = BeautifulSoup(text, 'html.parser')\n", - "\n", - "for item in soup.find_all('span', 'parking-ramp-utilization'):\n", - " print(item.text)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.2" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}