aniav · aniav · Sep 8, 2019 · Oct 18, 2020 · Oct 18, 2020 · Nov 20, 2021
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 venv
 simple.db
 __pycache__
-.DS_Store
+.DS_Store
+.vscode
diff --git a/Procfile b/Procfile
diff --git a/README.md b/README.md
@@ -2,7 +2,30 @@ This project is deployed to:
 
 http://bezmetki.org
 
-Running the flask app locally. Activate virtualenv and run:
+Requirements:
+- python 3
+
+Pre-requirements for OS X:
+```
+brew install postgresql openssl
+```
+
+### Setup virtual environment
+`python3 -m venv venv`
+
+### Activate virtual environment
+`source venv/bin/activate`
+
+
+### Preparing the database
+```
+flask db init
+flask db upgrade
+```
+
+
+### Running the flask app locally
+Acivate the virtual environment and run:
 ```
 export FLASK_APP=vintourage
 export FLASK_ENV=development
@@ -11,5 +34,5 @@ flask run
 
 Running the spiders:
 ```
-python vintourage/crawler/executor.py
+python -m crawler.executor
 ```
diff --git a/config.py b/config.py
diff --git a/vintourage/crawler/__init__.py → crawler/__init__.py b/vintourage/crawler/__init__.py → crawler/__init__.py
diff --git a/crawler/__init__.pyc b/crawler/__init__.pyc
diff --git a/crawler/base.py b/crawler/base.py
@@ -0,0 +1,43 @@
+import scrapy
+
+from vintourage.models import Category
+
+class CategorySpider(scrapy.Spider):
+    category_mapping = None
+    start_urls = []
+
+    def __init__(self, category=None, *args, **kwargs):
+        if not category:
+            return
+
+        if not self.category_mapping:
+            raise Exception('Spider has to implement categories mapping')
+
+        print("------------------------")
+        print(category)
+        print(category.value)
+
+        self.start_urls = self.category_mapping.get(category)
+        if not self.start_urls:
+            raise ValueError(f"{type(self).__name__} must have a proper category_mapping")
+
+        print(self.start_urls)
+
+        category = Category.query.filter_by(path=category.value).first()
+        if not category:
+            raise Exception(f'There is no category matching the path {category}')
+        self.category = category
+
+        print("Got category")
+        print(self.category)
+        super().__init__(*args, **kwargs)
+
+    def parse(self, request):
+        returned_item = super().parse(self, request)
+
+        if isinstance(returned_item, dict):
+            returned_item.update({
+                "category_id": self.category.id
+            })
+
+        yield returned_item
diff --git a/crawler/constants.py b/crawler/constants.py
@@ -0,0 +1,34 @@
+from enum import Enum
+
+class Categories(Enum):
+    kobiety = "kobiety"
+    bluzki_damskie = f"{kobiety}/bluzki"
+    marynarki_damskie = f"{kobiety}/marynarki"
+    swetry_damskie = f"{kobiety}/swetry"
+    spodnice = f"{kobiety}/spodnice"
+    spodnie_damskie = f"{kobiety}/spodnie"
+    sukienki = f"{kobiety}/sukienki" # sukienki i kombinezony
+    okrycia_damskie = f"{kobiety}/okrycia"
+    bielizna_damska = f"{kobiety}/bielizna"
+    obuwie_damskie = f"{kobiety}/obuwie"
+
+    mezczyzni = "mezczyzni"
+    koszule_meskie = f"{mezczyzni}/koszule"
+    marynarki_meskie = f"{mezczyzni}/marynarki"
+    swetry_meskie = f"{mezczyzni}/swetry" # Swetry i bluzy
+    spodnie_meskie = f"{mezczyzni}/spodnie"
+    okrycia_meskie = f"{mezczyzni}/okrycia"
+    bielizna_meska = f"{mezczyzni}/bielizna"
+    obuwie = f"{mezczyzni}/obuwie"
+
+    dzieci = "dzieci"
+    bluzki_dzieciece = f"{dzieci}/bluzki"
+    swetry_dzieciece = f"{dzieci}/swetry" # Swetry i bluzy
+    spodnice_dzieciece = f"{dzieci}/spodnice"
+    spodnie_dzieciece = f"{dzieci}/spodnie" # "Spodnie i kombinezony"
+    sukienki_dzieciece = f"{dzieci}/sukienki"
+    okrycia_dzieciece = f"{dzieci}/okrycia"
+    bielizna_dziecieca = f"{dzieci}/bielizna"
+    obuwie_dzieciece = f"{dzieci}/obuwie"
+
+    akcesoria = "akcesoria"
diff --git a/crawler/executor.py b/crawler/executor.py
@@ -0,0 +1,36 @@
+import scrapy
+from scrapy.crawler import CrawlerProcess
+from scrapy.utils.project import get_project_settings
+
+from .spiders.dewitched import DewitchedSpider
+from .spiders.inspired import InspiredSpider
+from .spiders.klunken import KlunkenSpider
+from .spiders.ragsandsilks import RagsandsilksSpider
+from .spiders.somavintage import SomavintageSpider
+from .spiders.vintageladies import VintageladiesSpider
+
+from .constants import Categories
+
+
+from vintourage import app, db
+
+
+crawlers = [
+    DewitchedSpider,
+    #InspiredSpider, KlunkenSpider, RagsandsilksSpider,
+    #SomavintageSpider, VintageladiesSpider
+]
+
+process = CrawlerProcess(get_project_settings())
+
+for crawler_class in crawlers:
+    print(f'Processing crawler {crawler_class.name}')
+
+    for category in crawler_class.category_mapping.keys():
+        print(f'Attempting to crawl {category.name}')
+        print(f'{category.value}')
+        crawler = crawler_class(category=category)
+
+        process.crawl(crawler)
+
+process.start() # the script will block here until all crawling jobs are finished
diff --git a/vintourage/crawler/extensions.py → crawler/extensions.py b/vintourage/crawler/extensions.py → crawler/extensions.py
@@ -6,7 +6,8 @@ class SentryLogging(object):
 
     @classmethod
     def from_crawler(cls, crawler):
-        sentry_dsn = crawler.settings.get('SENTRY_DSN', None)
+        sentry_dsn = None
+        #sentry_dsn = crawler.settings.get('SENTRY_DSN', None)
         if sentry_dsn is None:
             raise NotConfigured
 

diff --git a/vintourage/crawler/pipelines.py → crawler/pipelines.py b/vintourage/crawler/pipelines.py → crawler/pipelines.py
@@ -1,13 +1,12 @@
-# -*- coding: utf-8 -*-
 
 # Define your item pipelines here
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 import logging
 
-from .. import db
-from ..models import Product
+from vintourage import db
+from vintourage.models import Product
 
 logger = logging.getLogger(__name__)
 

diff --git a/vintourage/crawler/settings.py → crawler/settings.py b/vintourage/crawler/settings.py → crawler/settings.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 
 # Scrapy settings for scraping project
 #
@@ -11,8 +10,8 @@
 
 BOT_NAME = 'vintourage'
 
-SPIDER_MODULES = ['vintourage.crawler.spiders']
-NEWSPIDER_MODULE = 'vintourage.crawler.spiders'
+SPIDER_MODULES = ['crawler.spiders']
+NEWSPIDER_MODULE = 'crawler.spiders'
 
 
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
@@ -59,15 +58,15 @@
 # Enable or disable extensions
 # See https://doc.scrapy.org/en/latest/topics/extensions.html
 EXTENSIONS = {
-    'vintourage.crawler.extensions.SentryLogging': -1, # Load SentryLogging extension before others
+    'crawler.extensions.SentryLogging': -1, # Load SentryLogging extension before others
 }
 
 SENTRY_DSN = "https://[email protected]/1472720"
 
 # Configure item pipelines
 # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 ITEM_PIPELINES = {
-   'vintourage.crawler.pipelines.DatabasePipeline': 1,
+   'crawler.pipelines.DatabasePipeline': 1,
 }
 
 # Enable and configure the AutoThrottle extension (disabled by default)

diff --git a/vintourage/crawler/spiders/__init__.py → crawler/spiders/__init__.py b/vintourage/crawler/spiders/__init__.py → crawler/spiders/__init__.py
diff --git a/vintourage/crawler/spiders/dewitched.py → crawler/spiders/dewitched.py b/vintourage/crawler/spiders/dewitched.py → crawler/spiders/dewitched.py
@@ -1,11 +1,13 @@
-# -*- coding: utf-8 -*-
-import scrapy
+from ..base import CategorySpider
+from ..constants import Categories
 
 
-class DewitchedSpider(scrapy.Spider):
+class DewitchedSpider(CategorySpider):
     name = 'dewitched'
     allowed_domains = ['www.dewitched.pl']
-    start_urls = ['https://www.dewitched.pl/dla-pan-cat-5']
+    category_mapping = {
+        Categories.sukienki: ['https://www.dewitched.pl/dla-pan-cat-5']
+    }
 
     def parse(self, response):
         main_list = response.css('div.products-list')[0]

diff --git a/vintourage/crawler/spiders/inspired.py → crawler/spiders/inspired.py b/vintourage/crawler/spiders/inspired.py → crawler/spiders/inspired.py
@@ -1,12 +1,15 @@
-# -*- coding: utf-8 -*-
-import scrapy
 import tinycss2
 
+from ..base import CategorySpider
+from ..constants import Categories
 
-class InspiredSpider(scrapy.Spider):
+
+class InspiredSpider(CategorySpider):
     name = 'inspired'
     allowed_domains = ['inspired.sklep.pl']
-    start_urls = ['https://inspired.sklep.pl/kategoria-produktu/sukienki/']
+    category_mapping = {
+        Categories.sukienki: ['https://inspired.sklep.pl/kategoria-produktu/sukienki/']
+    }
 
     def parse(self, response):
         for product in response.css('li.htheme_single_wc_item'):

diff --git a/vintourage/crawler/spiders/klunken.py → crawler/spiders/klunken.py b/vintourage/crawler/spiders/klunken.py → crawler/spiders/klunken.py
@@ -1,11 +1,13 @@
-# -*- coding: utf-8 -*-
-import scrapy
+from ..base import CategorySpider
+from ..constants import Categories
 
 
-class KlunkenSpider(scrapy.Spider):
+class KlunkenSpider(CategorySpider):
     name = 'klunken'
     allowed_domains = ['klunken.pl']
-    start_urls = ['http://klunken.pl/kategoria-produktu/kobieta/sukienki/']
+    category_mapping = {
+        Categories.sukienki: ['http://klunken.pl/kategoria-produktu/kobieta/sukienki/']
+    }
 
     def parse(self, response):
         for product in response.css('li.product'):

diff --git a/vintourage/crawler/spiders/ragsandsilks.py → crawler/spiders/ragsandsilks.py b/vintourage/crawler/spiders/ragsandsilks.py → crawler/spiders/ragsandsilks.py
@@ -1,11 +1,13 @@
-# -*- coding: utf-8 -*-
-import scrapy
+from ..base import CategorySpider
+from ..constants import Categories
 
 
-class RagsandsilksSpider(scrapy.Spider):
+class RagsandsilksSpider(CategorySpider):
     name = 'ragsandsilks'
     allowed_domains = ['ragsandsilks.pl']
-    start_urls = ['https://ragsandsilks.pl/pl/c/SUKIENKI/20']
+    category_mapping = {
+        Categories.sukienki: ['https://ragsandsilks.pl/pl/c/SUKIENKI/20']
+    }
 
     def parse(self, response):
         for product in response.css('div.product'):

diff --git a/vintourage/crawler/spiders/somavintage.py → crawler/spiders/somavintage.py b/vintourage/crawler/spiders/somavintage.py → crawler/spiders/somavintage.py
@@ -1,16 +1,19 @@
-# -*- coding: utf-8 -*-
-import scrapy
+from ..base import CategorySpider
+from ..constants import Categories
+
 
 def clean_whitespaces(value):
     if not value:
         return
     return value.replace('\n','').replace('\t','').replace(' ', '')
 
 
-class SomavintageSpider(scrapy.Spider):
+class SomavintageSpider(CategorySpider):
     name = 'somavintage'
     allowed_domains = ['somavintagestore.com']
-    start_urls = ['http://somavintagestore.com/ubrania/sukienki']
+    category_mapping = {
+        Categories.sukienki: ['http://somavintagestore.com/ubrania/sukienki']
+    }
 
     def get_price_for_product(self, product):
         """Get the price of the product.

diff --git a/vintourage/crawler/spiders/vintageladies.py → crawler/spiders/vintageladies.py b/vintourage/crawler/spiders/vintageladies.py → crawler/spiders/vintageladies.py
@@ -1,15 +1,29 @@
-# -*- coding: utf-8 -*-
-import scrapy
+from ..base import CategorySpider
+from ..constants import Categories
 
 
-class VintageladiesSpider(scrapy.Spider):
+class VintageladiesSpider(CategorySpider):
     name = 'vintageladies'
     allowed_domains = ['vintageladies.pl']
-    start_urls = [
-        'http://vintageladies.pl/index.php?cPath=31_32', # sukienki codzienne
-        'http://vintageladies.pl/index.php?cPath=31_33', # sukienki koktajlowe
-        'http://vintageladies.pl/index.php?cPath=31_34', # na wielki bal
-    ]
+    category_mapping = {
+        Categories.bluzki_damskie: [
+            'http://vintageladies.pl/index.php?cPath=21_22',
+            'http://vintageladies.pl/index.php?cPath=21_23',
+        ],
+        Categories.sukienki: [
+            'http://vintageladies.pl/index.php?cPath=31_33',
+            'http://vintageladies.pl/index.php?cPath=31_34',
+        ],
+        Categories.swetry_damskie: [
+            'http://vintageladies.pl/index.php?cPath=24',
+        ],
+        Categories.spodnice: [
+            'http://vintageladies.pl/index.php?cPath=25_26',
+            'http://vintageladies.pl/index.php?cPath=25_27',
+            'http://vintageladies.pl/index.php?cPath=25_28',
+            'http://vintageladies.pl/index.php?cPath=25_29',
+        ]
+    }
 
     def parse(self, response):
         for product in response.xpath('//td[@valign = "top" and @width = "169"]'):
@@ -27,7 +41,7 @@ def parse(self, response):
                 'image': response.urljoin(image_uri),
                 'price': product.xpath('.//td[@class = "fe1"]/span/text()').get(),
                 'link': link,
-                'active': active
+                'active': active,
             }
 
         selector = '//a[@class = "pageResults" and contains(@title, "Następna")]/@href'

diff --git a/migrations/README b/migrations/README
diff --git a/migrations/alembic.ini b/migrations/alembic.ini
diff --git a/migrations/env.py b/migrations/env.py
@@ -46,7 +46,7 @@ def run_migrations_offline():
     """
     url = config.get_main_option("sqlalchemy.url")
     context.configure(
-        url=url, target_metadata=target_metadata, literal_binds=True
+        url=url, target_metadata=target_metadata, literal_binds=True,
     )
 
     with context.begin_transaction():
@@ -82,6 +82,7 @@ def process_revision_directives(context, revision, directives):
             connection=connection,
             target_metadata=target_metadata,
             process_revision_directives=process_revision_directives,
+            render_as_batch=True, # this is new feature
             **current_app.extensions['migrate'].configure_args
         )
 

diff --git a/migrations/script.py.mako b/migrations/script.py.mako
diff --git a/migrations/versions/511aee2d7b55_.py b/migrations/versions/511aee2d7b55_.py