Skip to content
This repository has been archived by the owner on Mar 14, 2023. It is now read-only.

Add Category model and introduce it to the Spider class #1

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
venv
simple.db
__pycache__
.DS_Store
.DS_Store
.vscode
Empty file modified Procfile
100644 → 100755
Empty file.
27 changes: 25 additions & 2 deletions README.md
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,30 @@ This project is deployed to:

http://bezmetki.org

Running the flask app locally. Activate virtualenv and run:
Requirements:
- python 3

Pre-requirements for OS X:
```
brew install postgresql openssl
```

### Setup virtual environment
`python3 -m venv venv`

### Activate virtual environment
`source venv/bin/activate`


### Preparing the database
```
flask db init
flask db upgrade
```


### Running the flask app locally
Acivate the virtual environment and run:
```
export FLASK_APP=vintourage
export FLASK_ENV=development
Expand All @@ -11,5 +34,5 @@ flask run

Running the spiders:
```
python vintourage/crawler/executor.py
python -m crawler.executor
```
Empty file modified config.py
100644 → 100755
Empty file.
File renamed without changes.
Binary file added crawler/__init__.pyc
Binary file not shown.
43 changes: 43 additions & 0 deletions crawler/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import scrapy

from vintourage.models import Category

class CategorySpider(scrapy.Spider):
category_mapping = None
start_urls = []

def __init__(self, category=None, *args, **kwargs):
if not category:
return

if not self.category_mapping:
raise Exception('Spider has to implement categories mapping')

print("------------------------")
print(category)
print(category.value)

self.start_urls = self.category_mapping.get(category)
if not self.start_urls:
raise ValueError(f"{type(self).__name__} must have a proper category_mapping")

print(self.start_urls)

category = Category.query.filter_by(path=category.value).first()
if not category:
raise Exception(f'There is no category matching the path {category}')
self.category = category

print("Got category")
print(self.category)
super().__init__(*args, **kwargs)

def parse(self, request):
returned_item = super().parse(self, request)

if isinstance(returned_item, dict):
returned_item.update({
"category_id": self.category.id
})

yield returned_item
34 changes: 34 additions & 0 deletions crawler/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from enum import Enum

class Categories(Enum):
kobiety = "kobiety"
bluzki_damskie = f"{kobiety}/bluzki"
marynarki_damskie = f"{kobiety}/marynarki"
swetry_damskie = f"{kobiety}/swetry"
spodnice = f"{kobiety}/spodnice"
spodnie_damskie = f"{kobiety}/spodnie"
sukienki = f"{kobiety}/sukienki" # sukienki i kombinezony
okrycia_damskie = f"{kobiety}/okrycia"
bielizna_damska = f"{kobiety}/bielizna"
obuwie_damskie = f"{kobiety}/obuwie"

mezczyzni = "mezczyzni"
koszule_meskie = f"{mezczyzni}/koszule"
marynarki_meskie = f"{mezczyzni}/marynarki"
swetry_meskie = f"{mezczyzni}/swetry" # Swetry i bluzy
spodnie_meskie = f"{mezczyzni}/spodnie"
okrycia_meskie = f"{mezczyzni}/okrycia"
bielizna_meska = f"{mezczyzni}/bielizna"
obuwie = f"{mezczyzni}/obuwie"

dzieci = "dzieci"
bluzki_dzieciece = f"{dzieci}/bluzki"
swetry_dzieciece = f"{dzieci}/swetry" # Swetry i bluzy
spodnice_dzieciece = f"{dzieci}/spodnice"
spodnie_dzieciece = f"{dzieci}/spodnie" # "Spodnie i kombinezony"
sukienki_dzieciece = f"{dzieci}/sukienki"
okrycia_dzieciece = f"{dzieci}/okrycia"
bielizna_dziecieca = f"{dzieci}/bielizna"
obuwie_dzieciece = f"{dzieci}/obuwie"

akcesoria = "akcesoria"
36 changes: 36 additions & 0 deletions crawler/executor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

from .spiders.dewitched import DewitchedSpider
from .spiders.inspired import InspiredSpider
from .spiders.klunken import KlunkenSpider
from .spiders.ragsandsilks import RagsandsilksSpider
from .spiders.somavintage import SomavintageSpider
from .spiders.vintageladies import VintageladiesSpider

from .constants import Categories


from vintourage import app, db


crawlers = [
DewitchedSpider,
#InspiredSpider, KlunkenSpider, RagsandsilksSpider,
#SomavintageSpider, VintageladiesSpider
]

process = CrawlerProcess(get_project_settings())

for crawler_class in crawlers:
print(f'Processing crawler {crawler_class.name}')

for category in crawler_class.category_mapping.keys():
print(f'Attempting to crawl {category.name}')
print(f'{category.value}')
crawler = crawler_class(category=category)

process.crawl(crawler)

process.start() # the script will block here until all crawling jobs are finished
3 changes: 2 additions & 1 deletion vintourage/crawler/extensions.py → crawler/extensions.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ class SentryLogging(object):

@classmethod
def from_crawler(cls, crawler):
sentry_dsn = crawler.settings.get('SENTRY_DSN', None)
sentry_dsn = None
#sentry_dsn = crawler.settings.get('SENTRY_DSN', None)
if sentry_dsn is None:
raise NotConfigured

Expand Down
5 changes: 2 additions & 3 deletions vintourage/crawler/pipelines.py → crawler/pipelines.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import logging

from .. import db
from ..models import Product
from vintourage import db
from vintourage.models import Product

logger = logging.getLogger(__name__)

Expand Down
9 changes: 4 additions & 5 deletions vintourage/crawler/settings.py → crawler/settings.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-

# Scrapy settings for scraping project
#
Expand All @@ -11,8 +10,8 @@

BOT_NAME = 'vintourage'

SPIDER_MODULES = ['vintourage.crawler.spiders']
NEWSPIDER_MODULE = 'vintourage.crawler.spiders'
SPIDER_MODULES = ['crawler.spiders']
NEWSPIDER_MODULE = 'crawler.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
Expand Down Expand Up @@ -59,15 +58,15 @@
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
EXTENSIONS = {
'vintourage.crawler.extensions.SentryLogging': -1, # Load SentryLogging extension before others
'crawler.extensions.SentryLogging': -1, # Load SentryLogging extension before others
}

SENTRY_DSN = "https://[email protected]/1472720"

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'vintourage.crawler.pipelines.DatabasePipeline': 1,
'crawler.pipelines.DatabasePipeline': 1,
}

# Enable and configure the AutoThrottle extension (disabled by default)
Expand Down
File renamed without changes.
10 changes: 6 additions & 4 deletions vintourage/crawler/spiders/dewitched.py → crawler/spiders/dewitched.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
# -*- coding: utf-8 -*-
import scrapy
from ..base import CategorySpider
from ..constants import Categories


class DewitchedSpider(scrapy.Spider):
class DewitchedSpider(CategorySpider):
name = 'dewitched'
allowed_domains = ['www.dewitched.pl']
start_urls = ['https://www.dewitched.pl/dla-pan-cat-5']
category_mapping = {
Categories.sukienki: ['https://www.dewitched.pl/dla-pan-cat-5']
}

def parse(self, response):
main_list = response.css('div.products-list')[0]
Expand Down
11 changes: 7 additions & 4 deletions vintourage/crawler/spiders/inspired.py → crawler/spiders/inspired.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
# -*- coding: utf-8 -*-
import scrapy
import tinycss2

from ..base import CategorySpider
from ..constants import Categories

class InspiredSpider(scrapy.Spider):

class InspiredSpider(CategorySpider):
name = 'inspired'
allowed_domains = ['inspired.sklep.pl']
start_urls = ['https://inspired.sklep.pl/kategoria-produktu/sukienki/']
category_mapping = {
Categories.sukienki: ['https://inspired.sklep.pl/kategoria-produktu/sukienki/']
}

def parse(self, response):
for product in response.css('li.htheme_single_wc_item'):
Expand Down
10 changes: 6 additions & 4 deletions vintourage/crawler/spiders/klunken.py → crawler/spiders/klunken.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
# -*- coding: utf-8 -*-
import scrapy
from ..base import CategorySpider
from ..constants import Categories


class KlunkenSpider(scrapy.Spider):
class KlunkenSpider(CategorySpider):
name = 'klunken'
allowed_domains = ['klunken.pl']
start_urls = ['http://klunken.pl/kategoria-produktu/kobieta/sukienki/']
category_mapping = {
Categories.sukienki: ['http://klunken.pl/kategoria-produktu/kobieta/sukienki/']
}

def parse(self, response):
for product in response.css('li.product'):
Expand Down
10 changes: 6 additions & 4 deletions vintourage/crawler/spiders/ragsandsilks.py → crawler/spiders/ragsandsilks.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
# -*- coding: utf-8 -*-
import scrapy
from ..base import CategorySpider
from ..constants import Categories


class RagsandsilksSpider(scrapy.Spider):
class RagsandsilksSpider(CategorySpider):
name = 'ragsandsilks'
allowed_domains = ['ragsandsilks.pl']
start_urls = ['https://ragsandsilks.pl/pl/c/SUKIENKI/20']
category_mapping = {
Categories.sukienki: ['https://ragsandsilks.pl/pl/c/SUKIENKI/20']
}

def parse(self, response):
for product in response.css('div.product'):
Expand Down
11 changes: 7 additions & 4 deletions vintourage/crawler/spiders/somavintage.py → crawler/spiders/somavintage.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
# -*- coding: utf-8 -*-
import scrapy
from ..base import CategorySpider
from ..constants import Categories


def clean_whitespaces(value):
if not value:
return
return value.replace('\n','').replace('\t','').replace(' ', '')


class SomavintageSpider(scrapy.Spider):
class SomavintageSpider(CategorySpider):
name = 'somavintage'
allowed_domains = ['somavintagestore.com']
start_urls = ['http://somavintagestore.com/ubrania/sukienki']
category_mapping = {
Categories.sukienki: ['http://somavintagestore.com/ubrania/sukienki']
}

def get_price_for_product(self, product):
"""Get the price of the product.
Expand Down
32 changes: 23 additions & 9 deletions vintourage/crawler/spiders/vintageladies.py → crawler/spiders/vintageladies.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,15 +1,29 @@
# -*- coding: utf-8 -*-
import scrapy
from ..base import CategorySpider
from ..constants import Categories


class VintageladiesSpider(scrapy.Spider):
class VintageladiesSpider(CategorySpider):
name = 'vintageladies'
allowed_domains = ['vintageladies.pl']
start_urls = [
'http://vintageladies.pl/index.php?cPath=31_32', # sukienki codzienne
'http://vintageladies.pl/index.php?cPath=31_33', # sukienki koktajlowe
'http://vintageladies.pl/index.php?cPath=31_34', # na wielki bal
]
category_mapping = {
Categories.bluzki_damskie: [
'http://vintageladies.pl/index.php?cPath=21_22',
'http://vintageladies.pl/index.php?cPath=21_23',
],
Categories.sukienki: [
'http://vintageladies.pl/index.php?cPath=31_33',
'http://vintageladies.pl/index.php?cPath=31_34',
],
Categories.swetry_damskie: [
'http://vintageladies.pl/index.php?cPath=24',
],
Categories.spodnice: [
'http://vintageladies.pl/index.php?cPath=25_26',
'http://vintageladies.pl/index.php?cPath=25_27',
'http://vintageladies.pl/index.php?cPath=25_28',
'http://vintageladies.pl/index.php?cPath=25_29',
]
}

def parse(self, response):
for product in response.xpath('//td[@valign = "top" and @width = "169"]'):
Expand All @@ -27,7 +41,7 @@ def parse(self, response):
'image': response.urljoin(image_uri),
'price': product.xpath('.//td[@class = "fe1"]/span/text()').get(),
'link': link,
'active': active
'active': active,
}

selector = '//a[@class = "pageResults" and contains(@title, "Następna")]/@href'
Expand Down
Empty file modified migrations/README
100644 → 100755
Empty file.
Empty file modified migrations/alembic.ini
100644 → 100755
Empty file.
3 changes: 2 additions & 1 deletion migrations/env.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def run_migrations_offline():
"""
url = config.get_main_option("sqlalchemy.url")
context.configure(
url=url, target_metadata=target_metadata, literal_binds=True
url=url, target_metadata=target_metadata, literal_binds=True,
)

with context.begin_transaction():
Expand Down Expand Up @@ -82,6 +82,7 @@ def process_revision_directives(context, revision, directives):
connection=connection,
target_metadata=target_metadata,
process_revision_directives=process_revision_directives,
render_as_batch=True, # this is new feature
**current_app.extensions['migrate'].configure_args
)

Expand Down
Empty file modified migrations/script.py.mako
100644 → 100755
Empty file.
Empty file modified migrations/versions/511aee2d7b55_.py
100644 → 100755
Empty file.
Loading