diff --git a/Dockerfile b/Dockerfile index 63dfedf..c2d163e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ # Scrapyd web service (with authentication) -FROM ubuntu:18.04 +FROM python:3.7 # install Ubuntu packages ENV DEBIAN_FRONTEND noninteractive @@ -34,6 +34,16 @@ ADD nginx.conf /etc/nginx/sites-enabled/default ADD scrapyd.conf /etc/scrapyd/scrapyd.conf # expose +ADD .. /code +RUN cd ./code && \ + python3 setup.py bdist_egg && \ + rm -rf /build /default.egg-info + +# expose +EXPOSE 6800 6801 +ENTRYPOINT ["/usr/local/bin/chaperone"] + + VOLUME /scrapyd EXPOSE 6800 diff --git a/README.md b/README.md index 38464d3..e00e2bd 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Scrapyd (with authentication) +# Scrapyd (with authentication & support of Asyncio Reactor based Spiders) Scrapyd is an application for deploying and running Scrapy spiders. It enables you to deploy (upload) your projects and control their spiders using a JSON API. @@ -6,6 +6,24 @@ Scrapyd doesn't include any provision for password protecting itself. This conta For more about Scrapyd, see the [Scrapyd documentation](http://scrapyd.readthedocs.org/en/latest/). + +# Using with docker-compose + +Run the following commands: + +``` +$ docker-compose build +$ docker-compose up +``` + +To run the `async example spider`, run the following command on another terminal or on Postman: +```shell +curl --location --request POST 'http://localhost:6801/schedule.json' --header 'Authorization: Basic ZGVidWc6ZGVidWc=' --header 'Content-Type: application/x-www-form-urlencoded' --data-urlencode 'project=default' --data-urlencode 'spider=example' +``` + +Observe the new job schduled at `http://localhost:6801/jobs`. + + # How to use this image ## Start a Scrapyd server diff --git a/chaperone.conf b/chaperone.conf index b4622b8..fc936f2 100644 --- a/chaperone.conf +++ b/chaperone.conf @@ -1,19 +1,24 @@ -dependencies.service: - command: /bin/bash -c "[ -z \"$PACKAGES\" ] || pip3 install ${PACKAGES//,/ }" - type: oneshot - password.service: command: htpasswd -b -c /etc/nginx/htpasswd $(USERNAME:?You need to supply a USERNAME environment variable) $(PASSWORD:?You need to supply a PASSWORD environment variable) type: oneshot + +scrapyd.service: + command: scrapyd + restart: true + directory: "/code" + after: "password.service" + startup_pause: 10 + +upload.service: + command: "curl http://localhost:6800/addversion.json --location --request POST --header 'Authorization: Basic ZGVidWc6ZGVidWc=' -F project=default -F version=1.0 -F egg='@default-1.0-py3.7.egg'" + after: "scrapyd.service" + type: oneshot + directory: "/code/dist" nginx.service: command: service nginx start restart: true - after: "scrapyd.service" - -scrapyd.service: - command: scrapyd - after: "dependencies.service,password.service" + after: "custom.service" console.logging: stdout: true diff --git a/custom_runner/__init__.py b/custom_runner/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/custom_runner/async_runner.py b/custom_runner/async_runner.py new file mode 100644 index 0000000..df551be --- /dev/null +++ b/custom_runner/async_runner.py @@ -0,0 +1,8 @@ +# sample runner fix from https://github.com/scrapy/scrapyd/issues/377 +from scrapy.utils.reactor import install_reactor + +install_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor") +from scrapyd.runner import main # noqa: E402 needs after install reactor + +if __name__ == "__main__": + main() diff --git a/default/__init__.py b/default/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/default/items.py b/default/items.py new file mode 100644 index 0000000..75e5cb3 --- /dev/null +++ b/default/items.py @@ -0,0 +1,11 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class DefaultItem(scrapy.Item): + # define the fields for your item here like: + url = scrapy.Field() diff --git a/default/middlewares.py b/default/middlewares.py new file mode 100644 index 0000000..3b95e01 --- /dev/null +++ b/default/middlewares.py @@ -0,0 +1,103 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import is_item, ItemAdapter + + +class DefaultSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class DefaultDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/default/pipelines.py b/default/pipelines.py new file mode 100644 index 0000000..0ec7ac3 --- /dev/null +++ b/default/pipelines.py @@ -0,0 +1,21 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + + +# useful for handling different item types with a single interface +import asyncio + + +class DefaultPipeline: + async def process_item(self, item, spider): + spider.logger.info("Running Asyncio Pipeline: Testing Sleep for 2 seconds") + # testing with running asycio sleep + await self.test_sleep(2, spider) + spider.logger.info(f"Ending sleep with item: {item}") + return item + + async def test_sleep(self, seconds, spider): + await asyncio.sleep(seconds) + spider.logger.info('hello from test_sleep') \ No newline at end of file diff --git a/default/settings.py b/default/settings.py new file mode 100644 index 0000000..4932d32 --- /dev/null +++ b/default/settings.py @@ -0,0 +1,91 @@ +# Scrapy settings for default project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'default' + +SPIDER_MODULES = ['default.spiders'] +NEWSPIDER_MODULE = 'default.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'default (+http://www.yourdomain.com)' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'default.middlewares.DefaultSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'default.middlewares.DefaultDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +#ITEM_PIPELINES = { +# 'default.pipelines.DefaultPipeline': 300, +#} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +# setting as default project settings does not work +# TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" \ No newline at end of file diff --git a/default/spiders/__init__.py b/default/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/default/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/default/spiders/example.py b/default/spiders/example.py new file mode 100644 index 0000000..9fe4012 --- /dev/null +++ b/default/spiders/example.py @@ -0,0 +1,23 @@ +import scrapy +from scrapy.loader import ItemLoader + +from ..items import DefaultItem + + +class ExampleSpider(scrapy.Spider): + name = 'example' + allowed_domains = ['example.com'] + start_urls = ['https://example.com/'] + custom_settings = { + "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", + "ITEM_PIPELINES": { + "default.pipelines.DefaultPipeline": 300 + } + } + + def parse(self, response, **kwargs): + self.logger.info(f'{response.url}') + # testing with yielding url + loader = ItemLoader(DefaultItem()) + loader.add_value('url', response.url) + yield loader.load_item() diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..582d7da --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,14 @@ +version: '3' + +services: + scrapyd: + build: + context: . + dockerfile: ./Dockerfile + + environment: + USERNAME: "debug" + PASSWORD: "debug" + + ports: + - "6801:6801" diff --git a/requirements.txt b/requirements.txt index 782ae8d..b21b55f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ -scrapy==1.8.0 +asyncio==3.4.3 +Scrapy==2.5.0 scrapyd==1.2.1 redis requests diff --git a/scrapyd.conf b/scrapyd.conf index 6532fa5..84f8d14 100644 --- a/scrapyd.conf +++ b/scrapyd.conf @@ -1,7 +1,29 @@ [scrapyd] -bind_address= 127.0.0.1 -http_port = 6801 -eggs_dir = /scrapyd/eggs -logs_dir = /scrapyd/logs -items_dir = /scrapyd/items -dbs_dir = /scrapyd/dbs +eggs_dir = dist +logs_dir = logs +items_dir = items +jobs_to_keep = 5 +dbs_dir = dbs +max_proc = 0 +max_proc_per_cpu = 4 +finished_to_keep = 100 +poll_interval = 5.0 +bind_address = 0.0.0.0 +http_port = 6801 +debug = off +runner = custom_runner.async_runner +application = scrapyd.app.application +launcher = scrapyd.launcher.Launcher +webroot = scrapyd.website.Root + +[services] +schedule.json = scrapyd.webservice.Schedule +cancel.json = scrapyd.webservice.Cancel +addversion.json = scrapyd.webservice.AddVersion +listprojects.json = scrapyd.webservice.ListProjects +listversions.json = scrapyd.webservice.ListVersions +listspiders.json = scrapyd.webservice.ListSpiders +delproject.json = scrapyd.webservice.DeleteProject +delversion.json = scrapyd.webservice.DeleteVersion +listjobs.json = scrapyd.webservice.ListJobs +daemonstatus.json = scrapyd.webservice.DaemonStatus diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..8a0a666 --- /dev/null +++ b/setup.py @@ -0,0 +1,8 @@ +from setuptools import find_packages, setup +# running the egg https://stackoverflow.com/a/37800297 +setup( + name="default", + version="1.0", + packages=find_packages(), + entry_points={"scrapy": ["settings = default.settings"]}, +)