From ea03f245cbb853bcc58bef258aa3d6f6353e3680 Mon Sep 17 00:00:00 2001 From: namiwa Date: Wed, 22 Sep 2021 18:27:24 -0700 Subject: [PATCH 1/5] init async spider & scrapyd async_runner --- custom_runner/__init__.py | 0 custom_runner/async_runner.py | 8 +++ default/__init__.py | 0 default/items.py | 11 ++++ default/middlewares.py | 103 ++++++++++++++++++++++++++++++++++ default/pipelines.py | 21 +++++++ default/settings.py | 91 ++++++++++++++++++++++++++++++ default/spiders/__init__.py | 4 ++ default/spiders/example.py | 23 ++++++++ docker-compose.yml | 14 +++++ requirements.txt | 2 + scrapyd.conf | 1 + 12 files changed, 278 insertions(+) create mode 100644 custom_runner/__init__.py create mode 100644 custom_runner/async_runner.py create mode 100644 default/__init__.py create mode 100644 default/items.py create mode 100644 default/middlewares.py create mode 100644 default/pipelines.py create mode 100644 default/settings.py create mode 100644 default/spiders/__init__.py create mode 100644 default/spiders/example.py create mode 100644 docker-compose.yml diff --git a/custom_runner/__init__.py b/custom_runner/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/custom_runner/async_runner.py b/custom_runner/async_runner.py new file mode 100644 index 0000000..df551be --- /dev/null +++ b/custom_runner/async_runner.py @@ -0,0 +1,8 @@ +# sample runner fix from https://github.com/scrapy/scrapyd/issues/377 +from scrapy.utils.reactor import install_reactor + +install_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor") +from scrapyd.runner import main # noqa: E402 needs after install reactor + +if __name__ == "__main__": + main() diff --git a/default/__init__.py b/default/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/default/items.py b/default/items.py new file mode 100644 index 0000000..75e5cb3 --- /dev/null +++ b/default/items.py @@ -0,0 +1,11 @@ +# Define here the models for your scraped items +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class DefaultItem(scrapy.Item): + # define the fields for your item here like: + url = scrapy.Field() diff --git a/default/middlewares.py b/default/middlewares.py new file mode 100644 index 0000000..3b95e01 --- /dev/null +++ b/default/middlewares.py @@ -0,0 +1,103 @@ +# Define here the models for your spider middleware +# +# See documentation in: +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + +# useful for handling different item types with a single interface +from itemadapter import is_item, ItemAdapter + + +class DefaultSpiderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, or item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Request or item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class DefaultDownloaderMiddleware: + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/default/pipelines.py b/default/pipelines.py new file mode 100644 index 0000000..adc1523 --- /dev/null +++ b/default/pipelines.py @@ -0,0 +1,21 @@ +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html + + +# useful for handling different item types with a single interface +import asyncio + + +class DefaultPipeline: + async def process_item(self, item, spider): + spider.logger.info("Running Asyncio Pipeline: Testing Sleep for 2 seconds") + # testing with running asycio sleep + asyncio.run(self.test_sleep(2, spider)) + spider.logger.info(f"Ending sleep with item: {item}") + return item + + async def test_sleep(self, seconds, spider): + await asyncio.sleep(seconds) + spider.logger.info('hello from test_sleep') \ No newline at end of file diff --git a/default/settings.py b/default/settings.py new file mode 100644 index 0000000..4932d32 --- /dev/null +++ b/default/settings.py @@ -0,0 +1,91 @@ +# Scrapy settings for default project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://docs.scrapy.org/en/latest/topics/settings.html +# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +# https://docs.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'default' + +SPIDER_MODULES = ['default.spiders'] +NEWSPIDER_MODULE = 'default.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'default (+http://www.yourdomain.com)' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'default.middlewares.DefaultSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'default.middlewares.DefaultDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://docs.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html +#ITEM_PIPELINES = { +# 'default.pipelines.DefaultPipeline': 300, +#} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +# setting as default project settings does not work +# TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" \ No newline at end of file diff --git a/default/spiders/__init__.py b/default/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/default/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/default/spiders/example.py b/default/spiders/example.py new file mode 100644 index 0000000..9fe4012 --- /dev/null +++ b/default/spiders/example.py @@ -0,0 +1,23 @@ +import scrapy +from scrapy.loader import ItemLoader + +from ..items import DefaultItem + + +class ExampleSpider(scrapy.Spider): + name = 'example' + allowed_domains = ['example.com'] + start_urls = ['https://example.com/'] + custom_settings = { + "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", + "ITEM_PIPELINES": { + "default.pipelines.DefaultPipeline": 300 + } + } + + def parse(self, response, **kwargs): + self.logger.info(f'{response.url}') + # testing with yielding url + loader = ItemLoader(DefaultItem()) + loader.add_value('url', response.url) + yield loader.load_item() diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..582d7da --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,14 @@ +version: '3' + +services: + scrapyd: + build: + context: . + dockerfile: ./Dockerfile + + environment: + USERNAME: "debug" + PASSWORD: "debug" + + ports: + - "6801:6801" diff --git a/requirements.txt b/requirements.txt index 782ae8d..9f018ec 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ +asyncio==3.4.3 +Scrapy==2.5.0 scrapy==1.8.0 scrapyd==1.2.1 redis diff --git a/scrapyd.conf b/scrapyd.conf index 6532fa5..70de4ab 100644 --- a/scrapyd.conf +++ b/scrapyd.conf @@ -5,3 +5,4 @@ eggs_dir = /scrapyd/eggs logs_dir = /scrapyd/logs items_dir = /scrapyd/items dbs_dir = /scrapyd/dbs +runner = custom_runner.async_runner \ No newline at end of file From f31af8e59f3946a8fabec6d73111ab582bd4022d Mon Sep 17 00:00:00 2001 From: namiwa Date: Wed, 22 Sep 2021 18:57:09 -0700 Subject: [PATCH 2/5] dockerfile: switch to python 3.7 --- Dockerfile | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 63dfedf..c2d163e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ # Scrapyd web service (with authentication) -FROM ubuntu:18.04 +FROM python:3.7 # install Ubuntu packages ENV DEBIAN_FRONTEND noninteractive @@ -34,6 +34,16 @@ ADD nginx.conf /etc/nginx/sites-enabled/default ADD scrapyd.conf /etc/scrapyd/scrapyd.conf # expose +ADD .. /code +RUN cd ./code && \ + python3 setup.py bdist_egg && \ + rm -rf /build /default.egg-info + +# expose +EXPOSE 6800 6801 +ENTRYPOINT ["/usr/local/bin/chaperone"] + + VOLUME /scrapyd EXPOSE 6800 From ce0e2b7cbc459122f43cdcda5c2bdf8f75338010 Mon Sep 17 00:00:00 2001 From: namiwa Date: Thu, 23 Sep 2021 10:12:55 -0700 Subject: [PATCH 3/5] add async spider support for scrapyd --- README.md | 77 +++++++++++++++++++++++++++++++++++++++++++++++- chaperone.conf | 23 +++++++++------ requirements.txt | 1 - scrapyd.conf | 35 +++++++++++++++++----- setup.py | 8 +++++ 5 files changed, 126 insertions(+), 18 deletions(-) create mode 100644 setup.py diff --git a/README.md b/README.md index 38464d3..fbc16cf 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Scrapyd (with authentication) +# Scrapyd (with authentication & support of Asyncio Reactor based Spiders) Scrapyd is an application for deploying and running Scrapy spiders. It enables you to deploy (upload) your projects and control their spiders using a JSON API. @@ -6,6 +6,81 @@ Scrapyd doesn't include any provision for password protecting itself. This conta For more about Scrapyd, see the [Scrapyd documentation](http://scrapyd.readthedocs.org/en/latest/). + +# Using with docker-compose + +Run the following commands: + +``` +$ docker-compose build +$ docker-compose up +``` + +The following printout to console should be seen after `docker-compose up`: +```shell +docker-compose up +Creating network "scrapyd-authenticated_default" with the default driver +Creating scrapyd-authenticated_scrapyd_1 ... done +Attaching to scrapyd-authenticated_scrapyd_1 +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: Switching all chaperone logging to /dev/log +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: chaperone version 0.3.9, ready. +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: service nginx.service enabled, queueing start request +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: nginx.service attempting start '/usr/sbin/service nginx start'... +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: service password.service enabled, queueing start request +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: password.service attempting start '/usr/bin/htpasswd -b -c /etc/nginx/htpasswd debug debug'... +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: service scrapyd.service enabled, queueing start request +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: service password.service enabled, queueing start request +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: service upload.service enabled, queueing start request +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: service scrapyd.service enabled, queueing start request +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d password[11]: Adding password for user debug +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: REAP pid=11,status=0 +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: REAP pid=0,status=0 +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: password.service exit status for pid=11 is '' +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: password.service successfully started +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: password.service notified waiters upon completion +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: service scrapyd.service prerequisites satisfied +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: scrapyd.service attempting start '/usr/local/bin/scrapyd'... +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d nginx[8]: Starting nginx: nginx. +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: REAP pid=8,status=0 +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: REAP pid=0,status=0 +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: nginx.service exit status for pid=8 is '' +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: nginx.service successfully started +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: nginx.service notified waiters upon completion +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: nginx.service exit status for pid=8 is '' +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d scrapyd[17]: 2021-09-23T17:02:55+0000 [-] Loading /usr/local/lib/python3.7/site-packages/scrapyd/txapp.py... +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d scrapyd[17]: 2021-09-23T17:02:55+0000 [-] Scrapyd web console available at http://0.0.0.0:6801/ +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d scrapyd[17]: 2021-09-23T17:02:55+0000 [-] Loaded. +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d scrapyd[17]: 2021-09-23T17:02:55+0000 [twisted.scripts._twistd_unix.UnixAppLogger#info] twistd 21.7.0 (/usr/local/bin/python 3.7.12) starting up. +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d scrapyd[17]: 2021-09-23T17:02:55+0000 [twisted.scripts._twistd_unix.UnixAppLogger#info] reactor class: twisted.internet.epollreactor.EPollReactor. +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d scrapyd[17]: 2021-09-23T17:02:55+0000 [-] Site starting on 6801 +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d scrapyd[17]: 2021-09-23T17:02:55+0000 [twisted.web.server.Site#info] Starting factory +scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d scrapyd[17]: 2021-09-23T17:02:55+0000 [Launcher] Scrapyd 1.2.1 started: max_proc=64, runner='custom_runner.async_runner' +scrapyd_1 | Sep 23 17:03:05 2a6b86ee639d chaperone[1]: scrapyd.service successfully started +scrapyd_1 | Sep 23 17:03:05 2a6b86ee639d chaperone[1]: scrapyd.service notified waiters upon completion +scrapyd_1 | Sep 23 17:03:05 2a6b86ee639d chaperone[1]: service upload.service prerequisites satisfied +scrapyd_1 | Sep 23 17:03:05 2a6b86ee639d chaperone[1]: upload.service attempting start '/usr/bin/curl http://localhost:6800/addversion.json --location --request POST --header Authorization: Basic ZGVidWc6ZGVidWc= -F project=default + -F version=1.0 -F egg=@default-1.0-py3.7.egg'... +scrapyd_1 | Sep 23 17:03:05 2a6b86ee639d upload[39]: % Total % Received % Xferd Average Speed Time Time Time Current +scrapyd_1 | Sep 23 17:03:05 2a6b86ee639d upload[39]: Dload Upload Total Spent Left Speed +scrapyd_1 | Sep 23 17:03:05 2a6b86ee639d scrapyd[17]: 2021-09-23T17:03:05+0000 [twisted.python.log#info] "127.0.0.1" - - [23/Sep/2021:17:03:04 +0000] "POST /addversion.json HTTP/1.0" 200 100 "-" "curl/7.74.0" +100 10828 100 100 100 10728 217 23321 --:--:-- --:--:-- --:--:-- 23539 +scrapyd_1 | Sep 23 17:03:05 2a6b86ee639d upload[39]: {"node_name": "2a6b86ee639d", "status": "ok", "project": "default", "version": "1.0", "spiders": 1} +scrapyd_1 | Sep 23 17:03:05 2a6b86ee639d chaperone[1]: REAP pid=39,status=0 +scrapyd_1 | Sep 23 17:03:05 2a6b86ee639d chaperone[1]: REAP pid=0,status=0 +scrapyd_1 | Sep 23 17:03:05 2a6b86ee639d chaperone[1]: upload.service exit status for pid=39 is '' +scrapyd_1 | Sep 23 17:03:05 2a6b86ee639d chaperone[1]: upload.service successfully started +scrapyd_1 | Sep 23 17:03:05 2a6b86ee639d chaperone[1]: upload.service notified waiters upon completion +scrapyd_1 | Sep 23 17:03:05 2a6b86ee639d chaperone[1]: REAP pid=0,status=0 +``` + +To run the `async example spider`, run the following command on another terminal or on Postman: +```shell +curl --location --request POST 'http://localhost:6801/schedule.json' --header 'Authorization: Basic ZGVidWc6ZGVidWc=' --header 'Content-Type: application/x-www-form-urlencoded' --data-urlencode 'project=default' --data-urlencode 'spider=example' +``` + +Observe the new job schduled at `http://localhost:6801/jobs`. + + # How to use this image ## Start a Scrapyd server diff --git a/chaperone.conf b/chaperone.conf index b4622b8..fc936f2 100644 --- a/chaperone.conf +++ b/chaperone.conf @@ -1,19 +1,24 @@ -dependencies.service: - command: /bin/bash -c "[ -z \"$PACKAGES\" ] || pip3 install ${PACKAGES//,/ }" - type: oneshot - password.service: command: htpasswd -b -c /etc/nginx/htpasswd $(USERNAME:?You need to supply a USERNAME environment variable) $(PASSWORD:?You need to supply a PASSWORD environment variable) type: oneshot + +scrapyd.service: + command: scrapyd + restart: true + directory: "/code" + after: "password.service" + startup_pause: 10 + +upload.service: + command: "curl http://localhost:6800/addversion.json --location --request POST --header 'Authorization: Basic ZGVidWc6ZGVidWc=' -F project=default -F version=1.0 -F egg='@default-1.0-py3.7.egg'" + after: "scrapyd.service" + type: oneshot + directory: "/code/dist" nginx.service: command: service nginx start restart: true - after: "scrapyd.service" - -scrapyd.service: - command: scrapyd - after: "dependencies.service,password.service" + after: "custom.service" console.logging: stdout: true diff --git a/requirements.txt b/requirements.txt index 9f018ec..b21b55f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,5 @@ asyncio==3.4.3 Scrapy==2.5.0 -scrapy==1.8.0 scrapyd==1.2.1 redis requests diff --git a/scrapyd.conf b/scrapyd.conf index 70de4ab..84f8d14 100644 --- a/scrapyd.conf +++ b/scrapyd.conf @@ -1,8 +1,29 @@ [scrapyd] -bind_address= 127.0.0.1 -http_port = 6801 -eggs_dir = /scrapyd/eggs -logs_dir = /scrapyd/logs -items_dir = /scrapyd/items -dbs_dir = /scrapyd/dbs -runner = custom_runner.async_runner \ No newline at end of file +eggs_dir = dist +logs_dir = logs +items_dir = items +jobs_to_keep = 5 +dbs_dir = dbs +max_proc = 0 +max_proc_per_cpu = 4 +finished_to_keep = 100 +poll_interval = 5.0 +bind_address = 0.0.0.0 +http_port = 6801 +debug = off +runner = custom_runner.async_runner +application = scrapyd.app.application +launcher = scrapyd.launcher.Launcher +webroot = scrapyd.website.Root + +[services] +schedule.json = scrapyd.webservice.Schedule +cancel.json = scrapyd.webservice.Cancel +addversion.json = scrapyd.webservice.AddVersion +listprojects.json = scrapyd.webservice.ListProjects +listversions.json = scrapyd.webservice.ListVersions +listspiders.json = scrapyd.webservice.ListSpiders +delproject.json = scrapyd.webservice.DeleteProject +delversion.json = scrapyd.webservice.DeleteVersion +listjobs.json = scrapyd.webservice.ListJobs +daemonstatus.json = scrapyd.webservice.DaemonStatus diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..8a0a666 --- /dev/null +++ b/setup.py @@ -0,0 +1,8 @@ +from setuptools import find_packages, setup +# running the egg https://stackoverflow.com/a/37800297 +setup( + name="default", + version="1.0", + packages=find_packages(), + entry_points={"scrapy": ["settings = default.settings"]}, +) From 730f9bb1802ee0894730ab5f2fbfe83b48f1959d Mon Sep 17 00:00:00 2001 From: namiwa Date: Thu, 23 Sep 2021 10:15:41 -0700 Subject: [PATCH 4/5] readme: simplify --- README.md | 57 ------------------------------------------------------- 1 file changed, 57 deletions(-) diff --git a/README.md b/README.md index fbc16cf..e00e2bd 100644 --- a/README.md +++ b/README.md @@ -16,63 +16,6 @@ $ docker-compose build $ docker-compose up ``` -The following printout to console should be seen after `docker-compose up`: -```shell -docker-compose up -Creating network "scrapyd-authenticated_default" with the default driver -Creating scrapyd-authenticated_scrapyd_1 ... done -Attaching to scrapyd-authenticated_scrapyd_1 -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: Switching all chaperone logging to /dev/log -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: chaperone version 0.3.9, ready. -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: service nginx.service enabled, queueing start request -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: nginx.service attempting start '/usr/sbin/service nginx start'... -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: service password.service enabled, queueing start request -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: password.service attempting start '/usr/bin/htpasswd -b -c /etc/nginx/htpasswd debug debug'... -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: service scrapyd.service enabled, queueing start request -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: service password.service enabled, queueing start request -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: service upload.service enabled, queueing start request -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: service scrapyd.service enabled, queueing start request -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d password[11]: Adding password for user debug -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: REAP pid=11,status=0 -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: REAP pid=0,status=0 -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: password.service exit status for pid=11 is '' -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: password.service successfully started -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: password.service notified waiters upon completion -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: service scrapyd.service prerequisites satisfied -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: scrapyd.service attempting start '/usr/local/bin/scrapyd'... -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d nginx[8]: Starting nginx: nginx. -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: REAP pid=8,status=0 -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: REAP pid=0,status=0 -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: nginx.service exit status for pid=8 is '' -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: nginx.service successfully started -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: nginx.service notified waiters upon completion -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d chaperone[1]: nginx.service exit status for pid=8 is '' -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d scrapyd[17]: 2021-09-23T17:02:55+0000 [-] Loading /usr/local/lib/python3.7/site-packages/scrapyd/txapp.py... -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d scrapyd[17]: 2021-09-23T17:02:55+0000 [-] Scrapyd web console available at http://0.0.0.0:6801/ -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d scrapyd[17]: 2021-09-23T17:02:55+0000 [-] Loaded. -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d scrapyd[17]: 2021-09-23T17:02:55+0000 [twisted.scripts._twistd_unix.UnixAppLogger#info] twistd 21.7.0 (/usr/local/bin/python 3.7.12) starting up. -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d scrapyd[17]: 2021-09-23T17:02:55+0000 [twisted.scripts._twistd_unix.UnixAppLogger#info] reactor class: twisted.internet.epollreactor.EPollReactor. -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d scrapyd[17]: 2021-09-23T17:02:55+0000 [-] Site starting on 6801 -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d scrapyd[17]: 2021-09-23T17:02:55+0000 [twisted.web.server.Site#info] Starting factory -scrapyd_1 | Sep 23 17:02:55 2a6b86ee639d scrapyd[17]: 2021-09-23T17:02:55+0000 [Launcher] Scrapyd 1.2.1 started: max_proc=64, runner='custom_runner.async_runner' -scrapyd_1 | Sep 23 17:03:05 2a6b86ee639d chaperone[1]: scrapyd.service successfully started -scrapyd_1 | Sep 23 17:03:05 2a6b86ee639d chaperone[1]: scrapyd.service notified waiters upon completion -scrapyd_1 | Sep 23 17:03:05 2a6b86ee639d chaperone[1]: service upload.service prerequisites satisfied -scrapyd_1 | Sep 23 17:03:05 2a6b86ee639d chaperone[1]: upload.service attempting start '/usr/bin/curl http://localhost:6800/addversion.json --location --request POST --header Authorization: Basic ZGVidWc6ZGVidWc= -F project=default - -F version=1.0 -F egg=@default-1.0-py3.7.egg'... -scrapyd_1 | Sep 23 17:03:05 2a6b86ee639d upload[39]: % Total % Received % Xferd Average Speed Time Time Time Current -scrapyd_1 | Sep 23 17:03:05 2a6b86ee639d upload[39]: Dload Upload Total Spent Left Speed -scrapyd_1 | Sep 23 17:03:05 2a6b86ee639d scrapyd[17]: 2021-09-23T17:03:05+0000 [twisted.python.log#info] "127.0.0.1" - - [23/Sep/2021:17:03:04 +0000] "POST /addversion.json HTTP/1.0" 200 100 "-" "curl/7.74.0" -100 10828 100 100 100 10728 217 23321 --:--:-- --:--:-- --:--:-- 23539 -scrapyd_1 | Sep 23 17:03:05 2a6b86ee639d upload[39]: {"node_name": "2a6b86ee639d", "status": "ok", "project": "default", "version": "1.0", "spiders": 1} -scrapyd_1 | Sep 23 17:03:05 2a6b86ee639d chaperone[1]: REAP pid=39,status=0 -scrapyd_1 | Sep 23 17:03:05 2a6b86ee639d chaperone[1]: REAP pid=0,status=0 -scrapyd_1 | Sep 23 17:03:05 2a6b86ee639d chaperone[1]: upload.service exit status for pid=39 is '' -scrapyd_1 | Sep 23 17:03:05 2a6b86ee639d chaperone[1]: upload.service successfully started -scrapyd_1 | Sep 23 17:03:05 2a6b86ee639d chaperone[1]: upload.service notified waiters upon completion -scrapyd_1 | Sep 23 17:03:05 2a6b86ee639d chaperone[1]: REAP pid=0,status=0 -``` - To run the `async example spider`, run the following command on another terminal or on Postman: ```shell curl --location --request POST 'http://localhost:6801/schedule.json' --header 'Authorization: Basic ZGVidWc6ZGVidWc=' --header 'Content-Type: application/x-www-form-urlencoded' --data-urlencode 'project=default' --data-urlencode 'spider=example' From 0b21334bf3e2f5b17af163741054b8d129f05853 Mon Sep 17 00:00:00 2001 From: namiwa Date: Thu, 23 Sep 2021 10:20:09 -0700 Subject: [PATCH 5/5] example_spider: removed asyncio_run call --- default/pipelines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/default/pipelines.py b/default/pipelines.py index adc1523..0ec7ac3 100644 --- a/default/pipelines.py +++ b/default/pipelines.py @@ -12,7 +12,7 @@ class DefaultPipeline: async def process_item(self, item, spider): spider.logger.info("Running Asyncio Pipeline: Testing Sleep for 2 seconds") # testing with running asycio sleep - asyncio.run(self.test_sleep(2, spider)) + await self.test_sleep(2, spider) spider.logger.info(f"Ending sleep with item: {item}") return item