From 7ba7ddc01e3957192933659d71c9476247ddcfda Mon Sep 17 00:00:00 2001 From: Nathan Malcolm Date: Mon, 28 Aug 2017 12:48:02 +0100 Subject: [PATCH] Enable the AutoThrottle extension, increase crawl depth, and reduce HTTP cache to 24 hours. --- README.md | 2 +- inventus_spider/settings.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 48cf4e4..df684df 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ $ scrapy crawl inventus -a domain=facebook.com -t csv -o Facebook.csv # Configuration -Configurations can be made to how Inventus behaves. For example, by default Inventus will ignore robots.txt, has a 30 second timeout, caches crawl data for a week, and has a 0.25 second delay between requests. These and more can all be changed by editing the `inventus_spider/settings.py` file. Scrapy's settings are [well documented](https://doc.scrapy.org/en/latest/topics/settings.html#aws-access-key-id) too. +Configurations can be made to how Inventus behaves. By default Inventus will ignore robots.txt, has a 30 second timeout, caches crawl data for 24 hours, has a crawl depth of 5, and uses Scrapy's AutoThrottle extension. These and more can all be changed by editing the `inventus_spider/settings.py` file. Scrapy's settings are [well documented](https://doc.scrapy.org/en/latest/topics/settings.html#aws-access-key-id) too. # Bugs / Suggestions / Feedback diff --git a/inventus_spider/settings.py b/inventus_spider/settings.py index 49aed5a..eecfd8a 100644 --- a/inventus_spider/settings.py +++ b/inventus_spider/settings.py @@ -70,32 +70,32 @@ # Enable and configure the AutoThrottle extension (disabled by default) # See http://doc.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True +AUTOTHROTTLE_ENABLED = True # The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 +AUTOTHROTTLE_START_DELAY = 0.25 # The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 +AUTOTHROTTLE_MAX_DELAY = 10 # The average number of requests Scrapy should be sending in parallel to # each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +AUTOTHROTTLE_TARGET_CONCURRENCY = 5.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings HTTPCACHE_ENABLED = True -HTTPCACHE_EXPIRATION_SECS = 604800 # 1 week +HTTPCACHE_EXPIRATION_SECS = 86400 # 1 day HTTPCACHE_DIR = 'httpcache' HTTPCACHE_IGNORE_HTTP_CODES = [] HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' # The maximum depth that will be allowed to crawl for any site. # If zero, no limit will be imposed. -DEPTH_LIMIT = 3 +DEPTH_LIMIT = 5 # The amount of time (in secs) that the downloader should wait before downloading consecutive pages # from the same website. -DOWNLOAD_DELAY = 0.25 +DOWNLOAD_DELAY = 0.2 # The amount of time (in secs) that the downloader will wait before timing out. DOWNLOAD_TIMEOUT = 30