From 7ba7ddc01e3957192933659d71c9476247ddcfda Mon Sep 17 00:00:00 2001
From: Nathan Malcolm <nathan@null>
Date: Mon, 28 Aug 2017 12:48:02 +0100
Subject: [PATCH] Enable the AutoThrottle extension, increase crawl depth, and
 reduce HTTP cache to 24 hours.

---
 README.md                   |  2 +-
 inventus_spider/settings.py | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 48cf4e4..df684df 100644
--- a/README.md
+++ b/README.md
@@ -64,7 +64,7 @@ $ scrapy crawl inventus -a domain=facebook.com -t csv -o Facebook.csv
 
 # Configuration
 
-Configurations can be made to how Inventus behaves. For example, by default Inventus will ignore robots.txt, has a 30 second timeout, caches crawl data for a week, and has a 0.25 second delay between requests. These and more can all be changed by editing the `inventus_spider/settings.py` file. Scrapy's settings are [well documented](https://doc.scrapy.org/en/latest/topics/settings.html#aws-access-key-id) too.
+Configurations can be made to how Inventus behaves. By default Inventus will ignore robots.txt, has a 30 second timeout, caches crawl data for 24 hours, has a crawl depth of 5, and uses Scrapy's AutoThrottle extension. These and more can all be changed by editing the `inventus_spider/settings.py` file. Scrapy's settings are [well documented](https://doc.scrapy.org/en/latest/topics/settings.html#aws-access-key-id) too.
 
 # Bugs / Suggestions / Feedback
 
diff --git a/inventus_spider/settings.py b/inventus_spider/settings.py
index 49aed5a..eecfd8a 100644
--- a/inventus_spider/settings.py
+++ b/inventus_spider/settings.py
@@ -70,32 +70,32 @@
 
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
-#AUTOTHROTTLE_ENABLED = True
+AUTOTHROTTLE_ENABLED = True
 # The initial download delay
-#AUTOTHROTTLE_START_DELAY = 5
+AUTOTHROTTLE_START_DELAY = 0.25
 # The maximum download delay to be set in case of high latencies
-#AUTOTHROTTLE_MAX_DELAY = 60
+AUTOTHROTTLE_MAX_DELAY = 10
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
-#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+AUTOTHROTTLE_TARGET_CONCURRENCY = 5.0
 # Enable showing throttling stats for every response received:
 #AUTOTHROTTLE_DEBUG = False
 
 # Enable and configure HTTP caching (disabled by default)
 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 HTTPCACHE_ENABLED = True
-HTTPCACHE_EXPIRATION_SECS = 604800 # 1 week
+HTTPCACHE_EXPIRATION_SECS = 86400 # 1 day
 HTTPCACHE_DIR = 'httpcache'
 HTTPCACHE_IGNORE_HTTP_CODES = []
 HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
 
 # The maximum depth that will be allowed to crawl for any site.
 # If zero, no limit will be imposed.
-DEPTH_LIMIT = 3
+DEPTH_LIMIT = 5
 
 # The amount of time (in secs) that the downloader should wait before downloading consecutive pages
 # from the same website.
-DOWNLOAD_DELAY = 0.25
+DOWNLOAD_DELAY = 0.2
 
 # The amount of time (in secs) that the downloader will wait before timing out.
 DOWNLOAD_TIMEOUT = 30