-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcontinuous-scraper-sqlite.py
394 lines (304 loc) · 15 KB
/
continuous-scraper-sqlite.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
import asyncio
import time
from tqdm import tqdm
import shutil
import sqlite3
import os
import httpx
import random
import aiohttp
'''
Roblox Continuous Game Scraper (SQlite)
by nouhidev
'''
_VERSION = "0.0.2-continuous-sqlite"
# ------- [ Scraping Parameters ] -------
# https://games.roblox.com/v1/games/votes?universeIds=
# https://games.roblox.com/v1/games?universeIds=
BASE_URL = "https://games.roblox.com/v1/games?universeIds="
# UIDs/Url (Default: 100)
BATCH_SIZE = 50
# Initial delay between requests (Default: 0.05 --> 20reqs/s)
INITIAL_REQUESTS_DELAY = 0.07
# Multiplier by which the delay will be multiplied with on rate limit error
RATE_LIMIT_DELAY_PENALTY_INCREASE = 1.1
# Multiplier by which the delay will be multiplied with on rate limit error
RATE_LIMIT_DELAY_PENALTY_DECREASE = 4
# Max allowed delay between each request
MAX_REQUESTS_DELAY = 1
# Number of consecutive requests without rate limiting required to reset the delay between requests (Default: 100)
RATE_LIMIT_PENALTY_RESET_THRESHOLD = 100
# UID to stop scraping at (Default: 5060800000)
END_ID = 5060800000
# Amount of seconds that have to be passed before returning the data (Default: 3)
RETURN_DATA_MINIMUM_DELAY = 2
# Amount of times a request will retry to get its batch
MAX_RETRIES = 30
# Concurrent open requests cap (Default: 2000)
MAX_CONCURRENT_OPEN_REQUESTS = 2000
# Base for calculating the random retry time for each failed request
MIN_RETRY_SLEEP_TIME = 1
# A random number between these bounds will be added to the base sleeping time when retrying
RANDOM_SLEEP_TIME_BOUNDS = (1, 10)
# Whether to use httpx to make requests or aiohttp
USE_HTTPX = False
# Log Response Times
LOG_RESPONSE_TIMES = False
# -------- [ File Output Paths ] --------
COUNT_FILE_PATH = "c_start_uid.txt"
ERRORED_BATCHES_FILE_PATH = "c_errored_batches.txt"
# -------- [ Requests Managing ] --------
# Locally used to keep track of the start of each batch
start_uid = 0
# Active delay between each request
current_requests_delay = 0
# Keeps track of the amount of consecutive requests without rate limiting
consecutive_no_rate_limit = 0
# Keeps track of the unresolved requests
unresolved_requests = 0
# Keeps track of the games added in the current session
games_added_in_session = 0
# Keeps track of the games scanned in the current session
games_scanned_in_session = 0
# Keeps track of the errored requests
errored_requests = []
# Keeps track of the recovered requests
recovered_requests = []
# Keeps track of the lost requests
lost_requests = []
# Keeps track of the resolved requests
resolved_requests = 0
# Accurate UIDs/s
uids_per_second = 0
# ------------- [ Other ] ---------------
progress_bar = None
terminal_width = shutil.get_terminal_size().columns
equals_line = "=" * terminal_width
# -------- [ ANSI Escape Codes ] --------
RED = '\033[91m'
DARK_RED = '\033[0;31m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RESET = '\033[0m'
GRAY = '\033[90m'
BOLD = '\033[1m'
GOLDEN = '\033[93m'
UNDERLINE = '\033[4m'
CYAN = '\033[96m'
# ------- [ SQLite Integration ] -------
conn = sqlite3.connect('games.db')
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS scraped_data (
uid INTEGER PRIMARY KEY,
place_id INTEGER,
visits INTEGER,
name TEXT,
favoritedCount INTEGER
)
''')
# ----- [ Get Start UID from File ] -----
try:
with open(COUNT_FILE_PATH, "r") as count_file:
count_str = count_file.read().strip()
if count_str:
start_uid = int(count_str)
except FileNotFoundError:
with open(COUNT_FILE_PATH, "w") as count_file:
count_file.write(str(start_uid))
# ----- [ ----------------------- ] -----
start_time = time.perf_counter()
def format_time(seconds):
hours, remainder = divmod(seconds, 3600)
minutes, seconds = divmod(remainder, 60)
return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"
def get_progress():
global start_uid
terminal_width, _ = shutil.get_terminal_size()
terminal_width -= 2
goal = END_ID - start_uid
progress_ratio = min(1.0, max(0.0, games_scanned_in_session / goal))
num_symbols = int(terminal_width * progress_ratio)
progress_bar = "(" + "-" * num_symbols + " " * (terminal_width - num_symbols) + ")"
return (progress_bar, goal)
def print_stats(_current_requests_delay):
global games_added_in_session, games_scanned_in_session, errored_requests, recovered_requests, resolved_requests, lost_requests, consecutive_no_rate_limit, start_uid, uids_per_second
os.system('cls')
elapsed_time = time.perf_counter() - start_time
formatted_elapsed_time = format_time(elapsed_time)
days_left = round(get_progress()[1]/max(24*60*60*(games_scanned_in_session/elapsed_time), 0.001))
days_left_str = f"{round(get_progress()[1]/max(24*60*60*(games_scanned_in_session/elapsed_time), 0.001))} days"
if (days_left < 0): days_left_str = "less than a day"
print(f"{GRAY}{equals_line}{RESET}")
print(f"{CYAN}Session Progress: {100*min(1.0, max(0.0, games_scanned_in_session / get_progress()[1])):.12f}%{RESET} {GRAY}({games_scanned_in_session:,}/{get_progress()[1]:,} | Total Progress: {100*min(1.0, max(0.0, start_uid / END_ID)):.12f}%{RESET})")
print(f"{GRAY}Estimate: {days_left_str} left{RESET}")
print(CYAN + get_progress()[0] + RESET)
print(f"{GRAY}{equals_line}{RESET}")
print(f"Ongoing requests: {(unresolved_requests):,} {GRAY}(Closed requests: {(resolved_requests):,} | Total requests: {(unresolved_requests+resolved_requests):,}){RESET}")
print(f"- Games added in session: {games_added_in_session:,} out of {games_scanned_in_session:,} scanned games\n")
print(f"Average session speed: {UNDERLINE}{round(games_scanned_in_session/elapsed_time, 3):,} UIDs/s{RESET}{GRAY} > {round(60*(games_scanned_in_session/elapsed_time), 3):,} UIDs/min > {round(60*60*(games_scanned_in_session/elapsed_time), 3):,} UIDs/h > {round(24*60*60*(games_scanned_in_session/elapsed_time), 3):,} UIDs/d => {RESET}{round(END_ID/max(24*60*60*(games_scanned_in_session/elapsed_time), 0.001))}d for all")
print(f"- Received {uids_per_second} UIDs last second")
print(f"- Delay between new requests: {_current_requests_delay} seconds")
print(f"{GRAY}{equals_line}{RESET}")
print(f"{DARK_RED}Errored connections: {len(errored_requests)}{RESET}")
print(f"- Recovered failed requests: {len(recovered_requests)}{RESET}")
print(f"- Lost requests: {len(lost_requests)}{RESET}")
print(f"{GRAY}{equals_line}{RESET}")
print(f"{GRAY}{formatted_elapsed_time} | {round(1/_current_requests_delay, 3)} reqs/s | {consecutive_no_rate_limit} sCReqs | HTTPX? {USE_HTTPX} | {BATCH_SIZE} UIDs/Url{RESET}")
print(f"{GRAY}{equals_line}{RESET}")
async def fetch_data(session, batch_start, batch_end, request_id):
global consecutive_no_rate_limit, current_requests_delay, unresolved_requests, games_added_in_session, games_scanned_in_session, errored_requests, resolved_requests, lost_requests, uids_per_second
unresolved_requests += 1
retry_counter = 0
universe_ids = ",".join(str(i) for i in range(batch_start, batch_end))
url = BASE_URL + universe_ids
# Decrease the current delay between requests after consecutive successful requests
if consecutive_no_rate_limit >= RATE_LIMIT_PENALTY_RESET_THRESHOLD:
current_requests_delay = max(current_requests_delay / RATE_LIMIT_DELAY_PENALTY_DECREASE, INITIAL_REQUESTS_DELAY)
consecutive_no_rate_limit = 0
while retry_counter < MAX_RETRIES:
# Try to sent the HTTP request
try:
timer = time.perf_counter()
response = await session.get(url, timeout=360)
timer_end = time.perf_counter()
if LOG_RESPONSE_TIMES: print(f"Response time: {timer_end-timer} seconds")
# HTTP request error handling
except Exception as e:
retry_counter += 1
await asyncio.sleep(MIN_RETRY_SLEEP_TIME + random.uniform(RANDOM_SLEEP_TIME_BOUNDS[0], RANDOM_SLEEP_TIME_BOUNDS[1]))
continue
if USE_HTTPX:
# Rate limit handling for v1/games and v1/votes endpoints
if response.status_code == 503 or response.status_code == 429:
# Penalize the script for getting rate limited
current_requests_delay = min(current_requests_delay * RATE_LIMIT_DELAY_PENALTY_INCREASE, MAX_REQUESTS_DELAY)
consecutive_no_rate_limit = 0
retry_counter += 1
if f"_id{request_id}" not in errored_requests: errored_requests.append(f"_id{request_id}")
await asyncio.sleep(MIN_RETRY_SLEEP_TIME + random.uniform(RANDOM_SLEEP_TIME_BOUNDS[0], RANDOM_SLEEP_TIME_BOUNDS[1]))
continue
else:
# Rate limit handling for v1/games and v1/votes endpoints
if response.status == 503 or response.status == 429:
# Penalize the script for getting rate limited
current_requests_delay = min(current_requests_delay * RATE_LIMIT_DELAY_PENALTY_INCREASE, MAX_REQUESTS_DELAY)
consecutive_no_rate_limit = 0
retry_counter += 1
if f"_id{request_id}" not in errored_requests: errored_requests.append(f"_id{request_id}")
await asyncio.sleep(MIN_RETRY_SLEEP_TIME + random.uniform(RANDOM_SLEEP_TIME_BOUNDS[0], RANDOM_SLEEP_TIME_BOUNDS[1]))
continue
# Try to parse the response as JSON
try:
if USE_HTTPX:
data = response.json()
else:
data = await response.json()
# Success !
saved_progress = 0
with open(COUNT_FILE_PATH, "r") as count_file:
saved_progress = int(count_file.read().strip())
with open(COUNT_FILE_PATH, "w") as count_file:
if not saved_progress: saved_progress = 0
count_file.write(str(max(batch_end, saved_progress)))
consecutive_no_rate_limit += 1
documents_to_insert = []
for entry in data["data"]:
uids_per_second += 1
uid = entry["id"]
place_id = entry["rootPlaceId"]
visits = entry["visits"]
name = entry["name"]
favoritedCount = entry["favoritedCount"]
document = {
"uid": uid,
"placeId": place_id,
"visits": visits,
"name": name,
"favoritedCount": favoritedCount
}
if entry["genre"] == "Horror" and entry["visits"] >= 1000:
games_added_in_session += 1
documents_to_insert.append(document)
games_scanned_in_session += 1
if documents_to_insert:
for doc in documents_to_insert:
cursor.execute('''
INSERT INTO scraped_data (uid, place_id, visits, name, favoritedCount)
VALUES (?, ?, ?, ?, ?)
''', (doc["uid"], doc["placeId"], doc["visits"], doc["name"], doc["favoritedCount"]))
conn.commit()
unresolved_requests -= 1
resolved_requests += 1
if retry_counter > 0:
for id in errored_requests:
if id == f"_id{request_id}":
recovered_requests.append(f"_id{request_id}")
errored_requests = [id for id in errored_requests if id != f"_id{request_id}"]
return
# If the parsing fails retry
except Exception as e:
for id in errored_requests:
if id == f"_id{request_id}":
lost_requests.append(f"_id{request_id}")
errored_requests = [id for id in errored_requests if id != f"_id{request_id}"]
print(e)
retry_counter += 1
await asyncio.sleep(MIN_RETRY_SLEEP_TIME + random.uniform(RANDOM_SLEEP_TIME_BOUNDS[0], RANDOM_SLEEP_TIME_BOUNDS[1]))
continue
# All retries have failed
unresolved_requests -= 1
resolved_requests += 1
if f"_id{request_id}" not in lost_requests: lost_requests.append(f"_id{request_id}")
async def main():
global start_uid, errored_requests, recovered_requests, lost_requests, current_requests_delay, unresolved_requests, resolved_requests, uids_per_second, uids_per_second_smooth
saved_progress = 0
with open(COUNT_FILE_PATH, "r") as count_file:
saved_progress = int(count_file.read().strip())
if not saved_progress: saved_progress = 0
start_uid = saved_progress
print(f"{UNDERLINE}{CYAN}Starting continuous scraper:{RESET}")
print(f"{GRAY}- Targetting {BASE_URL}{RESET}")
print(f"{GRAY} - with a batch size of {BATCH_SIZE} UIDs/Url{RESET}")
print(f"{GRAY} - at the starting UID {start_uid:,}{RESET}")
if USE_HTTPX:
print(f"{GRAY}- Requesting using HTTPX{RESET}")
else:
print(f"{GRAY}- Requesting using AIOHTTP{RESET}")
print(f"{GRAY} - with a cap of {MAX_CONCURRENT_OPEN_REQUESTS} open requests{RESET}")
print(f"{GRAY}- Desired delay between new requests {INITIAL_REQUESTS_DELAY} seconds ({round(1/INITIAL_REQUESTS_DELAY, 2)} reqs/s){RESET}")
await asyncio.sleep(3)
if USE_HTTPX:
async with httpx.AsyncClient(http2=True, limits=httpx.Limits(max_connections=None, max_keepalive_connections=0)) as session:
request_id = 1
current_requests_delay = INITIAL_REQUESTS_DELAY
last_print_time = time.perf_counter()
while start_uid < END_ID:
if time.perf_counter() - last_print_time >= 1:
print_stats(current_requests_delay)
last_print_time = time.perf_counter()
uids_per_second = 0
if unresolved_requests < MAX_CONCURRENT_OPEN_REQUESTS:
batch_end = min(start_uid + BATCH_SIZE, END_ID)
asyncio.create_task(fetch_data(session, start_uid, batch_end, request_id))
start_uid += BATCH_SIZE
request_id += 1
await asyncio.sleep(current_requests_delay)
else:
async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(limit=0, limit_per_host=0, force_close=True)) as session:
request_id = 1
current_requests_delay = INITIAL_REQUESTS_DELAY
last_print_time = time.perf_counter()
while start_uid < END_ID:
if time.perf_counter() - last_print_time >= 1:
print_stats(current_requests_delay)
last_print_time = time.perf_counter()
uids_per_second = 0
if unresolved_requests < MAX_CONCURRENT_OPEN_REQUESTS:
batch_end = min(start_uid + BATCH_SIZE, END_ID)
asyncio.create_task(fetch_data(session, start_uid, batch_end, request_id))
start_uid += BATCH_SIZE
request_id += 1
await asyncio.sleep(current_requests_delay)
if __name__ == "__main__":
asyncio.run(main())