-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathget_randos.py
35 lines (28 loc) · 855 Bytes
/
get_randos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
"""
Get metadata about an arbitrary number of random Wikipedia mainspace articles.
"""
import requests
import sys
import json
S = requests.Session()
RAND_URL = 'https://en.wikipedia.org/w/api.php?action=query&format=json&list=random&rnlimit=500&rnnamespace=0'
def get_batch():
r = S.get(url=RAND_URL)
dat = r.json()
# List of dictionaries, each having keys "id", "ns", and "title"
pgs = dat['query']['random']
return pgs
try:
n = int(sys.argv[1])
except:
print("USAGE: get_randos.py N")
sys.exit(1)
acc = []
while len(acc) < n:
acc += get_batch()
print('.', end='')
# TODO: Consider filtering out dupes? This is unlikely to be a problem. Though, because of the
# way it's implemented, if we get any dupes, we'll get a lot...
out_fname = 'pages.json'
with open(out_fname, 'w') as f:
json.dump(acc, f)