-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathscraper.py
153 lines (123 loc) · 4.92 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import logging
import os
import requests
import time
import datetime
from bs4 import BeautifulSoup
import gspread
from oauth2client.service_account import ServiceAccountCredentials
from email_debug import send_email as send_email
from apscheduler.schedulers.blocking import BlockingScheduler
LINES_METRO = ['azul', 'verde', 'vermelha', 'amarela', 'lilás', 'prata']
LINES_CPTM = [
'rubi',
'diamante',
'esmeralda',
'turquesa',
'coral',
'safira',
'jade']
ALL_LINES = LINES_METRO + LINES_CPTM
SPREADSHEET_IDS = {
1: "17TRsvuMKbX3fC4tCgac0HAyuRHaXKUxwpUO-0pKG0mY",
2: "1pkkC_4g7WArWfBt91r6jzZlsLeuiRGAuKTV2ggUnuU0",
3: "15QZfZZ4oVvZqfPHThHDrRq9O3_ax9io8D3f-xXPd3GY",
12: "1tIVjV5Uw4QHT0Fm4RTnVmUC36zahnkJrko6wIKTxi08"
}
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.DEBUG)
logger.info('Starting scraper')
def get_page_html(url):
try:
page = requests.get(url)
if page.status_code == 200:
return page.text
else:
return None
except BaseException:
return None
def init_sheet(SPREADSHEET_ID):
# use creds to create a client to interact with the Google Drive API
scope = ['https://spreadsheets.google.com/feeds',
'https://www.googleapis.com/auth/drive']
headers = {
"type": os.environ.get('TYPE', None),
"project_id": os.environ.get('PROJECT_ID', None),
"private_key_id": os.environ.get('PRIVATE_KEY_ID', None),
"private_key": os.environ.get('PRIVATE_KEY', None),
"client_email": os.environ.get('CLIENT_EMAIL', None),
"client_id": os.environ.get('CLIENT_ID', None),
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
"token_uri": "https://accounts.google.com/o/oauth2/token",
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
"client_x509_cert_url": os.environ.get('CLIENT_x509_CERT_URL', None)
}
# logger.info(str(headers))
creds = ServiceAccountCredentials.from_json_keyfile_dict(headers, scope)
# creds = ServiceAccountCredentials.from_json_keyfile_name('client_secret.json', scope)
client = gspread.authorize(creds)
data_sheet = client.open_by_key(SPREADSHEET_ID).worksheet("data")
return data_sheet
def get_operation_status(soup, all_lines):
extracted_status = {line: '' for line in all_lines}
# Contains all the info we need
status_column = soup.find(class_="operacao")
# The 'amarela' line is shown in a special container
extracted_status['amarela'] = status_column.find(class_="status").text
# All of the other lines are shown in a more orderly fashion. Metro has a
# div and CPTM has another one
lines_containers = status_column.find_all(class_="linhas")
for container in lines_containers:
line_info_divs = container.find_all(class_="info")
# each info div has two span tags inside: one for line title and one
# for line status
for div in line_info_divs:
line_title = ''
line_status = ''
spans = div.find_all("span")
line_title = spans[0].text.lower()
line_status = spans[1].text.lower()
# now that we have line_title and line_status set, we only have to
# store it to return later
extracted_status[line_title] = line_status
logging.info('Extracted: {}'.format(extracted_status))
return(extracted_status)
def get_time_data(soup):
return soup.find('time').text
def check_data_missing(op_status, page):
for status in op_status.values():
if(len(status) < 6 or status == ""):
return True
def sheet_id_from_date(date, spreadsheet_ids):
datetime_obj = datetime.datetime.strptime(date, '%d/%m/%Y %H:%M')
logger.info('using sheet id for month {}'.format(datetime_obj.month))
return spreadsheet_ids[datetime_obj.month]
sched = BlockingScheduler()
args = [SPREADSHEET_IDS, ALL_LINES]
@sched.scheduled_job('interval', minutes=6, args=args)
def timed_job(SPREADSHEET_IDS, all_lines):
missing_data = False
for _ in range(3):
vq_home = get_page_html('http://www.viaquatro.com.br')
if vq_home is None:
logger.error('failed getting via quatro page.')
return
s = BeautifulSoup(vq_home, 'html.parser')
time_data = get_time_data(s)
op_status = get_operation_status(s, all_lines)
if check_data_missing(op_status, vq_home):
logger.info(
'not all data was gathered from html. trying again in 10 seconds.')
missing_data = True
time.sleep(10)
continue
else:
missing_data = False
break
sheet_id = sheet_id_from_date(time_data, SPREADSHEET_IDS)
data_sheet = init_sheet(sheet_id)
if missing_data:
send_email(vq_home)
for line in all_lines:
data_sheet.append_row([time_data, line, op_status[line]])
sched.start()