-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser.py
225 lines (179 loc) · 9.36 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
#########################################################
# Name: Mohammed Ibrahim (Class of 2017)
# Project: Python script which requests for user's interest (keyword(s)) and populates
# google spreadsheet using the user's email adress, pasword and spreadsheet key
# Purpose: For intended use in timeline making
########################################################
#Below are the various libraries i imported while making the script. most of them i had to download
# as they dont come with the standard python library
from Tkinter import *
import tkMessageBox
import gdata.spreadsheet.service
import gspread
import feedparser
import urlparse
import urllib
import urllib2
from HTMLParser import HTMLParser
from re import sub
from sys import stderr
from traceback import print_exc
from bs4 import BeautifulSoup
import warnings
global email
global password
global spreadsheet_url
global spreadsheet_key
global spreadsheet_name
global keyword
#At some point, the program warns you about unsafe connection to the external spreadsheet. This is meant to ignre that.
warnings.filterwarnings("ignore")
#This is the GUI
class GUIFramework(Frame):
def __init__(self,master=None):
# Tells the class to Initialize itself
#"""Initialise the base class"""
Frame.__init__(self,master)
# """Set the Window Title"""
self.master.title("Request Form")
# """Display the main window"
#with a little bit of padding"""
self.grid(padx=10,pady=10)
self.CreateWidgets()
def CreateWidgets(self):
# """Create all the widgets that we need"""
# """Create the Text"""
self.email = Label(self, text="Enter Email:")
self.email.grid(row=0, column=0)
self.Password = Label(self, text="Enter Password:")
self.Password.grid(row=1, column=0)
self.SpreadsheetURL = Label(self, text="Enter SpreadsheetURL:")
self.SpreadsheetURL.grid(row=2, column=0)
self.SpreadsheetName = Label(self, text="Enter SpreadsheetName:")
self.SpreadsheetName.grid(row=3, column=0)
self.Keyword = Label(self, text="Enter Keyword(s):")
self. Keyword.grid(row=4, column=0)
# """Create the Entry, set it to be a bit wider"""
self.email = Entry(self)
self.email.grid(row=0, column=1, columnspan=3)
self.password = Entry(self)
self.password.grid(row=1, column=1, columnspan=3)
self.spreadsheetURL = Entry(self)
self.spreadsheetURL.grid(row=2, column=1, columnspan=3)
self.spreadsheetName = Entry(self)
self.spreadsheetName.grid(row=3, column=1, columnspan=3)
self.keyword = Entry(self)
self.keyword.grid(row=4, column=1, columnspan=3)
# """Create the Button, set the text and the
# command that will be called when the button is clicked"""
self.btnDisplay = Button(self, text="Submit", command=self.Display)
self.btnDisplay.grid(row=5, column=4)
def Display(self):
# """Called when btnDisplay is clicked, displays the contents of self.enText"""
rows=[]
email=self.email.get()
password=self.password.get()
words=self.keyword.get()
keyword=words.replace(" ","+")
spreadsheet_url= self.spreadsheetURL.get()
spreadsheet_key=getKey(spreadsheet_url)
spreadsheet_name= self.spreadsheetName.get()
tkMessageBox.showinfo("Info", "Your email: %s" %self.email.get()+"\nYour spreadsheetKey: %s" %spreadsheet_key+"\nYour SpreadsheetName: %s" %self.spreadsheetName.get()+"\nYour Keyword(s): %s" %self.keyword.get())
start(email,password,spreadsheet_name,spreadsheet_key,keyword,rows)
#This Function does the final parsing of the rss feed(s) contained in the feedList array. it rips out the important metadata
#from each entry in the feed and adds that to a dictionary. Each entry's dictionary is in turn added to a master array that holds them all.
def rssFeed(keyword,rows):
print("Here are the RSS results===>")
feedList=["http://news.google.com/news?pz=1&cf=all&ned=us&hl=en&q=keyword&output=rss&num=100"] #A list of rss feeds of interest e.g google news, reddit...
for i in feedList:
#Uses the feedparser library to "clean up" the webpage for results creating a list of them.
myfeed=feedparser.parse(i.replace("keyword",keyword))
#Loops through the list and makes a dictionary of their data.
for post in myfeed.entries:
try:
print(post.title) #prints out each topic/title it finds
#This conditional statement is in place just incase we decide to use the Reddit rss feed (which is a mostly discussion thread) it searches through
## #the comment page using the urlFromReddit method for a link to the actual article up for discussion
if "reddit"in post.link:
url=urlFromReddit(post.link)
while url==None:
url=urlFromReddit(post.link)
rows.append({"startdate":dateConvert(post.published),"enddate":"","headline":post.title,"text":htmlContent(url).replace("\n"," ").replace("\t"," "), "media":url})
else:
x=post.link
y=x.find("&url=")
if y>0:
rows.append({"startdate":dateConvert(post.published),"enddate":"","headline":post.title,"text":htmlContent(x[y+5:]).replace("\n"," ").replace("\t"," "), "media":x[y+5:]})
else:
rows.append({"startdate":dateConvert(post.published),"enddate":"","headline":post.title,"text":htmlContent(x).replace("\n"," ").replace("\t"," "), "media": x})
except:
rows.append({"startdate":dateConvert(post.published),"enddate":"","headline":post.title, "text":"None", "media":post.link})
#prints a line when its done searching
print("_"*60)
return rows
#This method finds the actual article link in the Reddit discussion thread using the beautiful soup library
#it is custom to a reddit rss however as other rss feeds might have a different style
def urlFromReddit(link):
htmlT=urllib.urlopen(link).read()
soup=BeautifulSoup(htmlT)
for tag in soup.find_all(tabindex="1",href=True):
if tag.text in soup.html.title.text:
return tag['href']
#This method strips all the HTML tags from a html document effectively returning just content of the "<p>" tags. it is utilized to get the text(body) of the document.
def htmlContent(url):
string=""
test=urllib.urlopen(url).read()
soup=BeautifulSoup(test)
for tag in soup.findAll("p"):
string+=(tag.text)
return string
#This mnethod converts date and time format of the web document into the acceptable format for the timeline(e.g converts "9th june 2014 00:00:00" to
# "07/09/2014 00:00:00"
def dateConvert(date):
months=["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"]
return (str(months.index(date[8:11])+1)+"/"+ date[5:7]+"/"+date[12:26])
#This method is meant to extract the spreadsheet key from the spreadsheet url. the key is cumbersome to type in so this saves the user time
def getKey(url):
if "key=" in url:
y=url.find("key=")
x=y+4
else:
y=url.find("/d/")
x=y+3
return url[x:x+44]
#This is the Start method where the user is prompted for "keyword(s)", email adress, password and spreadsheet url and then the other methods that search the webpage for
# results are called. This method also populates the spreadsheet with the results found. since google spreadshet API requres its own format of headers, the cells of each column
#are updated then the columns populated. when its done, it calls the fixHeader method which reverts the headers to previous format.
def start(email,password,spreadsheet_name,spreadsheet_key,keyword,rows):
worksheet_id ='od6' #Default value for all spreadsheets
gc = gspread.login(email, password)
wks = gc.open(spreadsheet_name).sheet1
wks.update_acell('A1', "startdate")
wks.update_acell('C1', "headline")
wks.update_acell('D1', "text")
wks.update_acell('E1', "media")
newRows=rssFeed(keyword,rows)
client = gdata.spreadsheet.service.SpreadsheetsService()
client.debug = True
client.email = email
client.password = password
client.source = 'some description'
client.ProgrammaticLogin()
for row in newRows:
try:
client.InsertRow(row, spreadsheet_key, worksheet_id)
except Exception as e:
pass
print("="*90)
fixHeaders(email,password,spreadsheet_name)
#This methods literally just fixes the headers into the desired timeline maker format
def fixHeaders(email,password,spreadsheet_name):
gk=gspread.login(email, password)
wkss = gk.open(spreadsheet_name).sheet1
wkss.update_acell('A1', "Start Date")
wkss.update_acell('C1', "Headline")
wkss.update_acell('D1', "Text")
wkss.update_acell('E1', "Media")
if __name__ == '__main__':
guiFrame = GUIFramework()
guiFrame.mainloop()