-
Notifications
You must be signed in to change notification settings - Fork 78
/
Copy pathExtractMails.py
65 lines (50 loc) · 2.31 KB
/
ExtractMails.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import urllib
import urllib2
import re
import mechanize
import cookielib
import requests
try:
from bs4 import BeautifulSoup
except Exception as e:
print("pip install beautifulsoup4")
exit(1)
class ExtractMails:
def create_browser(self):
br = mechanize.Browser() # Create basic browser
cj = cookielib.LWPCookieJar() # Create cookiejar to handle cookies
br.set_cookiejar(cj) # Set cookie jar for our browser
#Browser options
br.set_handle_equiv(True) # Allow opening of certain files
br.set_handle_gzip(False) # Allow handling of zip files(experimental option)
br.set_handle_redirect(True) # Automatically handle auto-redirects
br.set_handle_referer(True)
br.set_handle_robots(False) # ignore anti-robots.txt
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
# Necessary headers to simulate an actual browser
br.addheaders = [('User-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36'),
('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'),
('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.3'),
('Accept-Encoding', 'gzip,deflate,sdch'),
('Accept-Language', 'en-US,en;q=0.8,fr;q=0.6'),
('Connection', 'keep-alive')
]
return br
#obtain mails from a specific domain
def obtain_mails(self,domain):
br = self.create_browser()
f = open("mails.txt","wb")
print "Obtaining mail info from domain "+ domain
if domain.startswith("http") == False:
response = requests.get("http://"+domain)
else:
response = requests.get(domain)
contents = response.content
soup = BeautifulSoup(contents,'lxml')
links = soup.find('a',href=re.compile("mailto"))
if links:
print links['href'].replace("mailto:","")
f.write(links['href'].replace("mailto:","")+'\n')
f.close()