-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathjobs_indeed.py
104 lines (85 loc) · 3.91 KB
/
jobs_indeed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import urllib
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import telepot
# telegram bot info
token =''
receiver_id = 799755225
HEADERS ={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}
def flatten_list(list_of_lists):
# flatten a list of lists
return [item for sublist in list_of_lists for item in sublist]
def get_jobs_list(url, job_title, location, page_number):
# create a dictionary of properties to attach to the end of the base url
# publication_date (fromage) can take one of these values {1, 3, 7, 14, last}
# (1) last 24 hours, (3) last three days, (7) last seven days, (last) since last visit
query_args = {'q': job_title, 'l': location, 'fromage': 1, 'start' : page_number}
# designate the base url
url_cpt = url+urllib.parse.urlencode(query_args)
print(url_cpt)
page = requests.get(url_cpt, headers=HEADERS)
soup = BeautifulSoup(page.content, 'html.parser')
return soup.find(class_="jobsearch-ResultsList")
def get_job_details(jobs_list, country, titles, companies, links, dates):
# get all divs == jobs in the list
assert jobs_list is not None, 'No new jobs found'
job_divs = jobs_list.find_all('div', class_='slider_container')
# get titles
titles.append([div.find('h2', class_='jobTitle').text.strip() for div in job_divs])
# get companies
companies.append([div.find('span', class_='companyName').text.strip() for div in job_divs])
# get links
links.append([country+'.indeed.com' + div.find('a')['href'] for div in job_divs])
# get dates
dates.append([div.find('span', class_='date').text.strip()[6:] for div in job_divs])
return titles, companies, links, dates
def scrape_jobs(website, country, job_title, location):
# data to scrape
titles, companies, links, dates = [], [], [], []
if website == 'indeed':
if country == 'ca':
url = 'http://ca.indeed.com/jobs?'
elif country == 'ma':
url = 'http://ma.indeed.com/jobs?'
elif country == 'fr':
url = 'http://fr.indeed.com/jobs?'
for page_number in range(0, 21, 10):
jobs_list = get_jobs_list(url, job_title, location, page_number)
get_job_details(jobs_list, country, titles, companies, links, dates)
titles = flatten_list(titles)
companies = flatten_list(companies)
links = flatten_list(links)
dates = flatten_list(dates)
return pd.DataFrame({'Job Title': titles, 'Company': companies, 'Link': links, 'Date': dates})
"""
from urllib.parse import urlparse, parse_qs
URL='https://ma.indeed.com/jobs?q=data+scientist&l=Casablanca&radius=0&fromage=1&vjk=200ef90494131fbd'
parsed_url = urlparse(URL)
parse_qs(parsed_url.query)"""
def send_telegram_message(token, df):
# send a message to the telegram bot
bot = telepot.Bot(token)
for index, row in df.iterrows():
bot.sendMessage(receiver_id, row['Job Title'] + ' - ' + row['Company'] + ' - ' + row['Link'])
def main(csv_file, job_title, location, country):
# scrape jobs and compare with the csv file
df_old = pd.read_csv(csv_file)
found = False
try:
df = scrape_jobs('indeed', country, job_title, location)
df.to_csv(csv_file, index=False)
found = True
except Exception:
print('No jobs found')
if found:
df_new = pd.concat([df_old, df]).drop_duplicates(keep=False)
"""if not df_new.empty:
send_telegram_message(token, df_new)"""
if __name__ == '__main__':
job_title = 'stage data science'
location = ''
#main('jobs_ca.csv', job_title, location, 'ca')
#main('jobs_ma.csv', job_title, location, 'ma')
main('jobs_fr.csv', job_title, location, 'fr')