-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWeb_scraping_using_Beautiful_soup
87 lines (72 loc) · 2.97 KB
/
Web_scraping_using_Beautiful_soup
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
import csv
#returns a list of all the genres
def all_topics():
alltopics = []
url = "https://www.amazon.in/gp/bestsellers/books/ref=zg_bs_pg_1?ie=UTF8&pg=1
req = requests.get(url)
soup = BeautifulSoup(req.content , "html.parser")
for d in soup.findAll('div', attrs={'class':"a-fixed-left-grid-col a-col-left"}):
d = d.find('ul').find('ul')
for tag in d.find_all(re.compile("^a")):
alltopics.append(str(tag.string))
return(alltopics)
def book_details():
names = ["BOOKS"]
ratings = ["RATINGS"]
authors = ["AUTHORS"]
prices = ["PRICE"]
positions = ["POPULARITY"]
no_of_reviews = ["NUMBER OF REVIEWS"]
image_urls = ["LINKS TO THE COVER OF THE BOOK"]
#There are 2 pages that are being scraped
pgNo = 1
while pgNo < 3:
url = "https://www.amazon.in/gp/bestsellers/books/ref=zg_bs_pg_"+str(pgNo)+"?ie=UTF8&pg="+str(pgNo)
#The URL that is scraped
pgNo += 1
req = requests.get(url)
soup = BeautifulSoup(req.content , "html.parser")
for d in soup.findAll('ol', attrs={'class':"a-ordered-list a-vertical" , "id": "zg-ordered-list" ,"role" : "grid"}):
for tag in d.find_all(re.compile("^li")):
bookname = tag.find('div' , attrs={"aria-hidden":"true","class":"p13n-sc-truncate p13n-sc-line-clamp-1" , "data-rows":"1"} )
rating = tag.find('span' , attrs = {"class":"a-icon-alt"})
price = tag.find('span' , attrs = {"class" :'p13n-sc-price'} )
author = tag.find('a' , attrs = {"class" : "a-size-small a-link-child"})
position = tag.find('span' , attrs={"class": 'zg-badge-text'})
review = tag.find('a' , attrs={"class" :"a-size-small a-link-normal"})
names.append(str(bookname.string).lstrip(' ').replace('\n' , ''))
positions.append(position.string)
img_url = tag.find('div' , attrs={"class":"a-section a-spacing-small"})
if img_url.find('img') == None:
image_urls.append("NA")
else:
image_urls.append(img_url.find('img').get('src'))
if review == None:
no_of_reviews.append("NA")
else:
no_of_reviews.append(str(review.text).replace('\xa0', ""))
if price == None:
prices.append("NA")
else:
prices.append(str(price.string)[1:].replace('\xa0' , ""))
if author == None:
authors.append("NA")
else:
authors.append(author.string)
if rating == None:
ratings.append("NA")
else:
ratings.append(str(rating.text).replace('\xa0', ""))
for i,j in enumerate(names):
names[i] = names[i].strip(' ')
details = (list(zip(positions ,names , authors , ratings , prices , no_of_reviews , image_urls)))
return(details)
#The contents are written into a CSV file
with open('Top100books.csv', 'w') as f:
writer = csv.writer(f , lineterminator='\n')
for tup in book_details():
writer.writerow(tup)