-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAVBY_async_parser.py
97 lines (78 loc) · 4.25 KB
/
AVBY_async_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from requests import get
from lxml import html
from json import dump
from csv import DictWriter
from math import ceil
from time import time
import asyncio
import aiohttp
HEADERS = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;"
"q=0.8,application/signed-exchange;v=b3;q=0.9",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/91.0.4472.106 Safari/537.36"}
# test on BMW 5 (all generations for broad selection):
TARGET_URL = "https://cars.av.by/filter?brands[0][brand]=8&brands[0][model]=5865"
# uncomment line below to specify other model (url of search page from AV.BY):
# TARGET_URL = input("Link to specific model >> ")
def parser(url: str, save_as_file: str) -> None:
"""Parses search-page of specific model on AV.BY, saves data to .json/-csv"""
def total_pages() -> int:
"""Calculates number of pages to go through"""
resp = get(url, headers=HEADERS)
number_of_results = int(html.fromstring(resp.text).xpath("//h3[@class='listing__title']/text()")[2])
pages = ceil(number_of_results / 25)
print(f"Found {number_of_results} results on {pages} pages")
return pages
async def parse_page(current_page: int, storage: list) -> None:
"""Extracts information from one given page, saves to storage"""
async with aiohttp.request("GET", url=url + f"&page={current_page}", headers=HEADERS) as response:
content = await response.text()
res = html.fromstring(content).xpath("//div[@class='listing__items']")[0]
def f(name: str):
"""Helper with aim to improve readability of data-collecting block"""
return {
"age": lambda x: int(x.split()[0]),
"kms": lambda x: int(x.replace("\xa0", "").replace("\u2009", "").split("км")[0]),
"byn": lambda x: int(x.replace("\xa0", "").replace("\u2009", "")[:-2]),
"usd": lambda x: int(x.replace("\xa0", "").replace("\u2009", "")[1:-1]),
"vol": lambda x: float(x.split()[0]),
}[name]
# collecting information:
links = res.xpath(".//a[@class='listing-item__link']/@href")
prices_byn = map(f("byn"), res.xpath(".//div[@class='listing-item__price']/text()"))
prices_usd = map(f("usd"), res.xpath(".//div[@class='listing-item__priceusd']/text()"))
year = map(f("age"), res.xpath(".//div[@class='listing-item__params']/div[1]/text()"))
mile = map(f("kms"), res.xpath(".//div[@class='listing-item__params']/div[3]/span/text()"))
engine_params = res.xpath(".//div[@class='listing-item__params']/div[2]/text()")
engine_volume = map(f("vol"), engine_params[2::7])
engine_fuel = engine_params[4::7]
for c, y, f, v, m, b, u in zip(links, year, engine_fuel, engine_volume, mile, prices_byn, prices_usd):
storage.append({
"link": "https://cars.av.by" + c,
"year": y,
"engine": v,
"fuel": f,
"mileage": m,
"price_byn": b,
"price_usd": u,
})
print(f"Processed page {current_page}")
async def parse_all_pages() -> list:
"""Creates tasks to parse all pages, returns list of results"""
search_result = list()
await asyncio.gather(*[parse_page(current_page=p,
storage=search_result) for p in range(1, total_pages() + 1)])
return search_result
data: list = sorted(asyncio.run(parse_all_pages()), key=lambda x: x["price_usd"], reverse=True)
with open(f"{save_as_file}.json", "w") as json_fw, open(f"{save_as_file}.csv", "w", newline="") as csv_fw:
dump(data, json_fw, indent=2)
writer = DictWriter(csv_fw, data[0].keys(), delimiter=";")
writer.writeheader()
writer.writerows(data)
print("Saved to files")
if __name__ == "__main__":
start_time = time()
parser(TARGET_URL, "bmw")
end_time = time()
print(f"Completed in {round(end_time - start_time, 2)} sec.")