Skip to content

Commit

Permalink
style: update code to make it more type safe, rename base_crawler module
Browse files Browse the repository at this point in the history
  • Loading branch information
YisusChrist committed Dec 26, 2024
1 parent 5680af3 commit d6e1a7c
Show file tree
Hide file tree
Showing 7 changed files with 70 additions and 31 deletions.
File renamed without changes.
14 changes: 9 additions & 5 deletions ososedki_dl/crawlers/_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from aiohttp import ClientResponseError, ClientSession
from bs4 import BeautifulSoup, ResultSet
from bs4.element import NavigableString, Tag
from rich import print
from rich.progress import Progress, TaskID

Expand Down Expand Up @@ -58,7 +59,7 @@ async def process_album(
if album_url.endswith("/"):
album_url = album_url[:-1]

soup: BeautifulSoup = await fetch_soup(session, album_url)
soup: BeautifulSoup | None = await fetch_soup(session, album_url)
if soup is None:
return []

Expand Down Expand Up @@ -100,12 +101,15 @@ def search_ososedki_title(
soup: BeautifulSoup, button_class: Optional[str] = None
) -> str:
if button_class:
button_html = soup.find("a", class_=button_class)
button_html: Tag | NavigableString | None = soup.find("a", class_=button_class)
if button_html:
print(f"Found button: {button_html.text}")
return button_html.text

text_div = soup.find("title")
text_div: Tag | NavigableString | None = soup.find("title")
if not text_div:
return "Unknown"

text: str = text_div.text.strip()
title: str = "Unknown"

Expand Down Expand Up @@ -165,7 +169,7 @@ def _get_article_title(soup: BeautifulSoup) -> str:
}

# Get the page article:tag and extract the title
article_tags: ResultSet[str] = soup.find_all("meta", {"property": "article:tag"})
article_tags: ResultSet[Any] = soup.find_all("meta", {"property": "article:tag"})
for article_tag in article_tags:
tag_content: str = article_tag.get("content", "")
for tag in tags:
Expand Down Expand Up @@ -204,8 +208,8 @@ async def find_model_albums(
soup: BeautifulSoup | None = await fetch_soup(session, model_url)
if not soup:
return
model_name: str = title_extractor(soup)

model_name: str = title_extractor(soup)
i = 1
albums_found = True

Expand Down
2 changes: 1 addition & 1 deletion ososedki_dl/crawlers/bunkrr_albums.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ async def find_albums(
progress: Progress,
task: TaskID,
) -> list[dict[str, str]]:
soup: BeautifulSoup = await fetch_soup(session, url)
soup: BeautifulSoup | None = await fetch_soup(session, url)
if not soup:
return []

Expand Down
14 changes: 11 additions & 3 deletions ososedki_dl/crawlers/eromexxx.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import tldextract
from aiohttp import ClientResponseError, ClientSession
from bs4 import BeautifulSoup
from bs4.element import NavigableString, Tag
from rich import print
from rich.progress import Progress, TaskID

Expand All @@ -30,8 +31,13 @@ async def download_profile(
soup: BeautifulSoup = await get_soup(session, profile_url)

# Get the total number of albums
header = soup.find("div", class_="header-title")
total_albums = int(header.find("span").text.split(" ")[1])
header: Tag | NavigableString | None = soup.find("div", class_="header-title")
if not header:
return []
span: Tag | NavigableString | None | int = header.find("span")
if not span or isinstance(span, int):
return []
total_albums = int(span.text.split(" ")[1])
print(f"Total_albums: {total_albums}")

# Get all album URLs from pagination
Expand Down Expand Up @@ -63,7 +69,9 @@ async def find_albums_with_pagination(
) -> list:
soup: BeautifulSoup = await get_soup(session, profile_url)
# Get pagination items
pagination = soup.find("ul", class_="pagination")
pagination: Tag | NavigableString | None = soup.find("ul", class_="pagination")
if not pagination or isinstance(pagination, NavigableString):
return []
# Get the last page number
try:
last_page = int(pagination.find_all("li")[-2].text)
Expand Down
35 changes: 26 additions & 9 deletions ososedki_dl/crawlers/husvjjal_blogspot.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from aiohttp import ClientSession
from bs4 import BeautifulSoup
from bs4.element import NavigableString, Tag
from requests import Response, Session
from rich import print
from rich.progress import Progress, TaskID
Expand Down Expand Up @@ -131,26 +132,40 @@ def husvjjal_blogspot_media_filter(soup: BeautifulSoup) -> list[str]:
if img_hostname and img_hostname == "i.postimg.cc":
urls.append(img)
continue

soup = get_soup(session=session, url=img)
download_link = soup.find(
download_link: Tag | NavigableString | None = soup.find(
"a",
{"id": "download"},
)
if download_link:
download_href: str = download_link.get("href", "").strip()
download_hostname: str | None = urlparse(download_href).hostname
if download_hostname and download_href.startswith("https://"):
urls.append(download_href)
if not download_link or isinstance(download_link, NavigableString):
continue

download_href: str | list[str] = download_link.get("href", "")
if isinstance(download_href, list):
download_href = download_href[0]
download_href = download_href.strip()
download_hostname: str | None = urlparse(download_href).hostname
if download_hostname and download_href.startswith("https://"):
urls.append(download_href)

for vid in videos:
soup = get_soup(session=session, url=vid)
js_script = soup.find(
js_script: Tag | NavigableString | None = soup.find(
"script",
{"type": "text/javascript"},
)
max_stream: dict[str, str] = get_max_stream(js_script.string)
if not js_script or isinstance(js_script, NavigableString):
continue

js_script_str: str | None = js_script.string
if not js_script_str:
continue

max_stream: dict[str, str] = get_max_stream(js_script_str)
if not max_stream:
continue

play_url: str = max_stream.get("play_url", "").strip()
play_hostname: str | None = urlparse(play_url).hostname
if play_hostname and play_url.startswith("https://"):
Expand Down Expand Up @@ -196,7 +211,9 @@ async def download_profile(
)
return results

soup: BeautifulSoup = await fetch_soup(session, profile_url)
soup: BeautifulSoup | None = await fetch_soup(session, profile_url)
if not soup:
return []

album_classes: list[str] = [
"card-image ratio o-hidden mask ratio-16:9",
Expand Down
28 changes: 18 additions & 10 deletions ososedki_dl/crawlers/wildskirts.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from aiohttp import ClientSession
from bs4 import BeautifulSoup
from bs4.element import NavigableString, Tag
from rich.progress import Progress, TaskID

from ososedki_dl.crawlers._common import download_media_items, fetch_soup
Expand All @@ -16,14 +17,19 @@


def get_total_items(soup: BeautifulSoup, item: str) -> int:
try:
return int(
soup.find("div", class_=f"text-center mx-4 cursor-pointer tab-{item}")
.find("p")
.text
)
except AttributeError:
content_div: Tag | NavigableString | None = soup.find(
"div", class_=f"text-center mx-4 cursor-pointer tab-{item}"
)
if not content_div:
return 0

paragraph: Tag | NavigableString | None | int = content_div.find("p")
if not paragraph:
return 0
if isinstance(paragraph, int):
return paragraph

return int(paragraph.text)


def wildskirts_media_filter(soup: BeautifulSoup) -> list[str]:
Expand All @@ -43,8 +49,8 @@ def wildskirts_media_filter(soup: BeautifulSoup) -> list[str]:


async def fetch_media_urls(session: ClientSession, url: str) -> list[str]:
soup: BeautifulSoup = await fetch_soup(session, url)
return wildskirts_media_filter(soup)
soup: BeautifulSoup | None = await fetch_soup(session, url)
return wildskirts_media_filter(soup) if soup else []


@main_entry
Expand All @@ -61,7 +67,9 @@ async def download_profile(

profile: str = profile_url.split("/")[-1]

soup: BeautifulSoup = await fetch_soup(session, profile_url)
soup: BeautifulSoup | None = await fetch_soup(session, profile_url)
if not soup:
return []

total_pictures: int = get_total_items(soup, "photos")
total_videos: int = get_total_items(soup, "videos")
Expand Down
8 changes: 5 additions & 3 deletions ososedki_dl/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,10 @@ async def _generic_fetch(
await sleep(5)
except ClientResponseError as e: # 4xx, 5xx errors
print(f"Failed to fetch {url} with status {e.status}")
response = requests.get(url, timeout=MAX_TIMEOUT, **kwargs)
response.raise_for_status()
response2: requests.Response = requests.get(
url, timeout=MAX_TIMEOUT, **kwargs
)
response2.raise_for_status()

# Dynamically access the specified response property
if hasattr(response, response_property):
Expand Down Expand Up @@ -153,7 +155,7 @@ async def download_and_save_media(
response: requests.Response = requests.head(
url, headers=headers, timeout=MAX_TIMEOUT
)
content_type: str = response.headers.get("Content-Type")
content_type: str | None = response.headers.get("Content-Type")
if not content_type:
print(f"Failed to get content type for {url}")
else:
Expand Down

0 comments on commit d6e1a7c

Please sign in to comment.