style: update code to make it more type safe, rename base_crawler module

YisusChrist · Dec 26, 2024 · d6e1a7c · d6e1a7c
1 parent 5680af3
commit d6e1a7c
Show file tree

Hide file tree

Showing 7 changed files with 70 additions and 31 deletions.
diff --git a/ososedki_dl/crawlers/base_crawler.py → ososedki_dl/crawlers/_base_crawler.py b/ososedki_dl/crawlers/base_crawler.py → ososedki_dl/crawlers/_base_crawler.py
diff --git a/ososedki_dl/crawlers/_common.py b/ososedki_dl/crawlers/_common.py
@@ -9,6 +9,7 @@
 
 from aiohttp import ClientResponseError, ClientSession
 from bs4 import BeautifulSoup, ResultSet
+from bs4.element import NavigableString, Tag
 from rich import print
 from rich.progress import Progress, TaskID
 
@@ -58,7 +59,7 @@ async def process_album(
     if album_url.endswith("/"):
         album_url = album_url[:-1]
 
-    soup: BeautifulSoup = await fetch_soup(session, album_url)
+    soup: BeautifulSoup | None = await fetch_soup(session, album_url)
     if soup is None:
         return []
 
@@ -100,12 +101,15 @@ def search_ososedki_title(
     soup: BeautifulSoup, button_class: Optional[str] = None
 ) -> str:
     if button_class:
-        button_html = soup.find("a", class_=button_class)
+        button_html: Tag | NavigableString | None = soup.find("a", class_=button_class)
         if button_html:
             print(f"Found button: {button_html.text}")
             return button_html.text
 
-    text_div = soup.find("title")
+    text_div: Tag | NavigableString | None = soup.find("title")
+    if not text_div:
+        return "Unknown"
+
     text: str = text_div.text.strip()
     title: str = "Unknown"
 
@@ -165,7 +169,7 @@ def _get_article_title(soup: BeautifulSoup) -> str:
     }
 
     # Get the page article:tag and extract the title
-    article_tags: ResultSet[str] = soup.find_all("meta", {"property": "article:tag"})
+    article_tags: ResultSet[Any] = soup.find_all("meta", {"property": "article:tag"})
     for article_tag in article_tags:
         tag_content: str = article_tag.get("content", "")
         for tag in tags:
@@ -204,8 +208,8 @@ async def find_model_albums(
     soup: BeautifulSoup | None = await fetch_soup(session, model_url)
     if not soup:
         return
-    model_name: str = title_extractor(soup)
 
+    model_name: str = title_extractor(soup)
     i = 1
     albums_found = True
 

diff --git a/ososedki_dl/crawlers/bunkrr_albums.py b/ososedki_dl/crawlers/bunkrr_albums.py
@@ -31,7 +31,7 @@ async def find_albums(
     progress: Progress,
     task: TaskID,
 ) -> list[dict[str, str]]:
-    soup: BeautifulSoup = await fetch_soup(session, url)
+    soup: BeautifulSoup | None = await fetch_soup(session, url)
     if not soup:
         return []
 

diff --git a/ososedki_dl/crawlers/eromexxx.py b/ososedki_dl/crawlers/eromexxx.py
@@ -5,6 +5,7 @@
 import tldextract
 from aiohttp import ClientResponseError, ClientSession
 from bs4 import BeautifulSoup
+from bs4.element import NavigableString, Tag
 from rich import print
 from rich.progress import Progress, TaskID
 
@@ -30,8 +31,13 @@ async def download_profile(
     soup: BeautifulSoup = await get_soup(session, profile_url)
 
     # Get the total number of albums
-    header = soup.find("div", class_="header-title")
-    total_albums = int(header.find("span").text.split(" ")[1])
+    header: Tag | NavigableString | None = soup.find("div", class_="header-title")
+    if not header:
+        return []
+    span: Tag | NavigableString | None | int = header.find("span")
+    if not span or isinstance(span, int):
+        return []
+    total_albums = int(span.text.split(" ")[1])
     print(f"Total_albums: {total_albums}")
 
     # Get all album URLs from pagination
@@ -63,7 +69,9 @@ async def find_albums_with_pagination(
 ) -> list:
     soup: BeautifulSoup = await get_soup(session, profile_url)
     # Get pagination items
-    pagination = soup.find("ul", class_="pagination")
+    pagination: Tag | NavigableString | None = soup.find("ul", class_="pagination")
+    if not pagination or isinstance(pagination, NavigableString):
+        return []
     # Get the last page number
     try:
         last_page = int(pagination.find_all("li")[-2].text)

diff --git a/ososedki_dl/crawlers/husvjjal_blogspot.py b/ososedki_dl/crawlers/husvjjal_blogspot.py
@@ -8,6 +8,7 @@
 
 from aiohttp import ClientSession
 from bs4 import BeautifulSoup
+from bs4.element import NavigableString, Tag
 from requests import Response, Session
 from rich import print
 from rich.progress import Progress, TaskID
@@ -131,26 +132,40 @@ def husvjjal_blogspot_media_filter(soup: BeautifulSoup) -> list[str]:
             if img_hostname and img_hostname == "i.postimg.cc":
                 urls.append(img)
                 continue
+
             soup = get_soup(session=session, url=img)
-            download_link = soup.find(
+            download_link: Tag | NavigableString | None = soup.find(
                 "a",
                 {"id": "download"},
             )
-            if download_link:
-                download_href: str = download_link.get("href", "").strip()
-                download_hostname: str | None = urlparse(download_href).hostname
-                if download_hostname and download_href.startswith("https://"):
-                    urls.append(download_href)
+            if not download_link or isinstance(download_link, NavigableString):
+                continue
+
+            download_href: str | list[str] = download_link.get("href", "")
+            if isinstance(download_href, list):
+                download_href = download_href[0]
+            download_href = download_href.strip()
+            download_hostname: str | None = urlparse(download_href).hostname
+            if download_hostname and download_href.startswith("https://"):
+                urls.append(download_href)
 
         for vid in videos:
             soup = get_soup(session=session, url=vid)
-            js_script = soup.find(
+            js_script: Tag | NavigableString | None = soup.find(
                 "script",
                 {"type": "text/javascript"},
             )
-            max_stream: dict[str, str] = get_max_stream(js_script.string)
+            if not js_script or isinstance(js_script, NavigableString):
+                continue
+
+            js_script_str: str | None = js_script.string
+            if not js_script_str:
+                continue
+
+            max_stream: dict[str, str] = get_max_stream(js_script_str)
             if not max_stream:
                 continue
+
             play_url: str = max_stream.get("play_url", "").strip()
             play_hostname: str | None = urlparse(play_url).hostname
             if play_hostname and play_url.startswith("https://"):
@@ -196,7 +211,9 @@ async def download_profile(
             )
         return results
 
-    soup: BeautifulSoup = await fetch_soup(session, profile_url)
+    soup: BeautifulSoup | None = await fetch_soup(session, profile_url)
+    if not soup:
+        return []
 
     album_classes: list[str] = [
         "card-image ratio o-hidden mask ratio-16:9",

diff --git a/ososedki_dl/crawlers/wildskirts.py b/ososedki_dl/crawlers/wildskirts.py
@@ -5,6 +5,7 @@
 
 from aiohttp import ClientSession
 from bs4 import BeautifulSoup
+from bs4.element import NavigableString, Tag
 from rich.progress import Progress, TaskID
 
 from ososedki_dl.crawlers._common import download_media_items, fetch_soup
@@ -16,14 +17,19 @@
 
 
 def get_total_items(soup: BeautifulSoup, item: str) -> int:
-    try:
-        return int(
-            soup.find("div", class_=f"text-center mx-4 cursor-pointer tab-{item}")
-            .find("p")
-            .text
-        )
-    except AttributeError:
+    content_div: Tag | NavigableString | None = soup.find(
+        "div", class_=f"text-center mx-4 cursor-pointer tab-{item}"
+    )
+    if not content_div:
+        return 0
+
+    paragraph: Tag | NavigableString | None | int = content_div.find("p")
+    if not paragraph:
         return 0
+    if isinstance(paragraph, int):
+        return paragraph
+
+    return int(paragraph.text)
 
 
 def wildskirts_media_filter(soup: BeautifulSoup) -> list[str]:
@@ -43,8 +49,8 @@ def wildskirts_media_filter(soup: BeautifulSoup) -> list[str]:
 
 
 async def fetch_media_urls(session: ClientSession, url: str) -> list[str]:
-    soup: BeautifulSoup = await fetch_soup(session, url)
-    return wildskirts_media_filter(soup)
+    soup: BeautifulSoup | None = await fetch_soup(session, url)
+    return wildskirts_media_filter(soup) if soup else []
 
 
 @main_entry
@@ -61,7 +67,9 @@ async def download_profile(
 
     profile: str = profile_url.split("/")[-1]
 
-    soup: BeautifulSoup = await fetch_soup(session, profile_url)
+    soup: BeautifulSoup | None = await fetch_soup(session, profile_url)
+    if not soup:
+        return []
 
     total_pictures: int = get_total_items(soup, "photos")
     total_videos: int = get_total_items(soup, "videos")

diff --git a/ososedki_dl/download.py b/ososedki_dl/download.py
@@ -75,8 +75,10 @@ async def _generic_fetch(
             await sleep(5)
         except ClientResponseError as e:  # 4xx, 5xx errors
             print(f"Failed to fetch {url} with status {e.status}")
-            response = requests.get(url, timeout=MAX_TIMEOUT, **kwargs)
-            response.raise_for_status()
+            response2: requests.Response = requests.get(
+                url, timeout=MAX_TIMEOUT, **kwargs
+            )
+            response2.raise_for_status()
 
             # Dynamically access the specified response property
             if hasattr(response, response_property):
@@ -153,7 +155,7 @@ async def download_and_save_media(
         response: requests.Response = requests.head(
             url, headers=headers, timeout=MAX_TIMEOUT
         )
-        content_type: str = response.headers.get("Content-Type")
+        content_type: str | None = response.headers.get("Content-Type")
         if not content_type:
             print(f"Failed to get content type for {url}")
         else: