feat: add alias support in parsing when scraping torrents. several ot…

…her tweaks.
rivenmedia · Sep 19, 2024 · 365f022 · 365f022
1 parent 2f15fbd
commit 365f022
Show file tree

Hide file tree

Showing 10 changed files with 441 additions and 383 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/src/program/content/trakt.py b/src/program/content/trakt.py
@@ -76,24 +76,21 @@ def missing(self):
     def run(self):
         """Fetch media from Trakt and yield Movie, Show, or MediaItem instances."""
 
-        def fetch_items(fetch_function, *args):
-            """Helper function to fetch items using the provided function and arguments."""
-            return fetch_function(*args) if args else []
-
-        watchlist_ids = fetch_items(self._get_watchlist, self.settings.watchlist)
-        collection_ids = fetch_items(self._get_collection, self.settings.collection)
-        user_list_ids = fetch_items(self._get_list, self.settings.user_lists)
-        trending_ids = fetch_items(self._get_trending_items) if self.settings.fetch_trending else []
-        popular_ids = fetch_items(self._get_popular_items) if self.settings.fetch_popular else []
+        watchlist_ids = self._get_watchlist(self.settings.watchlist) if self.settings.watchlist else []
+        collection_ids = self._get_collection(self.settings.collection) if self.settings.collection else []
+        user_list_ids = self._get_list(self.settings.user_lists) if self.settings.user_lists else []
+        trending_ids = self._get_trending_items() if self.settings.fetch_trending else []
+        popular_ids = self._get_popular_items() if self.settings.fetch_popular else []
 
         # Combine all IMDb IDs and types into a set to avoid duplicates
         all_ids = set(watchlist_ids + collection_ids + user_list_ids + trending_ids + popular_ids)
 
-        items_to_yield = [
-            MediaItem({"imdb_id": imdb_id, "requested_by": self.key})
-            for imdb_id in all_ids
-            if imdb_id.startswith("tt")
-        ]
+        items_to_yield = []
+        for imdb_id, _ in all_ids:
+            items_to_yield.append(MediaItem({"imdb_id": imdb_id, "requested_by": self.key}))
+
+        if not items_to_yield:
+            return
 
         non_existing_items = _filter_existing_items(items_to_yield)
         new_non_recurring_items = [

diff --git a/src/program/indexers/trakt.py b/src/program/indexers/trakt.py
@@ -150,8 +150,10 @@ def _map_item_from_data(data, item_type: str, show_genres: List[str] = None) ->
 
     match item_type:
         case "movie":
+            item["aliases"] = get_show_aliases(item["imdb_id"], "movies")
             return Movie(item)
         case "show":
+            item["aliases"] = get_show_aliases(item["imdb_id"], "shows")
             return Show(item)
         case "season":
             item["number"] = data.number
@@ -175,11 +177,34 @@ def _get_formatted_date(data, item_type: str) -> Optional[datetime]:
 
 def get_show(imdb_id: str) -> dict:
     """Wrapper for trakt.tv API show method."""
+    if not imdb_id:
+        return {}
     url = f"https://api.trakt.tv/shows/{imdb_id}/seasons?extended=episodes,full"
     response = get(url, timeout=30, additional_headers={"trakt-api-version": "2", "trakt-api-key": CLIENT_ID})
     return response.data if response.is_ok and response.data else {}
 
 
+def get_show_aliases(imdb_id: str, item_type: str) -> List[dict]:
+    """Wrapper for trakt.tv API show method."""
+    if not imdb_id:
+        return []
+    url = f"https://api.trakt.tv/{item_type}/{imdb_id}/aliases"
+    response = get(url, timeout=30, additional_headers={"trakt-api-version": "2", "trakt-api-key": CLIENT_ID})
+    if response.is_ok and response.data:
+        aliases = {}
+        for ns in response.data:
+            country = ns.country
+            title = ns.title
+            if title.startswith("Anime-"):
+                title = title[len("Anime-"):]
+            if country not in aliases:
+                aliases[country] = []
+            if title not in aliases[country]:
+                aliases[country].append(title)
+        return aliases
+    return {}
+
+
 def create_item_from_imdb_id(imdb_id: str, type: str = None) -> Optional[MediaItem]:
     """Wrapper for trakt.tv API search method."""
     url = f"https://api.trakt.tv/search/imdb/{imdb_id}?extended=full"

diff --git a/src/program/media/item.py b/src/program/media/item.py
@@ -42,6 +42,7 @@ class MediaItem(db.Model):
     file: Mapped[Optional[str]] = mapped_column(sqlalchemy.String, nullable=True)
     folder: Mapped[Optional[str]] = mapped_column(sqlalchemy.String, nullable=True)
     alternative_folder: Mapped[Optional[str]] = mapped_column(sqlalchemy.String, nullable=True)
+    aliases: Mapped[Optional[dict]] = mapped_column(sqlalchemy.JSON, default={})
     is_anime: Mapped[Optional[bool]] = mapped_column(sqlalchemy.Boolean, default=False)
     title: Mapped[Optional[str]] = mapped_column(sqlalchemy.String, nullable=True)
     imdb_id: Mapped[Optional[str]] = mapped_column(sqlalchemy.String, nullable=True)
@@ -121,6 +122,7 @@ def __init__(self, item: dict | None) -> None:
         self.aired_at = item.get("aired_at")
         self.year = item.get("year")
         self.genres = item.get("genres", [])
+        self.aliases = item.get("aliases", {})
 
         # Plex related
         self.key = item.get("key")
@@ -155,13 +157,6 @@ def blacklist_stream(self, stream: Stream):
     def is_released(self) -> bool:
         """Check if an item has been released."""
         if self.aired_at and self.aired_at <= datetime.now():
-            # time_until_release = self.aired_at - datetime.now()
-            # days, seconds = time_until_release.days, time_until_release.seconds
-            # hours = seconds // 3600
-            # minutes = (seconds % 3600) // 60
-            # seconds = seconds % 60
-            # time_message = f"{self.log_string} will be released in {days} days, {hours:02}:{minutes:02}:{seconds:02}"
-            # logger.log("ITEM", time_message)
             return True
         return False
 
@@ -297,13 +292,21 @@ def set(self, key, value):
 
     def get_top_title(self) -> str:
         """Get the top title of the item."""
-        match self.__class__.__name__:
-            case "Season":
-                return self.parent.title
-            case "Episode":
-                return self.parent.parent.title
-            case _:
-                return self.title
+        if self.type == "season":
+            return self.parent.title
+        elif self.type == "episode":
+            return self.parent.parent.title
+        else:
+            return self.title
+
+    def get_aliases(self) -> dict:
+        """Get the aliases of the item."""
+        if self.type == "season":
+            return self.parent.aliases
+        elif self.type == "episode":
+            return self.parent.parent.aliases
+        else:
+            return self.aliases
 
     def __hash__(self):
         return hash(self.item_id)

diff --git a/src/program/scrapers/knightcrawler.py b/src/program/scrapers/knightcrawler.py
@@ -110,7 +110,7 @@ def api_scrape(self, item: MediaItem) -> tuple[Dict[str, str], int]:
             # This should help with Special episodes and other misc. names
             stream_title = stream.title.split("\n")[:-1]
             joined_title = "\n".join(stream_title)
-            raw_title = joined_title.split("/")[-1] if isinstance(item, Episode) else joined_title.split("\n")[0]
+            raw_title = joined_title.split("\n")[0]
 
             torrents[stream.infoHash] = raw_title
 

diff --git a/src/program/scrapers/orionoid.py b/src/program/scrapers/orionoid.py
@@ -28,7 +28,7 @@ def __init__(self):
             self.initialized = True
         else:
             return
-        self.second_limiter = RateLimiter(max_calls=1, period=5) if self.settings.ratelimit else None
+        self.second_limiter = RateLimiter(max_calls=1, period=5)
         logger.success("Orionoid initialized!")
 
     def validate(self) -> bool:
@@ -106,10 +106,7 @@ def run(self, item: MediaItem) -> Dict[str, str]:
         try:
             return self.scrape(item)
         except RateLimitExceeded:
-            if self.second_limiter:
-                self.second_limiter.limit_hit()
-            else:
-                logger.warning(f"Orionoid rate limit exceeded for item: {item.log_string}")
+            self.second_limiter.limit_hit()
         except ConnectTimeout:
             logger.warning(f"Orionoid connection timeout for item: {item.log_string}")
         except ReadTimeout:
@@ -167,10 +164,7 @@ def api_scrape(self, item: MediaItem) -> tuple[Dict, int]:
             imdb_id = item.parent.parent.imdb_id
             url = self.construct_url("show", imdb_id, season=item.parent.number, episode=item.number)
 
-        if self.second_limiter:
-            with self.second_limiter:
-                response = get(url, timeout=self.timeout)
-        else:
+        with self.second_limiter:
             response = get(url, timeout=self.timeout)
 
         if not response.is_ok or not hasattr(response.data, "data"):
@@ -180,7 +174,6 @@ def api_scrape(self, item: MediaItem) -> tuple[Dict, int]:
         for stream in response.data.data.streams:
             if not stream.file.hash or not stream.file.name:
                 continue
-
             torrents[stream.file.hash] = stream.file.name
 
         return torrents, len(response.data.data.streams)
diff --git a/src/program/scrapers/shared.py b/src/program/scrapers/shared.py
@@ -1,15 +1,13 @@
 """Shared functions for scrapers."""
-
-from datetime import datetime
-from typing import Dict, Set
+from typing import Dict, Set, Union
 
 from program.media.item import Episode, MediaItem, Movie, Season, Show
 from program.media.stream import Stream
 from program.settings.manager import settings_manager
 from program.settings.versions import models
-from RTN import RTN, Torrent, sort_torrents
+from RTN import RTN, ParsedData, Torrent, sort_torrents
 from RTN.exceptions import GarbageTorrent
-from utils.ignore import get_ignore_hashes
+from program.media.state import States
 from utils.logger import logger
 
 settings_model = settings_manager.settings.ranking
@@ -37,12 +35,11 @@ def _parse_results(item: MediaItem, results: Dict[str, str]) -> Dict[str, Stream
     torrents: Set[Torrent] = set()
     processed_infohashes: Set[str] = set()
     correct_title: str = item.get_top_title()
-    ignore_hashes: set = get_ignore_hashes()
 
     logger.log("SCRAPER", f"Processing {len(results)} results for {item.log_string}")
 
-    if isinstance(item, Show):
-        needed_seasons = [season.number for season in item.seasons]
+    if item.type in ["show", "season", "episode"]:
+        needed_seasons: list[int] = _get_needed_seasons(item)
 
     for infohash, raw_title in results.items():
         if infohash in processed_infohashes:
@@ -53,79 +50,106 @@ def _parse_results(item: MediaItem, results: Dict[str, str]) -> Dict[str, Stream
                 raw_title=raw_title,
                 infohash=infohash,
                 correct_title=correct_title,
-                remove_trash=settings_manager.settings.ranking.options["remove_all_trash"]
+                remove_trash=settings_manager.settings.ranking.options["remove_all_trash"],
+                aliases=item.get_aliases(),
             )
 
-            if not torrent or not torrent.fetch:
-                continue
-
-            if isinstance(item, Movie):
-                if not torrent.data.year or not hasattr(item, "aired_at") or not item.aired_at or item.aired_at > datetime.now():
+            if torrent.data.country and not item.is_anime:
+                if _get_item_country(item) != torrent.data.country:
+                    if settings_manager.settings.scraping.parse_debug:
+                        logger.debug(f"Skipping torrent for incorrect country with {item.log_string}: {raw_title}")
                     continue
 
-                year_range = [item.aired_at.year - 1, item.aired_at.year, item.aired_at.year + 1]
-                if torrent.data.year not in year_range:
+            if item.type in ["show", "season", "episode"]:
+                if torrent.data.complete:
+                    torrents.add(torrent)
+                    processed_infohashes.add(infohash)
                     continue
 
-                torrents.add(torrent)
+            if item.type == "movie":
+                if _check_item_year(item, torrent.data):
+                    torrents.add(torrent)
 
-            elif isinstance(item, Show):
-                if not needed_seasons:
-                    logger.error(f"No seasons found for {item.log_string}")
-                    break
-                if (
-                    hasattr(torrent.data, "seasons")
-                    and len(torrent.data.seasons) >= (len(needed_seasons) - 1)
-                    and (
-                        not hasattr(torrent.data, "episodes")
-                        or len(torrent.data.episodes) == 0
-                    )
-                    or torrent.data.complete
-                ):
+            elif item.type == "show":
+                if torrent.data.seasons and not torrent.data.episodes:
+                    # We subtract one because Trakt doesn't always index 
+                    # shows according to uploaders
+                    if len(torrent.data.seasons) >= (len(needed_seasons) - 1):
+                        torrents.add(torrent)
+
+            elif item.type == "season":
+                # If the torrent has the needed seasons and no episodes, we can add it
+                if any(season in torrent.data.seasons for season in needed_seasons) and not torrent.data.episodes:
                     torrents.add(torrent)
 
-            elif isinstance(item, Season):
+            elif item.type == "episode":
+                # If the torrent has the season and episode numbers, we can add it
                 if (
-                    len(getattr(torrent.data, "seasons", [])) == 1
-                    and item.number in torrent.data.seasons
-                    and (
-                        not hasattr(torrent.data, "episodes")
-                        or len(torrent.data.episodes) == 0
-                    )
-                    or torrent.data.complete
+                    item.number in torrent.data.episodes
+                    and item.parent.number in torrent.data.seasons
                 ):
                     torrents.add(torrent)
-
-            elif isinstance(item, Episode) and (
-                item.number in torrent.data.episodes
-                and (
-                    not hasattr(torrent.data, "seasons")
-                    or item.parent.number in torrent.data.seasons
-                )
-                or torrent.data.complete
-            ):
-                torrents.add(torrent)
+                # Anime edge cases where no season number is present for single season shows
+                elif (
+                    len(item.parent.parent.seasons) == 1
+                    and not torrent.data.seasons
+                    and item.number in torrent.data.episodes
+                ):
+                    torrents.add(torrent)
+                # If no episodes are present but the needed seasons are, we'll add it
+                elif any(
+                    season in torrent.data.seasons
+                    for season in needed_seasons
+                ) and not torrent.data.episodes:
+                    torrents.add(torrent)
 
             processed_infohashes.add(infohash)
 
-        except (ValueError, AttributeError):
-            # logger.error(f"Failed to parse: '{raw_title}' - {e}")
+        except (ValueError, AttributeError) as e:
+            # The only stuff I've seen that show up here is titles with a date.
+            # Dates can be sometimes parsed incorrectly by Arrow library,
+            # so we'll just ignore them.
+            if settings_manager.settings.scraping.parse_debug:
+                logger.debug(f"Skipping torrent: '{raw_title}' - {e}")
             continue
-        except GarbageTorrent:
-            # logger.debug(f"Trashing torrent {infohash}: '{raw_title}'")
+        except GarbageTorrent as e:
+            if settings_manager.settings.scraping.parse_debug:
+                logger.debug(f"Trashing torrent for {item.log_string}: '{raw_title}'")
             continue
 
     if torrents:
         logger.log("SCRAPER", f"Processed {len(torrents)} matches for {item.log_string}")
         torrents = sort_torrents(torrents)
         torrents_dict = {}
         for torrent in torrents.values():
-            stream = Stream(torrent)
-            if torrent.infohash in ignore_hashes:
-                logger.debug(f"Marking Torrent {torrent.infohash} as blacklisted for item {item.log_string}")
-                item.blacklisted_streams.append(stream)
-                continue
-            torrents_dict[torrent.infohash] = stream
+            torrents_dict[torrent.infohash] = Stream(torrent)
         return torrents_dict
-
     return {}
+
+
+# helper functions
+
+def _check_item_year(item: MediaItem, data: ParsedData) -> bool:
+    """Check if the year of the torrent is within the range of the item."""
+    year_range = [item.aired_at.year - 1, item.aired_at.year, item.aired_at.year + 1]
+    if item.type == "movie" and data.year:
+        return data.year in year_range
+    return False
+
+def _get_item_country(item: MediaItem) -> str:
+    """Get the country code for a country."""
+    if item.type == "season":
+        return item.parent.country.upper()
+    elif item.type == "episode":
+        return item.parent.parent.country.upper()
+    return item.country.upper()
+
+def _get_needed_seasons(item: Union[Show, Season, Episode]) -> list[int]:
+    """Get the seasons that are needed for the item."""
+    if item.type == "show":
+        return [season.number for season in item.seasons if season.last_state != States.Completed]
+    elif item.type == "season":
+        return [season.number for season in item.parent.seasons if season.last_state != States.Completed]
+    elif item.type == "episode":
+        return [season.number for season in item.parent.parent.seasons if season.last_state != States.Completed]
+    return []