Skip to content

Commit

Permalink
feat: add alias support in parsing when scraping torrents. several ot…
Browse files Browse the repository at this point in the history
…her tweaks.
  • Loading branch information
dreulavelle committed Sep 19, 2024
1 parent 2f15fbd commit 365f022
Show file tree
Hide file tree
Showing 10 changed files with 441 additions and 383 deletions.
571 changes: 294 additions & 277 deletions poetry.lock

Large diffs are not rendered by default.

25 changes: 11 additions & 14 deletions src/program/content/trakt.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,24 +76,21 @@ def missing(self):
def run(self):
"""Fetch media from Trakt and yield Movie, Show, or MediaItem instances."""

def fetch_items(fetch_function, *args):
"""Helper function to fetch items using the provided function and arguments."""
return fetch_function(*args) if args else []

watchlist_ids = fetch_items(self._get_watchlist, self.settings.watchlist)
collection_ids = fetch_items(self._get_collection, self.settings.collection)
user_list_ids = fetch_items(self._get_list, self.settings.user_lists)
trending_ids = fetch_items(self._get_trending_items) if self.settings.fetch_trending else []
popular_ids = fetch_items(self._get_popular_items) if self.settings.fetch_popular else []
watchlist_ids = self._get_watchlist(self.settings.watchlist) if self.settings.watchlist else []
collection_ids = self._get_collection(self.settings.collection) if self.settings.collection else []
user_list_ids = self._get_list(self.settings.user_lists) if self.settings.user_lists else []
trending_ids = self._get_trending_items() if self.settings.fetch_trending else []
popular_ids = self._get_popular_items() if self.settings.fetch_popular else []

# Combine all IMDb IDs and types into a set to avoid duplicates
all_ids = set(watchlist_ids + collection_ids + user_list_ids + trending_ids + popular_ids)

items_to_yield = [
MediaItem({"imdb_id": imdb_id, "requested_by": self.key})
for imdb_id in all_ids
if imdb_id.startswith("tt")
]
items_to_yield = []
for imdb_id, _ in all_ids:
items_to_yield.append(MediaItem({"imdb_id": imdb_id, "requested_by": self.key}))

if not items_to_yield:
return

non_existing_items = _filter_existing_items(items_to_yield)
new_non_recurring_items = [
Expand Down
25 changes: 25 additions & 0 deletions src/program/indexers/trakt.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,10 @@ def _map_item_from_data(data, item_type: str, show_genres: List[str] = None) ->

match item_type:
case "movie":
item["aliases"] = get_show_aliases(item["imdb_id"], "movies")
return Movie(item)
case "show":
item["aliases"] = get_show_aliases(item["imdb_id"], "shows")
return Show(item)
case "season":
item["number"] = data.number
Expand All @@ -175,11 +177,34 @@ def _get_formatted_date(data, item_type: str) -> Optional[datetime]:

def get_show(imdb_id: str) -> dict:
"""Wrapper for trakt.tv API show method."""
if not imdb_id:
return {}
url = f"https://api.trakt.tv/shows/{imdb_id}/seasons?extended=episodes,full"
response = get(url, timeout=30, additional_headers={"trakt-api-version": "2", "trakt-api-key": CLIENT_ID})
return response.data if response.is_ok and response.data else {}


def get_show_aliases(imdb_id: str, item_type: str) -> List[dict]:
"""Wrapper for trakt.tv API show method."""
if not imdb_id:
return []
url = f"https://api.trakt.tv/{item_type}/{imdb_id}/aliases"
response = get(url, timeout=30, additional_headers={"trakt-api-version": "2", "trakt-api-key": CLIENT_ID})
if response.is_ok and response.data:
aliases = {}
for ns in response.data:
country = ns.country
title = ns.title
if title.startswith("Anime-"):
title = title[len("Anime-"):]
if country not in aliases:
aliases[country] = []
if title not in aliases[country]:
aliases[country].append(title)
return aliases
return {}


def create_item_from_imdb_id(imdb_id: str, type: str = None) -> Optional[MediaItem]:
"""Wrapper for trakt.tv API search method."""
url = f"https://api.trakt.tv/search/imdb/{imdb_id}?extended=full"
Expand Down
31 changes: 17 additions & 14 deletions src/program/media/item.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ class MediaItem(db.Model):
file: Mapped[Optional[str]] = mapped_column(sqlalchemy.String, nullable=True)
folder: Mapped[Optional[str]] = mapped_column(sqlalchemy.String, nullable=True)
alternative_folder: Mapped[Optional[str]] = mapped_column(sqlalchemy.String, nullable=True)
aliases: Mapped[Optional[dict]] = mapped_column(sqlalchemy.JSON, default={})
is_anime: Mapped[Optional[bool]] = mapped_column(sqlalchemy.Boolean, default=False)
title: Mapped[Optional[str]] = mapped_column(sqlalchemy.String, nullable=True)
imdb_id: Mapped[Optional[str]] = mapped_column(sqlalchemy.String, nullable=True)
Expand Down Expand Up @@ -121,6 +122,7 @@ def __init__(self, item: dict | None) -> None:
self.aired_at = item.get("aired_at")
self.year = item.get("year")
self.genres = item.get("genres", [])
self.aliases = item.get("aliases", {})

# Plex related
self.key = item.get("key")
Expand Down Expand Up @@ -155,13 +157,6 @@ def blacklist_stream(self, stream: Stream):
def is_released(self) -> bool:
"""Check if an item has been released."""
if self.aired_at and self.aired_at <= datetime.now():
# time_until_release = self.aired_at - datetime.now()
# days, seconds = time_until_release.days, time_until_release.seconds
# hours = seconds // 3600
# minutes = (seconds % 3600) // 60
# seconds = seconds % 60
# time_message = f"{self.log_string} will be released in {days} days, {hours:02}:{minutes:02}:{seconds:02}"
# logger.log("ITEM", time_message)
return True
return False

Expand Down Expand Up @@ -297,13 +292,21 @@ def set(self, key, value):

def get_top_title(self) -> str:
"""Get the top title of the item."""
match self.__class__.__name__:
case "Season":
return self.parent.title
case "Episode":
return self.parent.parent.title
case _:
return self.title
if self.type == "season":
return self.parent.title
elif self.type == "episode":
return self.parent.parent.title
else:
return self.title

def get_aliases(self) -> dict:
"""Get the aliases of the item."""
if self.type == "season":
return self.parent.aliases
elif self.type == "episode":
return self.parent.parent.aliases
else:
return self.aliases

def __hash__(self):
return hash(self.item_id)
Expand Down
2 changes: 1 addition & 1 deletion src/program/scrapers/knightcrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def api_scrape(self, item: MediaItem) -> tuple[Dict[str, str], int]:
# This should help with Special episodes and other misc. names
stream_title = stream.title.split("\n")[:-1]
joined_title = "\n".join(stream_title)
raw_title = joined_title.split("/")[-1] if isinstance(item, Episode) else joined_title.split("\n")[0]
raw_title = joined_title.split("\n")[0]

torrents[stream.infoHash] = raw_title

Expand Down
13 changes: 3 additions & 10 deletions src/program/scrapers/orionoid.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def __init__(self):
self.initialized = True
else:
return
self.second_limiter = RateLimiter(max_calls=1, period=5) if self.settings.ratelimit else None
self.second_limiter = RateLimiter(max_calls=1, period=5)
logger.success("Orionoid initialized!")

def validate(self) -> bool:
Expand Down Expand Up @@ -106,10 +106,7 @@ def run(self, item: MediaItem) -> Dict[str, str]:
try:
return self.scrape(item)
except RateLimitExceeded:
if self.second_limiter:
self.second_limiter.limit_hit()
else:
logger.warning(f"Orionoid rate limit exceeded for item: {item.log_string}")
self.second_limiter.limit_hit()
except ConnectTimeout:
logger.warning(f"Orionoid connection timeout for item: {item.log_string}")
except ReadTimeout:
Expand Down Expand Up @@ -167,10 +164,7 @@ def api_scrape(self, item: MediaItem) -> tuple[Dict, int]:
imdb_id = item.parent.parent.imdb_id
url = self.construct_url("show", imdb_id, season=item.parent.number, episode=item.number)

if self.second_limiter:
with self.second_limiter:
response = get(url, timeout=self.timeout)
else:
with self.second_limiter:
response = get(url, timeout=self.timeout)

if not response.is_ok or not hasattr(response.data, "data"):
Expand All @@ -180,7 +174,6 @@ def api_scrape(self, item: MediaItem) -> tuple[Dict, int]:
for stream in response.data.data.streams:
if not stream.file.hash or not stream.file.name:
continue

torrents[stream.file.hash] = stream.file.name

return torrents, len(response.data.data.streams)
142 changes: 83 additions & 59 deletions src/program/scrapers/shared.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
"""Shared functions for scrapers."""

from datetime import datetime
from typing import Dict, Set
from typing import Dict, Set, Union

from program.media.item import Episode, MediaItem, Movie, Season, Show
from program.media.stream import Stream
from program.settings.manager import settings_manager
from program.settings.versions import models
from RTN import RTN, Torrent, sort_torrents
from RTN import RTN, ParsedData, Torrent, sort_torrents
from RTN.exceptions import GarbageTorrent
from utils.ignore import get_ignore_hashes
from program.media.state import States
from utils.logger import logger

settings_model = settings_manager.settings.ranking
Expand Down Expand Up @@ -37,12 +35,11 @@ def _parse_results(item: MediaItem, results: Dict[str, str]) -> Dict[str, Stream
torrents: Set[Torrent] = set()
processed_infohashes: Set[str] = set()
correct_title: str = item.get_top_title()
ignore_hashes: set = get_ignore_hashes()

logger.log("SCRAPER", f"Processing {len(results)} results for {item.log_string}")

if isinstance(item, Show):
needed_seasons = [season.number for season in item.seasons]
if item.type in ["show", "season", "episode"]:
needed_seasons: list[int] = _get_needed_seasons(item)

for infohash, raw_title in results.items():
if infohash in processed_infohashes:
Expand All @@ -53,79 +50,106 @@ def _parse_results(item: MediaItem, results: Dict[str, str]) -> Dict[str, Stream
raw_title=raw_title,
infohash=infohash,
correct_title=correct_title,
remove_trash=settings_manager.settings.ranking.options["remove_all_trash"]
remove_trash=settings_manager.settings.ranking.options["remove_all_trash"],
aliases=item.get_aliases(),
)

if not torrent or not torrent.fetch:
continue

if isinstance(item, Movie):
if not torrent.data.year or not hasattr(item, "aired_at") or not item.aired_at or item.aired_at > datetime.now():
if torrent.data.country and not item.is_anime:
if _get_item_country(item) != torrent.data.country:
if settings_manager.settings.scraping.parse_debug:
logger.debug(f"Skipping torrent for incorrect country with {item.log_string}: {raw_title}")
continue

year_range = [item.aired_at.year - 1, item.aired_at.year, item.aired_at.year + 1]
if torrent.data.year not in year_range:
if item.type in ["show", "season", "episode"]:
if torrent.data.complete:
torrents.add(torrent)
processed_infohashes.add(infohash)
continue

torrents.add(torrent)
if item.type == "movie":
if _check_item_year(item, torrent.data):
torrents.add(torrent)

elif isinstance(item, Show):
if not needed_seasons:
logger.error(f"No seasons found for {item.log_string}")
break
if (
hasattr(torrent.data, "seasons")
and len(torrent.data.seasons) >= (len(needed_seasons) - 1)
and (
not hasattr(torrent.data, "episodes")
or len(torrent.data.episodes) == 0
)
or torrent.data.complete
):
elif item.type == "show":
if torrent.data.seasons and not torrent.data.episodes:
# We subtract one because Trakt doesn't always index
# shows according to uploaders
if len(torrent.data.seasons) >= (len(needed_seasons) - 1):
torrents.add(torrent)

elif item.type == "season":
# If the torrent has the needed seasons and no episodes, we can add it
if any(season in torrent.data.seasons for season in needed_seasons) and not torrent.data.episodes:
torrents.add(torrent)

elif isinstance(item, Season):
elif item.type == "episode":
# If the torrent has the season and episode numbers, we can add it
if (
len(getattr(torrent.data, "seasons", [])) == 1
and item.number in torrent.data.seasons
and (
not hasattr(torrent.data, "episodes")
or len(torrent.data.episodes) == 0
)
or torrent.data.complete
item.number in torrent.data.episodes
and item.parent.number in torrent.data.seasons
):
torrents.add(torrent)

elif isinstance(item, Episode) and (
item.number in torrent.data.episodes
and (
not hasattr(torrent.data, "seasons")
or item.parent.number in torrent.data.seasons
)
or torrent.data.complete
):
torrents.add(torrent)
# Anime edge cases where no season number is present for single season shows
elif (
len(item.parent.parent.seasons) == 1
and not torrent.data.seasons
and item.number in torrent.data.episodes
):
torrents.add(torrent)
# If no episodes are present but the needed seasons are, we'll add it
elif any(
season in torrent.data.seasons
for season in needed_seasons
) and not torrent.data.episodes:
torrents.add(torrent)

processed_infohashes.add(infohash)

except (ValueError, AttributeError):
# logger.error(f"Failed to parse: '{raw_title}' - {e}")
except (ValueError, AttributeError) as e:
# The only stuff I've seen that show up here is titles with a date.
# Dates can be sometimes parsed incorrectly by Arrow library,
# so we'll just ignore them.
if settings_manager.settings.scraping.parse_debug:
logger.debug(f"Skipping torrent: '{raw_title}' - {e}")
continue
except GarbageTorrent:
# logger.debug(f"Trashing torrent {infohash}: '{raw_title}'")
except GarbageTorrent as e:
if settings_manager.settings.scraping.parse_debug:
logger.debug(f"Trashing torrent for {item.log_string}: '{raw_title}'")
continue

if torrents:
logger.log("SCRAPER", f"Processed {len(torrents)} matches for {item.log_string}")
torrents = sort_torrents(torrents)
torrents_dict = {}
for torrent in torrents.values():
stream = Stream(torrent)
if torrent.infohash in ignore_hashes:
logger.debug(f"Marking Torrent {torrent.infohash} as blacklisted for item {item.log_string}")
item.blacklisted_streams.append(stream)
continue
torrents_dict[torrent.infohash] = stream
torrents_dict[torrent.infohash] = Stream(torrent)
return torrents_dict

return {}


# helper functions

def _check_item_year(item: MediaItem, data: ParsedData) -> bool:
"""Check if the year of the torrent is within the range of the item."""
year_range = [item.aired_at.year - 1, item.aired_at.year, item.aired_at.year + 1]
if item.type == "movie" and data.year:
return data.year in year_range
return False

def _get_item_country(item: MediaItem) -> str:
"""Get the country code for a country."""
if item.type == "season":
return item.parent.country.upper()
elif item.type == "episode":
return item.parent.parent.country.upper()
return item.country.upper()

def _get_needed_seasons(item: Union[Show, Season, Episode]) -> list[int]:
"""Get the seasons that are needed for the item."""
if item.type == "show":
return [season.number for season in item.seasons if season.last_state != States.Completed]
elif item.type == "season":
return [season.number for season in item.parent.seasons if season.last_state != States.Completed]
elif item.type == "episode":
return [season.number for season in item.parent.parent.seasons if season.last_state != States.Completed]
return []
Loading

0 comments on commit 365f022

Please sign in to comment.