-
-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refactor: Update linting config, refactor package
Skip without token#
- Loading branch information
Showing
14 changed files
with
871 additions
and
397 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,111 +1 @@ | ||
import re | ||
|
||
from requests import session | ||
from bs4 import BeautifulSoup | ||
from feedgen.feed import FeedGenerator | ||
|
||
match_imdb = re.compile(r"^https?://www.imdb.com") | ||
match_tmdb = re.compile(r"^https?://www.themoviedb.org") | ||
|
||
base_url = "https://letterboxd.com/" | ||
|
||
s = session() | ||
|
||
|
||
def process(args): | ||
watchlist_url = args.letterboxd_url.rstrip("/") | ||
if not watchlist_url.startswith("https://"): | ||
watchlist_url = f"{base_url}{watchlist_url}" | ||
if not watchlist_url.endswith("watchlist"): | ||
watchlist_url += "/watchlist" | ||
watchlist_url += "/" | ||
|
||
feedlen = args.max_length | ||
output_file = args.output | ||
page_title = "The Dude's Watchlist" | ||
|
||
feed = FeedGenerator() | ||
feed.title(page_title) | ||
feed.id(watchlist_url) | ||
feed.link(href=watchlist_url, rel="alternate") | ||
feed.description(page_title + " from Letterboxd") | ||
|
||
# Get first page, gather general data | ||
r = s.get(watchlist_url) | ||
r.raise_for_status() | ||
soup = BeautifulSoup(r.text, "html.parser") | ||
|
||
watchlist_title = soup.find("meta", attrs={"property": "og:title"}) | ||
page_title = watchlist_title.attrs["content"] | ||
|
||
m = soup.find("span", attrs={"class": "js-watchlist-count"}) | ||
if len(m) > 0: | ||
total_movies = int(m.text.split()[0]) | ||
print(f"Found a total of {total_movies} movies") | ||
|
||
paginator = soup.find_all("li", attrs={"class": "paginate-page"}) | ||
page_count = int(paginator[-1].text) if paginator else 1 | ||
last_page_index = page_count + 1 | ||
|
||
movies_added = 0 | ||
for page in range(1, last_page_index): | ||
if page > 1: | ||
r = s.get(watchlist_url + "/page/%i/" % page) | ||
soup = BeautifulSoup(r.text, "html.parser") | ||
print() | ||
|
||
ul = soup.find("ul", attrs={"class": "poster-list"}) | ||
movies = ul.find_all("li") | ||
movies_on_page = len(movies) | ||
|
||
print(f"Gathering on page {page} (contains {movies_on_page} movies)\n") | ||
|
||
for movie in movies: | ||
added = extract_metadata(movie, feed) | ||
|
||
# Update total counter | ||
movies_added += added | ||
if feedlen > 0 and movies_added >= feedlen: | ||
print("\nReached desired maximum feed length") | ||
break | ||
|
||
if feedlen > 0 and movies_added >= feedlen: | ||
break | ||
|
||
if movies_added > 0: | ||
print(f"Writing feed to {output_file}") | ||
feed.rss_file(output_file) | ||
|
||
|
||
def extract_metadata(movie, feed): | ||
movie_url = base_url + "film/" + movie.div.attrs["data-film-slug"] | ||
movie_page = s.get(movie_url) | ||
movie_soup = BeautifulSoup(movie_page.text, "html.parser") | ||
|
||
try: | ||
movie_title = movie_soup.find("meta", attrs={"property": "og:title"}).attrs[ | ||
"content" | ||
] | ||
print("Adding", movie_title) | ||
movie_link = movie_soup.find( | ||
"a", attrs={"href": [match_imdb, match_tmdb]} | ||
).attrs["href"] | ||
if movie_link.endswith("/maindetails"): | ||
movie_link = movie_link[:-11] | ||
movie_description = movie_soup.find( | ||
"meta", attrs={"property": "og:description"} | ||
) | ||
if movie_description is not None: | ||
movie_description = movie_description.text.strip() | ||
|
||
item = feed.add_item() | ||
item.title(movie_title) | ||
item.description(movie_description) | ||
item.link(href=movie_link, rel="alternate") | ||
item.guid(movie_link) | ||
|
||
return 1 | ||
except Exception: | ||
print("Parsing failed on", movie_url) | ||
|
||
return 0 | ||
__version__ = "v0.3.0" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
from __future__ import annotations | ||
|
||
from concurrent.futures import Future, ThreadPoolExecutor, wait | ||
from typing import TYPE_CHECKING, Dict, List, Optional | ||
|
||
from bs4 import BeautifulSoup | ||
from bs4.element import Tag | ||
|
||
from letterboxd_rss.feed import create_feed | ||
from letterboxd_rss.parsing import parse_page | ||
from letterboxd_rss.session import session | ||
from letterboxd_rss.utils import make_watchlist_url | ||
|
||
if TYPE_CHECKING: | ||
from feedgen.feed import FeedEntry | ||
|
||
|
||
def process( | ||
letterboxd_url: str, | ||
output_file: str, | ||
max_length: int, | ||
) -> None: | ||
page_title = "" | ||
watchlist_url = make_watchlist_url(letterboxd_url) | ||
next_url: Optional[str] = watchlist_url + "page/1/" | ||
remaining_count = max_length | ||
with ThreadPoolExecutor(max_workers=4) as pool: | ||
future_to_url: Dict[Future[FeedEntry], str] = {} | ||
|
||
while next_url and remaining_count > 0: | ||
r = session.get_and_raise(next_url) | ||
soup = BeautifulSoup(r.text, "html.parser") | ||
|
||
next_url, _futures = parse_page(soup, max_movies=remaining_count, pool=pool) | ||
future_to_url.update(_futures) | ||
remaining_count -= len(_futures) | ||
|
||
entries: List[FeedEntry] = [] | ||
for future in wait(future_to_url).done: | ||
url = future_to_url[future] | ||
try: | ||
entry = future.result() | ||
except Exception as exc: | ||
print("%r generated an exception: %s" % (url, exc)) | ||
else: | ||
entries.append(entry) | ||
|
||
watchlist_title = soup.find("meta", attrs={"property": "og:title"}) | ||
page_title = watchlist_title.attrs["content"] if isinstance(watchlist_title, Tag) else "The Dude's Watchlist" | ||
|
||
if entries: | ||
create_feed( | ||
entries, | ||
page_title=page_title, | ||
watchlist_url=watchlist_url, | ||
output_file=output_file, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
from letterboxd_rss import __version__ | ||
|
||
PROG_NAME = "letterboxd-rss" | ||
USER_AGENT = f"{PROG_NAME}/{__version__} (/~https://github.com/janw/{PROG_NAME})" | ||
|
||
REQUESTS_TIMEOUT = 30 | ||
|
||
|
||
BASE_URL = "https://letterboxd.com" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
from __future__ import annotations | ||
|
||
from typing import List | ||
|
||
from feedgen.feed import FeedEntry, FeedGenerator | ||
|
||
|
||
def create_feed(entries: List[FeedEntry], page_title: str, watchlist_url: str, output_file: str) -> None: | ||
feed = FeedGenerator() | ||
feed.title(page_title) | ||
feed.id(watchlist_url) | ||
feed.link(href=watchlist_url, rel="alternate") | ||
feed.description(page_title + " from Letterboxd") | ||
for entry in entries: | ||
feed.add_entry(entry) | ||
|
||
print(f"Writing feed to {output_file}") | ||
feed.rss_file(output_file) |
Oops, something went wrong.