Skip to content

Commit

Permalink
refactor: Update linting config, refactor package
Browse files Browse the repository at this point in the history
Skip without token#
  • Loading branch information
janw committed Oct 30, 2024
1 parent 401f189 commit dba4b3d
Show file tree
Hide file tree
Showing 14 changed files with 871 additions and 397 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/bump-version.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@ jobs:
bump-version:
uses: janw/workflows/.github/workflows/commitizen-bump-version.yaml@main
secrets:
personal-access-token: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
personal-access-token: ${{ secrets.BOT_PERSONAL_ACCESS_TOKEN }}
gpg-private-key: ${{ secrets.GPG_PRIVATE_KEY }}
gpg-passphrase: ${{ secrets.GPG_PASSPHRASE }}
46 changes: 46 additions & 0 deletions .github/workflows/linters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,49 @@ on:
jobs:
commitizen:
uses: janw/workflows/.github/workflows/commitizen.yaml@main

pre-commit:
runs-on: ubuntu-latest
steps:
- name: Check out
uses: actions/checkout@v3
with:
token: ${{ secrets.BOT_PERSONAL_ACCESS_TOKEN }}

- name: Install poetry
run: pipx install poetry

- name: Set up python environment
uses: actions/setup-python@v5
with:
cache: poetry
python-version: 3.x

- name: Install dependencies
run: poetry install --sync

- id: cache-restore
uses: actions/cache/restore@v4
with:
path: ~/.cache/pre-commit
key: pre-commit-v0|${{ steps.setup-python.outputs.python-version }}|${{ hashFiles('.pre-commit-config.yaml') }}

- run: poetry run pre-commit run --show-diff-on-failure --color=always --all-files
shell: bash

- uses: stefanzweifel/git-auto-commit-action@v5
if: >
always()
&& !startsWith(github.event.head_commit.message, 'build(autofix):')
with:
commit_message: "build(autofix): Auto-fix linting issues"
commit_user_name: "Jan Willhaus [bot]"
commit_user_email: "bot@janw.xyz"
commit_author: Jan Willhaus [bot] <bot@janw.xyz>

- id: cache-save
uses: actions/cache/save@v4
if: always() && steps.cache-restore.outputs.cache-hit != 'true'
with:
key: ${{ steps.cache-restore.outputs.cache-primary-key }}
path: ~/.cache/pre-commit
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ ipython_config.py

# pyenv
.python-version
.tool-versions

# celery beat schedule file
celerybeat-schedule
Expand Down
34 changes: 23 additions & 11 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
ci:
autoupdate_commit_msg: 'build(pre-commit): pre-commit.ci autoupdate'
autoupdate_schedule: weekly
autofix_commit_msg: 'ci(pre-commit): auto fixes from pre-commit hooks'
autofix_prs: true

default_install_hook_types:
- pre-commit
default_stages:
- pre-commit
repos:
- repo: meta
hooks:
- id: check-hooks-apply

- repo: /~https://github.com/janw/pre-commit-hooks
rev: v0.1.0
hooks:
- id: sync_ruff_version

- repo: /~https://github.com/astral-sh/ruff-pre-commit
rev: 'v0.6.9'
rev: 'v0.7.1'
hooks:
- id: ruff
args: [ --fix, --exit-non-zero-on-fix ]
Expand All @@ -28,4 +27,17 @@ repos:
- repo: /~https://github.com/python-poetry/poetry
rev: '1.8.0'
hooks:
- id: poetry-lock
args:
- --no-update
- id: poetry-check

- repo: local
hooks:
- id: mypy
name: mypy
entry: poetry run mypy
language: system
require_serial: true
pass_filenames: false
types: [python]
112 changes: 1 addition & 111 deletions letterboxd_rss/__init__.py
Original file line number Diff line number Diff line change
@@ -1,111 +1 @@
import re

from requests import session
from bs4 import BeautifulSoup
from feedgen.feed import FeedGenerator

match_imdb = re.compile(r"^https?://www.imdb.com")
match_tmdb = re.compile(r"^https?://www.themoviedb.org")

base_url = "https://letterboxd.com/"

s = session()


def process(args):
watchlist_url = args.letterboxd_url.rstrip("/")
if not watchlist_url.startswith("https://"):
watchlist_url = f"{base_url}{watchlist_url}"
if not watchlist_url.endswith("watchlist"):
watchlist_url += "/watchlist"
watchlist_url += "/"

feedlen = args.max_length
output_file = args.output
page_title = "The Dude's Watchlist"

feed = FeedGenerator()
feed.title(page_title)
feed.id(watchlist_url)
feed.link(href=watchlist_url, rel="alternate")
feed.description(page_title + " from Letterboxd")

# Get first page, gather general data
r = s.get(watchlist_url)
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")

watchlist_title = soup.find("meta", attrs={"property": "og:title"})
page_title = watchlist_title.attrs["content"]

m = soup.find("span", attrs={"class": "js-watchlist-count"})
if len(m) > 0:
total_movies = int(m.text.split()[0])
print(f"Found a total of {total_movies} movies")

paginator = soup.find_all("li", attrs={"class": "paginate-page"})
page_count = int(paginator[-1].text) if paginator else 1
last_page_index = page_count + 1

movies_added = 0
for page in range(1, last_page_index):
if page > 1:
r = s.get(watchlist_url + "/page/%i/" % page)
soup = BeautifulSoup(r.text, "html.parser")
print()

ul = soup.find("ul", attrs={"class": "poster-list"})
movies = ul.find_all("li")
movies_on_page = len(movies)

print(f"Gathering on page {page} (contains {movies_on_page} movies)\n")

for movie in movies:
added = extract_metadata(movie, feed)

# Update total counter
movies_added += added
if feedlen > 0 and movies_added >= feedlen:
print("\nReached desired maximum feed length")
break

if feedlen > 0 and movies_added >= feedlen:
break

if movies_added > 0:
print(f"Writing feed to {output_file}")
feed.rss_file(output_file)


def extract_metadata(movie, feed):
movie_url = base_url + "film/" + movie.div.attrs["data-film-slug"]
movie_page = s.get(movie_url)
movie_soup = BeautifulSoup(movie_page.text, "html.parser")

try:
movie_title = movie_soup.find("meta", attrs={"property": "og:title"}).attrs[
"content"
]
print("Adding", movie_title)
movie_link = movie_soup.find(
"a", attrs={"href": [match_imdb, match_tmdb]}
).attrs["href"]
if movie_link.endswith("/maindetails"):
movie_link = movie_link[:-11]
movie_description = movie_soup.find(
"meta", attrs={"property": "og:description"}
)
if movie_description is not None:
movie_description = movie_description.text.strip()

item = feed.add_item()
item.title(movie_title)
item.description(movie_description)
item.link(href=movie_link, rel="alternate")
item.guid(movie_link)

return 1
except Exception:
print("Parsing failed on", movie_url)

return 0
__version__ = "v0.3.0"
57 changes: 57 additions & 0 deletions letterboxd_rss/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from __future__ import annotations

from concurrent.futures import Future, ThreadPoolExecutor, wait
from typing import TYPE_CHECKING, Dict, List, Optional

from bs4 import BeautifulSoup
from bs4.element import Tag

from letterboxd_rss.feed import create_feed
from letterboxd_rss.parsing import parse_page
from letterboxd_rss.session import session
from letterboxd_rss.utils import make_watchlist_url

if TYPE_CHECKING:
from feedgen.feed import FeedEntry


def process(
letterboxd_url: str,
output_file: str,
max_length: int,
) -> None:
page_title = ""
watchlist_url = make_watchlist_url(letterboxd_url)
next_url: Optional[str] = watchlist_url + "page/1/"
remaining_count = max_length
with ThreadPoolExecutor(max_workers=4) as pool:
future_to_url: Dict[Future[FeedEntry], str] = {}

while next_url and remaining_count > 0:
r = session.get_and_raise(next_url)
soup = BeautifulSoup(r.text, "html.parser")

next_url, _futures = parse_page(soup, max_movies=remaining_count, pool=pool)
future_to_url.update(_futures)
remaining_count -= len(_futures)

entries: List[FeedEntry] = []
for future in wait(future_to_url).done:
url = future_to_url[future]
try:
entry = future.result()
except Exception as exc:
print("%r generated an exception: %s" % (url, exc))
else:
entries.append(entry)

watchlist_title = soup.find("meta", attrs={"property": "og:title"})
page_title = watchlist_title.attrs["content"] if isinstance(watchlist_title, Tag) else "The Dude's Watchlist"

if entries:
create_feed(
entries,
page_title=page_title,
watchlist_url=watchlist_url,
output_file=output_file,
)
18 changes: 11 additions & 7 deletions letterboxd_rss/__main__.py → letterboxd_rss/cli.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import sys
from __future__ import annotations

import argparse
from letterboxd_rss import process
from typing import List, Optional

from letterboxd_rss.base import process


def main(argv=None):
def main(argv: Optional[List[str]] = None) -> None:
parser = argparse.ArgumentParser()
parser.add_argument(
"letterboxd_url",
Expand All @@ -26,7 +29,8 @@ def main(argv=None):
help="Maximum number of watchlist items to keep in the feed",
)
args = parser.parse_args(argv)
process(args)


main(sys.argv[1:])
process(
letterboxd_url=args.letterboxd_url,
output_file=args.output,
max_length=args.max_length,
)
9 changes: 9 additions & 0 deletions letterboxd_rss/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from letterboxd_rss import __version__

PROG_NAME = "letterboxd-rss"
USER_AGENT = f"{PROG_NAME}/{__version__} (/~https://github.com/janw/{PROG_NAME})"

REQUESTS_TIMEOUT = 30


BASE_URL = "https://letterboxd.com"
18 changes: 18 additions & 0 deletions letterboxd_rss/feed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from __future__ import annotations

from typing import List

from feedgen.feed import FeedEntry, FeedGenerator


def create_feed(entries: List[FeedEntry], page_title: str, watchlist_url: str, output_file: str) -> None:
feed = FeedGenerator()
feed.title(page_title)
feed.id(watchlist_url)
feed.link(href=watchlist_url, rel="alternate")
feed.description(page_title + " from Letterboxd")
for entry in entries:
feed.add_entry(entry)

print(f"Writing feed to {output_file}")
feed.rss_file(output_file)
Loading

0 comments on commit dba4b3d

Please sign in to comment.