diff --git a/poetry.lock b/poetry.lock index d906a06..3b1d20d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "annotated-types" @@ -951,6 +951,41 @@ html5 = ["html5lib"] htmlsoup = ["BeautifulSoup4"] source = ["Cython (>=3.0.7)"] +[[package]] +name = "markdown-it-py" +version = "3.0.0" +description = "Python port of markdown-it. Markdown parsing, done right!" +optional = false +python-versions = ">=3.8" +files = [ + {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"}, + {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"}, +] + +[package.dependencies] +mdurl = ">=0.1,<1.0" + +[package.extras] +benchmarking = ["psutil", "pytest", "pytest-benchmark"] +code-style = ["pre-commit (>=3.0,<4.0)"] +compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"] +linkify = ["linkify-it-py (>=1,<3)"] +plugins = ["mdit-py-plugins"] +profiling = ["gprof2dot"] +rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"] +testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] + +[[package]] +name = "mdurl" +version = "0.1.2" +description = "Markdown URL utilities" +optional = false +python-versions = ">=3.7" +files = [ + {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"}, + {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, +] + [[package]] name = "more-itertools" version = "10.2.0" @@ -1345,6 +1380,21 @@ files = [ [package.dependencies] typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" +[[package]] +name = "pygments" +version = "2.17.2" +description = "Pygments is a syntax highlighting package written in Python." +optional = false +python-versions = ">=3.7" +files = [ + {file = "pygments-2.17.2-py3-none-any.whl", hash = "sha256:b27c2826c47d0f3219f29554824c30c5e8945175d888647acd804ddd04af846c"}, + {file = "pygments-2.17.2.tar.gz", hash = "sha256:da46cec9fd2de5be3a8a784f434e4c4ab670b4ff54d605c4c2717e9d49c4c367"}, +] + +[package.extras] +plugins = ["importlib-metadata"] +windows-terminal = ["colorama (>=0.4.6)"] + [[package]] name = "pyproject-hooks" version = "1.0.0" @@ -1665,6 +1715,24 @@ files = [ [package.dependencies] requests = ">=2.0.1,<3.0.0" +[[package]] +name = "rich" +version = "13.7.1" +description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "rich-13.7.1-py3-none-any.whl", hash = "sha256:4edbae314f59eb482f54e9e30bf00d33350aaa94f4bfcd4e9e3110e64d0d7222"}, + {file = "rich-13.7.1.tar.gz", hash = "sha256:9be308cb1fe2f1f57d67ce99e95af38a1e2bc71ad9813b0e247cf7ffbcc3a432"}, +] + +[package.dependencies] +markdown-it-py = ">=2.2.0" +pygments = ">=2.13.0,<3.0.0" + +[package.extras] +jupyter = ["ipywidgets (>=7.5.1,<9)"] + [[package]] name = "ruff" version = "0.3.4" @@ -1991,4 +2059,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "aa56e4f019a1ccda47f234b0c3d337e928c7690fbddbf4deff2a770540cfadfc" +content-hash = "e5c1864cf14a1dd79f9bac5ad89080be871154fd3efc9440e4b3de0df42402ae" diff --git a/pyproject.toml b/pyproject.toml index b336fc9..c11b946 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ courlan = "^1.0.0" trafilatura = "^1.8.0" httpx = "^0.27.0" hishel = "^0.0.24" +rich = "^13.7.1" [tool.poetry.group.dev.dependencies] black = "^24.3.0" diff --git a/scrapework/core/collector.py b/scrapework/core/collector.py index 4f93982..54b8b52 100644 --- a/scrapework/core/collector.py +++ b/scrapework/core/collector.py @@ -1,9 +1,19 @@ +import datetime from dataclasses import dataclass, field +from typing import List + + +@dataclass +class JobCollector: + url: str + duration: datetime.timedelta + items_count: int @dataclass class MetadataCollector: metadata: dict = field(default_factory=dict) + jobs: List[JobCollector] = field(default_factory=list) def set(self, key, value): self.metadata[key] = value diff --git a/scrapework/reporter.py b/scrapework/reporter.py index 9d5ccb7..a897664 100644 --- a/scrapework/reporter.py +++ b/scrapework/reporter.py @@ -1,6 +1,9 @@ +import datetime from abc import abstractmethod import httpx +from rich.console import Console +from rich.table import Table from scrapework.core.context import Context from scrapework.module import Module @@ -15,11 +18,35 @@ def report(self, ctx: Context): class LoggerReporter(Reporter): def report(self, ctx: Context): + duration = ctx.collector.get("duration") + duration_str = str(duration) + if isinstance(duration, datetime.timedelta): + duration_str = str(duration.total_seconds()) + self.logger.info( - f"Processed {ctx.collector.get('items_count')} items in {ctx.collector.get('duration')}s." + f"Processed {ctx.collector.get('items_count')} items in {duration_str} seconds." ) +class RichReporter(Reporter): + def report(self, ctx: Context): + + table = Table(title="Parsing Results") + table.add_column("URL", style="blue", no_wrap=True) + table.add_column("Duration", justify="right", style="magenta", no_wrap=True) + table.add_column("Items", justify="right", style="green", no_wrap=True) + + for job in ctx.collector.jobs: + table.add_row( + job.url, + str(job.duration.total_seconds()), + str(job.items_count), + ) + + console = Console() + console.print(table) + + class SlackReporter(Reporter): def __init__(self, webhook_url): self.webhook_url = webhook_url diff --git a/scrapework/scraper.py b/scrapework/scraper.py index 3ca24bb..184f0b6 100644 --- a/scrapework/scraper.py +++ b/scrapework/scraper.py @@ -5,7 +5,7 @@ from httpx import Response -from scrapework.core.collector import MetadataCollector +from scrapework.core.collector import JobCollector, MetadataCollector from scrapework.core.config import EnvConfig from scrapework.core.context import Context from scrapework.core.logger import Logger @@ -127,9 +127,10 @@ def run(self): items = [] + begin_time = datetime.datetime.now() while self.urls_to_visit: + iter_begin_time = datetime.datetime.now() url_with_callback = self.urls_to_visit.pop(0) - begin_time = datetime.datetime.now() response = self.make_request(ctx, url_with_callback.url) @@ -143,9 +144,19 @@ def run(self): self.visited_urls.append(url_with_callback.url) - items += list(url_with_callback.extract(ctx, response)) - - ctx.collector.set("items_count", len(items)) + new_items = list(url_with_callback.extract(ctx, response)) + items += new_items + + iter_end_time = datetime.datetime.now() + items_count = len(items) + ctx.collector.set("items_count", items_count) + ctx.collector.jobs.append( + JobCollector( + url=url_with_callback.url, + duration=iter_end_time - iter_begin_time, + items_count=len(new_items), + ) + ) for handler in self.handlers: handler.process_items(ctx, items) @@ -153,10 +164,10 @@ def run(self): end_time = datetime.datetime.now() ctx.collector.set("duration", end_time - begin_time) + self.logger.info("Scraping complete") for reporter in self.reporters: reporter.report(ctx) - self.logger.info("Scraping complete") def make_request(self, ctx: Context, url: str) -> Optional[Response]: request = Request(url=url, logger=self.logger)