Skip to content

Commit

Permalink
add job details
Browse files Browse the repository at this point in the history
  • Loading branch information
sbusso committed Mar 25, 2024
1 parent 1d3edb1 commit 199c4df
Show file tree
Hide file tree
Showing 5 changed files with 126 additions and 9 deletions.
72 changes: 70 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ courlan = "^1.0.0"
trafilatura = "^1.8.0"
httpx = "^0.27.0"
hishel = "^0.0.24"
rich = "^13.7.1"

[tool.poetry.group.dev.dependencies]
black = "^24.3.0"
Expand Down
10 changes: 10 additions & 0 deletions scrapework/core/collector.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,19 @@
import datetime
from dataclasses import dataclass, field
from typing import List


@dataclass
class JobCollector:
url: str
duration: datetime.timedelta
items_count: int


@dataclass
class MetadataCollector:
metadata: dict = field(default_factory=dict)
jobs: List[JobCollector] = field(default_factory=list)

def set(self, key, value):
self.metadata[key] = value
Expand Down
29 changes: 28 additions & 1 deletion scrapework/reporter.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import datetime
from abc import abstractmethod

import httpx
from rich.console import Console
from rich.table import Table

from scrapework.core.context import Context
from scrapework.module import Module
Expand All @@ -15,11 +18,35 @@ def report(self, ctx: Context):

class LoggerReporter(Reporter):
def report(self, ctx: Context):
duration = ctx.collector.get("duration")
duration_str = str(duration)
if isinstance(duration, datetime.timedelta):
duration_str = str(duration.total_seconds())

self.logger.info(
f"Processed {ctx.collector.get('items_count')} items in {ctx.collector.get('duration')}s."
f"Processed {ctx.collector.get('items_count')} items in {duration_str} seconds."
)


class RichReporter(Reporter):
def report(self, ctx: Context):

table = Table(title="Parsing Results")
table.add_column("URL", style="blue", no_wrap=True)
table.add_column("Duration", justify="right", style="magenta", no_wrap=True)
table.add_column("Items", justify="right", style="green", no_wrap=True)

for job in ctx.collector.jobs:
table.add_row(
job.url,
str(job.duration.total_seconds()),
str(job.items_count),
)

console = Console()
console.print(table)


class SlackReporter(Reporter):
def __init__(self, webhook_url):
self.webhook_url = webhook_url
Expand Down
23 changes: 17 additions & 6 deletions scrapework/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from httpx import Response

from scrapework.core.collector import MetadataCollector
from scrapework.core.collector import JobCollector, MetadataCollector
from scrapework.core.config import EnvConfig
from scrapework.core.context import Context
from scrapework.core.logger import Logger
Expand Down Expand Up @@ -127,9 +127,10 @@ def run(self):

items = []

begin_time = datetime.datetime.now()
while self.urls_to_visit:
iter_begin_time = datetime.datetime.now()
url_with_callback = self.urls_to_visit.pop(0)
begin_time = datetime.datetime.now()

response = self.make_request(ctx, url_with_callback.url)

Expand All @@ -143,20 +144,30 @@ def run(self):

self.visited_urls.append(url_with_callback.url)

items += list(url_with_callback.extract(ctx, response))

ctx.collector.set("items_count", len(items))
new_items = list(url_with_callback.extract(ctx, response))
items += new_items

iter_end_time = datetime.datetime.now()
items_count = len(items)
ctx.collector.set("items_count", items_count)
ctx.collector.jobs.append(
JobCollector(
url=url_with_callback.url,
duration=iter_end_time - iter_begin_time,
items_count=len(new_items),
)
)

for handler in self.handlers:
handler.process_items(ctx, items)

end_time = datetime.datetime.now()

ctx.collector.set("duration", end_time - begin_time)
self.logger.info("Scraping complete")

for reporter in self.reporters:
reporter.report(ctx)
self.logger.info("Scraping complete")

def make_request(self, ctx: Context, url: str) -> Optional[Response]:
request = Request(url=url, logger=self.logger)
Expand Down

0 comments on commit 199c4df

Please sign in to comment.