diff --git a/pyproject.toml b/pyproject.toml index 5454c4b..50474d3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "scrapework" -version = "0.4.1" +version = "0.4.2" description = "simple scraping framework" authors = ["Stéphane Busso "] license = "MIT" diff --git a/scrapework/core/context.py b/scrapework/core/context.py index ebe5e74..adcdaf3 100644 --- a/scrapework/core/context.py +++ b/scrapework/core/context.py @@ -4,6 +4,7 @@ from httpx import Response from scrapework.core.collector import MetadataCollector +from scrapework.request import Request @dataclass @@ -11,3 +12,10 @@ class Context: collector: MetadataCollector = field(default_factory=MetadataCollector) variables: Dict = field(default_factory=dict) response: Response | None = None + request: Request | None = None + + def urljoin(self, url: str) -> str: + if not self.request: + return url + else: + return self.request.urljoin(url) diff --git a/scrapework/core/http_client.py b/scrapework/core/http_client.py new file mode 100644 index 0000000..8cb22fb --- /dev/null +++ b/scrapework/core/http_client.py @@ -0,0 +1,20 @@ +from abc import ABC, abstractmethod + +import httpx + +from scrapework.core.logger import Logger + + +class HTTPClient(ABC): + + @classmethod + @abstractmethod + def build_client(cls, **kwargs) -> httpx.Client: + pass + + +class HttpxClient(HTTPClient): + @classmethod + def build_client(cls, **kwargs) -> httpx.Client: + Logger().get_logger().debug("Building httpx client") + return httpx.Client(**kwargs) diff --git a/scrapework/request.py b/scrapework/request.py index 1650f13..864c7a2 100644 --- a/scrapework/request.py +++ b/scrapework/request.py @@ -1,27 +1,10 @@ import logging -from abc import ABC, abstractmethod from typing import Any, Dict import httpx from httpx import URL, HTTPError, TimeoutException -from scrapework.core.context import Context -from scrapework.core.logger import Logger - - -class HTTPClient(ABC): - - @classmethod - @abstractmethod - def build_client(cls, ctx: Context, **kwargs) -> httpx.Client: - pass - - -class HttpxClient(HTTPClient): - @classmethod - def build_client(cls, **kwargs) -> httpx.Client: - Logger().get_logger().debug("Building httpx client") - return httpx.Client(**kwargs) +from scrapework.core.http_client import HTTPClient, HttpxClient class Request: @@ -53,6 +36,9 @@ def __init__(self, url: str, **kwargs): class Config: arbitrary_types_allowed = True + def urljoin(self, url: str) -> str: + return str(URL(self.url).join(URL(url))) + def fetch(self) -> httpx.Response: """ Fetches the HTML content of a given URL. diff --git a/scrapework/scraper.py b/scrapework/scraper.py index 53d44dc..5d46ddc 100644 --- a/scrapework/scraper.py +++ b/scrapework/scraper.py @@ -149,8 +149,6 @@ def run( self.visited_urls.append(url_with_callback.url) - ctx.response = response - new_items = list(url_with_callback.extract(ctx, Selector(response.text))) items += new_items @@ -188,4 +186,7 @@ def make_request(self, ctx: Context, url: str) -> Optional[Response]: self.logger.info(f"Received response with status code {response.status_code}") + ctx.response = response + ctx.request = request + return response