Skip to content

Commit

Permalink
add urljoin helper
Browse files Browse the repository at this point in the history
  • Loading branch information
sbusso committed Mar 27, 2024
1 parent 79d6d97 commit 144e24c
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 21 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "scrapework"
version = "0.4.1"
version = "0.4.2"
description = "simple scraping framework"
authors = ["Stéphane Busso <stephane.busso@gmail.com>"]
license = "MIT"
Expand Down
8 changes: 8 additions & 0 deletions scrapework/core/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,18 @@
from httpx import Response

from scrapework.core.collector import MetadataCollector
from scrapework.request import Request


@dataclass
class Context:
collector: MetadataCollector = field(default_factory=MetadataCollector)
variables: Dict = field(default_factory=dict)
response: Response | None = None
request: Request | None = None

def urljoin(self, url: str) -> str:
if not self.request:
return url
else:
return self.request.urljoin(url)
20 changes: 20 additions & 0 deletions scrapework/core/http_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from abc import ABC, abstractmethod

import httpx

from scrapework.core.logger import Logger


class HTTPClient(ABC):

@classmethod
@abstractmethod
def build_client(cls, **kwargs) -> httpx.Client:
pass


class HttpxClient(HTTPClient):
@classmethod
def build_client(cls, **kwargs) -> httpx.Client:
Logger().get_logger().debug("Building httpx client")
return httpx.Client(**kwargs)
22 changes: 4 additions & 18 deletions scrapework/request.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,10 @@
import logging
from abc import ABC, abstractmethod
from typing import Any, Dict

import httpx
from httpx import URL, HTTPError, TimeoutException

from scrapework.core.context import Context
from scrapework.core.logger import Logger


class HTTPClient(ABC):

@classmethod
@abstractmethod
def build_client(cls, ctx: Context, **kwargs) -> httpx.Client:
pass


class HttpxClient(HTTPClient):
@classmethod
def build_client(cls, **kwargs) -> httpx.Client:
Logger().get_logger().debug("Building httpx client")
return httpx.Client(**kwargs)
from scrapework.core.http_client import HTTPClient, HttpxClient


class Request:
Expand Down Expand Up @@ -53,6 +36,9 @@ def __init__(self, url: str, **kwargs):
class Config:
arbitrary_types_allowed = True

def urljoin(self, url: str) -> str:
return str(URL(self.url).join(URL(url)))

def fetch(self) -> httpx.Response:
"""
Fetches the HTML content of a given URL.
Expand Down
5 changes: 3 additions & 2 deletions scrapework/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,6 @@ def run(

self.visited_urls.append(url_with_callback.url)

ctx.response = response

new_items = list(url_with_callback.extract(ctx, Selector(response.text)))
items += new_items

Expand Down Expand Up @@ -188,4 +186,7 @@ def make_request(self, ctx: Context, url: str) -> Optional[Response]:

self.logger.info(f"Received response with status code {response.status_code}")

ctx.response = response
ctx.request = request

return response

0 comments on commit 144e24c

Please sign in to comment.