Skip to content

Commit

Permalink
fix cache
Browse files Browse the repository at this point in the history
  • Loading branch information
sbusso committed Mar 24, 2024
1 parent fd8116e commit 245802e
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 14 deletions.
14 changes: 10 additions & 4 deletions scrapework/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@

import hishel

from scrapework.context import Context
from scrapework.middleware import Middleware
from scrapework.request import HTTPClient, Request


class HishelClient(HTTPClient):
@classmethod
def build_client(cls, **kwargs) -> hishel.CacheClient:
def build_client(cls, ctx: Context, **kwargs) -> hishel.CacheClient:
ctx.logger.debug("Building cache http client.")
return hishel.CacheClient(**kwargs)


Expand All @@ -19,8 +21,8 @@ class CacheMiddleware(Middleware):
storage: Optional[hishel.FileStorage] = None
cache_dir: Optional[str] = None

def __init__(self, cache_dir: str, ttl: int = 3600):
super().__init__()
def __init__(self, context: Context, cache_dir: str, ttl: int = 3600):
super().__init__(context=context)
self.controller = hishel.Controller(
# Cache only GET and POST methods
cacheable_methods=["GET", "POST"],
Expand All @@ -42,7 +44,7 @@ def __init__(self, cache_dir: str, ttl: int = 3600):
serializer = hishel.PickleSerializer()

self.storage = hishel.FileStorage(
base_path=Path(cache_dir_path), check_ttl_every=ttl
serializer=serializer, base_path=Path(cache_dir_path), check_ttl_every=ttl
)

self.cache_dir = cache_dir
Expand All @@ -51,7 +53,11 @@ class Config:
arbitrary_types_allowed = True

def process_request(self, request: Request):
self.context.logger.debug(
f"Using cache middleware with cache dir: {self.cache_dir}"
)
request.cls_client = HishelClient
request.client_kwargs["controller"] = self.controller
request.client_kwargs["storage"] = self.storage
request.request_kwargs["extensions"] = {"force_cache": True}
return request
7 changes: 7 additions & 0 deletions scrapework/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import List
from urllib.parse import urlencode

from scrapework.context import Context
from scrapework.request import Request


Expand All @@ -17,6 +18,12 @@ def __init__(self, url: str):


class Middleware(ABC):
context: Context

def __init__(self, context: Context) -> None:
self.context = context
self.context.logger.info(f"Using middleware: {self.__class__.__name__}")

@abstractmethod
def process_request(self, request: Request):
raise NotImplementedError
Expand Down
18 changes: 10 additions & 8 deletions scrapework/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,21 @@
import httpx
from httpx import HTTPError, TimeoutException

from scrapework.context import Context


class HTTPClient(ABC):

@classmethod
@abstractmethod
def build_client(cls, **kwargs) -> httpx.Client:
def build_client(cls, ctx: Context, **kwargs) -> httpx.Client:
pass


class HttpxClient(HTTPClient):
@classmethod
def build_client(cls, **kwargs) -> httpx.Client:
def build_client(cls, ctx: Context, **kwargs) -> httpx.Client:
ctx.logger.debug(f"Building httpx client with kwargs: {kwargs}")
return httpx.Client(**kwargs)


Expand All @@ -30,6 +33,7 @@ class Request:
retries: int = 0
cls_client: type[HTTPClient] = HttpxClient
client_kwargs: Dict[str, Any] = {}
request_kwargs: Dict[str, Any] = {}

def __init__(self, url: str, **kwargs):
self.url = url
Expand All @@ -41,6 +45,7 @@ def __init__(self, url: str, **kwargs):
self.retries = kwargs.get("retries", 0)
self.cls_client = kwargs.get("cls_client", HttpxClient)
self.client_kwargs = kwargs.get("client_kwargs", {})
self.request_kwargs = kwargs.get("request_kwargs", {})

class Config:
arbitrary_types_allowed = True
Expand All @@ -62,6 +67,7 @@ def fetch(self) -> httpx.Response:
else:
mounts = {}
client = self.cls_client.build_client(
ctx=Context(logger=self.logger, filename=""),
headers=self.headers,
timeout=self.timeout,
follow_redirects=self.follow_redirects,
Expand All @@ -70,13 +76,9 @@ def fetch(self) -> httpx.Response:
)
try:

request = client.build_request(
"GET",
response = client.get(
self.url,
)

response = client.send(
request,
**self.request_kwargs,
)

return response
Expand Down
5 changes: 3 additions & 2 deletions scrapework/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,9 @@ class SpiderConfig(EnvConfig):
class Config:
arbitrary_types_allowed = True

def use(self, middleware: Middleware):
self.middlewares.append(middleware)
def use(self, cls: type[Middleware], **kwargs) -> None:
cls_instance = cls(context=self.context, **kwargs) # type: ignore
self.middlewares.append(cls_instance)

@abstractmethod
def extract(self, response) -> Union[Dict[str, Any], Iterable[Dict[str, Any]]]:
Expand Down

0 comments on commit 245802e

Please sign in to comment.