Skip to content

Commit

Permalink
make start urls a runtime parameter
Browse files Browse the repository at this point in the history
  • Loading branch information
sbusso committed Mar 26, 2024
1 parent a269d45 commit 6fe91b7
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 16 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "scrapework"
version = "0.3.3"
version = "0.4.0"
description = "simple scraping framework"
authors = ["Stéphane Busso <stephane.busso@gmail.com>"]
license = "MIT"
Expand Down
22 changes: 7 additions & 15 deletions scrapework/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ class Scraper(ABC):
"""Scraper base class"""

name: ClassVar[str] = "base_scraper"
start_urls: List[str] = []
# start_urls: List[str] = []
visited_urls: List[str] = []
urls_to_visit: List[ExtractCallback] = []
base_url: str = ""
Expand All @@ -42,7 +42,7 @@ class Scraper(ABC):
handlers: List[Handler] = []
middlewares: List[RequestMiddleware] = []
reporters: List[Reporter] = []

modules: List[Module] = [LoggerReporter()]
config: EnvConfig

def __init__(self, **args):
Expand All @@ -52,17 +52,9 @@ def __init__(self, **args):

self.config = self.SpiderConfig.create_config()

if not self.base_url and self.start_urls:
self.base_url = self.start_urls[0]

if not self.filename:
self.filename = f"{self.name}.json"

# start_urls
args_start_urls = args.get("start_urls")
if args_start_urls and isinstance(args_start_urls, list):
self.start_urls = args_start_urls

self.logger = Logger(self.name).get_logger()

self.configuration()
Expand All @@ -74,11 +66,11 @@ class Config:
arbitrary_types_allowed = True

def use_modules(self) -> List[Module]:
return [LoggerReporter()]
return []

def configuration(self) -> None:

for module in self.use_modules():
for module in [*self.modules, *self.use_modules()]:
self.use(module)

def use(self, module: Module) -> None:
Expand Down Expand Up @@ -112,17 +104,17 @@ def to_visit(

self.urls_to_visit.append(ExtractCallback(url, extract))

def run(self):
def run(self, start_urls: List[str]):
self.logger.info("Scraping started")
ctx = Context(
variables=self.variables(),
collector=MetadataCollector(),
)

if not self.start_urls:
if not start_urls:
raise ValueError("No start_urls provided")

for url in self.start_urls:
for url in start_urls:
self.to_visit(url)

items = []
Expand Down

0 comments on commit 6fe91b7

Please sign in to comment.