-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweb_to_text.py
64 lines (53 loc) · 2.25 KB
/
web_to_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import requests, re
# import pdfplumber # Uncomment if you plan to work with PDF files
import logging
import streamlit as st
from typing import List, Any
from scrap_data import Scraper
# Configure logging for the application, specifying log level, format, and date format
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
# Initialize a logger for this module to log errors and information messages
logger = logging.getLogger(__name__)
@st.cache_data # Cache the extracted data to improve performance on repeated runs
def extract_all_pages(url) -> List[Any]:
"""
Extract all pages from a given URL by invoking the Scraper class from the 'scrap_data' module.
Args:
url (str): The URL to scrape data from.
Returns:
List[Any]: A list containing extracted data from the pages.
"""
# Initialize the scraper and call post_scraper method to get data from the URL
data = Scraper(url).post_scraper()
return data
def is_valid_url(url: str) -> bool:
"""
Validate the provided URL format and ensure it is accessible.
Args:
url (str): The URL to validate.
Returns:
bool: True if the URL is valid and accessible; False otherwise.
"""
# Basic URL pattern check using a regular expression
url_pattern = re.compile(
r'^(https?://)?' # Matches http:// or https:// (optional)
r'([a-zA-Z0-9-]+\.)+' # Matches the domain part (e.g., example.com)
r'[a-zA-Z]{2,}' # Matches the top-level domain (e.g., .com, .org)
r'(/.*)?$' # Matches any additional path (optional)
)
# Check if the URL matches the defined pattern
if not url_pattern.match(url):
return False
# Try sending a request to the URL to confirm it's accessible
try:
# Send a GET request to the URL with a timeout of 5 seconds
response = requests.get(url, timeout=5)
# Return True if the response status code is 200 (OK)
return response.status_code == 200
except requests.exceptions.RequestException:
# If there's any request error (e.g., DNS failure, connection error), URL is considered invalid
return False