Skip to content

Commit

Permalink
[CssSelectorBridge] Improvements (#3537) (#3573)
Browse files Browse the repository at this point in the history
* [CssSelectorBridge] Improvements (#3537)

* Improve parameter documentation / add tooltips
* Allow extracting content from home page instead of article page
* Keep titles from home page when every page <title> is the same

* [CssSelectorBridge] Code linting

* [CssSelectorBridge] Code linting (2)

* [CssSelectorBridge] Code linting (3)
  • Loading branch information
ORelio authored Jul 26, 2023
1 parent 556bca5 commit 977c0db
Showing 1 changed file with 80 additions and 23 deletions.
103 changes: 80 additions & 23 deletions bridges/CssSelectorBridge.php
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,40 @@ class CssSelectorBridge extends BridgeAbstract
],
'url_selector' => [
'name' => 'Selector for article links or their parent elements',
'title' => <<<EOT
This bridge works using CSS selectors, e.g. "a.article" will match all <a class="article"
href="URL">TITLE</a> on home page, each one being treated as a feed item. &#10;&#13;
Instead of just a link you can selet one of its parent element. Everything inside that
element becomes feed item content, e.g. image and summary present on home page.
When doing so, the first link inside the selected element becomes feed item URL/Title.
EOT,
'exampleValue' => 'a.article',
'required' => true
],
'url_pattern' => [
'name' => '[Optional] Pattern for site URLs to keep in feed',
'exampleValue' => 'https://example.com/article/.*',
'title' => 'Optionally filter items by applying a regular expression on their URL',
'exampleValue' => '/blog/article/.*',
],
'content_selector' => [
'name' => '[Optional] Selector to extract each article content',
'name' => '[Optional] Selector to expand each article content',
'title' => <<<EOT
When specified, the bridge will fetch each article from its URL
and extract content using the provided selector (Slower!)
EOT,
'exampleValue' => 'article.content',
],
'content_cleanup' => [
'name' => '[Optional] Content cleanup: List of items to remove',
'title' => 'Selector for unnecessary elements to remove inside article contents.',
'exampleValue' => 'div.ads, div.comments',
],
'title_cleanup' => [
'name' => '[Optional] Text to remove from expanded article title',
'title' => <<<EOT
When fetching each article page, feed item title comes from page title.
Specify here some text from page title that need to be removed, e.g. " | BlogName".
EOT,
'exampleValue' => ' | BlogName',
],
'limit' => self::LIMIT
Expand Down Expand Up @@ -69,7 +86,7 @@ public function collectData()

$html = defaultLinkTo(getSimpleHTMLDOM($url), $url);
$this->feedName = $this->getPageTitle($html, $title_cleanup);
$items = $this->htmlFindLinks($html, $url_selector, $url_pattern, $limit);
$items = $this->htmlFindEntries($html, $url_selector, $url_pattern, $limit, $content_cleanup);

if (empty($content_selector)) {
$this->items = $items;
Expand All @@ -79,7 +96,8 @@ public function collectData()
$item['uri'],
$content_selector,
$content_cleanup,
$title_cleanup
$title_cleanup,
$item['title']
);
}
}
Expand Down Expand Up @@ -127,41 +145,79 @@ protected function getPageTitle($page, $title_cleanup = null)
}

/**
* Retrieve first N links from webpage URL or DOM satisfying the specified criteria
* @param string|object $page URL or DOM to retrieve links from
* Remove all elements from HTML content matching cleanup selector
* @param string|object $content HTML content as HTML object or string
* @return string|object Cleaned content (same type as input)
*/
protected function cleanArticleContent($content, $cleanup_selector)
{
$string_convert = false;
if (is_string($content)) {
$string_convert = true;
$content = str_get_html($content);
}

if (!empty($cleanup_selector)) {
foreach ($content->find($cleanup_selector) as $item_to_clean) {
$item_to_clean->outertext = '';
}
}

if ($string_convert) {
$content = $content->outertext;
}
return $content;
}

/**
* Retrieve first N link+title+truncated-content from webpage URL or DOM satisfying the specified criteria
* @param string|object $page URL or DOM to retrieve feed items from
* @param string $url_selector DOM selector for matching links or their parent element
* @param string $url_pattern Optional filter to keep only links matching the pattern
* @param int $limit Optional maximum amount of URLs to return
* @return array of minimal feed items {'uri': entry_url, 'title', entry_title}
* @param string $content_cleanup Optional selector for removing elements, e.g. "div.ads, div.comments"
* @return array of items {'uri': entry_url, 'title': entry_title, ['content': when present in DOM] }
*/
protected function htmlFindLinks($page, $url_selector, $url_pattern = '', $limit = 0)
protected function htmlFindEntries($page, $url_selector, $url_pattern = '', $limit = 0, $content_cleanup = null)
{
if (is_string($page)) {
$page = getSimpleHTMLDOM($page);
}

$links = $page->find($url_selector);

if (empty($links)) {
returnClientError('No results for URL selector');
}

$link_to_title = [];
$link_to_item = [];
foreach ($links as $link) {
$item = [];
if ($link->innertext != $link->plaintext) {
$item['content'] = $link->innertext;
}
if ($link->tag != 'a') {
$link = $link->find('a', 0);
}
$link_to_title[$link->href] = $link->plaintext;
$item['uri'] = $link->href;
$item['title'] = $link->plaintext;
if (isset($item['content'])) {
$item['content'] = convertLazyLoading($item['content']);
$item['content'] = defaultLinkTo($item['content'], $item['uri']);
$item['content'] = $this->cleanArticleContent($item['content'], $content_cleanup);
}
$link_to_item[$link->href] = $item;
}

$links = $this->filterUrlList(array_keys($link_to_title), $url_pattern, $limit);
$links = $this->filterUrlList(array_keys($link_to_item), $url_pattern, $limit);

if (empty($links)) {
returnClientError('No results for URL pattern');
}

$items = [];
foreach ($links as $link) {
$item = [];
$item['uri'] = $link;
$item['title'] = $link_to_title[$link];
$items[] = $item;
$items[] = $link_to_item[$link];
}

return $items;
Expand All @@ -173,9 +229,10 @@ protected function htmlFindLinks($page, $url_selector, $url_pattern = '', $limit
* @param string $content_selector HTML selector for extracting content, e.g. "article.content"
* @param string $content_cleanup Optional selector for removing elements, e.g. "div.ads, div.comments"
* @param string $title_cleanup Optional string to remove from article title, e.g. " | BlogName"
* @param string $title_default Optional title to use when could not extract title reliably
* @return array Entry data: uri, title, content
*/
protected function expandEntryWithSelector($entry_url, $content_selector, $content_cleanup = null, $title_cleanup = null)
protected function expandEntryWithSelector($entry_url, $content_selector, $content_cleanup = null, $title_cleanup = null, $title_default = null)
{
if (empty($content_selector)) {
returnClientError('Please specify a content selector');
Expand All @@ -190,18 +247,18 @@ protected function expandEntryWithSelector($entry_url, $content_selector, $conte
returnClientError('Could not find content selector at URL: ' . $entry_url);
}

if (!empty($content_cleanup)) {
foreach ($article_content->find($content_cleanup) as $item_to_clean) {
$item_to_clean->outertext = '';
}
}

$article_content = convertLazyLoading($article_content);
$article_content = defaultLinkTo($article_content, $entry_url);
$article_content = $this->cleanArticleContent($article_content, $content_cleanup);

$article_title = $this->getPageTitle($entry_html, $title_cleanup);
if (!empty($title_default) && (empty($article_title) || $article_title === $this->feedName)) {
$article_title = $title_default;
}

$item = [];
$item['uri'] = $entry_url;
$item['title'] = $this->getPageTitle($entry_html, $title_cleanup);
$item['title'] = $article_title;
$item['content'] = $article_content;
return $item;
}
Expand Down

0 comments on commit 977c0db

Please sign in to comment.