Skip to content

Commit

Permalink
[Core] Add html/convertLazyLoading (+ document stripRecursiveHTMLSect…
Browse files Browse the repository at this point in the history
…ion) (#3157)

* [core] Add html/convertLazyLoading($dom)

Looks for lazy-loading attributes such as 'data-src' and converts
them back to regular ones such as 'src', easier for RSS readers.
It also converts <picture> elements to plain <img> elements.

* [core] Document html/stripRecursiveHTMLSection()

Add documentation for that function (no code changes).

* [WordPressBridge] Use convertLazyLoading()

* [WordPressBridge] Unwrap image figures

<img> inside <figure> may not display on RSS readers.
This converts them back to <img>, without losing caption if present.

* [ZDNet] Convert lazy loading images

* [code] html/stripRecursiveHTMLSection: Fix typo
  • Loading branch information
ORelio authored Nov 20, 2022
1 parent 2f7f13d commit d592e2c
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 15 deletions.
19 changes: 6 additions & 13 deletions bridges/WordPressBridge.php
Original file line number Diff line number Diff line change
Expand Up @@ -74,20 +74,8 @@ protected function parseItem($newItem)
}
}

// Convert lazy-loading images and iframes (videos...)
foreach ($article->find('img, iframe') as $img) {
if (!empty($img->getAttribute('data-src'))) {
$img->src = $img->getAttribute('data-src');
} elseif (!empty($img->getAttribute('data-srcset'))) {
$img->src = explode(' ', $img->getAttribute('data-srcset'))[0];
} elseif (!empty($img->getAttribute('data-lazy-src'))) {
$img->src = $img->getAttribute('data-lazy-src');
} elseif (!empty($img->getAttribute('srcset'))) {
$img->src = explode(' ', $img->getAttribute('srcset'))[0];
}
}

// Find article main image
$article = convertLazyLoading($article);
$article_image = $article_html->find('img.wp-post-image', 0);
if (!empty($item['content']) && (!is_object($article_image) || empty($article_image->src))) {
$article_image = str_get_html($item['content'])->find('img.wp-post-image', 0);
Expand All @@ -105,6 +93,11 @@ protected function parseItem($newItem)
}
}

// Unwrap images figures
foreach ($article->find('figure.wp-block-image') as $figure) {
$figure->outertext = $figure->innertext;
}

if (!is_null($article)) {
$item['content'] = $this->cleanContent($article->innertext);
$item['content'] = defaultLinkTo($item['content'], $item['uri']);
Expand Down
2 changes: 1 addition & 1 deletion bridges/ZDNetBridge.php
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ protected function parseItem($item)
$contents = stripWithDelimiters($contents, '<meta itemprop="image"', '>');
$contents = stripWithDelimiters($contents, '<svg class="svg-symbol', '</svg>');
$contents = trim(stripWithDelimiters($contents, '<section class="sharethrough-top', '</section>'));
$item['content'] = $contents;
$item['content'] = convertLazyLoading($contents);

return $item;
}
Expand Down
85 changes: 84 additions & 1 deletion lib/html.php
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,69 @@ function defaultLinkTo($dom, $url)
return $dom;
}

/**
* Convert lazy-loading images and frames (video embeds) into static elements
*
* This function looks for lazy-loading attributes such as 'data-src' and converts
* them back to regular ones such as 'src', making them loadable in RSS readers.
* It also converts <picture> elements to plain <img> elements.
*
* @param string|object $content The HTML content. Supports HTML objects or string objects
* @return string|object Content with fixed image/frame URLs (same type as input).
*/
function convertLazyLoading($dom)
{
$string_convert = false;
if (is_string($dom)) {
$string_convert = true;
$dom = str_get_html($dom);
}

// Process standalone images, embeds and picture sources
foreach ($dom->find('img, iframe, source') as $img) {
if (!empty($img->getAttribute('data-src'))) {
$img->src = $img->getAttribute('data-src');
} elseif (!empty($img->getAttribute('data-srcset'))) {
$img->src = explode(' ', $img->getAttribute('data-srcset'))[0];
} elseif (!empty($img->getAttribute('data-lazy-src'))) {
$img->src = $img->getAttribute('data-lazy-src');
} elseif (!empty($img->getAttribute('srcset'))) {
$img->src = explode(' ', $img->getAttribute('srcset'))[0];
} else {
continue; // Proceed to next element without removing attributes
}
foreach (['loading', 'decoding', 'srcset', 'data-src', 'data-srcset'] as $attr) {
if ($img->hasAttribute($attr)) {
$img->removeAttribute($attr);
}
}
}

// Convert complex HTML5 pictures to plain, standalone images
// <img> and <source> tags already have their "src" attribute set at this point,
// so we replace the whole <picture> with a standalone <img> from within the <picture>
foreach ($dom->find('picture') as $picture) {
$img = $picture->find('img, source', 0);
if (!empty($img)) {
if ($img->tag == 'source') {
$img->tag = 'img';
}
// Adding/removing node would change its position inside the parent element,
// So instead we rewrite the node in-place though the outertext attribute
$picture->outertext = $img->outertext;
}
}

// If the expected return type is object, reload the DOM to make sure
// all $picture->outertext rewritten above are converted back to objects
$dom = $dom->outertext;
if (!$string_convert) {
$dom = str_get_html($dom);
}

return $dom;
}

/**
* Extract the first part of a string matching the specified start and end delimiters
*
Expand Down Expand Up @@ -245,27 +308,47 @@ function stripWithDelimiters($string, $start, $end)
* @param string $tag_start Start of the HTML tag to remove, e.g. `<div class="ads">`
* @return string Cleaned String, e.g. `foobar`
*
* @todo This function needs more documentation to make it maintainable.
* This function works by locating the desired tag start, then finding the appropriate
* end by counting opening and ending tags until the amount of open tags reaches zero:
*
* ```
* Amount of open tags:
* 1 2 1 0
* |---------------||---| |----| |----|
* <div class="ads"><div>ads</div>ads</div>bar
* | <-------- Section to remove -------> |
* ```
*/
function stripRecursiveHTMLSection($string, $tag_name, $tag_start)
{
$open_tag = '<' . $tag_name;
$close_tag = '</' . $tag_name . '>';
$close_tag_length = strlen($close_tag);

// Make sure the provided $tag_start argument matches the provided $tag_name argument
if (strpos($tag_start, $open_tag) === 0) {
// While tag_start is present, there is at least one remaining section to remove
while (strpos($string, $tag_start) !== false) {
// In order to locate the end of the section, we attempt each closing tag until we find the right one
// We know we found the right one when the amount of "<tag" is the same as amount of "</tag"
// When the attempted "</tag" is not the correct one, we increase $search_offset to skip it
// and retry unless $max_recursion is reached (prevents infinite loop on malformed HTML)
$max_recursion = 100;
$section_to_remove = null;
$section_start = strpos($string, $tag_start);
$search_offset = $section_start;
do {
$max_recursion--;
// Move on to the next occurrence of "</tag"
$section_end = strpos($string, $close_tag, $search_offset);
$search_offset = $section_end + $close_tag_length;
// If the next "</tag" is the correct one, then this is the section we must remove:
$section_to_remove = substr($string, $section_start, $section_end - $section_start + $close_tag_length);
// Count amount of "<tag" and "</tag" in the section to remove
$open_tag_count = substr_count($section_to_remove, $open_tag);
$close_tag_count = substr_count($section_to_remove, $close_tag);
} while ($open_tag_count > $close_tag_count && $max_recursion > 0);
// We exited the loop, let's remove the section
$string = str_replace($section_to_remove, '', $string);
}
}
Expand Down

0 comments on commit d592e2c

Please sign in to comment.