From 547829f971d21422d3aacb115ccc7a9f80f4d0a6 Mon Sep 17 00:00:00 2001 From: ORelio Date: Fri, 29 Oct 2021 22:24:19 +0200 Subject: [PATCH] [FuturaSciences] Improve content extraction (#2317) - Fix tracking removal in URL - Fix images broken due to new lazy loading mechanism - Remove headline, articles do not have it anymore - Improve article cleanup --- bridges/FuturaSciencesBridge.php | 64 ++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 24 deletions(-) diff --git a/bridges/FuturaSciencesBridge.php b/bridges/FuturaSciencesBridge.php index 79c0588098c..4a8674f3db0 100644 --- a/bridges/FuturaSciencesBridge.php +++ b/bridges/FuturaSciencesBridge.php @@ -85,7 +85,7 @@ public function collectData(){ protected function parseItem($newsItem){ $item = parent::parseItem($newsItem); - $item['uri'] = str_replace('#xtor=RSS-8', '', $item['uri']); + $item['uri'] = str_replace('#xtor%3DRSS-8', '', $item['uri']); $article = getSimpleHTMLDOMCached($item['uri']) or returnServerError('Could not request Futura-Sciences: ' . $item['uri']); $item['content'] = $this->extractArticleContent($article); @@ -96,31 +96,47 @@ protected function parseItem($newsItem){ } private function extractArticleContent($article){ - $contents = $article->find('section.article-text', 1)->innertext; - $headline = trim($article->find('p.description', 0)->plaintext); - if(!empty($headline)) - $headline = '

' . $headline . '

'; + $contents = $article->find('section.article-text', 1); - foreach (array( - '
find('img') as $img) { + if(!empty($img->getAttribute('data-src'))) { + $img->src = $img->getAttribute('data-src'); + } } + foreach($contents->find('a.tooltip-link') as $a) { + $a->outertext = $a->plaintext; + } + + foreach(array( + 'clear', + 'sharebar2', + 'diaporamafullscreen', + 'module.social-button', + 'module.social-share', + 'ficheprevnext', + 'addthis_toolbox', + 'noprint', + 'hubbottom', + 'hubbottom2' + ) as $div_class_remove) { + foreach($contents->find('div.' . $div_class_remove) as $div) { + $keep_div = false; + foreach(array( + 'didyouknow' + ) as $div_class_dont_remove) { + if(strpos($div->getAttribute('class'), $div_class_dont_remove) !== false) { + $keep_div = true; + } + } + if(!$keep_div) { + $div->outertext = ''; + } + } + } + + $contents = $contents->innertext; + $contents = stripWithDelimiters($contents, '
'); $contents = stripWithDelimiters($contents, '