[ClacoForm] strip HTML in export

claroline · Mar 24, 2023 · 5ebccc3 · 5ebccc3
1 parent dc796c4
commit 5ebccc3
Show file tree

Hide file tree

Showing 19 changed files with 189 additions and 693 deletions.
diff --git a/src/main/app/Resources/modules/intl/date/index.js b/src/main/app/Resources/modules/intl/date/index.js
@@ -186,10 +186,6 @@ function getTimeDiff(startDate, endDate) {
   return moment.duration(diff).asSeconds()
 }
 
-function nowAdd(addition, local = true) {
-  return local ? moment().utc().local().add(addition).format(getApiFormat()) : moment().utc().add(addition).format(getApiFormat())
-}
-
 export {
   getApiFormat,
   getDisplayFormat,
@@ -201,7 +197,6 @@ export {
   dateToDisplayFormat,
   computeElapsedTime,
   getTimeDiff,
-  nowAdd,
   displayDuration,
   displayTime,
   displayDateRange

diff --git a/src/main/core/Library/Normalizer/TextNormalizer.php b/src/main/core/Library/Normalizer/TextNormalizer.php
@@ -4,15 +4,8 @@
 
 class TextNormalizer
 {
-    /**
-     * @param $string
-     *
-     * @return string
-     */
-    public static function stripDiacritics($string)
+    public static function stripDiacritics(string $string): string
     {
-        $string = (string) $string;
-
         if (!preg_match('/[\x80-\xff]/', $string)) {
             return $string;
         }
@@ -82,9 +75,8 @@ public static function stripDiacritics($string)
             // grave accent
             'Ǜ' => 'U', 'ǜ' => 'u',
         ];
-        $string = str_replace(array_keys($transliteration), array_values($transliteration), $string);
 
-        return $string;
+        return str_replace(array_keys($transliteration), array_values($transliteration), $string);
     }
 
     public static function toKey($string, int $length = null)
@@ -118,6 +110,9 @@ public static function toUtf8(string $string): string
         return $string;
     }
 
+    /**
+     * Converts a string into UTF-8 and replaces EOL by PHP ones.
+     */
     public static function sanitize(string $string): string
     {
         // If encoding not UTF-8 then convert it to UTF-8
@@ -126,6 +121,7 @@ public static function sanitize(string $string): string
         // normalize end of lines
         $string = str_replace("\r\n", PHP_EOL, $string);
         $string = str_replace("\r", PHP_EOL, $string);
+        $string = str_replace("\n", PHP_EOL, $string);
 
         return $string;
     }
@@ -135,87 +131,127 @@ public static function sanitize(string $string): string
      */
     public static function resumeHtml(string $text, int $nbCharacter, ?string $readMoreText = ''): string
     {
-        if (is_numeric($nbCharacter)) {
-            $lengthBeforeWithoutHtml = strlen(trim(strip_tags($text)));
-            $htmlSplitMask = '#</?([a-zA-Z1-6]+)(?: +[a-zA-Z]+="[^"]*")*( ?/)?>#';
-            $htmlMatchMask = '#<(?:/([a-zA-Z1-6]+)|([a-zA-Z1-6]+)(?: +[a-zA-Z]+="[^"]*")*( ?/)?)>#';
+        $lengthBeforeWithoutHtml = strlen(trim(strip_tags($text)));
+        $htmlSplitMask = '#</?([a-zA-Z1-6]+)(?: +[a-zA-Z]+="[^"]*")*( ?/)?>#';
+        $htmlMatchMask = '#<(?:/([a-zA-Z1-6]+)|([a-zA-Z1-6]+)(?: +[a-zA-Z]+="[^"]*")*( ?/)?)>#';
+        $text .= ' ';
+        $textPieces = preg_split($htmlSplitMask, $text, -1, PREG_SPLIT_OFFSET_CAPTURE | PREG_SPLIT_NO_EMPTY);
+        $pieceNumber = count($textPieces);
+
+        if (1 === $pieceNumber) {
             $text .= ' ';
-            $textPieces = preg_split($htmlSplitMask, $text, -1, PREG_SPLIT_OFFSET_CAPTURE | PREG_SPLIT_NO_EMPTY);
-            $pieceNumber = count($textPieces);
+            $lengthBefore = strlen($text);
+            $text = substr($text, 0, strpos($text, ' ', $lengthBefore > $nbCharacter ? $nbCharacter : $lengthBefore));
 
-            if (1 === $pieceNumber) {
-                $text .= ' ';
-                $lengthBefore = strlen($text);
-                $text = substr($text, 0, strpos($text, ' ', $lengthBefore > $nbCharacter ? $nbCharacter : $lengthBefore));
+            if ('' != $readMoreText && $lengthBefore > $nbCharacter) {
+                $text .= $readMoreText;
+            }
+        } else {
+            $length = 0;
+            $indexLastPiece = $pieceNumber - 1;
+            $position = $textPieces[$indexLastPiece][1] + strlen($textPieces[$indexLastPiece][0]) - 1;
+            $indexPiece = $indexLastPiece;
+            $searchSpace = true;
 
-                if ('' != $readMoreText && $lengthBefore > $nbCharacter) {
-                    $text .= $readMoreText;
-                }
-            } else {
-                $length = 0;
-                $indexLastPiece = $pieceNumber - 1;
-                $position = $textPieces[$indexLastPiece][1] + strlen($textPieces[$indexLastPiece][0]) - 1;
-                $indexPiece = $indexLastPiece;
-                $searchSpace = true;
-
-                foreach ($textPieces as $index => $bout) {
-                    $length += strlen($bout[0]);
-                    if ($length >= $nbCharacter) {
-                        $positionEndPiece = $bout[1] + strlen($bout[0]) - 1;
-                        $position = $positionEndPiece - ($length - $nbCharacter);
-
-                        $positionSpace = strpos($bout[0], ' ', $position - $bout[1]);
-                        if (false !== $positionSpace) {
-                            $position = $bout[1] + $positionSpace;
-                            $searchSpace = false;
-                        }
-                        if ($index != $indexLastPiece) {
-                            $indexPiece = $index + 1;
-                        }
-                        break;
+            foreach ($textPieces as $index => $bout) {
+                $length += strlen($bout[0]);
+                if ($length >= $nbCharacter) {
+                    $positionEndPiece = $bout[1] + strlen($bout[0]) - 1;
+                    $position = $positionEndPiece - ($length - $nbCharacter);
+
+                    $positionSpace = strpos($bout[0], ' ', $position - $bout[1]);
+                    if (false !== $positionSpace) {
+                        $position = $bout[1] + $positionSpace;
+                        $searchSpace = false;
                     }
+                    if ($index != $indexLastPiece) {
+                        $indexPiece = $index + 1;
+                    }
+                    break;
                 }
+            }
 
-                if (true === $searchSpace) {
-                    for ($i = $indexPiece; $i <= $indexLastPiece; ++$i) {
-                        $position = $textPieces[$i][1];
-                        $positionSpace = strpos($textPieces[$i][0], ' ');
-                        if (false !== $positionSpace) {
-                            $position += $positionSpace;
-                            break;
-                        }
+            if (true === $searchSpace) {
+                for ($i = $indexPiece; $i <= $indexLastPiece; ++$i) {
+                    $position = $textPieces[$i][1];
+                    $positionSpace = strpos($textPieces[$i][0], ' ');
+                    if (false !== $positionSpace) {
+                        $position += $positionSpace;
+                        break;
                     }
                 }
+            }
 
-                $text = substr($text, 0, $position);
-                preg_match_all($htmlMatchMask, $text, $return, PREG_OFFSET_CAPTURE);
-                $tagPieces = [];
+            $text = substr($text, 0, $position);
+            preg_match_all($htmlMatchMask, $text, $return, PREG_OFFSET_CAPTURE);
+            $tagPieces = [];
 
-                foreach ($return[0] as $index => $tag) {
-                    if (isset($return[3][$index][0])) {
-                        continue;
-                    }
-                    if ('/' != $return[0][$index][0][1]) {
-                        array_unshift($tagPieces, $return[2][$index][0]);
-                    } else {
-                        array_shift($tagPieces);
-                    }
+            foreach ($return[0] as $index => $tag) {
+                if (isset($return[3][$index][0])) {
+                    continue;
                 }
-
-                if (!empty($tagPieces)) {
-                    foreach ($tagPieces as $tag) {
-                        $text .= '</'.$tag.'>';
-                    }
+                if ('/' != $return[0][$index][0][1]) {
+                    array_unshift($tagPieces, $return[2][$index][0]);
+                } else {
+                    array_shift($tagPieces);
                 }
+            }
 
-                if ('' != $readMoreText && $lengthBeforeWithoutHtml > $nbCharacter) {
-                    $text .= 'SuspensionPoint';
-                    $pattern = '#((</[^>]*>[\n\t\r ]*)?(</[^>]*>[\n\t\r ]*)?(</[^>]*>[\n\t\r ]*)?(</[^>]*>[\n\t\r ]*)?(</[^>]*>)[\n\t\r ]*SuspensionPoint)#i';
-                    $text = preg_replace($pattern, $readMoreText.'${2}${3}${4}${5}${6}', $text);
+            if (!empty($tagPieces)) {
+                foreach ($tagPieces as $tag) {
+                    $text .= '</'.$tag.'>';
                 }
             }
+
+            if ('' != $readMoreText && $lengthBeforeWithoutHtml > $nbCharacter) {
+                $text .= 'SuspensionPoint';
+                $pattern = '#((</[^>]*>[\n\t\r ]*)?(</[^>]*>[\n\t\r ]*)?(</[^>]*>[\n\t\r ]*)?(</[^>]*>[\n\t\r ]*)?(</[^>]*>)[\n\t\r ]*SuspensionPoint)#i';
+                $text = preg_replace($pattern, $readMoreText.'${2}${3}${4}${5}${6}', $text);
+            }
         }
 
         return $text;
     }
+
+    public static function stripHtml(string $htmlStr, ?bool $preserveMedia = false): string
+    {
+        $csvStr = self::sanitize($htmlStr);
+        if ($preserveMedia) {
+            $csvStr = strip_tags($csvStr, '<img><embed><video><audio><source>');
+            // On Image and Embed objects, keep src
+            $csvStr = preg_replace(
+                '/<(img|embed)([^>]+src=[\'"]([^\'"]+)[\'"])*[^\/>]*\/?>/i',
+                '[$1 src="$3"]',
+                $csvStr
+            );
+            // On Video and Audio keep sources
+            $csvStr = preg_replace_callback(
+                '/<(video|audio)([^>]+src=[\'"]([^\'"]+)[\'"])*[^\/>]*\/?>([\s\S]*)<\/\1>/i',
+                function ($matches) {
+                    return self::mediaSrcExtractor($matches);
+                },
+                $csvStr
+            );
+        }
+        // Strip any remaining tags
+        $csvStr = strip_tags($csvStr);
+
+        // Trim spaces
+        return trim(preg_replace('/\s+/', ' ', $csvStr));
+    }
+
+    private static function mediaSrcExtractor(array $matches): string
+    {
+        $ret = '['.$matches[1].(empty($matches[3]) ? '' : ' src="'.$matches[3].'"');
+        if (!empty($matches[4])) {
+            $srcs = [];
+            preg_match_all('/src=[\'"]([^\'"]+)[\'"]/', $matches[4], $srcs);
+            foreach ($srcs[1] as $src) {
+                $ret .= ' src="'.$src.'"';
+            }
+        }
+        $ret .= ']';
+
+        return $ret;
+    }
 }
diff --git a/src/main/core/Library/Utilities/ClaroUtilities.php b/src/main/core/Library/Utilities/ClaroUtilities.php
@@ -1,60 +1,14 @@
 <?php
 
-/*
- * This file is part of the Claroline Connect package.
- *
- * (c) Claroline Consortium <consortium@claroline.net>
- *
- * For the full copyright and license information, please view the LICENSE
- * file that was distributed with this source code.
- */
-
 namespace Claroline\CoreBundle\Library\Utilities;
 
+use Claroline\CoreBundle\Library\Normalizer\TextNormalizer;
+
 class ClaroUtilities
 {
-    private $hasIntl;
-
-    public function __construct()
+    public function html2Csv(string $htmlStr, ?bool $preserveMedia = false): string
     {
-        $this->hasIntl = extension_loaded('intl');
-    }
-
-    /**
-     * Detect if encoding is UTF-8, ASCII, ISO-8859-1 or Windows-1252.
-     *
-     * @param $string
-     *
-     * @return bool|string
-     */
-    public function detectEncoding($string)
-    {
-        static $enclist = ['UTF-8', 'ASCII', 'ISO-8859-1', 'Windows-1252'];
-
-        if (function_exists('mb_detect_encoding')) {
-            return mb_detect_encoding($string, $enclist, true);
-        }
-
-        $result = false;
-
-        foreach ($enclist as $item) {
-            try {
-                $sample = iconv($item, $item, $string);
-                if (md5($sample) === md5($string)) {
-                    $result = $item;
-                    break;
-                }
-            } catch (\Exception $e) {
-                unset($e);
-            }
-        }
-
-        return $result;
-    }
-
-    public function html2Csv($htmlStr, $preserveMedia = false)
-    {
-        $csvStr = $this->formatCsvOutput($htmlStr);
+        $csvStr = TextNormalizer::sanitize($htmlStr);
         if ($preserveMedia) {
             $csvStr = strip_tags($csvStr, '<img><embed><video><audio><source>');
             // On Image and Embed objects, keep src
@@ -74,13 +28,12 @@ function ($matches) {
         }
         // Strip any remaining tags
         $csvStr = strip_tags($csvStr);
-        // Trim spaces
-        $csvStr = trim(preg_replace('/\s+/', ' ', $csvStr));
 
-        return $csvStr;
+        // Trim spaces
+        return trim(preg_replace('/\s+/', ' ', $csvStr));
     }
 
-    private function mediaSrcExtractor($matches)
+    private function mediaSrcExtractor(array $matches): string
     {
         $ret = '['.$matches[1].(empty($matches[3]) ? '' : ' src="'.$matches[3].'"');
         if (!empty($matches[4])) {
@@ -94,26 +47,4 @@ private function mediaSrcExtractor($matches)
 
         return $ret;
     }
-
-    private function formatCsvOutput($data)
-    {
-        // If encoding not UTF-8 then convert it to UTF-8
-        $data = $this->stringToUtf8($data);
-        $data = str_replace("\r\n", PHP_EOL, $data);
-        $data = str_replace("\r", PHP_EOL, $data);
-        $data = str_replace("\n", PHP_EOL, $data);
-
-        return $data;
-    }
-
-    private function stringToUtf8($string)
-    {
-        // If encoding not UTF-8 then convert it to UTF-8
-        $encoding = $this->detectEncoding($string);
-        if ($encoding && 'UTF-8' !== $encoding) {
-            $string = iconv($encoding, 'UTF-8', $string);
-        }
-
-        return $string;
-    }
 }