Skip to content

Commit

Permalink
[ClacoForm] strip HTML in export
Browse files Browse the repository at this point in the history
  • Loading branch information
Elorfin committed Mar 24, 2023
1 parent dc796c4 commit 5ebccc3
Show file tree
Hide file tree
Showing 19 changed files with 189 additions and 693 deletions.
5 changes: 0 additions & 5 deletions src/main/app/Resources/modules/intl/date/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -186,10 +186,6 @@ function getTimeDiff(startDate, endDate) {
return moment.duration(diff).asSeconds()
}

function nowAdd(addition, local = true) {
return local ? moment().utc().local().add(addition).format(getApiFormat()) : moment().utc().add(addition).format(getApiFormat())
}

export {
getApiFormat,
getDisplayFormat,
Expand All @@ -201,7 +197,6 @@ export {
dateToDisplayFormat,
computeElapsedTime,
getTimeDiff,
nowAdd,
displayDuration,
displayTime,
displayDateRange
Expand Down
184 changes: 110 additions & 74 deletions src/main/core/Library/Normalizer/TextNormalizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,8 @@

class TextNormalizer
{
/**
* @param $string
*
* @return string
*/
public static function stripDiacritics($string)
public static function stripDiacritics(string $string): string
{
$string = (string) $string;

if (!preg_match('/[\x80-\xff]/', $string)) {
return $string;
}
Expand Down Expand Up @@ -82,9 +75,8 @@ public static function stripDiacritics($string)
// grave accent
'Ǜ' => 'U', 'ǜ' => 'u',
];
$string = str_replace(array_keys($transliteration), array_values($transliteration), $string);

return $string;
return str_replace(array_keys($transliteration), array_values($transliteration), $string);
}

public static function toKey($string, int $length = null)
Expand Down Expand Up @@ -118,6 +110,9 @@ public static function toUtf8(string $string): string
return $string;
}

/**
* Converts a string into UTF-8 and replaces EOL by PHP ones.
*/
public static function sanitize(string $string): string
{
// If encoding not UTF-8 then convert it to UTF-8
Expand All @@ -126,6 +121,7 @@ public static function sanitize(string $string): string
// normalize end of lines
$string = str_replace("\r\n", PHP_EOL, $string);
$string = str_replace("\r", PHP_EOL, $string);
$string = str_replace("\n", PHP_EOL, $string);

return $string;
}
Expand All @@ -135,87 +131,127 @@ public static function sanitize(string $string): string
*/
public static function resumeHtml(string $text, int $nbCharacter, ?string $readMoreText = ''): string
{
if (is_numeric($nbCharacter)) {
$lengthBeforeWithoutHtml = strlen(trim(strip_tags($text)));
$htmlSplitMask = '#</?([a-zA-Z1-6]+)(?: +[a-zA-Z]+="[^"]*")*( ?/)?>#';
$htmlMatchMask = '#<(?:/([a-zA-Z1-6]+)|([a-zA-Z1-6]+)(?: +[a-zA-Z]+="[^"]*")*( ?/)?)>#';
$lengthBeforeWithoutHtml = strlen(trim(strip_tags($text)));
$htmlSplitMask = '#</?([a-zA-Z1-6]+)(?: +[a-zA-Z]+="[^"]*")*( ?/)?>#';
$htmlMatchMask = '#<(?:/([a-zA-Z1-6]+)|([a-zA-Z1-6]+)(?: +[a-zA-Z]+="[^"]*")*( ?/)?)>#';
$text .= ' ';
$textPieces = preg_split($htmlSplitMask, $text, -1, PREG_SPLIT_OFFSET_CAPTURE | PREG_SPLIT_NO_EMPTY);
$pieceNumber = count($textPieces);

if (1 === $pieceNumber) {
$text .= ' ';
$textPieces = preg_split($htmlSplitMask, $text, -1, PREG_SPLIT_OFFSET_CAPTURE | PREG_SPLIT_NO_EMPTY);
$pieceNumber = count($textPieces);
$lengthBefore = strlen($text);
$text = substr($text, 0, strpos($text, ' ', $lengthBefore > $nbCharacter ? $nbCharacter : $lengthBefore));

if (1 === $pieceNumber) {
$text .= ' ';
$lengthBefore = strlen($text);
$text = substr($text, 0, strpos($text, ' ', $lengthBefore > $nbCharacter ? $nbCharacter : $lengthBefore));
if ('' != $readMoreText && $lengthBefore > $nbCharacter) {
$text .= $readMoreText;
}
} else {
$length = 0;
$indexLastPiece = $pieceNumber - 1;
$position = $textPieces[$indexLastPiece][1] + strlen($textPieces[$indexLastPiece][0]) - 1;
$indexPiece = $indexLastPiece;
$searchSpace = true;

if ('' != $readMoreText && $lengthBefore > $nbCharacter) {
$text .= $readMoreText;
}
} else {
$length = 0;
$indexLastPiece = $pieceNumber - 1;
$position = $textPieces[$indexLastPiece][1] + strlen($textPieces[$indexLastPiece][0]) - 1;
$indexPiece = $indexLastPiece;
$searchSpace = true;

foreach ($textPieces as $index => $bout) {
$length += strlen($bout[0]);
if ($length >= $nbCharacter) {
$positionEndPiece = $bout[1] + strlen($bout[0]) - 1;
$position = $positionEndPiece - ($length - $nbCharacter);

$positionSpace = strpos($bout[0], ' ', $position - $bout[1]);
if (false !== $positionSpace) {
$position = $bout[1] + $positionSpace;
$searchSpace = false;
}
if ($index != $indexLastPiece) {
$indexPiece = $index + 1;
}
break;
foreach ($textPieces as $index => $bout) {
$length += strlen($bout[0]);
if ($length >= $nbCharacter) {
$positionEndPiece = $bout[1] + strlen($bout[0]) - 1;
$position = $positionEndPiece - ($length - $nbCharacter);

$positionSpace = strpos($bout[0], ' ', $position - $bout[1]);
if (false !== $positionSpace) {
$position = $bout[1] + $positionSpace;
$searchSpace = false;
}
if ($index != $indexLastPiece) {
$indexPiece = $index + 1;
}
break;
}
}

if (true === $searchSpace) {
for ($i = $indexPiece; $i <= $indexLastPiece; ++$i) {
$position = $textPieces[$i][1];
$positionSpace = strpos($textPieces[$i][0], ' ');
if (false !== $positionSpace) {
$position += $positionSpace;
break;
}
if (true === $searchSpace) {
for ($i = $indexPiece; $i <= $indexLastPiece; ++$i) {
$position = $textPieces[$i][1];
$positionSpace = strpos($textPieces[$i][0], ' ');
if (false !== $positionSpace) {
$position += $positionSpace;
break;
}
}
}

$text = substr($text, 0, $position);
preg_match_all($htmlMatchMask, $text, $return, PREG_OFFSET_CAPTURE);
$tagPieces = [];
$text = substr($text, 0, $position);
preg_match_all($htmlMatchMask, $text, $return, PREG_OFFSET_CAPTURE);
$tagPieces = [];

foreach ($return[0] as $index => $tag) {
if (isset($return[3][$index][0])) {
continue;
}
if ('/' != $return[0][$index][0][1]) {
array_unshift($tagPieces, $return[2][$index][0]);
} else {
array_shift($tagPieces);
}
foreach ($return[0] as $index => $tag) {
if (isset($return[3][$index][0])) {
continue;
}

if (!empty($tagPieces)) {
foreach ($tagPieces as $tag) {
$text .= '</'.$tag.'>';
}
if ('/' != $return[0][$index][0][1]) {
array_unshift($tagPieces, $return[2][$index][0]);
} else {
array_shift($tagPieces);
}
}

if ('' != $readMoreText && $lengthBeforeWithoutHtml > $nbCharacter) {
$text .= 'SuspensionPoint';
$pattern = '#((</[^>]*>[\n\t\r ]*)?(</[^>]*>[\n\t\r ]*)?(</[^>]*>[\n\t\r ]*)?(</[^>]*>[\n\t\r ]*)?(</[^>]*>)[\n\t\r ]*SuspensionPoint)#i';
$text = preg_replace($pattern, $readMoreText.'${2}${3}${4}${5}${6}', $text);
if (!empty($tagPieces)) {
foreach ($tagPieces as $tag) {
$text .= '</'.$tag.'>';
}
}

if ('' != $readMoreText && $lengthBeforeWithoutHtml > $nbCharacter) {
$text .= 'SuspensionPoint';
$pattern = '#((</[^>]*>[\n\t\r ]*)?(</[^>]*>[\n\t\r ]*)?(</[^>]*>[\n\t\r ]*)?(</[^>]*>[\n\t\r ]*)?(</[^>]*>)[\n\t\r ]*SuspensionPoint)#i';
$text = preg_replace($pattern, $readMoreText.'${2}${3}${4}${5}${6}', $text);
}
}

return $text;
}

public static function stripHtml(string $htmlStr, ?bool $preserveMedia = false): string
{
$csvStr = self::sanitize($htmlStr);
if ($preserveMedia) {
$csvStr = strip_tags($csvStr, '<img><embed><video><audio><source>');
// On Image and Embed objects, keep src
$csvStr = preg_replace(
'/<(img|embed)([^>]+src=[\'"]([^\'"]+)[\'"])*[^\/>]*\/?>/i',
'[$1 src="$3"]',
$csvStr
);
// On Video and Audio keep sources
$csvStr = preg_replace_callback(
'/<(video|audio)([^>]+src=[\'"]([^\'"]+)[\'"])*[^\/>]*\/?>([\s\S]*)<\/\1>/i',
function ($matches) {
return self::mediaSrcExtractor($matches);
},
$csvStr
);
}
// Strip any remaining tags
$csvStr = strip_tags($csvStr);

// Trim spaces
return trim(preg_replace('/\s+/', ' ', $csvStr));
}

private static function mediaSrcExtractor(array $matches): string
{
$ret = '['.$matches[1].(empty($matches[3]) ? '' : ' src="'.$matches[3].'"');
if (!empty($matches[4])) {
$srcs = [];
preg_match_all('/src=[\'"]([^\'"]+)[\'"]/', $matches[4], $srcs);
foreach ($srcs[1] as $src) {
$ret .= ' src="'.$src.'"';
}
}
$ret .= ']';

return $ret;
}
}
83 changes: 7 additions & 76 deletions src/main/core/Library/Utilities/ClaroUtilities.php
Original file line number Diff line number Diff line change
@@ -1,60 +1,14 @@
<?php

/*
* This file is part of the Claroline Connect package.
*
* (c) Claroline Consortium <consortium@claroline.net>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/

namespace Claroline\CoreBundle\Library\Utilities;

use Claroline\CoreBundle\Library\Normalizer\TextNormalizer;

class ClaroUtilities
{
private $hasIntl;

public function __construct()
public function html2Csv(string $htmlStr, ?bool $preserveMedia = false): string
{
$this->hasIntl = extension_loaded('intl');
}

/**
* Detect if encoding is UTF-8, ASCII, ISO-8859-1 or Windows-1252.
*
* @param $string
*
* @return bool|string
*/
public function detectEncoding($string)
{
static $enclist = ['UTF-8', 'ASCII', 'ISO-8859-1', 'Windows-1252'];

if (function_exists('mb_detect_encoding')) {
return mb_detect_encoding($string, $enclist, true);
}

$result = false;

foreach ($enclist as $item) {
try {
$sample = iconv($item, $item, $string);
if (md5($sample) === md5($string)) {
$result = $item;
break;
}
} catch (\Exception $e) {
unset($e);
}
}

return $result;
}

public function html2Csv($htmlStr, $preserveMedia = false)
{
$csvStr = $this->formatCsvOutput($htmlStr);
$csvStr = TextNormalizer::sanitize($htmlStr);
if ($preserveMedia) {
$csvStr = strip_tags($csvStr, '<img><embed><video><audio><source>');
// On Image and Embed objects, keep src
Expand All @@ -74,13 +28,12 @@ function ($matches) {
}
// Strip any remaining tags
$csvStr = strip_tags($csvStr);
// Trim spaces
$csvStr = trim(preg_replace('/\s+/', ' ', $csvStr));

return $csvStr;
// Trim spaces
return trim(preg_replace('/\s+/', ' ', $csvStr));
}

private function mediaSrcExtractor($matches)
private function mediaSrcExtractor(array $matches): string
{
$ret = '['.$matches[1].(empty($matches[3]) ? '' : ' src="'.$matches[3].'"');
if (!empty($matches[4])) {
Expand All @@ -94,26 +47,4 @@ private function mediaSrcExtractor($matches)

return $ret;
}

private function formatCsvOutput($data)
{
// If encoding not UTF-8 then convert it to UTF-8
$data = $this->stringToUtf8($data);
$data = str_replace("\r\n", PHP_EOL, $data);
$data = str_replace("\r", PHP_EOL, $data);
$data = str_replace("\n", PHP_EOL, $data);

return $data;
}

private function stringToUtf8($string)
{
// If encoding not UTF-8 then convert it to UTF-8
$encoding = $this->detectEncoding($string);
if ($encoding && 'UTF-8' !== $encoding) {
$string = iconv($encoding, 'UTF-8', $string);
}

return $string;
}
}
Loading

0 comments on commit 5ebccc3

Please sign in to comment.