Skip to content

Commit

Permalink
Reimplement HTML Serializer for better ns cleaning
Browse files Browse the repository at this point in the history
  • Loading branch information
theseer committed Dec 1, 2023
1 parent d0013f2 commit 01cb296
Show file tree
Hide file tree
Showing 4 changed files with 180 additions and 27 deletions.
1 change: 1 addition & 0 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
"php" : "^8.2",
"ext-dom" : "*",
"ext-libxml": "*",
"ext-xmlwriter": "*",
"theseer/css2xpath": "^2.0"
},
"autoload": {
Expand Down
1 change: 0 additions & 1 deletion src/autoload.php
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ function($class) {
'templado\\engine\\mergelistexception' => '/merger/MergeListException.php',
'templado\\engine\\merger' => '/merger/Merger.php',
'templado\\engine\\mergerexception' => '/merger/MergerException.php',
'templado\\engine\\namespacecleaningtransformation' => '/transformation/NamespaceCleaningTransformation.php',
'templado\\engine\\notdefined' => '/viewmodel/NotDefined.php',
'templado\\engine\\parsingexception' => '/document/ParsingException.php',
'templado\\engine\\remove' => '/viewmodel/Remove.php',
Expand Down
160 changes: 136 additions & 24 deletions src/serializer/HTMLSerializer.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,16 @@
*/
namespace Templado\Engine;

use DOMAttr;
use DOMDocument;
use DOMElement;
use DOMNameSpaceNode;
use DOMNode;
use DOMXPath;
use XMLWriter;
use function assert;
use const LIBXML_NOEMPTYTAG;
use const LIBXML_NOXMLDECL;

class HTMLSerializer implements Serializer {
private bool $stripRDFaFlag = false;
Expand All @@ -20,6 +29,10 @@ class HTMLSerializer implements Serializer {

private bool $withDoctypeFlag = true;

private const HTMLNS = 'http://www.w3.org/1999/xhtml';

private bool $isFirst;

/** @psalm-var list<Filter> */
private array $filters = [];

Expand Down Expand Up @@ -63,13 +76,6 @@ public function addFilter(Filter $filter): self {
}

public function serialize(DOMDocument $document): string {
if ($this->namespaceCleaningFlag) {
$this->transformations[] = new NamespaceCleaningTransformation();
}

if ($this->stripRDFaFlag) {
$this->transformations[] = new StripRDFaAttributesTransformation;
}

if (!empty($this->transformations)) {
(new TransformationProcessor())->process(
Expand All @@ -78,34 +84,140 @@ public function serialize(DOMDocument $document): string {
);
}

$xmlString = $this->namespaceCleaningFlag ?
$this->serializeToCleanedString($document) :
$this->serializeToBasicString($document);

$this->filters[] = new EmptyElementsFilter();

foreach ($this->filters as $filter) {
$xmlString = $filter->apply($xmlString);
}

return $xmlString;
}

private function serializeToCleanedString(DOMDocument $document): string {
$writer = new XMLWriter();
$writer->openMemory();
$writer->setIndent(true);
$writer->setIndentString(' ');

if ($this->keepXMLHeaderFlag) {
$writer->startDocument();
}

if ($this->withDoctypeFlag) {
$document = $this->enforceHTML5DocType($document);
$writer->writeDtd('html');
}

$document->formatOutput = true;
$xmlString = $document->saveXML(options: LIBXML_NOEMPTYTAG);
$this->isFirst = true;

$this->filters[] = new EmptyElementsFilter();
$this->walk($writer, $document->documentElement, []);

if (!$this->keepXMLHeaderFlag) {
$this->filters[] = new XMLHeaderFilter();
if ($this->keepXMLHeaderFlag) {
$writer->endDocument();
}

foreach ($this->filters as $filter) {
$xmlString = $filter->apply($xmlString);
return $writer->outputMemory();
}

private function walk(XMLWriter $writer, DOMNode $node, array $knownPrefixes):void {
assert($node->ownerDocument instanceof DOMDocument);

if (!$node instanceof DOMElement) {
$writer->writeRaw(
$node->ownerDocument->saveXML($node)
);

return;
}

return $xmlString;
if ($node->namespaceURI === self::HTMLNS || empty($node->namespaceURI)) {
$writer->startElement($node->localName);
if ($this->isFirst) {
$writer->writeAttribute('xmlns', self::HTMLNS);
$this->isFirst = false;
}
} else {
$writer->startElement($node->nodeName);
if (empty($node->prefix)) {
$writer->writeAttribute('xmlns', $node->namespaceURI);
} elseif (!isset($knownPrefixes[$node->prefix])) {
$writer->writeAttribute('xmlns:' . $node->prefix, $node->namespaceURI);
$knownPrefixes[$node->prefix] = $node->namespaceURI;
}
}

foreach($node->attributes as $attribute) {
assert($attribute instanceof DOMAttr);

if ($this->stripRDFaFlag && in_array($attribute->name, ['property', 'resource', 'prefix', 'typeof', 'vocab'])) {
continue;
}

if (empty($attribute->prefix)) {
$writer->writeAttribute($attribute->name, $attribute->value);
continue;
}

if (!isset($knownPrefixes[$attribute->prefix])) {
$knownPrefixes[$attribute->prefix] = $node->lookupNamespaceURI($attribute->prefix);
$writer->writeAttribute('xmlns:' . $attribute->prefix, $node->lookupNamespaceURI($attribute->prefix));
}

$writer->writeAttribute(
$attribute->nodeName,
$attribute->value
);
}

foreach((new DOMXPath($node->ownerDocument))->query('./namespace::*', $node) as $nsNode) {
assert($nsNode instanceof DOMNameSpaceNode);

if (empty($nsNode->prefix) || $nsNode->prefix === 'xml') {
continue;
}

if ($nsNode->nodeValue === self::HTMLNS) {
continue;
}

if (isset($knownPrefixes[$nsNode->prefix])) {
continue;
}

assert($nsNode->nodeValue !== null);
$writer->writeAttribute('xmlns:' . $nsNode->prefix, $nsNode->nodeValue);
$knownPrefixes[$nsNode->prefix] = $nsNode->nodeValue;

}

if ($node->hasChildNodes()) {
foreach($node->childNodes as $childNode) {
$this->walk($writer, $childNode, $knownPrefixes);
}
}

$writer->fullEndElement();
}

private function enforceHTML5DocType(DOMDocument $document): DOMDocument {
$tmp = new DOMDocument();
$tmp->loadXML('<?xml version="1.0" ?><!DOCTYPE html><html />');
$tmp->replaceChild(
$tmp->importNode($document->documentElement, true),
$tmp->documentElement
);
private function serializeToBasicString(DOMDocument $document): string {
$document->formatOutput = true;
$xmlString = $document->saveXML($document->documentElement, options: LIBXML_NOEMPTYTAG);

return $tmp;
if ($this->withDoctypeFlag) {
$xmlString = "<!DOCTYPE html>\n" . $xmlString;
}

if ($this->keepXMLHeaderFlag) {
$xmlString = sprintf(
'<?xml version="1.0" encoding="%s" ?>',
$document->encoding ?? 'utf-8'
) . "\n" . $xmlString;
}

return $xmlString . "\n";
}

}
45 changes: 43 additions & 2 deletions tests/serializer/HTMLSerializerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,14 @@
use PHPUnit\Framework\Attributes\CoversClass;
use PHPUnit\Framework\Attributes\UsesClass;
use PHPUnit\Framework\TestCase;
use const LIBXML_NOEMPTYTAG;

#[CoversClass(HTMLSerializer::class)]
#[UsesClass(Document::class)]
#[UsesClass(EmptyElementsFilter::class)]
#[UsesClass(NamespaceCleaningTransformation::class)]
#[UsesClass(Selection::class)]
#[UsesClass(StaticNodeList::class)]
#[UsesClass(TransformationProcessor::class)]
#[UsesClass(XMLHeaderFilter::class)]
#[UsesClass(XPathSelector::class)]
#[UsesClass(StripRDFaAttributesTransformation::class)]
class HTMLSerializerTest extends TestCase {
Expand Down Expand Up @@ -77,6 +76,48 @@ public function testAddedTransformationGetsApplies(): void {
);
}

public function testXMLHeaderIsKeptWhenNotCleaning() {
$dom = new DOMDocument();
$dom->preserveWhiteSpace = false;
$dom->loadXML('<html xmlns="http://www.w3.org/1999/xhtml" />');

$this->assertSame(
'<?xml version="1.0" encoding="utf-8" ?>' . "\n" . '<html xmlns="http://www.w3.org/1999/xhtml"></html>' . "\n",
(new HTMLSerializer())->keepXMLHeader()->noHtml5Doctype()->disableNamespaceCleaning()->serialize($dom)
);
}

public function testNamespacedAttributesGetSerializedCorrectly() {
$dom = new DOMDocument();
$dom->preserveWhiteSpace = false;
$dom->loadXML('<?xml version="1.0" ?><html xmlns="http://www.w3.org/1999/xhtml" xmlns:a="urn:a" a:attr="value" />');

$this->assertSame(
'<?xml version="1.0"?>' . "\n" . '<html xmlns="http://www.w3.org/1999/xhtml" xmlns:a="urn:a" a:attr="value"></html>' . "\n",
(new HTMLSerializer())->keepXMLHeader()->noHtml5Doctype()->serialize($dom)
);
}

public function testNamespacedElementsGetSerializedCorrectly() {
$dom = new DOMDocument();
$dom->preserveWhiteSpace = false;
$dom->loadXML('<?xml version="1.0" ?><html xmlns="http://www.w3.org/1999/xhtml" xmlns:a="urn:a"><a:foo /><b:foo xmlns:b="urn:b" /><c xmlns="urn:c" /></html>');

$this->assertSame(
implode("\n", [
'<?xml version="1.0"?>',
'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:a="urn:a">',
' <a:foo></a:foo>',
' <b:foo xmlns:b="urn:b"></b:foo>',
' <c xmlns="urn:c"></c>',
'</html>' . "\n"
]),
(new HTMLSerializer())->keepXMLHeader()->noHtml5Doctype()->serialize($dom)
);
}



private function createInputDocument(): Document {
return Document::fromString(file_get_contents(__DIR__ . '/../_data/serializer/input.xml'));
}
Expand Down

0 comments on commit 01cb296

Please sign in to comment.