From f8517093da973f5bd7eeb0045fbed7bb32d15d92 Mon Sep 17 00:00:00 2001 From: Dan Allen Date: Thu, 22 Jun 2023 23:54:31 -0600 Subject: [PATCH] backport fix for #2430 remove null character enclosed by XML tag when sanitizing text --- CHANGELOG.adoc | 1 + lib/asciidoctor/pdf/sanitizer.rb | 4 ++-- spec/outline_spec.rb | 8 ++++++-- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.adoc b/CHANGELOG.adoc index fe9d8be2e..0571ac6b4 100644 --- a/CHANGELOG.adoc +++ b/CHANGELOG.adoc @@ -9,6 +9,7 @@ For a detailed view of what has changed, refer to the {url-repo}/commits/main[co Bug Fixes:: +* remove null character enclosed in XML tag when santizing text; fixes invisible text in outline when heading contains index term (#2430) * alias `File.exists?` to `File.exist?` when loading RGhost optimizer to patch incompatibility when using Ruby 3.2 Build / Infrastructure:: diff --git a/lib/asciidoctor/pdf/sanitizer.rb b/lib/asciidoctor/pdf/sanitizer.rb index 8c009ca95..e55744143 100644 --- a/lib/asciidoctor/pdf/sanitizer.rb +++ b/lib/asciidoctor/pdf/sanitizer.rb @@ -19,11 +19,11 @@ module Sanitizer 'nbsp' => ' ', 'quot' => '"', }).default = '?' - SanitizeXMLRx = /<[^>]+>/ + SanitizeXMLRx = /<[^>]+>\0?/ CharRefRx = /&(?:amp;)?(?:([a-z][a-z]+\d{0,2})|#(?:(\d\d\d{0,4})|x(\h\h\h{0,3})));/ UnescapedAmpersandRx = /&(?!(?:[a-z][a-z]+\d{0,2}|#(?:\d\d\d{0,4}|x\h\h\h{0,3}));)/ - # Strip leading, trailing and repeating whitespace, remove XML tags and + # Strip leading, trailing and repeating whitespace, remove XML tags along with an enclosed null character, and # resolve all entities in the specified string. # # FIXME: move to a module so we can mix it in elsewhere diff --git a/spec/outline_spec.rb b/spec/outline_spec.rb index 34363ba59..79f9056c1 100644 --- a/spec/outline_spec.rb +++ b/spec/outline_spec.rb @@ -749,14 +749,18 @@ pdf = to_pdf <<~'EOS' = _Document_ *Title* :doctype: book + :sectnums: == _First_ *Chapter* + + == ((Wetland Birds)) EOS outline = extract_outline pdf - (expect outline).to have_size 2 + (expect outline).to have_size 3 (expect outline[0][:title]).to eql 'Document Title' - (expect outline[1][:title]).to eql 'First Chapter' + (expect outline[1][:title]).to eql 'Chapter 1. First Chapter' + (expect outline[2][:title]).to eql 'Chapter 2. Wetland Birds' end it 'should decode character references in entries' do