From fc8b049c48d81e215e006698b5c64296de213fcf Mon Sep 17 00:00:00 2001 From: Yufei Huang Date: Sat, 25 Nov 2023 11:43:09 +0800 Subject: [PATCH] fix: only allow h1 to be the title --- .../Conceptual/BuildConceptualDocument.cs | 56 +++++++++++++++---- .../Conceptual/HtmlDocumentUtility.cs | 49 ---------------- .../Conceptual/SeparatedHtmlInfo.cs | 13 ----- .../{CountWord.cs => WordCounter.cs} | 25 --------- 4 files changed, 44 insertions(+), 99 deletions(-) delete mode 100644 src/Docfx.Build/Conceptual/HtmlDocumentUtility.cs delete mode 100644 src/Docfx.Build/Conceptual/SeparatedHtmlInfo.cs rename src/Docfx.Build/Conceptual/{CountWord.cs => WordCounter.cs} (70%) diff --git a/src/Docfx.Build/Conceptual/BuildConceptualDocument.cs b/src/Docfx.Build/Conceptual/BuildConceptualDocument.cs index 072c5ae6ece..a6c438076d1 100644 --- a/src/Docfx.Build/Conceptual/BuildConceptualDocument.cs +++ b/src/Docfx.Build/Conceptual/BuildConceptualDocument.cs @@ -3,18 +3,18 @@ using System.Collections.Immutable; using System.Composition; - +using System.Net; using Docfx.Build.Common; using Docfx.Common; using Docfx.DataContracts.Common; using Docfx.Plugins; +using HtmlAgilityPack; namespace Docfx.Build.ConceptualDocuments; [Export(nameof(ConceptualDocumentProcessor), typeof(IDocumentBuildStep))] class BuildConceptualDocument : BaseDocumentBuildStep { - private const string ConceptualKey = Constants.PropertyName.Conceptual; private const string DocumentTypeKey = "documentType"; public override string Name => nameof(BuildConceptualDocument); @@ -28,16 +28,17 @@ public override void Build(FileModel model, IHostService host) return; } var content = (Dictionary)model.Content; - var markdown = (string)content[ConceptualKey]; + var markdown = (string)content[Constants.PropertyName.Conceptual]; var result = host.Markup(markdown, model.OriginalFileAndType, false); - var htmlInfo = HtmlDocumentUtility.SeparateHtml(result.Html); - content["rawTitle"] = htmlInfo.RawTitle; - if (!string.IsNullOrEmpty(htmlInfo.RawTitle)) + var (h1, h1Raw, conceptual) = ExtractH1(result.Html); + content["rawTitle"] = h1Raw; + if (!string.IsNullOrEmpty(h1Raw)) { - model.ManifestProperties.rawTitle = htmlInfo.RawTitle; + model.ManifestProperties.rawTitle = h1Raw; } - content[ConceptualKey] = htmlInfo.Content; + content[Constants.PropertyName.Conceptual] = conceptual; + content["wordCount"] = WordCounter.CountWord(conceptual); if (result.YamlHeader?.Count > 0) { @@ -47,13 +48,14 @@ public override void Build(FileModel model, IHostService host) } } - (content[Constants.PropertyName.Title], model.Properties.IsUserDefinedTitle) = GetTitle(result.YamlHeader, htmlInfo); + (content[Constants.PropertyName.Title], model.Properties.IsUserDefinedTitle) = GetTitle(result.YamlHeader, h1); model.LinkToFiles = result.LinkToFiles.ToImmutableHashSet(); model.LinkToUids = result.LinkToUids; model.FileLinkSources = result.FileLinkSources; model.UidLinkSources = result.UidLinkSources; model.Properties.XrefSpec = null; + if (model.Uids.Length > 0) { var title = content[Constants.PropertyName.Title] as string; @@ -108,7 +110,7 @@ void HandleYamlHeaderPair(string key, object value) } } - (string title, bool isUserDefined) GetTitle(ImmutableDictionary yamlHeader, SeparatedHtmlInfo info) + (string title, bool isUserDefined) GetTitle(ImmutableDictionary yamlHeader, string h1) { // title from YAML header if (yamlHeader != null @@ -124,9 +126,9 @@ void HandleYamlHeaderPair(string key, object value) } // title from H1 - if (!string.IsNullOrEmpty(info.Title)) + if (!string.IsNullOrEmpty(h1)) { - return (info.Title, false); + return (h1, false); } // title from globalMetadata or fileMetadata @@ -152,4 +154,34 @@ bool TryGetStringValue(IDictionary dictionary, string key, out s } } } + + static (string h1, string h1Raw, string body) ExtractH1(string contentHtml) + { + ArgumentNullException.ThrowIfNull(contentHtml); + + var document = new HtmlDocument(); + document.LoadHtml(contentHtml); + + // InnerText in HtmlAgilityPack is not decoded, should be a bug + var h1Node = document.DocumentNode.SelectSingleNode("//h1"); + var h1 = WebUtility.HtmlDecode(h1Node?.InnerText); + var h1Raw = ""; + if (h1Node != null && GetFirstNoneCommentChild(document.DocumentNode) == h1Node) + { + h1Raw = h1Node.OuterHtml; + h1Node.Remove(); + } + + return (h1, h1Raw, document.DocumentNode.OuterHtml); + + static HtmlNode GetFirstNoneCommentChild(HtmlNode node) + { + var result = node.FirstChild; + while (result != null && (result.NodeType == HtmlNodeType.Comment || string.IsNullOrWhiteSpace(result.OuterHtml))) + { + result = result.NextSibling; + } + return result; + } + } } diff --git a/src/Docfx.Build/Conceptual/HtmlDocumentUtility.cs b/src/Docfx.Build/Conceptual/HtmlDocumentUtility.cs deleted file mode 100644 index 4428c836ad6..00000000000 --- a/src/Docfx.Build/Conceptual/HtmlDocumentUtility.cs +++ /dev/null @@ -1,49 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -using System.Net; -using HtmlAgilityPack; - -namespace Docfx.Build.ConceptualDocuments; - -static class HtmlDocumentUtility -{ - public static SeparatedHtmlInfo SeparateHtml(string contentHtml) - { - ArgumentNullException.ThrowIfNull(contentHtml); - - var content = new SeparatedHtmlInfo(); - - var document = new HtmlDocument(); - document.LoadHtml(contentHtml); - - // TODO: how to get TITLE - // InnerText in HtmlAgilityPack is not decoded, should be a bug - var headerNode = document.DocumentNode.SelectSingleNode("//h1|//h2|//h3"); - content.Title = WebUtility.HtmlDecode(headerNode?.InnerText); - - if (headerNode != null && GetFirstNoneCommentChild(document.DocumentNode) == headerNode) - { - content.RawTitle = headerNode.OuterHtml; - headerNode.Remove(); - } - else - { - content.RawTitle = string.Empty; - } - - content.Content = document.DocumentNode.OuterHtml; - - return content; - } - - private static HtmlNode GetFirstNoneCommentChild(HtmlNode node) - { - var result = node.FirstChild; - while (result != null && (result.NodeType == HtmlNodeType.Comment || string.IsNullOrWhiteSpace(result.OuterHtml))) - { - result = result.NextSibling; - } - return result; - } -} diff --git a/src/Docfx.Build/Conceptual/SeparatedHtmlInfo.cs b/src/Docfx.Build/Conceptual/SeparatedHtmlInfo.cs deleted file mode 100644 index fc90dbf3ae9..00000000000 --- a/src/Docfx.Build/Conceptual/SeparatedHtmlInfo.cs +++ /dev/null @@ -1,13 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. - -namespace Docfx.Build.ConceptualDocuments; - -class SeparatedHtmlInfo -{ - public string Title { get; set; } - - public string RawTitle { get; set; } - - public string Content { get; set; } -} diff --git a/src/Docfx.Build/Conceptual/CountWord.cs b/src/Docfx.Build/Conceptual/WordCounter.cs similarity index 70% rename from src/Docfx.Build/Conceptual/CountWord.cs rename to src/Docfx.Build/Conceptual/WordCounter.cs index 2bc5acb2aa6..837f7dce171 100644 --- a/src/Docfx.Build/Conceptual/CountWord.cs +++ b/src/Docfx.Build/Conceptual/WordCounter.cs @@ -1,35 +1,10 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. -using System.Collections.Immutable; -using System.Composition; -using Docfx.Build.Common; -using Docfx.DataContracts.Common; -using Docfx.Plugins; using HtmlAgilityPack; namespace Docfx.Build.ConceptualDocuments; -[Export(nameof(ConceptualDocumentProcessor), typeof(IDocumentBuildStep))] -class CountWord : BaseDocumentBuildStep -{ - public override string Name => nameof(CountWord); - - public override int BuildOrder => 1; - - public override void Postbuild(ImmutableList models, IHostService host) - { - foreach (var model in models) - { - if (model.Type == DocumentType.Article) - { - var content = (Dictionary)model.Content; - content["wordCount"] = WordCounter.CountWord((string)content[Constants.PropertyName.Conceptual]); - } - } - } -} - internal static class WordCounter { private static readonly string[] ExcludeNodeXPaths = { "//title" };