Skip to content

Commit

Permalink
Improve buffer management
Browse files Browse the repository at this point in the history
  • Loading branch information
itboy87 committed Aug 23, 2024
1 parent 5ba3212 commit e321c46
Show file tree
Hide file tree
Showing 11 changed files with 384 additions and 226 deletions.
69 changes: 37 additions & 32 deletions ksoup-test/test/com/fleeksoft/ksoup/parser/CharacterReaderTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ package com.fleeksoft.ksoup.parser
import com.fleeksoft.ksoup.BuildConfig
import com.fleeksoft.ksoup.Platform
import com.fleeksoft.ksoup.TestHelper
import com.fleeksoft.ksoup.isJS
import com.fleeksoft.ksoup.internal.StringUtil
import com.fleeksoft.ksoup.isJsOrWasm
import com.fleeksoft.ksoup.ported.exception.UncheckedIOException
import com.fleeksoft.ksoup.ported.io.Charsets
import com.fleeksoft.ksoup.ported.io.StringReader
Expand All @@ -13,6 +14,7 @@ import korlibs.io.lang.substr
import kotlinx.coroutines.test.runTest
import kotlin.test.*


/**
* Test suite for character reader.
*
Expand All @@ -22,7 +24,7 @@ class CharacterReaderTest {

@Test
fun testUtf16BE() = runTest {
if (BuildConfig.isKotlinx && Platform.isJS()) {
if (BuildConfig.isKotlinx && Platform.isJsOrWasm()) {
// not supported in kotlinx for js
return@runTest
}
Expand All @@ -37,7 +39,7 @@ class CharacterReaderTest {

@Test
fun testUtf16LE() = runTest {
if (BuildConfig.isKotlinx && Platform.isJS()) {
if (BuildConfig.isKotlinx && Platform.isJsOrWasm()) {
// not supported in kotlinx for js
return@runTest
}
Expand All @@ -55,7 +57,7 @@ class CharacterReaderTest {
@Test
fun testReadMixSpecialChar() {
val input = "ä<a>ä</a>"
val charReader = CharacterReader(StringReader(input), sz = 1)
val charReader = CharacterReader(StringReader(input))
input.forEachIndexed { index, char ->
assertEquals(index, charReader.pos())
assertEquals(char, charReader.consume())
Expand Down Expand Up @@ -424,23 +426,24 @@ class CharacterReaderTest {

@Test
fun notEmptyAtBufferSplitPoint() {
val r = CharacterReader("How about now".toReader(), sz = 3)
assertEquals("How", r.consumeTo(' '))
assertFalse(r.isEmpty(), "Should not be empty")
assertEquals(' ', r.consume())
assertFalse(r.isEmpty())
assertEquals(4, r.pos())
assertEquals('a', r.consume())
assertEquals(5, r.pos())
assertEquals('b', r.consume())
assertEquals('o', r.consume())
assertEquals('u', r.consume())
assertEquals('t', r.consume())
assertEquals(' ', r.consume())
assertEquals('n', r.consume())
assertEquals('o', r.consume())
assertEquals('w', r.consume())
val len = CharacterReader.BufferSize * 12
val builder: StringBuilder = StringUtil.borrowBuilder()
while (builder.length <= len) builder.append('!')
val r = CharacterReader(builder.toString())
StringUtil.releaseBuilder(builder)


// consume through
for (pos in 0 until len) {
assertEquals(pos, r.pos())
assertFalse(r.isEmpty())
assertEquals('!', r.consume())
assertEquals(pos + 1, r.pos())
assertFalse(r.isEmpty())
}
assertEquals('!', r.consume())
assertTrue(r.isEmpty())
assertEquals(CharacterReader.EOF, r.consume())
}

@Test
Expand Down Expand Up @@ -477,7 +480,7 @@ class CharacterReaderTest {
fun canTrackNewlines() {
val builder = StringBuilder()
builder.append("<foo>\n<bar>\n<qux>\n")
while (builder.length < CharacterReader.maxBufferLen) {
while (builder.length < CharacterReader.BufferSize) {
builder.append("Lorem ipsum dolor sit amet, consectetur adipiscing elit.")
}
builder.append("[foo]\n[bar]")
Expand All @@ -496,10 +499,10 @@ class CharacterReaderTest {
assertEquals("1:13", noTrack.posLineCol())
// get over the buffer
while (!noTrack.matches("[foo]")) noTrack.consumeTo("[foo]")
assertEquals(32778, noTrack.pos())
assertEquals(2090, noTrack.pos())
assertEquals(1, noTrack.lineNumber())
assertEquals(noTrack.pos() + 1, noTrack.columnNumber())
assertEquals("1:32779", noTrack.posLineCol())
assertEquals("1:2091", noTrack.posLineCol())

val track = CharacterReader(content)
track.trackNewlines(true)
Expand Down Expand Up @@ -527,12 +530,12 @@ class CharacterReaderTest {
assertEquals("3:6", track.posLineCol())
// get over the buffer
while (!track.matches("[foo]")) track.consumeTo("[foo]")
assertEquals(32778, track.pos())
assertEquals(2090, track.pos())
assertEquals(4, track.lineNumber())
assertEquals(32761, track.columnNumber())
assertEquals("4:32761", track.posLineCol())
assertEquals(2073, track.columnNumber())
assertEquals("4:2073", track.posLineCol())
track.consumeTo('\n')
assertEquals("4:32766", track.posLineCol())
assertEquals("4:2078", track.posLineCol())
track.consumeTo("[bar]")
assertEquals(5, track.lineNumber())
assertEquals("5:1", track.posLineCol())
Expand All @@ -543,19 +546,21 @@ class CharacterReaderTest {
@Test
fun countsColumnsOverBufferWhenNoNewlines() {
val builder = StringBuilder()
while (builder.length < CharacterReader.maxBufferLen * 4) builder.append("Lorem ipsum dolor sit amet, consectetur adipiscing elit.")
while (builder.length < CharacterReader.BufferSize * 4) builder.append("Lorem ipsum dolor sit amet, consectetur adipiscing elit.")
val content = builder.toString()
val reader = CharacterReader(content)
reader.trackNewlines(true)
assertEquals("1:1", reader.posLineCol())
while (!reader.isEmpty()) reader.consume()
assertEquals(131096, reader.pos())
val seen = StringBuilder()
while (!reader.isEmpty()) seen.append(reader.consume())
assertEquals(content, seen.toString())
assertEquals(content.length, reader.pos())
assertEquals(reader.pos() + 1, reader.columnNumber())
assertEquals(1, reader.lineNumber())
}

@Test
fun linenumbersAgreeWithEditor() = runTest {
fun lineNumbersAgreeWithEditor() = runTest {
val content: String = TestHelper.getFileAsString(
TestHelper.getResourceAbsolutePath("htmltests/large.html.gz").uniVfs
)
Expand Down Expand Up @@ -606,7 +611,7 @@ class CharacterReaderTest {
companion object {
fun bufferBuster(content: String): String {
val builder = StringBuilder()
while (builder.length < CharacterReader.maxBufferLen) builder.append(content)
while (builder.length < CharacterReader.BufferSize) builder.append(content)
return builder.toString()
}
}
Expand Down
8 changes: 4 additions & 4 deletions ksoup-test/test/com/fleeksoft/ksoup/parser/HtmlParserTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ class HtmlParserTest {
@Test
fun handlesCdataAcrossBuffer() {
val sb = StringBuilder()
while (sb.length <= CharacterReader.maxBufferLen) {
while (sb.length <= CharacterReader.BufferSize) {
sb.append("A suitable amount of CData.\n")
}
val cdata = sb.toString()
Expand Down Expand Up @@ -1290,7 +1290,7 @@ class HtmlParserTest {
@Test
fun testInvalidTableContents() = runTest {
val resourceName = "htmltests/table-invalid-elements.html"
val doc: Document = if (BuildConfig.isKotlinx && Platform.isJS()) {
val doc: Document = if (BuildConfig.isKotlinx && Platform.isJsOrWasm()) {
val source = TestHelper.readResource(resourceName)
Ksoup.parse(sourceReader = source, baseUri = resourceName, charsetName = "UTF-8")
} else {
Expand Down Expand Up @@ -1507,7 +1507,7 @@ class HtmlParserTest {
@Test
fun testTemplateInsideTable() = runTest {
val resourceName = "htmltests/table-polymer-template.html"
val doc: Document = if (BuildConfig.isKotlinx && Platform.isJS()) {
val doc: Document = if (BuildConfig.isKotlinx && Platform.isJsOrWasm()) {
val source = TestHelper.readResource(resourceName)
Ksoup.parse(sourceReader = source, baseUri = resourceName, charsetName = "UTF-8")
} else {
Expand Down Expand Up @@ -1550,7 +1550,7 @@ class HtmlParserTest {
@Test
fun handlesXmlDeclAndCommentsBeforeDoctype() = runTest {
val resourceName = "htmltests/comments.html"
val doc: Document = if (BuildConfig.isKotlinx && Platform.isJS()) {
val doc: Document = if (BuildConfig.isKotlinx && Platform.isJsOrWasm()) {
val source = TestHelper.readResource(resourceName)
Ksoup.parse(sourceReader = source, baseUri = resourceName, charsetName = "UTF-8")
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -200,12 +200,12 @@ class TokeniserStateTest {
fun testUnconsumeAtBufferBoundary() {
val triggeringSnippet = "<a href=\"\"foo"
val padding =
CharArray(CharacterReader.readAheadLimit - triggeringSnippet.length + 2) // The "foo" part must be just at the limit.
CharArray(CharacterReader.RefillPoint - triggeringSnippet.length + 2) // The "foo" part must be just at the limit.
padding.fill(' ')
val paddedSnippet = padding.concatToString() + triggeringSnippet
val errorList = ParseErrorList.tracking(1)
Parser.parseFragment(paddedSnippet, null, "", errorList)
assertEquals(CharacterReader.readAheadLimit - 1, errorList[0].pos)
assertEquals(CharacterReader.RefillPoint - 1, errorList[0].pos)
}

@Test
Expand All @@ -214,7 +214,7 @@ class TokeniserStateTest {
val triggeringSnippet = "<title>One <span>Two"
val padding =
CharArray(
CharacterReader.readAheadLimit - triggeringSnippet.length + 8,
CharacterReader.RefillPoint - triggeringSnippet.length + 8,
) // The "<span" part must be just at the limit. The "containsIgnoreCase" scan does a bufferUp, losing the unconsume
padding.fill(' ')
val paddedSnippet = padding.concatToString() + triggeringSnippet
Expand Down
32 changes: 16 additions & 16 deletions ksoup-test/test/com/fleeksoft/ksoup/parser/TokeniserTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class TokeniserTest {
val preamble = "<img src=$quote"
val tail = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
val sb = StringBuilder(preamble)
val charsToFillBuffer = CharacterReader.maxBufferLen - preamble.length
val charsToFillBuffer = CharacterReader.BufferSize - preamble.length
for (i in 0 until charsToFillBuffer) {
sb.append('a')
}
Expand All @@ -37,10 +37,10 @@ class TokeniserTest {
@Test
fun handleSuperLargeTagNames() {
// unlikely, but valid. so who knows.
val sb = StringBuilder(CharacterReader.maxBufferLen)
val sb = StringBuilder(CharacterReader.BufferSize)
do {
sb.append("LargeTagName")
} while (sb.length < CharacterReader.maxBufferLen)
} while (sb.length < CharacterReader.BufferSize)
val tag = sb.toString()
val html = "<$tag>One</$tag>"
val doc = Parser.htmlParser().settings(ParseSettings.preserveCase).parseInput(html, "")
Expand All @@ -54,10 +54,10 @@ class TokeniserTest {

@Test
fun handleSuperLargeAttributeName() {
val sb = StringBuilder(CharacterReader.maxBufferLen)
val sb = StringBuilder(CharacterReader.BufferSize)
do {
sb.append("LargAttributeName")
} while (sb.length < CharacterReader.maxBufferLen)
} while (sb.length < CharacterReader.BufferSize)
val attrName = sb.toString()
val html = "<p $attrName=foo>One</p>"
val doc = Ksoup.parse(html)
Expand All @@ -73,10 +73,10 @@ class TokeniserTest {

@Test
fun handleLargeText() {
val sb = StringBuilder(CharacterReader.maxBufferLen)
val sb = StringBuilder(CharacterReader.BufferSize)
do {
sb.append("A Large Amount of Text")
} while (sb.length < CharacterReader.maxBufferLen)
} while (sb.length < CharacterReader.BufferSize)
val text = sb.toString()
val html = "<p>$text</p>"
val doc = Ksoup.parse(html)
Expand All @@ -89,10 +89,10 @@ class TokeniserTest {

@Test
fun handleLargeComment() {
val sb = StringBuilder(CharacterReader.maxBufferLen)
val sb = StringBuilder(CharacterReader.BufferSize)
do {
sb.append("Quite a comment ")
} while (sb.length < CharacterReader.maxBufferLen)
} while (sb.length < CharacterReader.BufferSize)
val comment = sb.toString()
val html = "<p><!-- $comment --></p>"
val doc = Ksoup.parse(html)
Expand All @@ -106,10 +106,10 @@ class TokeniserTest {

@Test
fun handleLargeCdata() {
val sb = StringBuilder(CharacterReader.maxBufferLen)
val sb = StringBuilder(CharacterReader.BufferSize)
do {
sb.append("Quite a lot of CDATA <><><><>")
} while (sb.length < CharacterReader.maxBufferLen)
} while (sb.length < CharacterReader.BufferSize)
val cdata = sb.toString()
val html = "<p><![CDATA[$cdata]]></p>"
val doc = Ksoup.parse(html)
Expand All @@ -124,10 +124,10 @@ class TokeniserTest {

@Test
fun handleLargeTitle() {
val sb = StringBuilder(CharacterReader.maxBufferLen)
val sb = StringBuilder(CharacterReader.BufferSize)
do {
sb.append("Quite a long title")
} while (sb.length < CharacterReader.maxBufferLen)
} while (sb.length < CharacterReader.BufferSize)
val title = sb.toString()
val html = "<title>$title</title>"
val doc = Ksoup.parse(html)
Expand Down Expand Up @@ -174,10 +174,10 @@ class TokeniserTest {

@Test
fun canParseVeryLongBogusComment() {
val commentData = StringBuilder(CharacterReader.maxBufferLen)
val commentData = StringBuilder(CharacterReader.BufferSize)
do {
commentData.append("blah blah blah blah ")
} while (commentData.length < CharacterReader.maxBufferLen)
} while (commentData.length < CharacterReader.BufferSize)
val expectedCommentData = commentData.toString()
val testMarkup = "<html><body><!$expectedCommentData></body></html>"
val parser = Parser(HtmlTreeBuilder())
Expand All @@ -192,7 +192,7 @@ class TokeniserTest {
val cdataStart = "<![CDATA["
val cdataEnd = "]]>"
val bufLen =
CharacterReader.maxBufferLen - cdataStart.length - 1 // also breaks with -2, but not with -3 or 0
CharacterReader.BufferSize - cdataStart.length - 1 // also breaks with -2, but not with -3 or 0
val cdataContentsArray = CharArray(bufLen)
cdataContentsArray.fill('x')
val cdataContents = cdataContentsArray.concatToString()
Expand Down
Loading

0 comments on commit e321c46

Please sign in to comment.