From e321c467ef6d64e6b4deb0c2924c1ddc948b6fb9 Mon Sep 17 00:00:00 2001 From: sabeeh Date: Fri, 23 Aug 2024 10:04:25 +0500 Subject: [PATCH 01/10] Improve buffer management --- .../ksoup/parser/CharacterReaderTest.kt | 69 +++---- .../fleeksoft/ksoup/parser/HtmlParserTest.kt | 8 +- .../ksoup/parser/TokeniserStateTest.kt | 6 +- .../fleeksoft/ksoup/parser/TokeniserTest.kt | 32 ++-- .../fleeksoft/ksoup/internal/SoftPoolTest.kt | 132 +++++++++++++ .../com/fleeksoft/ksoup/helper/DataUtil.kt | 88 ++++----- .../ksoup/internal/SharedConstants.kt | 2 +- .../com/fleeksoft/ksoup/internal/SoftPool.kt | 52 ++++++ .../fleeksoft/ksoup/internal/StringUtil.kt | 34 ++-- .../fleeksoft/ksoup/parser/CharacterReader.kt | 173 +++++++++--------- .../com/fleeksoft/ksoup/parser/Tokeniser.kt | 14 +- 11 files changed, 384 insertions(+), 226 deletions(-) create mode 100644 ksoup-test/test@jvmAndAndroid/com/fleeksoft/ksoup/internal/SoftPoolTest.kt create mode 100644 ksoup/src/com/fleeksoft/ksoup/internal/SoftPool.kt diff --git a/ksoup-test/test/com/fleeksoft/ksoup/parser/CharacterReaderTest.kt b/ksoup-test/test/com/fleeksoft/ksoup/parser/CharacterReaderTest.kt index cf0eebd1..8291750f 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/parser/CharacterReaderTest.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/parser/CharacterReaderTest.kt @@ -3,7 +3,8 @@ package com.fleeksoft.ksoup.parser import com.fleeksoft.ksoup.BuildConfig import com.fleeksoft.ksoup.Platform import com.fleeksoft.ksoup.TestHelper -import com.fleeksoft.ksoup.isJS +import com.fleeksoft.ksoup.internal.StringUtil +import com.fleeksoft.ksoup.isJsOrWasm import com.fleeksoft.ksoup.ported.exception.UncheckedIOException import com.fleeksoft.ksoup.ported.io.Charsets import com.fleeksoft.ksoup.ported.io.StringReader @@ -13,6 +14,7 @@ import korlibs.io.lang.substr import kotlinx.coroutines.test.runTest import kotlin.test.* + /** * Test suite for character reader. * @@ -22,7 +24,7 @@ class CharacterReaderTest { @Test fun testUtf16BE() = runTest { - if (BuildConfig.isKotlinx && Platform.isJS()) { + if (BuildConfig.isKotlinx && Platform.isJsOrWasm()) { // not supported in kotlinx for js return@runTest } @@ -37,7 +39,7 @@ class CharacterReaderTest { @Test fun testUtf16LE() = runTest { - if (BuildConfig.isKotlinx && Platform.isJS()) { + if (BuildConfig.isKotlinx && Platform.isJsOrWasm()) { // not supported in kotlinx for js return@runTest } @@ -55,7 +57,7 @@ class CharacterReaderTest { @Test fun testReadMixSpecialChar() { val input = "ää" - val charReader = CharacterReader(StringReader(input), sz = 1) + val charReader = CharacterReader(StringReader(input)) input.forEachIndexed { index, char -> assertEquals(index, charReader.pos()) assertEquals(char, charReader.consume()) @@ -424,23 +426,24 @@ class CharacterReaderTest { @Test fun notEmptyAtBufferSplitPoint() { - val r = CharacterReader("How about now".toReader(), sz = 3) - assertEquals("How", r.consumeTo(' ')) - assertFalse(r.isEmpty(), "Should not be empty") - assertEquals(' ', r.consume()) - assertFalse(r.isEmpty()) - assertEquals(4, r.pos()) - assertEquals('a', r.consume()) - assertEquals(5, r.pos()) - assertEquals('b', r.consume()) - assertEquals('o', r.consume()) - assertEquals('u', r.consume()) - assertEquals('t', r.consume()) - assertEquals(' ', r.consume()) - assertEquals('n', r.consume()) - assertEquals('o', r.consume()) - assertEquals('w', r.consume()) + val len = CharacterReader.BufferSize * 12 + val builder: StringBuilder = StringUtil.borrowBuilder() + while (builder.length <= len) builder.append('!') + val r = CharacterReader(builder.toString()) + StringUtil.releaseBuilder(builder) + + + // consume through + for (pos in 0 until len) { + assertEquals(pos, r.pos()) + assertFalse(r.isEmpty()) + assertEquals('!', r.consume()) + assertEquals(pos + 1, r.pos()) + assertFalse(r.isEmpty()) + } + assertEquals('!', r.consume()) assertTrue(r.isEmpty()) + assertEquals(CharacterReader.EOF, r.consume()) } @Test @@ -477,7 +480,7 @@ class CharacterReaderTest { fun canTrackNewlines() { val builder = StringBuilder() builder.append("\n\n\n") - while (builder.length < CharacterReader.maxBufferLen) { + while (builder.length < CharacterReader.BufferSize) { builder.append("Lorem ipsum dolor sit amet, consectetur adipiscing elit.") } builder.append("[foo]\n[bar]") @@ -496,10 +499,10 @@ class CharacterReaderTest { assertEquals("1:13", noTrack.posLineCol()) // get over the buffer while (!noTrack.matches("[foo]")) noTrack.consumeTo("[foo]") - assertEquals(32778, noTrack.pos()) + assertEquals(2090, noTrack.pos()) assertEquals(1, noTrack.lineNumber()) assertEquals(noTrack.pos() + 1, noTrack.columnNumber()) - assertEquals("1:32779", noTrack.posLineCol()) + assertEquals("1:2091", noTrack.posLineCol()) val track = CharacterReader(content) track.trackNewlines(true) @@ -527,12 +530,12 @@ class CharacterReaderTest { assertEquals("3:6", track.posLineCol()) // get over the buffer while (!track.matches("[foo]")) track.consumeTo("[foo]") - assertEquals(32778, track.pos()) + assertEquals(2090, track.pos()) assertEquals(4, track.lineNumber()) - assertEquals(32761, track.columnNumber()) - assertEquals("4:32761", track.posLineCol()) + assertEquals(2073, track.columnNumber()) + assertEquals("4:2073", track.posLineCol()) track.consumeTo('\n') - assertEquals("4:32766", track.posLineCol()) + assertEquals("4:2078", track.posLineCol()) track.consumeTo("[bar]") assertEquals(5, track.lineNumber()) assertEquals("5:1", track.posLineCol()) @@ -543,19 +546,21 @@ class CharacterReaderTest { @Test fun countsColumnsOverBufferWhenNoNewlines() { val builder = StringBuilder() - while (builder.length < CharacterReader.maxBufferLen * 4) builder.append("Lorem ipsum dolor sit amet, consectetur adipiscing elit.") + while (builder.length < CharacterReader.BufferSize * 4) builder.append("Lorem ipsum dolor sit amet, consectetur adipiscing elit.") val content = builder.toString() val reader = CharacterReader(content) reader.trackNewlines(true) assertEquals("1:1", reader.posLineCol()) - while (!reader.isEmpty()) reader.consume() - assertEquals(131096, reader.pos()) + val seen = StringBuilder() + while (!reader.isEmpty()) seen.append(reader.consume()) + assertEquals(content, seen.toString()) + assertEquals(content.length, reader.pos()) assertEquals(reader.pos() + 1, reader.columnNumber()) assertEquals(1, reader.lineNumber()) } @Test - fun linenumbersAgreeWithEditor() = runTest { + fun lineNumbersAgreeWithEditor() = runTest { val content: String = TestHelper.getFileAsString( TestHelper.getResourceAbsolutePath("htmltests/large.html.gz").uniVfs ) @@ -606,7 +611,7 @@ class CharacterReaderTest { companion object { fun bufferBuster(content: String): String { val builder = StringBuilder() - while (builder.length < CharacterReader.maxBufferLen) builder.append(content) + while (builder.length < CharacterReader.BufferSize) builder.append(content) return builder.toString() } } diff --git a/ksoup-test/test/com/fleeksoft/ksoup/parser/HtmlParserTest.kt b/ksoup-test/test/com/fleeksoft/ksoup/parser/HtmlParserTest.kt index 09314fc4..aeeda564 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/parser/HtmlParserTest.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/parser/HtmlParserTest.kt @@ -397,7 +397,7 @@ class HtmlParserTest { @Test fun handlesCdataAcrossBuffer() { val sb = StringBuilder() - while (sb.length <= CharacterReader.maxBufferLen) { + while (sb.length <= CharacterReader.BufferSize) { sb.append("A suitable amount of CData.\n") } val cdata = sb.toString() @@ -1290,7 +1290,7 @@ class HtmlParserTest { @Test fun testInvalidTableContents() = runTest { val resourceName = "htmltests/table-invalid-elements.html" - val doc: Document = if (BuildConfig.isKotlinx && Platform.isJS()) { + val doc: Document = if (BuildConfig.isKotlinx && Platform.isJsOrWasm()) { val source = TestHelper.readResource(resourceName) Ksoup.parse(sourceReader = source, baseUri = resourceName, charsetName = "UTF-8") } else { @@ -1507,7 +1507,7 @@ class HtmlParserTest { @Test fun testTemplateInsideTable() = runTest { val resourceName = "htmltests/table-polymer-template.html" - val doc: Document = if (BuildConfig.isKotlinx && Platform.isJS()) { + val doc: Document = if (BuildConfig.isKotlinx && Platform.isJsOrWasm()) { val source = TestHelper.readResource(resourceName) Ksoup.parse(sourceReader = source, baseUri = resourceName, charsetName = "UTF-8") } else { @@ -1550,7 +1550,7 @@ class HtmlParserTest { @Test fun handlesXmlDeclAndCommentsBeforeDoctype() = runTest { val resourceName = "htmltests/comments.html" - val doc: Document = if (BuildConfig.isKotlinx && Platform.isJS()) { + val doc: Document = if (BuildConfig.isKotlinx && Platform.isJsOrWasm()) { val source = TestHelper.readResource(resourceName) Ksoup.parse(sourceReader = source, baseUri = resourceName, charsetName = "UTF-8") } else { diff --git a/ksoup-test/test/com/fleeksoft/ksoup/parser/TokeniserStateTest.kt b/ksoup-test/test/com/fleeksoft/ksoup/parser/TokeniserStateTest.kt index cfcb266c..07607691 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/parser/TokeniserStateTest.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/parser/TokeniserStateTest.kt @@ -200,12 +200,12 @@ class TokeniserStateTest { fun testUnconsumeAtBufferBoundary() { val triggeringSnippet = "<><><>") - } while (sb.length < CharacterReader.maxBufferLen) + } while (sb.length < CharacterReader.BufferSize) val cdata = sb.toString() val html = "

" val doc = Ksoup.parse(html) @@ -124,10 +124,10 @@ class TokeniserTest { @Test fun handleLargeTitle() { - val sb = StringBuilder(CharacterReader.maxBufferLen) + val sb = StringBuilder(CharacterReader.BufferSize) do { sb.append("Quite a long title") - } while (sb.length < CharacterReader.maxBufferLen) + } while (sb.length < CharacterReader.BufferSize) val title = sb.toString() val html = "$title" val doc = Ksoup.parse(html) @@ -174,10 +174,10 @@ class TokeniserTest { @Test fun canParseVeryLongBogusComment() { - val commentData = StringBuilder(CharacterReader.maxBufferLen) + val commentData = StringBuilder(CharacterReader.BufferSize) do { commentData.append("blah blah blah blah ") - } while (commentData.length < CharacterReader.maxBufferLen) + } while (commentData.length < CharacterReader.BufferSize) val expectedCommentData = commentData.toString() val testMarkup = "" val parser = Parser(HtmlTreeBuilder()) @@ -192,7 +192,7 @@ class TokeniserTest { val cdataStart = "" val bufLen = - CharacterReader.maxBufferLen - cdataStart.length - 1 // also breaks with -2, but not with -3 or 0 + CharacterReader.BufferSize - cdataStart.length - 1 // also breaks with -2, but not with -3 or 0 val cdataContentsArray = CharArray(bufLen) cdataContentsArray.fill('x') val cdataContents = cdataContentsArray.concatToString() diff --git a/ksoup-test/test@jvmAndAndroid/com/fleeksoft/ksoup/internal/SoftPoolTest.kt b/ksoup-test/test@jvmAndAndroid/com/fleeksoft/ksoup/internal/SoftPoolTest.kt new file mode 100644 index 00000000..de57270d --- /dev/null +++ b/ksoup-test/test@jvmAndAndroid/com/fleeksoft/ksoup/internal/SoftPoolTest.kt @@ -0,0 +1,132 @@ +package com.fleeksoft.ksoup.internal + +import kotlin.test.* + +class SoftPoolTest { + @Test + fun testSoftLocalPool() { + val softLocalPool: SoftPool = SoftPool { CharArray(BufSize) } + + val executorService: java.util.concurrent.ExecutorService = java.util.concurrent.Executors.newFixedThreadPool(NumThreads) + val latch: java.util.concurrent.CountDownLatch = java.util.concurrent.CountDownLatch(NumThreads) + + fun getNThreads(): List> { + return buildList { + repeat(NumThreads) { + add(HashSet()) + } + } + } + + val allBuffers: MutableSet = java.util.HashSet() + val threadLocalBuffers: Array> = arrayOf(*getNThreads().toTypedArray()) + + val threadCount: java.util.concurrent.atomic.AtomicInteger = java.util.concurrent.atomic.AtomicInteger() + + val task = Runnable { + try { + val threadIndex: Int = threadCount.getAndIncrement() + val localBuffers: MutableSet = java.util.HashSet() + // First borrow + for (i in 0 until NumObjects) { + val buffer = softLocalPool.borrow() + assertEquals(BufSize, buffer.size) + localBuffers.add(buffer) + } + + // Release buffers back to the pool + for (buffer in localBuffers) { + softLocalPool.release(buffer) + } + + // Borrow again and ensure buffers are reused + for (i in 0 until NumObjects) { + val buffer = softLocalPool.borrow() + assertTrue(localBuffers.contains(buffer), "Buffer was not reused in the same thread") + threadLocalBuffers[threadIndex].add(buffer) + } + + synchronized(allBuffers) { + allBuffers.addAll(threadLocalBuffers[threadIndex]) + } + } finally { + latch.countDown() + } + } + + // Run the tasks + for (i in 0 until NumThreads) { + executorService.submit { task.run() } + } + + // Wait for all threads to complete + latch.await() + executorService.shutdown() + + // Ensure no buffers are shared between threads + val uniqueBuffers: MutableSet = java.util.HashSet() + for (bufferSet in threadLocalBuffers) { + for (buffer in bufferSet) { + assertTrue(uniqueBuffers.add(buffer), "Buffer was shared between threads") + } + } + } + + @Test + fun testSoftReferenceBehavior() { + val softLocalPool: SoftPool = SoftPool { CharArray(BufSize) } + + // Borrow and release an object + val buffer = softLocalPool.borrow() + assertEquals(BufSize, buffer.size) + softLocalPool.release(buffer) + + // Fake a GC + softLocalPool.threadLocalStack.get().clear() + + // Ensure the object is garbage collected +// assertNull(softLocalPool.threadLocalStack.get()) // IN KMP we don't have weak reference + + val second = softLocalPool.borrow() + // should be different, but same size + assertNotEquals(buffer, second) + assertEquals(BufSize, second.size) + } + + @Test + fun testBorrowFromEmptyPool() { + val softLocalPool: SoftPool = SoftPool { CharArray(BufSize) } + + // Borrow from an empty pool + val buffer = softLocalPool.borrow() + assertNotNull(buffer, "Borrowed null from an empty pool") + assertEquals(BufSize, buffer.size) + } + + @Test + fun testReleaseMoreThanMaxIdle() { + val softLocalPool: SoftPool = SoftPool { CharArray(BufSize) } + + // Borrow more than MaxIdle objects + val borrowedBuffers: MutableList = java.util.ArrayList() + for (i in 0 until SoftPool.MaxIdle + 5) { + val buffer = softLocalPool.borrow() + borrowedBuffers.add(buffer) + } + + // Release all borrowed objects back to the pool + for (buffer in borrowedBuffers) { + softLocalPool.release(buffer) + } + + // Ensure the pool size does not exceed MaxIdle + val stack = softLocalPool.stack + assertTrue(stack.size <= SoftPool.MaxIdle, "Pool size exceeded MaxIdle limit") + } + + companion object { + private const val BufSize = 12 + private const val NumThreads = 5 + private const val NumObjects = 3 + } +} \ No newline at end of file diff --git a/ksoup/src/com/fleeksoft/ksoup/helper/DataUtil.kt b/ksoup/src/com/fleeksoft/ksoup/helper/DataUtil.kt index 3daa0893..2ef4f5e0 100644 --- a/ksoup/src/com/fleeksoft/ksoup/helper/DataUtil.kt +++ b/ksoup/src/com/fleeksoft/ksoup/helper/DataUtil.kt @@ -1,11 +1,9 @@ package com.fleeksoft.ksoup.helper -import com.fleeksoft.ksoup.Platform import com.fleeksoft.ksoup.internal.SharedConstants import com.fleeksoft.ksoup.internal.StringUtil import com.fleeksoft.ksoup.io.Charset import com.fleeksoft.ksoup.io.SourceReader -import com.fleeksoft.ksoup.isApple import com.fleeksoft.ksoup.nodes.Comment import com.fleeksoft.ksoup.nodes.Document import com.fleeksoft.ksoup.nodes.Node @@ -17,7 +15,6 @@ import com.fleeksoft.ksoup.ported.exception.UncheckedIOException import com.fleeksoft.ksoup.ported.io.* import com.fleeksoft.ksoup.ported.isCharsetSupported import com.fleeksoft.ksoup.select.Elements -import kotlin.math.min import kotlin.random.Random /** @@ -67,7 +64,6 @@ public object DataUtil { val charsetName: String? = charset?.name val charsetDoc: CharsetDoc = detectCharset(sourceReader, baseUri, charsetName, parser) val reader = BufferedReader(InputSourceReader(source = charsetDoc.input, charset = charsetDoc.charset), SharedConstants.DefaultBufferSize) - maybeSkipBom(reader, charsetDoc) streamer.parse(reader, baseUri) // initializes the parse and the document, but does not step() it return streamer @@ -77,7 +73,7 @@ public object DataUtil { val doc: Document var charsetDoc: CharsetDoc? = null try { - charsetDoc = detectCharset(sourceReader = sourceReader, baseUri = baseUri, charsetName = charsetName, parser = parser) + charsetDoc = detectCharset(inputSource = sourceReader, baseUri = baseUri, charsetName = charsetName, parser = parser) doc = parseInputSource(charsetDoc = charsetDoc, baseUri = baseUri, parser = parser) } finally { sourceReader.close() @@ -90,36 +86,33 @@ public object DataUtil { val charset: Charset, var doc: Document?, val input: SourceReader, - val skip: Boolean ) - private fun detectCharset(sourceReader: SourceReader, baseUri: String, charsetName: String?, parser: Parser): CharsetDoc { + private fun detectCharset(inputSource: SourceReader, baseUri: String, charsetName: String?, parser: Parser): CharsetDoc { var effectiveCharsetName: String? = charsetName var doc: Document? = null - // read the start of the stream and look for a BOM or meta charset - - sourceReader.mark(SharedConstants.DefaultBufferSize.toLong()) - // -1 because we read one more to see if completed. First read is < buffer size, so can't be invalid. - val firstBytes: ByteArray = readToByteBuffer(sourceReader, firstReadBufferSize - 1) - val fullyRead = sourceReader.exhausted() - sourceReader.reset() - - + // read the start of the stream and look for a BOM or meta charset: // look for BOM - overrides any other header or input - val bomCharset: BomCharset? = detectCharsetFromBom(firstBytes) - if (bomCharset != null) effectiveCharsetName = bomCharset.charset + val bomCharset = detectCharsetFromBom(inputSource) // resets / consumes appropriately + if (bomCharset != null) effectiveCharsetName = bomCharset - if (effectiveCharsetName == null) { // determine from meta. safe first parse as UTF-8 - doc = try { - parser.parseInput(firstBytes.decodeToString(), baseUri) + if (effectiveCharsetName == null) { // read ahead and determine from meta. safe first parse as UTF-8 + // @TODO:// implement it limit the max reading size +// input.max(firstReadBufferSize) + inputSource.mark(firstReadBufferSize) + try { + val reader = InputSourceReader(source = inputSource, charset = Charsets.UTF8) + doc = parser.parseInput(reader, baseUri) + inputSource.reset() +// input.max(origMax); // reset for a full read if required // @TODO implement it } catch (e: UncheckedIOException) { throw e } // look for or HTML5 - val metaElements: Elements = doc!!.select("meta[http-equiv=content-type], meta[charset]") + val metaElements: Elements = doc.select("meta[http-equiv=content-type], meta[charset]") var foundCharset: String? = null // if not found, will keep utf-8 as best attempt for (meta in metaElements) { if (meta.hasAttr("http-equiv")) { @@ -150,7 +143,9 @@ public object DataUtil { foundCharset = foundCharset.trim { it <= ' ' }.replace("[\"']".toRegex(), "") effectiveCharsetName = foundCharset doc = null - } else if (!fullyRead) { + } else if (inputSource.exhausted()) { // if we have read fully, and the charset was correct, keep that current parse + inputSource.close() + } else { doc = null } } else { // specified by content type header (or by user on file load) @@ -163,14 +158,10 @@ public object DataUtil { // finally: prepare the return struct if (effectiveCharsetName == null) effectiveCharsetName = defaultCharsetName val charset: Charset = if (effectiveCharsetName == defaultCharsetName) Charsets.UTF8 else Charsets.forName(effectiveCharsetName) - val skip = bomCharset != null && bomCharset.offset // skip 1 if the BOM is there and needs offset - // if consumer needs to parse the input; prep it if there's a BOM. Can't skip in inputstream as wrapping buffer will ignore the pos - return CharsetDoc(charset, doc, sourceReader, skip) + return CharsetDoc(charset = charset, doc = doc, input = inputSource) } public fun parseInputSource(charsetDoc: CharsetDoc, baseUri: String, parser: Parser): Document { - - // if doc != null it was fully parsed during charset detection; so just return that if (charsetDoc.doc != null) return charsetDoc.doc!! @@ -178,8 +169,7 @@ public object DataUtil { val doc: Document val charset: Charset = charsetDoc.charset - val reader = BufferedReader(InputSourceReader(input, charset), SharedConstants.DefaultBufferSize) - maybeSkipBom(reader, charsetDoc) + val reader = InputSourceReader(input, charset) try { doc = parser.parseInput(reader, baseUri) } catch (e: UncheckedIOException) { @@ -232,7 +222,6 @@ public object DataUtil { return null } - // @Nullable private fun validateCharset(cs: String?): String? { if (cs.isNullOrEmpty()) return null val cleanedStr = cs.trim { it <= ' ' }.replace("[\"']".toRegex(), "") @@ -259,34 +248,29 @@ public object DataUtil { return StringUtil.releaseBuilder(mime) } - private fun detectCharsetFromBom(firstByteArray: ByteArray): BomCharset? { - // .mark and rewind used to return Buffer, now ByteBuffer, so cast for backward compat - val bom = if (firstByteArray.size >= 4) { - firstByteArray.copyOf(4) - } else { - ByteArray(4) - } + private fun detectCharsetFromBom(sourceReader: SourceReader): String? { + val bom = ByteArray(4) + sourceReader.mark(bom.size.toLong()) + sourceReader.read(bom, 0, bom.size) + sourceReader.reset() + // 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here if (bom[0].toInt() == 0x00 && bom[1].toInt() == 0x00 && bom[2] == 0xFE.toByte() && bom[3] == 0xFF.toByte()) { // BE - return BomCharset("UTF-32BE", Platform.isApple()) // and I hope it's on your system + sourceReader.read(bom, 0, 4) // consume BOM + return "UTF-32BE" // and I hope it's on your system } else if (bom[0] == 0xFF.toByte() && bom[1] == 0xFE.toByte() && bom[2].toInt() == 0x00 && bom[3].toInt() == 0x00) { // LE - return BomCharset("UTF-32LE", Platform.isApple()) // and I hope it's on your system + sourceReader.read(bom, 0, 4) // consume BOM + return "UTF-32LE" // and I hope it's on your system } else if (bom[0] == 0xFE.toByte() && bom[1] == 0xFF.toByte()) { // BE - return BomCharset("UTF-16BE", true) // in all Javas + sourceReader.read(bom, 0, 2) // consume BOM + return "UTF-16BE" // in all Javas } else if (bom[0] == 0xFF.toByte() && bom[1] == 0xFE.toByte()) { // LE - return BomCharset("UTF-16LE", true) // in all Javas + sourceReader.read(bom, 0, 2) // consume BOM + return "UTF-16LE" // in all Javas } else if (bom[0] == 0xEF.toByte() && bom[1] == 0xBB.toByte() && bom[2] == 0xBF.toByte()) { - return BomCharset("UTF-8", true) // in all Javas - // 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here + sourceReader.read(bom, 0, 3) // consume the UTF-8 BOM + return "UTF-8" } return null } - - private class BomCharset(val charset: String, val offset: Boolean) - - private fun maybeSkipBom(reader: Reader, charsetDoc: CharsetDoc) { - if (charsetDoc.skip) { - reader.skip(1) - } - } } diff --git a/ksoup/src/com/fleeksoft/ksoup/internal/SharedConstants.kt b/ksoup/src/com/fleeksoft/ksoup/internal/SharedConstants.kt index 2434e731..07c95cec 100644 --- a/ksoup/src/com/fleeksoft/ksoup/internal/SharedConstants.kt +++ b/ksoup/src/com/fleeksoft/ksoup/internal/SharedConstants.kt @@ -13,7 +13,7 @@ public object SharedConstants { public const val RangeKey: String = "ksoup.start" public const val EndRangeKey: String = "ksoup.end" - public const val DefaultBufferSize: Int = 1024 * 32 + public const val DefaultBufferSize: Int = 8192 const val DEFAULT_CHAR_BUFFER_SIZE: Int = 8192 var DEFAULT_BYTE_BUFFER_SIZE: Int = 8192 diff --git a/ksoup/src/com/fleeksoft/ksoup/internal/SoftPool.kt b/ksoup/src/com/fleeksoft/ksoup/internal/SoftPool.kt new file mode 100644 index 00000000..8e5927ec --- /dev/null +++ b/ksoup/src/com/fleeksoft/ksoup/internal/SoftPool.kt @@ -0,0 +1,52 @@ +package com.fleeksoft.ksoup.internal + +import com.fleeksoft.ksoup.ported.ThreadLocal + +/** + * A SoftPool is a ThreadLocal that holds a SoftReference to a pool of initializable objects. This allows us to reuse + * expensive objects (buffers, etc.) between invocations (the ThreadLocal), but also for those objects to be reaped if + * they are no longer in use. + * + * Like a ThreadLocal, should be stored in a static field. + * @param the type of object to pool. + * @since 1.18.2 + */ +class SoftPool(private val initializer: () -> T) { + val threadLocalStack: ThreadLocal> = ThreadLocal { ArrayDeque() } + + /** + * Borrow an object from the pool, creating a new one if the pool is empty. Make sure to release it back to the pool + * when done, so that it can be reused. + * @return an object from the pool, as defined by the initializer. + */ + fun borrow(): T { + val stack: ArrayDeque = stack + if (!stack.isEmpty()) { + return stack.removeFirst() + } + return initializer() + } + + /** + * Release an object back to the pool. If the pool is full, the object is not retained. If you don't want to reuse a + * borrowed object (for e.g. a StringBuilder that grew too large), just don't release it. + * @param value the object to release back to the pool. + */ + fun release(value: T) { + val stack: ArrayDeque = stack + if (stack.size < MaxIdle) { + stack.addFirst(value) + } + } + + val stack: ArrayDeque + get() = threadLocalStack.get() + + companion object { + /** + * How many total uses of the creating object might be instantiated on the same thread at once. More than this and + * those objects aren't recycled. Doesn't need to be too conservative, as they can still be GCed as SoftRefs. + */ + const val MaxIdle: Int = 12 + } +} diff --git a/ksoup/src/com/fleeksoft/ksoup/internal/StringUtil.kt b/ksoup/src/com/fleeksoft/ksoup/internal/StringUtil.kt index 59af064c..753d0e3b 100644 --- a/ksoup/src/com/fleeksoft/ksoup/internal/StringUtil.kt +++ b/ksoup/src/com/fleeksoft/ksoup/internal/StringUtil.kt @@ -1,7 +1,6 @@ package com.fleeksoft.ksoup.internal import com.fleeksoft.ksoup.ported.Character -import com.fleeksoft.ksoup.ported.ThreadLocal import com.fleeksoft.ksoup.ported.resolveOrNull import de.cketti.codepoints.deluxe.CodePoint import de.cketti.codepoints.deluxe.appendCodePoint @@ -241,7 +240,9 @@ public object StringUtil { return input.replace(controlChars, "") } - private val stringLocalBuilders: ThreadLocal> = ThreadLocal { ArrayDeque() } + private const val InitBuilderSize: Int = 1024 + private const val MaxBuilderSize: Int = 8 * 1024 + private val StringBuilderPool: SoftPool = SoftPool { StringBuilder(InitBuilderSize) } /** * Maintains cached StringBuilders in a flyweight pattern, to minimize new StringBuilder GCs. The StringBuilder is @@ -252,9 +253,7 @@ public object StringUtil { * @return an empty StringBuilder */ public fun borrowBuilder(): StringBuilder { - return StringBuilder(MaxCachedBuilderSize) - val stringBuilder = stringLocalBuilders.get() - return if (stringBuilder.isEmpty()) StringBuilder(MaxCachedBuilderSize) else stringBuilder.last() + return StringBuilderPool.borrow() } /** @@ -263,28 +262,17 @@ public object StringUtil { * @param sb the StringBuilder to release. * @return the string value of the released String Builder (as an incentive to release it!). */ - - // TODO: replace this public fun releaseBuilder(sb: StringBuilder): String { - return sb.toString() + val str = sb.toString() - var stringBuilder: StringBuilder = sb - val string: String = stringBuilder.toString() - if (stringBuilder.length > MaxCachedBuilderSize) { - stringBuilder = StringBuilder(MaxCachedBuilderSize) // make sure it hasn't grown too big - } else { - stringBuilder.clear() // make sure it's emptied on release - } - val builders = stringLocalBuilders.get() - builders.add(stringBuilder) - while (builders.size > MaxIdleBuilders) { - builders.removeLast() + // if it hasn't grown too big, reset it and return it to the pool: + if (sb.length <= MaxBuilderSize) { + sb.clear() // make sure it's emptied on release + StringBuilderPool.release(sb) } - return string - } - private const val MaxCachedBuilderSize = 8 * 1024 - private const val MaxIdleBuilders = 8 + return str + } /** * A StringJoiner allows incremental / filtered joining of a set of stringable objects. diff --git a/ksoup/src/com/fleeksoft/ksoup/parser/CharacterReader.kt b/ksoup/src/com/fleeksoft/ksoup/parser/CharacterReader.kt index 1ef95594..b15702dd 100644 --- a/ksoup/src/com/fleeksoft/ksoup/parser/CharacterReader.kt +++ b/ksoup/src/com/fleeksoft/ksoup/parser/CharacterReader.kt @@ -1,5 +1,6 @@ package com.fleeksoft.ksoup.parser +import com.fleeksoft.ksoup.internal.SoftPool import com.fleeksoft.ksoup.ported.buildString import com.fleeksoft.ksoup.ported.exception.IOException import com.fleeksoft.ksoup.ported.exception.UncheckedIOException @@ -12,88 +13,107 @@ import kotlin.math.min * CharacterReader consumes tokens off a string. Used internally by com.fleeksoft.ksoup. API subject to changes. */ public class CharacterReader { - private var charBuf: CharArray? - private var charReader: Reader? = null - private var bufLength = 0 - private var bufSplitPoint = 0 - private var bufPos = 0 - private var readerPos: Int = 0 - private var bufMark = -1 - private var close: Boolean = false - private var stringCache: Array? = arrayOfNulls(stringCacheSize) // holds reused strings in this doc, to lessen garbage - - // optionally track the pos() position of newlines - scans during bufferUp() - private var newlinePositions: ArrayList? = null + private var stringCache: Array? = null // holds reused strings in this doc, to lessen garbage + + private var reader: Reader? = null // underlying Reader, will be backed by a buffered+controlled input source, or StringReader + private var charBuf: CharArray? // character buffer we consume from; filled from Reader + private var bufPos = 0 // position in charBuf that's been consumed to + private var bufLength = 0 // the num of characters actually buffered in charBuf, <= charBuf.length + private var fillPoint = 0 // how far into the charBuf we read before re-filling. 0.5 of charBuf.length after bufferUp + private var consumed: Int = 0 // how many characters total have been consumed from this CharacterReader (less the current bufPos) + private var bufMark = -1 // if not -1, the marked rewind position + private var readFully = false // if the underlying stream has been completely read, no value in further buffering + + private var newlinePositions: ArrayList? = null // optionally track the pos() position of newlines - scans during bufferUp() private var lineNumberOffset = 1 // line numbers start at 1; += newlinePosition[indexof(pos)] - var counter = 1 - public constructor(reader: Reader, sz: Int = maxBufferLen) { - this.charReader = reader - charBuf = CharArray(min(sz, maxBufferLen)) + public constructor(reader: Reader) { + this.reader = reader + charBuf = BufferPool.borrow() + stringCache = StringPool.borrow() bufferUp() } - public constructor(html: String) : this(StringReader(html), html.length) + public constructor(html: String) : this(StringReader(html)) - public fun isClosed(): Boolean = close + public fun isClosed(): Boolean = reader == null public fun close() { - close = true try { - charReader = null + reader?.close() } catch (ignored: IOException) { } finally { + reader = null + charBuf?.fill(0.toChar()) // before release, clear the buffer. Not required, but acts as a safety net, and makes debug view clearer + charBuf?.let { BufferPool.release(it) } charBuf = null + stringCache?.let { StringPool.release(it) } // conversely, we don't clear the string cache, so we can reuse the contents stringCache = null } } - // if the underlying stream has been completely read, no value in further buffering - private var readFully = false - private fun bufferUp() { - if (readFully || bufPos < bufSplitPoint) return - - val (pos, offset) = if (bufMark != -1) { - Pair(bufMark.toLong(), bufPos - bufMark) - } else { - Pair(bufPos.toLong(), 0) - } - - if (pos > 0) { - charReader!!.skip(pos) - } + if (readFully || bufPos < fillPoint || bufMark != -1) return - charReader!!.mark(maxBufferLen) - var read: Int = 0 - while (read <= minReadAheadLen) { - val toReadSize = charBuf!!.size - read - val thisRead = charReader!!.read(charBuf!!, offset = read, length = toReadSize) + doBufferUp() // structured so bufferUp may become an intrinsic candidate + } - if (thisRead == -1) readFully = true - if (thisRead <= 0) break - read += thisRead - } - charReader!!.reset() - - if (read > 0) { - bufLength = read - readerPos += pos.toInt() - bufPos = offset - if (bufMark != -1) bufMark = 0 - bufSplitPoint = minOf(bufLength, readAheadLimit) + private fun doBufferUp() { + /* + The flow: + - if read fully, or if bufPos < fillPoint, or if marked - do not fill. + - update readerPos (total amount consumed from this CharacterReader) += bufPos + - shift charBuf contents such that bufPos = 0; set next read offset (bufLength) -= shift amount + - loop read the Reader until we fill charBuf. bufLength += read. + - readFully = true when read = -1 + */ + consumed += bufPos + bufLength -= bufPos + if (bufLength > 0) charBuf?.copyInto(destination = charBuf!!, destinationOffset = 0, startIndex = bufPos, endIndex = bufPos + bufLength) + bufPos = 0 + while (bufLength < BufferSize) { + try { + val read = reader!!.read(cbuf = charBuf!!, offset = bufLength, length = charBuf!!.size - bufLength) + if (read == -1) { + readFully = true + break + } + bufLength += read + } catch (e: IOException) { + throw UncheckedIOException(e) + } } + fillPoint = min(bufLength, RefillPoint) scanBufferForNewlines() // if enabled, we index newline positions for line number tracking lastIcSeq = null // cache for last containsIgnoreCase(seq) } + fun mark() { + // make sure there is enough look ahead capacity + if (bufLength - bufPos < RewindLimit) fillPoint = 0 + + bufferUp() + bufMark = bufPos + } + + fun unmark() { + bufMark = -1 + } + + fun rewindToMark() { + if (bufMark == -1) throw UncheckedIOException(IOException("Mark invalid")) + + bufPos = bufMark + unmark() + } + /** * Gets the position currently read to in the content. Starts at 0. * @return current position */ public fun pos(): Int { - return readerPos + bufPos + return consumed + bufPos } /** Tests if the buffer has been fully read. */ @@ -110,7 +130,7 @@ public class CharacterReader { */ public fun trackNewlines(track: Boolean) { if (track && newlinePositions == null) { - newlinePositions = ArrayList(maxBufferLen / 80) // rough guess of likely count + newlinePositions = ArrayList(BufferSize / 80) // rough guess of likely count scanBufferForNewlines() // first pass when enabled; subsequently called during bufferUp } else if (!track) { newlinePositions = null @@ -173,9 +193,9 @@ public class CharacterReader { */ private fun scanBufferForNewlines() { if (!isTrackNewlines()) return - if (newlinePositions!!.size > 0) { + if (newlinePositions!!.isNotEmpty()) { // work out the line number that we have read up to (as we have likely scanned past this point) - var index = lineNumIndex(readerPos) + var index = lineNumIndex(consumed) if (index == -1) index = 0 // first line val linePos: Int = newlinePositions!![index] lineNumberOffset += index // the num lines we've read up to @@ -183,7 +203,7 @@ public class CharacterReader { newlinePositions!!.add(linePos) // roll the last read pos to first, for cursor num after buffer } for (i in bufPos until bufLength) { - if (charBuf!![i] == '\n') newlinePositions!!.add(1 + readerPos + i) + if (charBuf!![i] == '\n') newlinePositions!!.add(1 + consumed + i) } } @@ -229,23 +249,6 @@ public class CharacterReader { bufPos++ } - public fun mark() { - // make sure there is enough look ahead capacity - if (bufLength - bufPos < minReadAheadLen) bufSplitPoint = 0 - bufferUp() - bufMark = bufPos - } - - public fun unmark() { - bufMark = -1 - } - - public fun rewindToMark() { - if (bufMark == -1) throw UncheckedIOException(IOException("Mark invalid")) - bufPos = bufMark - unmark() - } - /** * Returns the number of characters between the current position and the next instance of the input char * @param c scan target @@ -642,13 +645,14 @@ public class CharacterReader { public companion object { public const val EOF: Char = (-1).toChar() - private const val maxStringCacheLen = 12 - public const val maxBufferLen: Int = 1024 * 32 // visible for testing - public const val readAheadLimit: Int = (maxBufferLen * 0.75).toInt() // visible for testing + private const val MaxStringCacheLen = 12 + private const val StringCacheSize = 512 + private val StringPool: SoftPool> = SoftPool { arrayOfNulls(StringCacheSize) } + private val BufferPool: SoftPool = SoftPool { CharArray(BufferSize) } // recycled char buffer - // the minimum mark length supported. No HTML entities can be larger than this. - private const val minReadAheadLen = 1024 - private const val stringCacheSize = 512 + public const val BufferSize: Int = 1024 * 2 // visible for testing + public const val RefillPoint: Int = BufferSize / 2 // when bufPos characters read, refill; visible for testing; + private const val RewindLimit = 1024 // the maximum we can rewind. No HTML entities can be larger than this. /** * Caches short strings, as a flyweight pattern, to reduce GC load. Just for this doc, to prevent leaks. @@ -664,18 +668,19 @@ public class CharacterReader { start: Int, count: Int, ): String { - // limit (no cache): - if (count > maxStringCacheLen) return String.buildString(charBuf!!, start, count) + // don't cache strings that are too big + if (count > MaxStringCacheLen) return String.buildString(charBuf!!, start, count) if (count < 1) return "" // calculate hash: var hash = 0 - for (i in 0 until count) { - hash = 31 * hash + charBuf!![start + i].code + val end = count + start + for (i in start until end) { + hash = 31 * hash + charBuf!![i].code } // get from cache - val index = hash and stringCacheSize - 1 + val index = hash and StringCacheSize - 1 var cached = stringCache!![index] if (cached != null && rangeEquals(charBuf, start, count, cached)) { // positive hit diff --git a/ksoup/src/com/fleeksoft/ksoup/parser/Tokeniser.kt b/ksoup/src/com/fleeksoft/ksoup/parser/Tokeniser.kt index 210249b5..b4f5ba32 100644 --- a/ksoup/src/com/fleeksoft/ksoup/parser/Tokeniser.kt +++ b/ksoup/src/com/fleeksoft/ksoup/parser/Tokeniser.kt @@ -181,7 +181,7 @@ public class Tokeniser(treeBuilder: TreeBuilder) { if (charval == -1 || charval > 0x10FFFF) { characterReferenceError("character [$charval] outside of valid range") - codeRef[0] = Tokeniser.replacementChar.code + codeRef[0] = Tokeniser.ReplacementChar.code } else { if (charval >= win1252ExtensionsStart && charval < win1252ExtensionsStart + win1252Extensions.size) { characterReferenceError("character [$charval] is not a valid unicode code point") @@ -337,9 +337,8 @@ public class Tokeniser(treeBuilder: TreeBuilder) { } public companion object { - public const val replacementChar: Char = '\uFFFD' // replaces null character - private val notCharRefCharsSorted: CharArray = - charArrayOf('\t', '\n', '\r', '\u000c', ' ', '<', '&').sortedArray() + public const val ReplacementChar: Char = '\uFFFD' // replaces null character + private val notCharRefCharsSorted: CharArray = charArrayOf('\t', '\n', '\r', '\u000c', ' ', '<', '&').sortedArray() // Some illegal character escapes are parsed by browsers as windows-1252 instead. See issue #1034 // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state @@ -352,12 +351,5 @@ public class Tokeniser(treeBuilder: TreeBuilder) { 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, ) - - public fun currentNodeInHtmlNS(): Boolean { - // todo: implement namespaces correctly - return true - // Element currentNode = currentNode(); - // return currentNode != null && currentNode.namespace().equals("HTML"); - } } } From 931ac8d2649f570d7ccd1a9b849fce6eb6dbceeb Mon Sep 17 00:00:00 2001 From: sabeeh Date: Fri, 23 Aug 2024 10:05:03 +0500 Subject: [PATCH 02/10] attribute escape value check for null --- ksoup/src/com/fleeksoft/ksoup/nodes/Attribute.kt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ksoup/src/com/fleeksoft/ksoup/nodes/Attribute.kt b/ksoup/src/com/fleeksoft/ksoup/nodes/Attribute.kt index 4895ac77..8fde9a14 100644 --- a/ksoup/src/com/fleeksoft/ksoup/nodes/Attribute.kt +++ b/ksoup/src/com/fleeksoft/ksoup/nodes/Attribute.kt @@ -1,11 +1,11 @@ package com.fleeksoft.ksoup.nodes -import com.fleeksoft.ksoup.ported.exception.SerializationException import com.fleeksoft.ksoup.helper.Validate import com.fleeksoft.ksoup.internal.StringUtil import com.fleeksoft.ksoup.nodes.Document.OutputSettings.Syntax import com.fleeksoft.ksoup.ported.KCloneable import com.fleeksoft.ksoup.ported.exception.IOException +import com.fleeksoft.ksoup.ported.exception.SerializationException /** * A single key + value attribute. (Only used for presentation.) @@ -247,7 +247,7 @@ public open class Attribute : Map.Entry, KCloneable accum.append(key) if (!shouldCollapseAttribute(key, value, out)) { accum.append("=\"") - Entities.escape(accum = accum, string = value ?: "", out = out, options = Entities.ForAttribute) + Entities.escape(accum = accum, data = Attributes.checkNotNull(value), out = out, options = Entities.ForAttribute) accum.append('"') } } From 2947ba3f205788d7515c7c8110a9f37f80a40450 Mon Sep 17 00:00:00 2001 From: sabeeh Date: Fri, 23 Aug 2024 10:07:21 +0500 Subject: [PATCH 03/10] Tidied up Nullable array values --- .../com/fleeksoft/ksoup/nodes/Attributes.kt | 68 ++++++++----------- 1 file changed, 27 insertions(+), 41 deletions(-) diff --git a/ksoup/src/com/fleeksoft/ksoup/nodes/Attributes.kt b/ksoup/src/com/fleeksoft/ksoup/nodes/Attributes.kt index 4908c98e..47f13412 100644 --- a/ksoup/src/com/fleeksoft/ksoup/nodes/Attributes.kt +++ b/ksoup/src/com/fleeksoft/ksoup/nodes/Attributes.kt @@ -1,14 +1,13 @@ package com.fleeksoft.ksoup.nodes -import com.fleeksoft.ksoup.ported.exception.SerializationException import com.fleeksoft.ksoup.helper.Validate -import com.fleeksoft.ksoup.internal.Normalizer.lowerCase import com.fleeksoft.ksoup.internal.SharedConstants import com.fleeksoft.ksoup.internal.StringUtil import com.fleeksoft.ksoup.nodes.Range.AttributeRange.Companion.UntrackedAttr import com.fleeksoft.ksoup.parser.ParseSettings import com.fleeksoft.ksoup.ported.KCloneable import com.fleeksoft.ksoup.ported.exception.IOException +import com.fleeksoft.ksoup.ported.exception.SerializationException /** * The attributes of an Element. @@ -28,10 +27,8 @@ import com.fleeksoft.ksoup.ported.exception.IOException public class Attributes : Iterable, KCloneable { // the number of instance fields is kept as low as possible giving an object size of 24 bytes private var size = 0 // number of slots used (not total capacity, which is keys.length) - internal var keys: Array = arrayOfNulls(InitialCapacity) - - // Genericish: all non-internal attribute values must be Strings and are cast on access. - internal var vals = arrayOfNulls(InitialCapacity) + internal var keys: Array = arrayOfNulls(InitialCapacity) // keys is not null, but contents may be. Same for vals + internal var vals = arrayOfNulls(InitialCapacity) // Genericish: all non-internal attribute values must be Strings and are cast on access. // todo - make keys iterable without creating Attribute objects // check there's room for more @@ -198,7 +195,8 @@ public class Attributes : Iterable, KCloneable { val i = indexOfKeyIgnoreCase(key) if (i != NotFound) { vals[i] = value - if (keys[i] != key) { + val old = keys[i] + if (old != null && old != key) { // case changed, update keys[i] = key } @@ -370,8 +368,9 @@ public class Attributes : Iterable, KCloneable { override fun hasNext(): Boolean { checkModified() while (i < size) { - if (isInternalKey(keys[i])) { - // skip over internal keys + val key = keys[i] + require(key != null) + if (isInternalKey(key)) { // skip over internal keys i++ } else { break @@ -410,8 +409,9 @@ public class Attributes : Iterable, KCloneable { public fun asList(): List { val list: ArrayList = ArrayList(size) for (i in 0 until size) { - if (isInternalKey(keys[i])) continue // skip internal keys - val attr = Attribute(keys[i]!!, vals[i] as String?, this@Attributes) + val key = keys[i]!! + if (isInternalKey(key)) continue // skip internal keys + val attr = Attribute(key, vals[i] as String?, this@Attributes) list.add(attr) } return list.toList() @@ -433,10 +433,7 @@ public class Attributes : Iterable, KCloneable { public fun html(): String { val sb: StringBuilder = StringUtil.borrowBuilder() try { - html( - sb, - Document("").outputSettings(), - ) // output settings a bit funky, but this html() seldom used + html(sb, Document("").outputSettings()) // output settings a bit funky, but this html() seldom used } catch (e: IOException) { // ought never happen throw SerializationException(e) @@ -444,21 +441,14 @@ public class Attributes : Iterable, KCloneable { return StringUtil.releaseBuilder(sb) } - public fun html( - accum: Appendable, - out: Document.OutputSettings, - ) { + public fun html(accum: Appendable, out: Document.OutputSettings) { val sz = size for (i in 0 until sz) { - if (isInternalKey(keys[i])) continue - val key: String? = keys[i]?.let { Attribute.getValidKey(it, out.syntax()) } - if (key != null) { - Attribute.htmlNoValidate( - key, - vals[i] as String?, - accum.append(' '), - out, - ) + val key = keys[i]!! + if (isInternalKey(key)) continue + val validated = Attribute.getValidKey(key, out.syntax()) + if (validated != null) { + Attribute.htmlNoValidate(validated, vals[i] as String?, accum.append(' '), out) } } } @@ -513,7 +503,8 @@ public class Attributes : Iterable, KCloneable { */ public fun normalize() { for (i in 0 until size) { - if (!isInternalKey(keys[i])) keys[i] = lowerCase(keys[i]) + val key = keys[i]!! + if (!isInternalKey(key)) keys[i] = key.lowercase() } } @@ -526,16 +517,11 @@ public class Attributes : Iterable, KCloneable { if (isEmpty()) return 0 val preserve: Boolean = settings.preserveAttributeCase() var dupes = 0 - OUTER@ for (i in keys.indices) { + for (i in 0 until size) { + val keyI = keys[i] var j = i + 1 - while (j < keys.size) { - if (keys[j] == null) continue@OUTER // keys.length doesn't shrink when removing, so re-test - if (preserve && keys[i] == keys[j] || !preserve && - keys[i].equals( - keys[j], - ignoreCase = true, - ) - ) { + while (j < size) { + if ((preserve && keyI == keys[j]) || (!preserve && keyI.equals(keys[j], ignoreCase = true))) { dupes++ remove(j) j-- @@ -595,7 +581,7 @@ public class Attributes : Iterable, KCloneable { // we track boolean attributes as null in values - they're just keys. so returns empty for consumers // casts to String, so only for non-internal attributes public fun checkNotNull(value: Any?): String { - return if (value == null) EmptyString else (value as String?)!! + return if (value == null) EmptyString else (value as String) } private fun dataKey(key: String): String { @@ -606,8 +592,8 @@ public class Attributes : Iterable, KCloneable { return "$InternalPrefix$key" } - public fun isInternalKey(key: String?): Boolean { - return key != null && key.length > 1 && key[0] == InternalPrefix + public fun isInternalKey(key: String): Boolean { + return key.length > 1 && key[0] == InternalPrefix } } } From 14c0e30fe8580dd6b1dfe17a6ff62d1472837d4f Mon Sep 17 00:00:00 2001 From: sabeeh Date: Fri, 23 Aug 2024 10:15:50 +0500 Subject: [PATCH 04/10] Minor code tweaks --- .../com/fleeksoft/ksoup/nodes/ElementTest.kt | 53 +++++------------- .../com/fleeksoft/ksoup/nodes/CDataNode.kt | 4 +- .../src/com/fleeksoft/ksoup/nodes/Document.kt | 56 +++++++++---------- .../src/com/fleeksoft/ksoup/nodes/Element.kt | 31 ++++------ .../com/fleeksoft/ksoup/nodes/FormElement.kt | 6 +- .../com/fleeksoft/ksoup/nodes/NodeUtils.kt | 5 +- ksoup/src/com/fleeksoft/ksoup/nodes/Range.kt | 7 ++- .../ksoup/parser/HtmlTreeBuilderState.kt | 31 +++++----- .../src/com/fleeksoft/ksoup/parser/Parser.kt | 5 +- ksoup/src/com/fleeksoft/ksoup/parser/Token.kt | 4 +- .../fleeksoft/ksoup/parser/TokeniserState.kt | 9 +-- .../com/fleeksoft/ksoup/parser/TreeBuilder.kt | 11 ++-- 12 files changed, 90 insertions(+), 132 deletions(-) diff --git a/ksoup-test/test/com/fleeksoft/ksoup/nodes/ElementTest.kt b/ksoup-test/test/com/fleeksoft/ksoup/nodes/ElementTest.kt index 6ca55fe4..944fc0fb 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/nodes/ElementTest.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/nodes/ElementTest.kt @@ -2760,21 +2760,21 @@ Three assertEquals("2one", els[1].attr("href")) } - @Test - fun getElementsByAttributeValueMatchingValidation() { - val doc = Ksoup.parse(reference) - if (Platform.isJS()) { - val ex: Throwable = assertFails { doc.getElementsByAttributeValueMatching("key", "\\x") } - assertContains(ex.message ?: "", "Invalid regular expression: /\\x/gu", ignoreCase = true) + private fun checkRegexExceptionFunc(func: () -> Unit) { + if (Platform.isJsOrWasm()) { + val ex: Throwable = assertFails { func() } + val checkStr = if (Platform.isWasmJs()) "Invalid hexadecimal escape sequence near index" else "Invalid regular expression: /\\x/gu" + assertContains(ex.message ?: "", checkStr, ignoreCase = true) } else { - val ex: Throwable = assertFailsWith { doc.getElementsByAttributeValueMatching("key", "\\x") } + val ex: Throwable = assertFailsWith { func() } assertContains(ex.message ?: "", "hexadecimal escape sequence near index", ignoreCase = true) } - /*if (Platform.isApple() || Platform.isWindows()) { - assertEquals("Invalid hexadecimal escape sequence near index: 0\n\\x\n^", ex.message) - } else { - assertEquals("Illegal hexadecimal escape sequence near index 2\n\\x", ex.message) - }*/ + } + + @Test + fun getElementsByAttributeValueMatchingValidation() { + val doc = Ksoup.parse(reference) + checkRegexExceptionFunc { doc.getElementsByAttributeValueMatching("key", "\\x") } } @Test @@ -2807,18 +2807,7 @@ Three fun getElementsMatchingTextValidation() { val doc = Ksoup.parse(reference) - if (Platform.isJS()) { - val ex: Throwable = assertFails { doc.getElementsMatchingText("\\x") } - assertContains(ex.message ?: "", "Invalid regular expression: /\\x/gu", ignoreCase = true) - } else { - val ex: Throwable = assertFailsWith { doc.getElementsMatchingText("\\x") } - assertContains(ex.message ?: "", "hexadecimal escape sequence near index", ignoreCase = true) - } - /*if (Platform.isApple() || Platform.isWindows()) { - assertEquals("Invalid hexadecimal escape sequence near index: 0\n\\x\n^", ex.message) - } else { - assertEquals("Illegal hexadecimal escape sequence near index 2\n\\x", ex.message) - }*/ + checkRegexExceptionFunc { doc.getElementsMatchingText("\\x") } } @Test @@ -2841,24 +2830,12 @@ Three @Test fun getElementsMatchingOwnTextValidation() { val doc = Ksoup.parse(reference) - if (Platform.isJS()) { - val ex: Throwable = assertFails { doc.getElementsMatchingOwnText("\\x") } - assertContains(ex.message ?: "", "Invalid regular expression: /\\x/gu", ignoreCase = true) - } else { - val ex: Throwable = assertFailsWith { doc.getElementsMatchingOwnText("\\x") } - assertContains(ex.message ?: "", "hexadecimal escape sequence near index", ignoreCase = true) - } - /*if (Platform.isApple() || Platform.isWindows()) { - assertEquals("Invalid hexadecimal escape sequence near index: 0\n\\x\n^", ex.message) - } else { - assertEquals("Illegal hexadecimal escape sequence near index 2\n\\x", ex.message) - }*/ + checkRegexExceptionFunc { doc.getElementsMatchingOwnText("\\x") } } @Test fun hasText() { - val doc = - Ksoup.parse("

One

Two
") + val doc = Ksoup.parse("

One

Two
") assertTrue(doc.getElementById("1")!!.hasText()) assertTrue(doc.getElementById("2")!!.hasText()) assertFalse(doc.getElementById("3")!!.hasText()) diff --git a/ksoup/src/com/fleeksoft/ksoup/nodes/CDataNode.kt b/ksoup/src/com/fleeksoft/ksoup/nodes/CDataNode.kt index 0cde9a9d..be0b7380 100644 --- a/ksoup/src/com/fleeksoft/ksoup/nodes/CDataNode.kt +++ b/ksoup/src/com/fleeksoft/ksoup/nodes/CDataNode.kt @@ -9,8 +9,8 @@ public class CDataNode(text: String?) : TextNode(text!!) { } /** - * Get the unencoded, **non-normalized** text content of this CDataNode. - * @return unencoded, non-normalized text + * Get the un-encoded, **non-normalized** text content of this CDataNode. + * @return un-encoded, non-normalized text */ override fun text(): String { return getWholeText() diff --git a/ksoup/src/com/fleeksoft/ksoup/nodes/Document.kt b/ksoup/src/com/fleeksoft/ksoup/nodes/Document.kt index 7cad10b7..fe485c77 100644 --- a/ksoup/src/com/fleeksoft/ksoup/nodes/Document.kt +++ b/ksoup/src/com/fleeksoft/ksoup/nodes/Document.kt @@ -2,11 +2,11 @@ package com.fleeksoft.ksoup.nodes import com.fleeksoft.ksoup.helper.Validate import com.fleeksoft.ksoup.internal.StringUtil +import com.fleeksoft.ksoup.io.Charset import com.fleeksoft.ksoup.parser.ParseSettings import com.fleeksoft.ksoup.parser.Parser import com.fleeksoft.ksoup.parser.Tag import com.fleeksoft.ksoup.ported.KCloneable -import com.fleeksoft.ksoup.io.Charset import com.fleeksoft.ksoup.ported.io.Charsets import com.fleeksoft.ksoup.select.Elements import com.fleeksoft.ksoup.select.Evaluator @@ -85,13 +85,12 @@ public class Document(private val namespace: String, private val location: Strin } /** - * Get this document's `head` element. - * - * - * As a side-effect, if this Document does not already have a HTML structure, it will be created. If you do not want - * that, use `#selectFirst("head")` instead. - * - * @return `head` element. + Get this document's {@code head} element. +

+ As a side effect, if this Document does not already have an HTML structure, it will be created. If you do not want + that, use {@code #selectFirst("head")} instead. + + @return {@code head} element. */ public fun head(): Element { val html: Element = htmlEl() @@ -104,13 +103,13 @@ public class Document(private val namespace: String, private val location: Strin } /** - * Get this document's `` or `` element. - * - * - * As a **side-effect**, if this Document does not already have a HTML structure, it will be created with a `` element. If you do not want that, use `#selectFirst("body")` instead. - * - * @return `body` element for documents with a ``, a new `` element if the document - * had no contents, or the outermost ` element` for frameset documents. + Get this document's {@code } or {@code } element. +

+ As a side effect, if this Document does not already have an HTML structure, it will be created with a {@code + } element. If you do not want that, use {@code #selectFirst("body")} instead. + + @return {@code body} element for documents with a {@code }, a new {@code } element if the document + had no contents, or the outermost {@code element} for frameset documents. */ public fun body(): Element { val html: Element = htmlEl() @@ -194,7 +193,7 @@ public class Document(private val namespace: String, private val location: Strin /** * Set the text of the `body` of this document. Any existing nodes within the body will be cleared. - * @param text unencoded text + * @param text un-encoded text * @return this document */ override fun text(text: String): Element { @@ -355,7 +354,6 @@ public class Document(private val namespace: String, private val location: Strin public data class OutputSettings( private var escapeMode: Entities.EscapeMode = Entities.EscapeMode.base, private var charset: Charset = Charsets.UTF8, - var coreCharset: Entities.CoreCharset = Entities.CoreCharset.byName(charset.name), // fast encoders for ascii and utf8 private var prettyPrint: Boolean = true, private var outline: Boolean = false, private var indentAmount: Int = 1, @@ -365,19 +363,19 @@ public class Document(private val namespace: String, private val location: Strin /** * The output serialization syntax. */ - public enum class Syntax { - html, - xml, - } + public enum class Syntax { html, xml } /** - * Get the document's current HTML escape mode: `base`, which provides a limited set of named HTML - * entities and escapes other characters as numbered entities for maximum compatibility; or `extended`, - * which uses the complete set of HTML named entities. - * - * - * The default escape mode is `base`. - * @return the document's current escape mode + Get the document's current entity escape mode: +

    +
  • xhtml, the minimal named entities in XHTML / XML
  • +
  • base, which provides a limited set of named HTML + entities and escapes other characters as numbered entities for maximum compatibility
  • +
  • extended, + which uses the complete set of HTML named entities.
  • +
+

The default escape mode is base. + @return the document's current escape mode */ public fun escapeMode(): Entities.EscapeMode { return escapeMode @@ -414,7 +412,6 @@ public class Document(private val namespace: String, private val location: Strin */ public fun charset(charset: Charset): OutputSettings { this.charset = charset - coreCharset = Entities.CoreCharset.byName(charset.name) return this } @@ -607,7 +604,6 @@ public class Document(private val namespace: String, private val location: Strin */ public fun createShell(baseUri: String): Document { val doc = Document(baseUri) - doc.parser = doc.parser() val html: Element = doc.appendElement("html") html.appendElement("head") html.appendElement("body") diff --git a/ksoup/src/com/fleeksoft/ksoup/nodes/Element.kt b/ksoup/src/com/fleeksoft/ksoup/nodes/Element.kt index 09f544c6..06fc9dfc 100644 --- a/ksoup/src/com/fleeksoft/ksoup/nodes/Element.kt +++ b/ksoup/src/com/fleeksoft/ksoup/nodes/Element.kt @@ -1171,29 +1171,22 @@ public open class Element : Node { * @param pattern compiled regular expression to match against attribute values * @return elements that have attributes matching this regular expression */ - public fun getElementsByAttributeValueMatching( - key: String, - regex: Regex, - ): Elements { + public fun getElementsByAttributeValueMatching(key: String, regex: Regex): Elements { return Collector.collect(Evaluator.AttributeWithValueMatching(key, regex), this) } /** * Find elements that have attributes whose values match the supplied regular expression. * @param key name of the attribute - * @param regex regular expression to match against attribute values. You can use [embedded flags](http://java.sun.com/docs/books/tutorial/essential/regex/pattern.html#embedded) (such as (?i) and (?m) to control regex options. + * @param regex regular expression to match against attribute values. You can use embedded flags (such as {@code (?i)} and {@code (?m)}) to control regex options. * @return elements that have attributes matching this regular expression */ - public fun getElementsByAttributeValueMatching( - key: String, - regex: String, - ): Elements { - val pattern: Regex = - try { - jsSupportedRegex(regex) - } catch (e: PatternSyntaxException) { - throw IllegalArgumentException("Pattern syntax error: $regex", e) - } + public fun getElementsByAttributeValueMatching(key: String, regex: String): Elements { + val pattern: Regex = try { + jsSupportedRegex(regex) + } catch (e: PatternSyntaxException) { + throw IllegalArgumentException("Pattern syntax error: $regex", e) + } return getElementsByAttributeValueMatching(key, pattern) } @@ -1258,9 +1251,9 @@ public open class Element : Node { /** * Find elements whose text matches the supplied regular expression. - * @param regex regular expression to match text against. You can use [embedded flags](http://java.sun.com/docs/books/tutorial/essential/regex/pattern.html#embedded) (such as (?i) and (?m) to control regex options. + * @param regex regular expression to match text against. You can use embedded flags (such as {@code (?i)} and {@code (?m)}) to control regex options. * @return elements matching the supplied regular expression. - * @see Element.text + * @see Element#text() */ public fun getElementsMatchingText(regex: String): Elements { val pattern: Regex = @@ -1284,9 +1277,9 @@ public open class Element : Node { /** * Find elements whose own text matches the supplied regular expression. - * @param regex regular expression to match text against. You can use [embedded flags](http://java.sun.com/docs/books/tutorial/essential/regex/pattern.html#embedded) (such as (?i) and (?m) to control regex options. + * @param regex regular expression to match text against. You can use embedded flags (such as {@code (?i)} and {@code (?m)}) to control regex options. * @return elements matching the supplied regular expression. - * @see Element.ownText + * @see Element#ownText() */ public fun getElementsMatchingOwnText(regex: String): Elements { val pattern: Regex = diff --git a/ksoup/src/com/fleeksoft/ksoup/nodes/FormElement.kt b/ksoup/src/com/fleeksoft/ksoup/nodes/FormElement.kt index 2c6f253c..0bbf992b 100644 --- a/ksoup/src/com/fleeksoft/ksoup/nodes/FormElement.kt +++ b/ksoup/src/com/fleeksoft/ksoup/nodes/FormElement.kt @@ -7,7 +7,7 @@ import com.fleeksoft.ksoup.select.Elements import com.fleeksoft.ksoup.select.QueryParser /** - * A HTML Form Element provides ready access to the form fields/controls that are associated with it. It also allows a + * An HTML Form Element provides ready access to the form fields/controls that are associated with it. It also allows a * form to easily be submitted. * Create a new, standalone form element. * @@ -19,7 +19,7 @@ public class FormElement(tag: Tag, baseUri: String?, attributes: Attributes?) : private val linkedEls: Elements = Elements() // contains form submittable elements that were linked during the parse (and due to parse rules, may no longer be a child of this form) - val submitable = QueryParser.parse(StringUtil.join(SharedConstants.FormSubmitTags.toList(), ", ")) + private val submittable = QueryParser.parse(StringUtil.join(SharedConstants.FormSubmitTags.toList(), ", ")) /** * Get the list of form control elements associated with this form. @@ -27,7 +27,7 @@ public class FormElement(tag: Tag, baseUri: String?, attributes: Attributes?) : */ public fun elements(): Elements { // As elements may have been added or removed from the DOM after parse, prepare a new list that unions them: - val els = select(submitable) // current form children + val els = select(submittable) // current form children linkedEls.forEach { linkedEl -> if (linkedEl.ownerDocument() != null && !els.contains(linkedEl)) { els.add(linkedEl); // adds previously linked elements, that weren't previously removed from the DOM diff --git a/ksoup/src/com/fleeksoft/ksoup/nodes/NodeUtils.kt b/ksoup/src/com/fleeksoft/ksoup/nodes/NodeUtils.kt index bd892f2c..fc07bc11 100644 --- a/ksoup/src/com/fleeksoft/ksoup/nodes/NodeUtils.kt +++ b/ksoup/src/com/fleeksoft/ksoup/nodes/NodeUtils.kt @@ -27,10 +27,7 @@ internal object NodeUtils { } /** Creates a Stream, starting with the supplied node. */ - fun stream( - start: Node, - type: KClass, - ): Sequence { + fun stream(start: Node, type: KClass): Sequence { val iterator: NodeIterator = NodeIterator(start, type) return iterator.asSequence() } diff --git a/ksoup/src/com/fleeksoft/ksoup/nodes/Range.kt b/ksoup/src/com/fleeksoft/ksoup/nodes/Range.kt index 1ba38b44..7a9792f3 100644 --- a/ksoup/src/com/fleeksoft/ksoup/nodes/Range.kt +++ b/ksoup/src/com/fleeksoft/ksoup/nodes/Range.kt @@ -1,6 +1,7 @@ package com.fleeksoft.ksoup.nodes import com.fleeksoft.ksoup.internal.SharedConstants +import com.fleeksoft.ksoup.internal.StringUtil /** * A Range object tracks the character positions in the original input source where a Node starts or ends. If you want to @@ -134,7 +135,11 @@ public data class Range(private val start: Position, private val end: Position) * `line,column:pos-line,column:pos=line,column:pos-line,column:pos` (name start - name end = val start - val end). * . */ override fun toString(): String { - return "${nameRange()}=${valueRange()}" + val sb = StringUtil.borrowBuilder() + .append(nameRange) + .append('=') + .append(valueRange) + return StringUtil.releaseBuilder(sb) } public companion object { diff --git a/ksoup/src/com/fleeksoft/ksoup/parser/HtmlTreeBuilderState.kt b/ksoup/src/com/fleeksoft/ksoup/parser/HtmlTreeBuilderState.kt index 310e0526..954e341b 100644 --- a/ksoup/src/com/fleeksoft/ksoup/parser/HtmlTreeBuilderState.kt +++ b/ksoup/src/com/fleeksoft/ksoup/parser/HtmlTreeBuilderState.kt @@ -1114,12 +1114,12 @@ public enum class HtmlTreeBuilderState { return tb.process(t, InHead) } else if (name == "input") { if (!( - startTag.hasAttributes() && - startTag.attributes!!["type"].equals( - "hidden", - ignoreCase = true, + startTag.hasAttributes() && + startTag.attributes!!["type"].equals( + "hidden", + ignoreCase = true, + ) ) - ) ) { return anythingElse(t, tb) } else { @@ -1237,10 +1237,10 @@ public enum class HtmlTreeBuilderState { tb.transition(InTable) } } else if (( - t.isStartTag() && - StringUtil.inSorted(t.asStartTag().retrieveNormalName(), Constants.InCellCol) || - t.isEndTag() && t.asEndTag().retrieveNormalName() == "table" - ) + t.isStartTag() && + StringUtil.inSorted(t.asStartTag().retrieveNormalName(), Constants.InCellCol) || + t.isEndTag() && t.asEndTag().retrieveNormalName() == "table" + ) ) { // same as above but processes after transition if (!tb.inTableScope("caption")) { // fragment case @@ -1946,10 +1946,10 @@ public enum class HtmlTreeBuilderState { return processAsHtml(t, tb) } if (start.normalName.equals("font") && ( - start.hasAttributeIgnoreCase("color") || - start.hasAttributeIgnoreCase("face") || - start.hasAttributeIgnoreCase("size") - ) + start.hasAttributeIgnoreCase("color") || + start.hasAttributeIgnoreCase("face") || + start.hasAttributeIgnoreCase("size") + ) ) { return processAsHtml(t, tb) } @@ -2012,10 +2012,7 @@ public enum class HtmlTreeBuilderState { }, ; - public abstract fun process( - t: Token, - tb: HtmlTreeBuilder, - ): Boolean + public abstract fun process(t: Token, tb: HtmlTreeBuilder): Boolean // lists of tags to search through public object Constants { diff --git a/ksoup/src/com/fleeksoft/ksoup/parser/Parser.kt b/ksoup/src/com/fleeksoft/ksoup/parser/Parser.kt index 3dc45ea9..8609b57d 100644 --- a/ksoup/src/com/fleeksoft/ksoup/parser/Parser.kt +++ b/ksoup/src/com/fleeksoft/ksoup/parser/Parser.kt @@ -50,10 +50,7 @@ public class Parser { isTrackPosition = copy.isTrackPosition } - public fun parseInput( - input: String, - baseUri: String, - ): Document { + public fun parseInput(input: String, baseUri: String): Document { return treeBuilder.parse(StringReader(input), baseUri, this) } diff --git a/ksoup/src/com/fleeksoft/ksoup/parser/Token.kt b/ksoup/src/com/fleeksoft/ksoup/parser/Token.kt index 4383320e..1c23b651 100644 --- a/ksoup/src/com/fleeksoft/ksoup/parser/Token.kt +++ b/ksoup/src/com/fleeksoft/ksoup/parser/Token.kt @@ -242,7 +242,7 @@ public abstract class Token private constructor(public var type: TokenType) { // these appenders are rarely hit in not null state-- caused by null chars. public fun appendTagName(append: String) { // might have null chars - need to replace with null replacement character - val replacedAppend = append.replace(TokeniserState.nullChar, Tokeniser.replacementChar) + val replacedAppend = append.replace(TokeniserState.nullChar, Tokeniser.ReplacementChar) tagName = if (tagName == null) replacedAppend else tagName + replacedAppend normalName = ParseSettings.normalName(tagName) } @@ -257,7 +257,7 @@ public abstract class Token private constructor(public var type: TokenType) { endPos: Int, ) { // might have null chars because we eat in one pass - need to replace with null replacement character - val resultAppend = append.replace(TokeniserState.nullChar, Tokeniser.replacementChar) + val resultAppend = append.replace(TokeniserState.nullChar, Tokeniser.ReplacementChar) ensureAttrName(startPos, endPos) if (attrNameSb.isEmpty()) { diff --git a/ksoup/src/com/fleeksoft/ksoup/parser/TokeniserState.kt b/ksoup/src/com/fleeksoft/ksoup/parser/TokeniserState.kt index b059fa70..5e839869 100644 --- a/ksoup/src/com/fleeksoft/ksoup/parser/TokeniserState.kt +++ b/ksoup/src/com/fleeksoft/ksoup/parser/TokeniserState.kt @@ -8,10 +8,7 @@ import com.fleeksoft.ksoup.nodes.DocumentType public enum class TokeniserState { Data { // in data state, gather characters until a character reference or tag is found - override fun read( - t: Tokeniser, - r: CharacterReader, - ) { + override fun read(t: Tokeniser, r: CharacterReader) { when (r.current()) { '&' -> t.advanceTransition(CharacterReferenceInData) '<' -> t.advanceTransition(TagOpen) @@ -1796,8 +1793,8 @@ public enum class TokeniserState { '>', '`', ) - private const val replacementChar: Char = Tokeniser.replacementChar - private const val replacementStr: String = Tokeniser.replacementChar.toString() + private const val replacementChar: Char = Tokeniser.ReplacementChar + private const val replacementStr: String = Tokeniser.ReplacementChar.toString() private const val eof: Char = CharacterReader.EOF /** diff --git a/ksoup/src/com/fleeksoft/ksoup/parser/TreeBuilder.kt b/ksoup/src/com/fleeksoft/ksoup/parser/TreeBuilder.kt index fcade3cd..a1085941 100644 --- a/ksoup/src/com/fleeksoft/ksoup/parser/TreeBuilder.kt +++ b/ksoup/src/com/fleeksoft/ksoup/parser/TreeBuilder.kt @@ -51,9 +51,9 @@ public abstract class TreeBuilder { settings = parser.settings() reader = CharacterReader(input) trackSourceRange = parser.isTrackPosition - reader.trackNewlines( - parser.isTrackErrors() || trackSourceRange, - ) // when tracking errors or source ranges, enable newline tracking for better legibility + + // when tracking errors or source ranges, enable newline tracking for better legibility + reader.trackNewlines(parser.isTrackErrors() || trackSourceRange) tokeniser = Tokeniser(this) _stack = ArrayList(32) seenTags = HashMap() @@ -115,7 +115,7 @@ public abstract class TreeBuilder { fun stepParser(): Boolean { // if we have reached the end already, step by popping off the stack, to hit nodeRemoved callbacks: - if (currentToken?.type === Token.TokenType.EOF) { + if (currentToken?.type == Token.TokenType.EOF) { if (_stack == null) { return false } else if (_stack?.isEmpty() == true) { @@ -326,8 +326,7 @@ public abstract class TreeBuilder { } } - val startPosition: Range.Position = - Range.Position(startPos, reader.lineNumber(startPos), reader.columnNumber(startPos)) + val startPosition: Range.Position = Range.Position(startPos, reader.lineNumber(startPos), reader.columnNumber(startPos)) val endPosition: Range.Position = Range.Position(endPos, reader.lineNumber(endPos), reader.columnNumber(endPos)) val range = Range(startPosition, endPosition) node.attributes().userData(if (isStart) SharedConstants.RangeKey else SharedConstants.EndRangeKey, range) From e8e9e8a8549da35987fe278f7319df512ed1e590 Mon Sep 17 00:00:00 2001 From: sabeeh Date: Fri, 23 Aug 2024 10:17:59 +0500 Subject: [PATCH 05/10] Minor tweaks to LeafNode --- .../ksoup/parser/XmlTreeBuilderTest.kt | 8 +- .../src/com/fleeksoft/ksoup/nodes/Comment.kt | 31 ++--- .../src/com/fleeksoft/ksoup/nodes/DataNode.kt | 25 +--- .../com/fleeksoft/ksoup/nodes/DocumentType.kt | 48 ++++---- .../src/com/fleeksoft/ksoup/nodes/Entities.kt | 112 +++++++++--------- .../src/com/fleeksoft/ksoup/nodes/LeafNode.kt | 16 ++- ksoup/src/com/fleeksoft/ksoup/nodes/Node.kt | 56 ++++----- .../src/com/fleeksoft/ksoup/nodes/TextNode.kt | 11 +- .../fleeksoft/ksoup/nodes/XmlDeclaration.kt | 19 +-- 9 files changed, 129 insertions(+), 197 deletions(-) diff --git a/ksoup-test/test/com/fleeksoft/ksoup/parser/XmlTreeBuilderTest.kt b/ksoup-test/test/com/fleeksoft/ksoup/parser/XmlTreeBuilderTest.kt index 9c105f00..d6e65b42 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/parser/XmlTreeBuilderTest.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/parser/XmlTreeBuilderTest.kt @@ -153,8 +153,7 @@ class XmlTreeBuilderTest { @Test fun testParseDeclarationWithoutAttributes() { - val xml = - "\n" + val xml = "\n" val doc = Ksoup.parse(html = xml, baseUri = "", parser = Parser.xmlParser()) val decl = doc.childNode(2) as XmlDeclaration assertEquals("myProcessingInstruction", decl.name()) @@ -257,10 +256,7 @@ class XmlTreeBuilderTest { fun handlesLTinScript() { val html = "" val doc = Ksoup.parse(html = html, baseUri = "", parser = Parser.xmlParser()) - assertEquals( - "", - doc.html(), - ) // converted from pseudo xmldecl to comment + assertEquals("", doc.html()) // converted from pseudo xmldecl to comment } @Test diff --git a/ksoup/src/com/fleeksoft/ksoup/nodes/Comment.kt b/ksoup/src/com/fleeksoft/ksoup/nodes/Comment.kt index 28ce64a7..0d11b949 100644 --- a/ksoup/src/com/fleeksoft/ksoup/nodes/Comment.kt +++ b/ksoup/src/com/fleeksoft/ksoup/nodes/Comment.kt @@ -8,15 +8,7 @@ import com.fleeksoft.ksoup.parser.Parser * * @author Sabeeh, fleeksoft@gmail.com */ -public class Comment(data: String) : LeafNode() { - /** - * Create a new comment node. - * @param data The contents of the comment - */ - init { - value = data - } - +public class Comment(data: String) : LeafNode(data) { override fun nodeName(): String { return "#comment" } @@ -34,10 +26,10 @@ public class Comment(data: String) : LeafNode() { out: Document.OutputSettings, ) { if (out.prettyPrint() && ( - isEffectivelyFirst() && _parentNode is Element && - (_parentNode as Element).tag() - .formatAsBlock() || out.outline() - ) + isEffectivelyFirst() && _parentNode is Element && + (_parentNode as Element).tag() + .formatAsBlock() || out.outline() + ) ) { indent(accum, depth, out) } @@ -47,15 +39,7 @@ public class Comment(data: String) : LeafNode() { .append("-->") } - override fun outerHtmlTail( - accum: Appendable, - depth: Int, - out: Document.OutputSettings, - ) { - } - - override fun toString(): String { - return outerHtml() + override fun outerHtmlTail(accum: Appendable, depth: Int, out: Document.OutputSettings) { } override fun createClone(): Node { @@ -84,8 +68,7 @@ public class Comment(data: String) : LeafNode() { if (isXmlDeclarationData(declContent)) return null val fragment = "<$declContent>" // use the HTML parser not XML, so we don't get into a recursive XML Declaration on contrived data - val doc: Document = - Parser.htmlParser().settings(ParseSettings.preserveCase).parseInput(fragment, baseUri()) + val doc: Document = Parser.htmlParser().settings(ParseSettings.preserveCase).parseInput(fragment, baseUri()) if (doc.body().childrenSize() > 0) { val el: Element = doc.body().child(0) decl = diff --git a/ksoup/src/com/fleeksoft/ksoup/nodes/DataNode.kt b/ksoup/src/com/fleeksoft/ksoup/nodes/DataNode.kt index 3f7d781c..e41fa8de 100644 --- a/ksoup/src/com/fleeksoft/ksoup/nodes/DataNode.kt +++ b/ksoup/src/com/fleeksoft/ksoup/nodes/DataNode.kt @@ -6,10 +6,7 @@ package com.fleeksoft.ksoup.nodes * * @param data data contents */ -public class DataNode(data: String) : LeafNode() { - init { - value = data - } +public class DataNode(data: String) : LeafNode(data) { override fun nodeName(): String { return "#data" @@ -19,7 +16,7 @@ public class DataNode(data: String) : LeafNode() { /** * Set the data contents of this node. - * @param data unencoded data + * @param data un-encoded data * @return this node, for chaining */ public fun setWholeData(data: String?): DataNode { @@ -27,13 +24,9 @@ public class DataNode(data: String) : LeafNode() { return this } - public override fun outerHtmlHead( - accum: Appendable, - depth: Int, - out: Document.OutputSettings, - ) { + public override fun outerHtmlHead(accum: Appendable, depth: Int, out: Document.OutputSettings) { /* For XML output, escape the DataNode in a CData section. The data may contain pseudo-CData content if it was - parsed as HTML, so don't double up Cdata. Output in polygot HTML / XHTML / XML format. */ + parsed as HTML, so don't double up Cdata. Output in polyglot HTML / XHTML / XML format. */ val data = getWholeData() if (out.syntax() === Document.OutputSettings.Syntax.xml && !data.contains(" 0 && out.prettyPrint()) accum.append('\n') - if (out.syntax() === Syntax.html && !has(PUBLIC_ID) && !has(SYSTEM_ID)) { + + if (out.syntax() == Syntax.html && !has(PublicId) && !has(SystemId)) { // looks like a html5 doctype, go lowercase for aesthetics accum.append("') } @@ -99,9 +98,10 @@ public class DocumentType(private val name: String, private val publicId: String // todo needs a bit of a chunky cleanup. this level of detail isn't needed public const val PUBLIC_KEY: String = "PUBLIC" public const val SYSTEM_KEY: String = "SYSTEM" - private const val NAME: String = "name" - private const val PUB_SYS_KEY: String = "pubSysKey" // PUBLIC or SYSTEM - private const val PUBLIC_ID: String = "publicId" - private const val SYSTEM_ID: String = "systemId" + private const val Name: String = "#doctype" + private const val PubSysKey: String = "pubSysKey" // PUBLIC or SYSTEM + private const val PublicId: String = "publicId" + private const val SystemId: String = "systemId" + // todo: quirk mode from publicId and systemId } } diff --git a/ksoup/src/com/fleeksoft/ksoup/nodes/Entities.kt b/ksoup/src/com/fleeksoft/ksoup/nodes/Entities.kt index 6ff652c6..72a6ab68 100644 --- a/ksoup/src/com/fleeksoft/ksoup/nodes/Entities.kt +++ b/ksoup/src/com/fleeksoft/ksoup/nodes/Entities.kt @@ -4,7 +4,9 @@ package com.fleeksoft.ksoup.nodes import com.fleeksoft.ksoup.helper.Validate import com.fleeksoft.ksoup.internal.StringUtil +import com.fleeksoft.ksoup.io.Charset import com.fleeksoft.ksoup.nodes.Document.OutputSettings +import com.fleeksoft.ksoup.nodes.Document.OutputSettings.Syntax import com.fleeksoft.ksoup.nodes.Entities.EscapeMode.base import com.fleeksoft.ksoup.nodes.Entities.EscapeMode.extended import com.fleeksoft.ksoup.parser.CharacterReader @@ -13,7 +15,7 @@ import com.fleeksoft.ksoup.ported.Character import com.fleeksoft.ksoup.ported.ThreadLocal import com.fleeksoft.ksoup.ported.exception.IOException import com.fleeksoft.ksoup.ported.exception.SerializationException -import com.fleeksoft.ksoup.io.Charset +import com.fleeksoft.ksoup.ported.io.Charsets import de.cketti.codepoints.deluxe.CodePoint import de.cketti.codepoints.deluxe.codePointAt @@ -96,60 +98,55 @@ public object Entities { /** HTML escape an input string. That is, {@code <} is returned as {@code <}. The escaped string is suitable for use both in attributes and in text data. - @param string the un-escaped string to escape + @param data the un-escaped string to escape @param out the output settings to use. This configures the character set escaped against (that is, if a character is supported in the output character set, it doesn't have to be escaped), and also HTML or XML settings. @return the escaped string */ - public fun escape( - string: String?, - out: OutputSettings?, - ): String { - if (string == null) return "" - val accum: StringBuilder = StringUtil.borrowBuilder() + public fun escape(data: String?, out: OutputSettings): String { + return escapeString(data, out.escapeMode(), out.syntax(), out.charset()) + } + + /** + HTML escape an input string, using the default settings (UTF-8, base entities, HTML syntax). That is, {@code <} is + returned as {@code <}. The escaped string is suitable for use both in attributes and in text data. + @param data the un-escaped string to escape + @return the escaped string + @see #escape(String, OutputSettings) + */ + public fun escape(data: String?): String { + return escapeString(data, base, Syntax.html, Charsets.UTF8) + } + + public fun escapeString(data: String?, escapeMode: EscapeMode, syntax: Syntax, charset: Charset): String { + if (data == null) return "" + val accum = StringUtil.borrowBuilder() try { - escape( - accum, - string, - out, - ForText or ForAttribute - ) // for text and for attribute; preserve whitespaces + doEscape(data, accum, escapeMode, syntax, charset, ForText or ForAttribute) } catch (e: IOException) { throw SerializationException(e) // doesn't happen } + return StringUtil.releaseBuilder(accum) } - /** - * HTML escape an input string, using the default settings (UTF-8, base entities). That is, {@code <} is returned as - * {@code <}. The escaped string is suitable for use both in attributes and in text data. - * - * @param string the un-escaped string to escape - * @return the escaped string - * @see #escape(String, OutputSettings) - */ - public fun escape(string: String?): String { - if (DefaultOutput == null) DefaultOutput = OutputSettings() - return escape(string, DefaultOutput) + fun escape(accum: Appendable, data: String, out: OutputSettings, options: Int) { + doEscape(data, accum, out.escapeMode(), out.syntax(), out.charset(), options) } - private var DefaultOutput: OutputSettings? = - null // lazy-init, to break circular dependency with OutputSettings - // this method does a lot, but other breakups cause rescanning and stringbuilder generations - public fun escape(accum: Appendable, string: String, out: OutputSettings?, options: Int) { - val escapeMode: EscapeMode = out!!.escapeMode() - val encoder: Charset = out.encoder() - val coreCharset: CoreCharset = out.coreCharset // init in out.prepareEncoder() - val length = string.length + private fun doEscape(data: String, accum: Appendable, mode: EscapeMode, syntax: Syntax, charset: Charset, options: Int) { + val fallback: Charset = charset + val coreCharset: CoreCharset = CoreCharset.byName(charset.name) + val length = data.length var codePoint: CodePoint var lastWasWhite = false var reachedNonWhite = false var skipped = false var offset = 0 while (offset < length) { - codePoint = string.codePointAt(offset) + codePoint = data.codePointAt(offset) if (options and Normalise != 0) { if (StringUtil.isWhitespace(codePoint.value)) { @@ -180,18 +177,24 @@ public object Entities { } } - appendEscaped(accum, out, options, codePoint, escapeMode, encoder, coreCharset) + appendEscaped(codePoint, accum, options, mode, syntax, coreCharset, fallback) // surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]): offset += codePoint.charCount } } private fun appendEscaped( - accum: Appendable, out: OutputSettings, options: Int, - codePoint: CodePoint, escapeMode: EscapeMode, encoder: Charset, coreCharset: CoreCharset + codePoint: CodePoint, + accum: Appendable, + options: Int, + escapeMode: EscapeMode, + syntax: Syntax, + coreCharset: CoreCharset, + fallback: Charset ) { + // surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]): val c = codePoint.value.toChar() - if (codePoint.value < Character.MIN_SUPPLEMENTARY_CODE_POINT || encoder.name.uppercase() == "ASCII" || encoder.name.uppercase() == "US-ASCII" || encoder.name.uppercase() == "ISO-8859-1") { + if (codePoint.value < Character.MIN_SUPPLEMENTARY_CODE_POINT || fallback.name.uppercase() == "ASCII" || fallback.name.uppercase() == "US-ASCII" || fallback.name.uppercase() == "ISO-8859-1") { when { c == '&' -> { accum.append("&") @@ -209,7 +212,7 @@ public object Entities { c == '<' -> { // escape when in character data or when in a xml attribute val or XML syntax; not needed in html attr val - appendLt(accum, options, escapeMode, out) + appendLt(accum, options, escapeMode, syntax) /*if (!inAttribute || escapeMode == EscapeMode.xhtml || out.syntax() === OutputSettings.Syntax.xml) { accum.append("<") } else { @@ -238,19 +241,15 @@ public object Entities { } else -> { - if (c.code < 0x20 || !canEncode(coreCharset, c, encoder)) { - appendEncoded( - accum, - escapeMode, - codePoint.value, - ) + if (c.code < 0x20 || !canEncode(coreCharset, c, fallback)) { + appendEncoded(accum, escapeMode, codePoint.value) } else { accum.append(c) } } } } else { - if (encoder.canEncode(codePoint.toChars().concatToString())) { + if (fallback.canEncode(codePoint.toChars().concatToString())) { val chars = charBuf.get() val len = codePoint.toChars(chars, 0) if (accum is StringBuilder) { @@ -271,10 +270,12 @@ public object Entities { else accum.append(" ") } - private fun appendLt(accum: Appendable, options: Int, escapeMode: EscapeMode, out: OutputSettings) { - if ((options and ForText) != 0 || (escapeMode == EscapeMode.xhtml) || (out.syntax() === OutputSettings.Syntax.xml)) { + private fun appendLt(accum: Appendable, options: Int, escapeMode: EscapeMode, syntax: Syntax) { + if ((options and ForText) != 0 || (escapeMode == EscapeMode.xhtml) || (syntax === Syntax.xml)) { accum.append("<") - } else accum.append('<') + } else { + accum.append('<') // no need to escape < when in an HTML attribute + } } private fun appendApos(accum: Appendable, options: Int, escapeMode: EscapeMode) { @@ -297,11 +298,8 @@ public object Entities { accum.append('&').append(name).append(';') } else { accum.append("&#x") - .append( - codePoint.toHexString( - HexFormat { number { removeLeadingZeros = true } }, - ), - ).append(';') + .append(codePoint.toHexString(HexFormat { number { removeLeadingZeros = true } })) + .append(';') } } @@ -354,11 +352,7 @@ public object Entities { } } - private fun load( - e: EscapeMode, - pointsData: String, - size: Int, - ) { + private fun load(e: EscapeMode, pointsData: String, size: Int) { e.nameKeys = arrayOfNulls(size) e.codeVals = IntArray(size) e.codeKeys = IntArray(size) diff --git a/ksoup/src/com/fleeksoft/ksoup/nodes/LeafNode.kt b/ksoup/src/com/fleeksoft/ksoup/nodes/LeafNode.kt index c268ee9d..d9552c95 100644 --- a/ksoup/src/com/fleeksoft/ksoup/nodes/LeafNode.kt +++ b/ksoup/src/com/fleeksoft/ksoup/nodes/LeafNode.kt @@ -3,10 +3,18 @@ package com.fleeksoft.ksoup.nodes /** A node that does not hold any children. E.g.: {@link TextNode}, {@link DataNode}, {@link Comment}. */ -public abstract class LeafNode : Node() { +public abstract class LeafNode : Node { // either a string value, or an attribute map (in the rare case multiple attributes are set) internal var value: Any? = null + constructor() { + value = "" + } + + constructor(coreValue: String) { + value = coreValue + } + override fun hasAttributes(): Boolean { return value is Attributes } @@ -18,10 +26,10 @@ public abstract class LeafNode : Node() { private fun ensureAttributes() { if (!hasAttributes()) { - val coreValue = value + val coreValue = value as? String val attributes = Attributes() value = attributes - if (coreValue != null) attributes.put(nodeName(), coreValue as String?) + attributes.put(nodeName(), coreValue) } } @@ -70,7 +78,7 @@ public abstract class LeafNode : Node() { } override fun baseUri(): String { - return if (hasParent()) parent()!!.baseUri() else "" + return if (_parentNode != null) _parentNode!!.baseUri() else "" } protected override fun doSetBaseUri(baseUri: String?) { diff --git a/ksoup/src/com/fleeksoft/ksoup/nodes/Node.kt b/ksoup/src/com/fleeksoft/ksoup/nodes/Node.kt index 9f295721..67f4b17b 100644 --- a/ksoup/src/com/fleeksoft/ksoup/nodes/Node.kt +++ b/ksoup/src/com/fleeksoft/ksoup/nodes/Node.kt @@ -1,10 +1,12 @@ package com.fleeksoft.ksoup.nodes -import com.fleeksoft.ksoup.ported.exception.SerializationException import com.fleeksoft.ksoup.helper.Validate import com.fleeksoft.ksoup.internal.StringUtil -import com.fleeksoft.ksoup.ported.* +import com.fleeksoft.ksoup.ported.Consumer +import com.fleeksoft.ksoup.ported.KCloneable +import com.fleeksoft.ksoup.ported.LinkedList import com.fleeksoft.ksoup.ported.exception.IOException +import com.fleeksoft.ksoup.ported.exception.SerializationException import com.fleeksoft.ksoup.select.NodeFilter import com.fleeksoft.ksoup.select.NodeTraversor import com.fleeksoft.ksoup.select.NodeVisitor @@ -392,11 +394,9 @@ public abstract class Node protected constructor() : KCloneable { html: String, ) { Validate.notNull(_parentNode) - val context: Element? = if (parent() is Element) parent() as Element? else null - val nodes: List = - NodeUtils.parser(this) - .parseFragmentInput(html, context, baseUri()) - _parentNode!!.addChildren(index, *nodes.toTypedArray()) + val context: Element? = if (_parentNode is Element) _parentNode as Element? else null + val nodes: List = NodeUtils.parser(this).parseFragmentInput(html, context, baseUri()) + _parentNode?.addChildren(index, *nodes.toTypedArray()) } /** @@ -462,16 +462,6 @@ public abstract class Node protected constructor() : KCloneable { return firstChild } - private fun getDeepChild(el: Element): Element { - var resultEl = el - var child = resultEl.firstElementChild() - while (child != null) { - resultEl = child - child = child.firstElementChild() - } - return resultEl - } - internal open fun nodelistChanged() { // Element overrides this to clear its shadow children elements } @@ -880,22 +870,9 @@ public abstract class Node protected constructor() : KCloneable { return clone } - private class OuterHtmlVisitor( - accum: Appendable, - out: Document.OutputSettings, - ) : NodeVisitor { - private val accum: Appendable - private val out: Document.OutputSettings - - init { - this.accum = accum - this.out = out - } + private class OuterHtmlVisitor(private val accum: Appendable, private val out: Document.OutputSettings) : NodeVisitor { - override fun head( - node: Node, - depth: Int, - ) { + override fun head(node: Node, depth: Int) { try { node.outerHtmlHead(accum, depth, out) } catch (exception: IOException) { @@ -903,10 +880,7 @@ public abstract class Node protected constructor() : KCloneable { } } - override fun tail( - node: Node, - depth: Int, - ) { + override fun tail(node: Node, depth: Int) { if (node.nodeName() != "#text") { // saves a void hit. try { node.outerHtmlTail(accum, depth, out) @@ -920,5 +894,15 @@ public abstract class Node protected constructor() : KCloneable { public companion object { public val EmptyNodes: MutableList = mutableListOf() public const val EmptyString: String = "" + + private fun getDeepChild(el: Element): Element { + var resultEl = el + var child = resultEl.firstElementChild() + while (child != null) { + resultEl = child + child = child.firstElementChild() + } + return resultEl + } } } diff --git a/ksoup/src/com/fleeksoft/ksoup/nodes/TextNode.kt b/ksoup/src/com/fleeksoft/ksoup/nodes/TextNode.kt index 2cbb963f..4e881365 100644 --- a/ksoup/src/com/fleeksoft/ksoup/nodes/TextNode.kt +++ b/ksoup/src/com/fleeksoft/ksoup/nodes/TextNode.kt @@ -8,16 +8,7 @@ import com.fleeksoft.ksoup.internal.StringUtil * * @author Sabeeh, fleeksoft@gmail.com */ -public open class TextNode(text: String) : LeafNode() { - /** - * Create a new TextNode representing the supplied (unencoded) text). - * - * @param text raw text - * @see .createFromEncoded - */ - init { - value = text - } +public open class TextNode(text: String) : LeafNode(text) { override fun nodeName(): String { return "#text" diff --git a/ksoup/src/com/fleeksoft/ksoup/nodes/XmlDeclaration.kt b/ksoup/src/com/fleeksoft/ksoup/nodes/XmlDeclaration.kt index d4d4147c..8f84b849 100644 --- a/ksoup/src/com/fleeksoft/ksoup/nodes/XmlDeclaration.kt +++ b/ksoup/src/com/fleeksoft/ksoup/nodes/XmlDeclaration.kt @@ -1,25 +1,16 @@ package com.fleeksoft.ksoup.nodes -import com.fleeksoft.ksoup.ported.exception.SerializationException import com.fleeksoft.ksoup.internal.StringUtil import com.fleeksoft.ksoup.ported.exception.IOException +import com.fleeksoft.ksoup.ported.exception.SerializationException /** * An XML Declaration. */ -public class XmlDeclaration(name: String, isProcessingInstruction: Boolean) : LeafNode() { - // todo this impl isn't really right, the data shouldn't be attributes, just a run of text after the name - private val isProcessingInstruction: Boolean - - /** - * Create a new XML declaration - * @param name of declaration - * @param isProcessingInstruction is processing instruction - */ - init { - value = name - this.isProcessingInstruction = isProcessingInstruction - } +public class XmlDeclaration( + name: String, + private val isProcessingInstruction: Boolean // Date: Fri, 23 Aug 2024 10:19:16 +0500 Subject: [PATCH 06/10] Add isJsOrWasm flag --- .../test/com/fleeksoft/ksoup/PlatformTest.kt | 5 +- .../test/com/fleeksoft/ksoup/TestHelper.kt | 3 +- .../fleeksoft/ksoup/helper/DataUtilTest.kt | 54 +++++++++---------- .../fleeksoft/ksoup/integration/ParseTest.kt | 4 +- .../fleeksoft/ksoup/issues/GithubIssue19.kt | 2 +- .../com/fleeksoft/ksoup/nodes/DocumentTest.kt | 4 +- .../com/fleeksoft/ksoup/nodes/ElementIT.kt | 8 +-- .../com/fleeksoft/ksoup/parser/ParserIT.kt | 2 +- ksoup/src/com/fleeksoft/ksoup/PlatformExt.kt | 2 +- .../fleeksoft/ksoup/ported/CoreFunctions.kt | 4 +- 10 files changed, 42 insertions(+), 46 deletions(-) diff --git a/ksoup-test/test/com/fleeksoft/ksoup/PlatformTest.kt b/ksoup-test/test/com/fleeksoft/ksoup/PlatformTest.kt index 3b18aa00..8d99eaf4 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/PlatformTest.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/PlatformTest.kt @@ -1,7 +1,6 @@ package com.fleeksoft.ksoup import com.fleeksoft.ksoup.ported.jsSupportedRegex -import kotlin.test.BeforeTest import kotlin.test.Test import kotlin.test.assertEquals @@ -10,9 +9,9 @@ class PlatformTest { @Test fun testJsSupportedRegex() { val regex2 = jsSupportedRegex("img[src~=(?i)\\.(png|jpe?g)]") - val expected2 = if (Platform.isJS()) """img[src~=\.(png|jpe?g)]""" else """img[src~=(?i)\.(png|jpe?g)]""" + val expected2 = if (Platform.isJsOrWasm()) """img[src~=\.(png|jpe?g)]""" else """img[src~=(?i)\.(png|jpe?g)]""" assertEquals(expected2, regex2.pattern) - if (Platform.isJS()) { + if (Platform.isJsOrWasm()) { assertEquals(RegexOption.IGNORE_CASE, regex2.options.first()) } } diff --git a/ksoup-test/test/com/fleeksoft/ksoup/TestHelper.kt b/ksoup-test/test/com/fleeksoft/ksoup/TestHelper.kt index fbc8b573..722919a7 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/TestHelper.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/TestHelper.kt @@ -1,6 +1,5 @@ package com.fleeksoft.ksoup -import com.fleeksoft.ksoup.io.FileSource import com.fleeksoft.ksoup.io.SourceReader import com.fleeksoft.ksoup.ported.openSourceReader import korlibs.io.compression.deflate.GZIP @@ -27,7 +26,7 @@ object TestHelper { fun getResourceAbsolutePath(resourceName: String): String { if (Platform.isWindows()) { return "../../../../testResources/$resourceName" - } else if (Platform.isJS()) { + } else if (Platform.isJsOrWasm()) { return "https://raw.githubusercontent.com/fleeksoft/ksoup/release/ksoup-test/testResources/$resourceName" } return "${BuildConfig.PROJECT_ROOT}/ksoup-test/testResources/$resourceName" diff --git a/ksoup-test/test/com/fleeksoft/ksoup/helper/DataUtilTest.kt b/ksoup-test/test/com/fleeksoft/ksoup/helper/DataUtilTest.kt index 22bc3ff5..834c35da 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/helper/DataUtilTest.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/helper/DataUtilTest.kt @@ -129,7 +129,7 @@ class DataUtilTest { @Test fun secondMetaElementWithContentTypeContainsCharsetParameter() { - if (Platform.isJS() || Platform.isApple() || Platform.isWindows()) { + if (Platform.isJsOrWasm() || Platform.isApple() || Platform.isWindows()) { // FIXME: euc-kr charset not supported return } @@ -167,38 +167,38 @@ class DataUtilTest { @Test fun supportsBOMinFiles() = runTest { - if (BuildConfig.isKotlinx && Platform.isJS()) { + if (BuildConfig.isKotlinx && Platform.isJsOrWasm()) { // FIXME: UTF-16 charset not supported return@runTest } var input = TestHelper.getResourceAbsolutePath("bomtests/bom_utf16be.html") var doc: Document = Ksoup.parseFile(filePath = input, baseUri = "http://example.com", charsetName = null) - assertTrue(doc.title().contains("UTF-16BE")) - assertTrue(doc.text().contains("가각갂갃간갅")) + assertContains(doc.title(), "UTF-16BE") + assertContains(doc.text(), "가각갂갃간갅") input = TestHelper.getResourceAbsolutePath("bomtests/bom_utf16le.html") doc = Ksoup.parseFile(filePath = input, baseUri = "http://example.com", charsetName = null) - assertTrue(doc.title().contains("UTF-16LE")) - assertTrue(doc.text().contains("가각갂갃간갅")) + assertContains(doc.title(), "UTF-16LE") + assertContains(doc.text(), "가각갂갃간갅") - if (Platform.isJS() || Platform.isWindows() || Platform.isLinux()) { + if (Platform.isJsOrWasm() || Platform.isWindows() || Platform.isLinux()) { // FIXME: UTF-32 charset not supported return@runTest } input = TestHelper.getResourceAbsolutePath("bomtests/bom_utf32be.html") doc = Ksoup.parseFile(filePath = input, baseUri = "http://example.com", charsetName = null) - assertTrue(doc.title().contains("UTF-32BE")) - assertTrue(doc.text().contains("가각갂갃간갅")) + assertContains(doc.title(), "UTF-32BE") + assertContains(doc.text(), "가각갂갃간갅") input = TestHelper.getResourceAbsolutePath("bomtests/bom_utf32le.html") doc = Ksoup.parseFile(filePath = input, baseUri = "http://example.com", charsetName = null) - assertTrue(doc.title().contains("UTF-32LE")) - assertTrue(doc.text().contains("가각갂갃간갅")) + assertContains(doc.title(), "UTF-32LE") + assertContains(doc.text(), "가각갂갃간갅") } @Test fun streamerSupportsBOMinFiles() = runTest { - if (BuildConfig.isKotlinx && Platform.isJS()) { + if (BuildConfig.isKotlinx && Platform.isJsOrWasm()) { // FIXME: UTF-16 charset not supported return@runTest } @@ -208,16 +208,16 @@ class DataUtilTest { var doc: Document = DataUtil.streamParser(sourceReader = source, baseUri = "http://example.com", charset = null, parser = parser) .complete() - assertTrue(doc.title().contains("UTF-16BE")) - assertTrue(doc.text().contains("가각갂갃간갅")) + assertContains(doc.title(), "UTF-16BE") + assertContains(doc.text(), "가각갂갃간갅") source = TestHelper.readResource("bomtests/bom_utf16le.html") doc = DataUtil.streamParser(sourceReader = source, baseUri = "http://example.com", charset = null, parser = parser) .complete() - assertTrue(doc.title().contains("UTF-16LE")) - assertTrue(doc.text().contains("가각갂갃간갅")) + assertContains(doc.title(), "UTF-16LE") + assertContains(doc.text(), "가각갂갃간갅") - if (Platform.isJS() || Platform.isWindows() || Platform.isLinux()) { + if (Platform.isJsOrWasm() || Platform.isWindows() || Platform.isLinux()) { // FIXME: UTF-32 charset not supported return@runTest } @@ -225,14 +225,14 @@ class DataUtilTest { source = TestHelper.readResource("bomtests/bom_utf32be.html") doc = DataUtil.streamParser(sourceReader = source, baseUri = "http://example.com", charset = null, parser = parser) .complete() - assertTrue(doc.title().contains("UTF-32BE")) - assertTrue(doc.text().contains("가각갂갃간갅")) + assertContains(doc.title(), "UTF-32BE") + assertContains(doc.text(), "가각갂갃간갅") source = TestHelper.readResource("bomtests/bom_utf32le.html") doc = DataUtil.streamParser(sourceReader = source, baseUri = "http://example.com", charset = null, parser = parser) .complete() - assertTrue(doc.title().contains("UTF-32LE")) - assertTrue(doc.text().contains("가각갂갃간갅")) + assertContains(doc.title(), "UTF-32LE") + assertContains(doc.text(), "가각갂갃간갅") } @Test @@ -286,13 +286,11 @@ class DataUtilTest { @Test fun supportsXmlCharsetDeclaration() { val encoding = "iso-8859-1" - val soup = - ( - "" + - "" + - "Hellö Wörld!" - ) - .toByteArray(Charsets.forName(encoding)).openSourceReader() + val soup = ( + "" + + "" + + "Hellö Wörld!" + ).toByteArray(Charsets.forName(encoding)).openSourceReader() val doc: Document = Ksoup.parse(soup, baseUri = "", charsetName = null) assertEquals("Hellö Wörld!", doc.body().text()) } diff --git a/ksoup-test/test/com/fleeksoft/ksoup/integration/ParseTest.kt b/ksoup-test/test/com/fleeksoft/ksoup/integration/ParseTest.kt index a7f2b5dd..4337aa70 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/integration/ParseTest.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/integration/ParseTest.kt @@ -19,7 +19,7 @@ class ParseTest { @Test fun testHtml5Charset() = runTest { - if (Platform.isApple() || Platform.isWindows() || (BuildConfig.isKotlinx && Platform.isJS())) { + if (Platform.isApple() || Platform.isWindows() || (BuildConfig.isKotlinx && Platform.isJsOrWasm())) { // don't support gb2312 or gbk return@runTest } @@ -75,7 +75,7 @@ class ParseTest { @Test fun testLowercaseUtf8Charset() = runTest { val resourceName = "htmltests/lowercase-charset-test.html" - val doc: Document = if (BuildConfig.isKotlinx && Platform.isJS()) { + val doc: Document = if (BuildConfig.isKotlinx && Platform.isJsOrWasm()) { val source = TestHelper.readResource(resourceName) Ksoup.parse(sourceReader = source, baseUri = resourceName) } else { diff --git a/ksoup-test/test/com/fleeksoft/ksoup/issues/GithubIssue19.kt b/ksoup-test/test/com/fleeksoft/ksoup/issues/GithubIssue19.kt index 184b17ce..08ecee29 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/issues/GithubIssue19.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/issues/GithubIssue19.kt @@ -14,7 +14,7 @@ class GithubIssue19 { @Test fun testAttributeIncorrectMixCharsetIssue() = runTest { - if (Platform.isJS()) { + if (Platform.isJsOrWasm()) { // timeout issue return@runTest } diff --git a/ksoup-test/test/com/fleeksoft/ksoup/nodes/DocumentTest.kt b/ksoup-test/test/com/fleeksoft/ksoup/nodes/DocumentTest.kt index c7767372..8ea69772 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/nodes/DocumentTest.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/nodes/DocumentTest.kt @@ -150,7 +150,7 @@ class DocumentTest { fun testLocation() = runTest { // tests location vs base href val resourceName = "htmltests/basehref.html" - val doc: Document = if (BuildConfig.isKotlinx && Platform.isJS()) { + val doc: Document = if (BuildConfig.isKotlinx && Platform.isJsOrWasm()) { val source = TestHelper.readResource(resourceName) Ksoup.parse(sourceReader = source, baseUri = "http://example.com/", charsetName = "UTF-8") } else { @@ -468,7 +468,7 @@ class DocumentTest { @Test fun testShiftJisRoundtrip() { - if (Platform.isJS()) { + if (Platform.isJsOrWasm()) { // Shift_JIS not supported return } diff --git a/ksoup-test/test/com/fleeksoft/ksoup/nodes/ElementIT.kt b/ksoup-test/test/com/fleeksoft/ksoup/nodes/ElementIT.kt index 0c30d877..9f0d1b60 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/nodes/ElementIT.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/nodes/ElementIT.kt @@ -42,7 +42,7 @@ class ElementIT { @Test fun testFastReparentExistingContent() { - if (Platform.isJS() && BuildConfig.isGithubActions) { + if (Platform.isJsOrWasm() && BuildConfig.isGithubActions) { // failing on github action return } @@ -81,7 +81,7 @@ class ElementIT { // These overflow tests take a couple seconds to run, so are in the slow tests @Test fun hasTextNoOverflow() { - if (Platform.isJS()) { + if (Platform.isJsOrWasm()) { // FIXME: timeout error for js return } @@ -100,7 +100,7 @@ class ElementIT { @Test fun dataNoOverflow() { - if (Platform.isJS()) { + if (Platform.isJsOrWasm()) { // FIXME: timeout error for js return } @@ -120,7 +120,7 @@ class ElementIT { @Test fun parentsNoOverflow() { - if (Platform.isJS()) { + if (Platform.isJsOrWasm()) { // FIXME: timeout error for js return } diff --git a/ksoup-test/test/com/fleeksoft/ksoup/parser/ParserIT.kt b/ksoup-test/test/com/fleeksoft/ksoup/parser/ParserIT.kt index 17983fb7..2aab36fc 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/parser/ParserIT.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/parser/ParserIT.kt @@ -25,7 +25,7 @@ class ParserIT { @Test fun handlesDeepStack() { - if (Platform.isJS() && BuildConfig.isGithubActions) { + if (Platform.isJsOrWasm() && BuildConfig.isGithubActions) { // The GitHub action is taking too much time. return } diff --git a/ksoup/src/com/fleeksoft/ksoup/PlatformExt.kt b/ksoup/src/com/fleeksoft/ksoup/PlatformExt.kt index e4d37869..cc364ae2 100644 --- a/ksoup/src/com/fleeksoft/ksoup/PlatformExt.kt +++ b/ksoup/src/com/fleeksoft/ksoup/PlatformExt.kt @@ -10,7 +10,7 @@ public fun Platform.isJvmOrAndroid(): Boolean = this.current == PlatformType.JVM public fun Platform.isJvm(): Boolean = this.current == PlatformType.JVM -public fun Platform.isJS(): Boolean = this.current == PlatformType.JS || this.current == PlatformType.WASM_JS +public fun Platform.isJsOrWasm(): Boolean = this.current == PlatformType.JS || this.current == PlatformType.WASM_JS public fun Platform.isWasmJs(): Boolean = this.current == PlatformType.WASM_JS diff --git a/ksoup/src/com/fleeksoft/ksoup/ported/CoreFunctions.kt b/ksoup/src/com/fleeksoft/ksoup/ported/CoreFunctions.kt index faa77bf7..c82057c6 100644 --- a/ksoup/src/com/fleeksoft/ksoup/ported/CoreFunctions.kt +++ b/ksoup/src/com/fleeksoft/ksoup/ported/CoreFunctions.kt @@ -1,12 +1,12 @@ package com.fleeksoft.ksoup.ported import com.fleeksoft.ksoup.Platform -import com.fleeksoft.ksoup.isJS +import com.fleeksoft.ksoup.isJsOrWasm // js don't support ?i public fun jsSupportedRegex(regex: String): Regex { - return if (Platform.isJS() && regex.contains("(?i)")) { + return if (Platform.isJsOrWasm() && regex.contains("(?i)")) { Regex(regex.replace("(?i)", ""), RegexOption.IGNORE_CASE) } else { Regex(regex) From f105979075a8da775a0b09f42000356b5a59dcc9 Mon Sep 17 00:00:00 2001 From: sabeeh Date: Fri, 23 Aug 2024 10:19:48 +0500 Subject: [PATCH 07/10] Add jsAndWasm target --- ksoup-network/module.yaml | 3 ++- .../com/fleeksoft/ksoup/network/ProvideHttpClientEngineJs.kt | 0 2 files changed, 2 insertions(+), 1 deletion(-) rename ksoup-network/{src@js => src@jsAndWasm}/com/fleeksoft/ksoup/network/ProvideHttpClientEngineJs.kt (100%) diff --git a/ksoup-network/module.yaml b/ksoup-network/module.yaml index 82491b9a..65bff510 100644 --- a/ksoup-network/module.yaml +++ b/ksoup-network/module.yaml @@ -7,6 +7,7 @@ apply: [ ../common.module-template.yaml ] aliases: - jvmAndAndroid: [ jvm, android ] - concurrent: [ jvm, android, linuxX64, linuxArm64, tvosArm64, tvosX64, tvosSimulatorArm64, macosX64, macosArm64, iosArm64, iosSimulatorArm64, iosX64, mingwX64 ] + - jsAndWasm: [js, wasm] repositories: - mavenLocal @@ -23,7 +24,7 @@ dependencies@jvmAndAndroid: dependencies@apple: - $libs.ktor.client.darwin -dependencies@js: +dependencies@jsAndWasm: - $libs.ktor.client.js dependencies@mingw: diff --git a/ksoup-network/src@js/com/fleeksoft/ksoup/network/ProvideHttpClientEngineJs.kt b/ksoup-network/src@jsAndWasm/com/fleeksoft/ksoup/network/ProvideHttpClientEngineJs.kt similarity index 100% rename from ksoup-network/src@js/com/fleeksoft/ksoup/network/ProvideHttpClientEngineJs.kt rename to ksoup-network/src@jsAndWasm/com/fleeksoft/ksoup/network/ProvideHttpClientEngineJs.kt From cb07e07acba1b1fe134b5bcdaa19975fb04239ae Mon Sep 17 00:00:00 2001 From: sabeeh Date: Fri, 23 Aug 2024 14:22:40 +0500 Subject: [PATCH 08/10] project scripts --- publishToLocal.sh | 9 ++++++++- runAllTests.sh | 24 ++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) create mode 100755 runAllTests.sh diff --git a/publishToLocal.sh b/publishToLocal.sh index 9f9643cc..f8a98841 100755 --- a/publishToLocal.sh +++ b/publishToLocal.sh @@ -1,3 +1,8 @@ +#!/bin/bash + +# Stop the script if any command fails +set -e + ./gradlew clean ./gradlew :ksoup-engine-common:publishToMavenLocal ./gradlew :ksoup-engine-kotlinx:publishToMavenLocal @@ -9,4 +14,6 @@ ./gradlew clean ./gradlew :ksoup:publishToMavenLocal -PisKorlibs=true -./gradlew :ksoup-network-korlibs:publishToMavenLocal -PisKorlibs=true \ No newline at end of file +./gradlew :ksoup-network-korlibs:publishToMavenLocal -PisKorlibs=true + +echo "Publishing completed successfully." \ No newline at end of file diff --git a/runAllTests.sh b/runAllTests.sh new file mode 100755 index 00000000..498307a0 --- /dev/null +++ b/runAllTests.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +# Stop the script if any command fails +set -e + +# Function to run tests for a specific configuration +run_tests() { + local isKorlibs=$1 + + echo "Running tests with isKorlibs=$isKorlibs..." + + ./gradlew clean + ./gradlew jvmTest testDebugUnitTest testReleaseUnitTest -PisKorlibs=$isKorlibs + ./gradlew iosX64Test iosSimulatorArm64Test macosX64Test macosArm64Test tvosX64Test tvosSimulatorArm64Test -PisKorlibs=$isKorlibs + ./gradlew jsTest wasmTest -PisKorlibs=$isKorlibs +} + +# Run tests for isKorlibs=false +run_tests false + +# Run tests for isKorlibs=true +run_tests true + +echo "All tests run successfully!" \ No newline at end of file From f54b8ba7a583ecd52ea293351a896415532c07d8 Mon Sep 17 00:00:00 2001 From: sabeeh Date: Fri, 23 Aug 2024 14:23:01 +0500 Subject: [PATCH 09/10] bump amper dev version --- settings.gradle.kts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/settings.gradle.kts b/settings.gradle.kts index ca1a2c7e..6fdbbc7d 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -10,7 +10,7 @@ pluginManagement { } plugins { - id("org.jetbrains.amper.settings.plugin").version("0.5.0-dev-940") + id("org.jetbrains.amper.settings.plugin").version("0.5.0-dev-947") } @@ -19,4 +19,5 @@ include("ksoup-engine-common") include("ksoup-engine-kotlinx", "ksoup-network") include("ksoup-engine-korlibs", "ksoup-network-korlibs") include("ksoup-test") -//include("sample:shared", "sample:android", "sample:desktop", "sample:ios") \ No newline at end of file +include("sample:shared", "sample:desktop") +//include("sample:android", "sample:ios") \ No newline at end of file From d9345a13c21607846c28cc01da733a12cf75275e Mon Sep 17 00:00:00 2001 From: sabeeh Date: Fri, 23 Aug 2024 14:24:16 +0500 Subject: [PATCH 10/10] update test resource fetch --- ksoup-test/module.yaml | 1 + .../test/com/fleeksoft/ksoup/TestHelper.kt | 65 ++++++++++--------- .../fleeksoft/ksoup/helper/DataUtilTest.kt | 12 ++-- .../fleeksoft/ksoup/integration/ParseTest.kt | 7 +- .../ksoup/issues/GithubIssuesTests.kt | 3 +- .../ksoup/parser/CharacterReaderTest.kt | 5 +- .../ksoup/parser/StreamParserTest.kt | 12 ++-- .../ksoup/PerformanceComparisonTest.kt | 4 +- sample/shared/module.yaml | 4 +- 9 files changed, 56 insertions(+), 57 deletions(-) diff --git a/ksoup-test/module.yaml b/ksoup-test/module.yaml index 46bbb985..1da14f68 100644 --- a/ksoup-test/module.yaml +++ b/ksoup-test/module.yaml @@ -13,6 +13,7 @@ repositories: test-dependencies: - ../ksoup - $libs.korlibs.io + - $libs.kotlinx.io - $libs.codepoints - $libs.kotlinx.coroutines.test - $libs.kotlinx.datetime diff --git a/ksoup-test/test/com/fleeksoft/ksoup/TestHelper.kt b/ksoup-test/test/com/fleeksoft/ksoup/TestHelper.kt index 722919a7..ad50aefb 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/TestHelper.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/TestHelper.kt @@ -4,27 +4,27 @@ import com.fleeksoft.ksoup.io.SourceReader import com.fleeksoft.ksoup.ported.openSourceReader import korlibs.io.compression.deflate.GZIP import korlibs.io.compression.uncompress -import korlibs.io.file.VfsFile -import korlibs.io.file.fullName -import korlibs.io.file.readAsSyncStream import korlibs.io.file.std.uniVfs -import korlibs.io.stream.readAll +import kotlinx.io.buffered +import kotlinx.io.files.Path +import kotlinx.io.files.SystemFileSystem +import kotlinx.io.readByteArray object TestHelper { - suspend fun readGzipResource(file: String): SourceReader { - return readGzipFile(getResourceAbsolutePath(file).uniVfs) + suspend fun readGzipResource(resource: String): SourceReader { + return readGzipFile(resource) } - suspend fun readResource(file: String): SourceReader { - if (file.endsWith(".gz") || file.endsWith(".z")) { - return readGzipResource(file) + suspend fun readResource(resource: String): SourceReader { + if (resource.endsWith(".gz") || resource.endsWith(".z")) { + return readGzipResource(resource) } - return readFile(getResourceAbsolutePath(file).uniVfs) + return readFile(resource) } - fun getResourceAbsolutePath(resourceName: String): String { - if (Platform.isWindows()) { + fun getResourceAbsolutePath(resourceName: String, absForWindows: Boolean = true): String { + if (Platform.isWindows() && !BuildConfig.isKotlinx && absForWindows) { return "../../../../testResources/$resourceName" } else if (Platform.isJsOrWasm()) { return "https://raw.githubusercontent.com/fleeksoft/ksoup/release/ksoup-test/testResources/$resourceName" @@ -32,33 +32,40 @@ object TestHelper { return "${BuildConfig.PROJECT_ROOT}/ksoup-test/testResources/$resourceName" } - suspend fun getFileAsString(file: VfsFile): String { - val bytes: ByteArray = if (file.fullName.endsWith(".gz")) { - readGzipFile(file).readAllBytes() + suspend fun readResourceAsString(resourceName: String): String { + val bytes: ByteArray = if (resourceName.endsWith(".gz")) { + readGzipFile(resourceName).readAllBytes() } else { - readFile(file).readAllBytes() + readFile(resourceName).readAllBytes() } return bytes.decodeToString() } - suspend fun resourceFilePathToStream(path: String): SourceReader { - val file = this.getResourceAbsolutePath(path).uniVfs - return pathToStream(file) - } - - suspend fun pathToStream(file: VfsFile): SourceReader { - return if (file.fullName.endsWith(".gz") || file.fullName.endsWith(".z")) { - readGzipFile(file) + suspend fun resourceFilePathToStream(resource: String): SourceReader { + return if (resource.endsWith(".gz") || resource.endsWith(".z")) { + readGzipFile(resource) } else { - readFile(file) + readFile(resource) } } - suspend fun readFile(file: VfsFile): SourceReader { - return file.readAll().openSourceReader() + private suspend fun readFile(resource: String): SourceReader { + val abs = getResourceAbsolutePath(resource, absForWindows = false) + val bytes = if (Platform.isJsOrWasm()) { + abs.uniVfs.readAll() + } else { + SystemFileSystem.source(Path(abs)).buffered().readByteArray() + } + return bytes.openSourceReader() } - suspend fun readGzipFile(file: VfsFile): SourceReader { - return file.readAsSyncStream().readAll().uncompress(GZIP).openSourceReader() + private suspend fun readGzipFile(resource: String): SourceReader { + val abs = getResourceAbsolutePath(resource, absForWindows = false) + val bytes = if (Platform.isJsOrWasm()) { + abs.uniVfs.readAll() + } else { + SystemFileSystem.source(Path(abs)).buffered().readByteArray() + } + return bytes.uncompress(GZIP).openSourceReader() } } diff --git a/ksoup-test/test/com/fleeksoft/ksoup/helper/DataUtilTest.kt b/ksoup-test/test/com/fleeksoft/ksoup/helper/DataUtilTest.kt index 834c35da..33533359 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/helper/DataUtilTest.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/helper/DataUtilTest.kt @@ -8,7 +8,6 @@ import com.fleeksoft.ksoup.ported.io.Charsets import com.fleeksoft.ksoup.ported.openSourceReader import com.fleeksoft.ksoup.ported.toByteArray import com.fleeksoft.ksoup.ported.toSourceFile -import korlibs.io.file.std.uniVfs import kotlinx.coroutines.test.runTest import kotlin.test.* @@ -339,9 +338,10 @@ class DataUtilTest { // kotlinx module not support gzip return@runTest } - val resourceFile = TestHelper.getResourceAbsolutePath("htmltests/large.html.gz") + val resourceName = "htmltests/large.html.gz" + val resourceFile = TestHelper.getResourceAbsolutePath(resourceName) val inputFile = resourceFile.toSourceFile() - val input: String = TestHelper.getFileAsString(resourceFile.uniVfs) + val input: String = TestHelper.readResourceAsString(resourceName) val expected = Ksoup.parse(input, "https://example.com") val doc: Document = Ksoup.parseFile(inputFile, baseUri = "https://example.com", charsetName = null) @@ -352,8 +352,7 @@ class DataUtilTest { @Test fun testStringVsSourceReaderParse() = runTest { - val resourceFile = TestHelper.getResourceAbsolutePath("htmltests/large.html.gz") - val input: String = TestHelper.getFileAsString(resourceFile.uniVfs) + val input: String = TestHelper.readResourceAsString("htmltests/large.html.gz") val expected = Ksoup.parse(input, "https://example.com") val doc: Document = Ksoup.parse(sourceReader = input.openSourceReader(), baseUri = "https://example.com", charsetName = null) @@ -363,8 +362,7 @@ class DataUtilTest { @Test fun handlesUnlimitedRead() = runTest { - val inputFile: String = TestHelper.getResourceAbsolutePath("htmltests/large.html.gz") - val input: String = TestHelper.getFileAsString(inputFile.uniVfs) + val input: String = TestHelper.readResourceAsString("htmltests/large.html.gz") val byteBuffer: ByteArray = DataUtil.readToByteBuffer(input.openSourceReader(), 0) val read = byteBuffer.decodeToString() assertEquals(input, read) diff --git a/ksoup-test/test/com/fleeksoft/ksoup/integration/ParseTest.kt b/ksoup-test/test/com/fleeksoft/ksoup/integration/ParseTest.kt index 4337aa70..84bc81ec 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/integration/ParseTest.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/integration/ParseTest.kt @@ -6,7 +6,6 @@ import com.fleeksoft.ksoup.Ksoup.parseFile import com.fleeksoft.ksoup.nodes.Document import com.fleeksoft.ksoup.parser.Parser import com.fleeksoft.ksoup.ported.openSourceReader -import korlibs.io.file.std.uniVfs import kotlinx.coroutines.test.runTest import kotlin.test.* @@ -130,8 +129,7 @@ class ParseTest { @Test fun testWikiExpandedFromString() = runTest { - val input = TestHelper.getResourceAbsolutePath("htmltests/xwiki-edit.html.gz") - val html = TestHelper.getFileAsString(input.uniVfs) + val html = TestHelper.readResourceAsString("htmltests/xwiki-edit.html.gz") val doc = parse(html) assertEquals("XWiki Jetty HSQLDB 12.1-SNAPSHOT", doc.select("#xwikiplatformversion").text()) val wantHtml = @@ -141,8 +139,7 @@ class ParseTest { @Test fun testWikiFromString() = runTest { - val input = TestHelper.getResourceAbsolutePath("htmltests/xwiki-1324.html.gz") - val html = TestHelper.getFileAsString(input.uniVfs) + val html = TestHelper.readResourceAsString("htmltests/xwiki-1324.html.gz") val doc = parse(html) assertEquals("XWiki Jetty HSQLDB 12.1-SNAPSHOT", doc.select("#xwikiplatformversion").text()) val wantHtml = diff --git a/ksoup-test/test/com/fleeksoft/ksoup/issues/GithubIssuesTests.kt b/ksoup-test/test/com/fleeksoft/ksoup/issues/GithubIssuesTests.kt index 47222aaa..cc3d6d8c 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/issues/GithubIssuesTests.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/issues/GithubIssuesTests.kt @@ -2,7 +2,6 @@ package com.fleeksoft.ksoup.issues import com.fleeksoft.ksoup.Ksoup import com.fleeksoft.ksoup.TestHelper -import korlibs.io.file.std.uniVfs import kotlinx.coroutines.test.runTest import kotlin.test.Test import kotlin.test.assertEquals @@ -11,7 +10,7 @@ class GithubIssuesTests { @Test fun testIssue20DuplicateElements() = runTest { // /~https://github.com/fleeksoft/ksoup/issues/20 - Ksoup.parse(TestHelper.getFileAsString(TestHelper.getResourceAbsolutePath("htmltests/issue20.html.gz").uniVfs)) + Ksoup.parse(TestHelper.readResourceAsString("htmltests/issue20.html.gz")) // Ksoup.parseGetRequest("https://www.dm530w.org/") .apply { body().select("div[class=firs l]") diff --git a/ksoup-test/test/com/fleeksoft/ksoup/parser/CharacterReaderTest.kt b/ksoup-test/test/com/fleeksoft/ksoup/parser/CharacterReaderTest.kt index 8291750f..f6e2ec4a 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/parser/CharacterReaderTest.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/parser/CharacterReaderTest.kt @@ -9,7 +9,6 @@ import com.fleeksoft.ksoup.ported.exception.UncheckedIOException import com.fleeksoft.ksoup.ported.io.Charsets import com.fleeksoft.ksoup.ported.io.StringReader import com.fleeksoft.ksoup.ported.toReader -import korlibs.io.file.std.uniVfs import korlibs.io.lang.substr import kotlinx.coroutines.test.runTest import kotlin.test.* @@ -561,9 +560,7 @@ class CharacterReaderTest { @Test fun lineNumbersAgreeWithEditor() = runTest { - val content: String = TestHelper.getFileAsString( - TestHelper.getResourceAbsolutePath("htmltests/large.html.gz").uniVfs - ) + val content: String = TestHelper.readResourceAsString("htmltests/large.html.gz") val reader = CharacterReader(content) reader.trackNewlines(true) val scan = "

VESTIBULUM" // near the end of the file diff --git a/ksoup-test/test/com/fleeksoft/ksoup/parser/StreamParserTest.kt b/ksoup-test/test/com/fleeksoft/ksoup/parser/StreamParserTest.kt index 0ba5074b..ab1fa8a5 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/parser/StreamParserTest.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/parser/StreamParserTest.kt @@ -8,7 +8,6 @@ import com.fleeksoft.ksoup.nodes.Node import com.fleeksoft.ksoup.ported.io.Charsets import com.fleeksoft.ksoup.ported.toReader import com.fleeksoft.ksoup.select.Elements -import korlibs.io.file.std.uniVfs import kotlinx.coroutines.test.runTest import kotlin.test.* @@ -293,12 +292,11 @@ class StreamParserTest { } @Test - fun canParseFileReader() = runTest() { - val file = TestHelper.getResourceAbsolutePath("htmltests/large.html.gz").uniVfs - - - val reader = TestHelper.readGzipFile(file).toReader() - val streamer: StreamParser = StreamParser(Parser.htmlParser()).parse(reader, file.absolutePath) + fun canParseFileReader() = runTest { + val resourceName = "htmltests/large.html.gz" + val file = TestHelper.getResourceAbsolutePath(resourceName) + val reader = TestHelper.readGzipResource(resourceName).toReader() + val streamer: StreamParser = StreamParser(Parser.htmlParser()).parse(reader, file) var last: Element? = null var e: Element? diff --git a/ksoup-test/test@jvmAndAndroid/com/fleeksoft/ksoup/PerformanceComparisonTest.kt b/ksoup-test/test@jvmAndAndroid/com/fleeksoft/ksoup/PerformanceComparisonTest.kt index 95604358..8f27c424 100644 --- a/ksoup-test/test@jvmAndAndroid/com/fleeksoft/ksoup/PerformanceComparisonTest.kt +++ b/ksoup-test/test@jvmAndAndroid/com/fleeksoft/ksoup/PerformanceComparisonTest.kt @@ -2,6 +2,7 @@ package com.fleeksoft.ksoup import com.fleeksoft.ksoup.nodes.Document import korlibs.io.file.std.uniVfs +import kotlinx.coroutines.delay import kotlinx.coroutines.test.runTest import org.jsoup.Jsoup import kotlin.system.measureTimeMillis @@ -11,9 +12,10 @@ import kotlin.test.Test class PerformanceComparisonTest { @Test - @Ignore +// @Ignore fun compareWithJsoup() = runTest { + delay(8000) if (BuildConfig.isGithubActions) { return@runTest } diff --git a/sample/shared/module.yaml b/sample/shared/module.yaml index 00ab887c..86282fd3 100644 --- a/sample/shared/module.yaml +++ b/sample/shared/module.yaml @@ -5,8 +5,8 @@ product: dependencies: - $compose.foundation: exported - $compose.material3: exported - - ../../ksoup-korlibs - - ../../ksoup-network-korlibs + - ../../ksoup + - ../../ksoup-network dependencies@android: # Compose integration with Android activities