diff --git a/ksoup-engine-common/src/com/fleeksoft/ksoup/engine/KsoupEngine.kt b/ksoup-engine-common/src/com/fleeksoft/ksoup/engine/KsoupEngine.kt index 46887edb..c19e43fc 100644 --- a/ksoup-engine-common/src/com/fleeksoft/ksoup/engine/KsoupEngine.kt +++ b/ksoup-engine-common/src/com/fleeksoft/ksoup/engine/KsoupEngine.kt @@ -5,7 +5,6 @@ import com.fleeksoft.ksoup.io.FileSource import com.fleeksoft.ksoup.io.SourceReader interface KsoupEngine { - fun urlResolveOrNull(base: String, relUrl: String): String? fun openSourceReader(content: String, charset: Charset? = null): SourceReader diff --git a/ksoup-engine-korlibs/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt b/ksoup-engine-korlibs/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt index d39476c1..d6d17996 100644 --- a/ksoup-engine-korlibs/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt +++ b/ksoup-engine-korlibs/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt @@ -2,12 +2,8 @@ package com.fleeksoft.ksoup.engine import com.fleeksoft.ksoup.io.* import korlibs.io.lang.Charsets -import korlibs.io.net.URL object KsoupEngineImpl : KsoupEngine { - override fun urlResolveOrNull(base: String, relUrl: String): String? { - return URL.resolveOrNull(base = base, access = relUrl) - } override fun openSourceReader(content: String, charset: Charset?): SourceReader { return SourceReader.from(charset?.toByteArray(content) ?: content.encodeToByteArray()) diff --git a/ksoup-engine-kotlinx/module.yaml b/ksoup-engine-kotlinx/module.yaml index 8ab31086..aac34660 100644 --- a/ksoup-engine-kotlinx/module.yaml +++ b/ksoup-engine-kotlinx/module.yaml @@ -10,5 +10,4 @@ aliases: dependencies: - ../ksoup-engine-common - $libs.kotlinx.io: exported - - $libs.ktor.io - - $libs.ktor.http \ No newline at end of file + - $libs.ktor.io \ No newline at end of file diff --git a/ksoup-engine-kotlinx/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt b/ksoup-engine-kotlinx/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt index 5b495d6c..ca5ba236 100644 --- a/ksoup-engine-kotlinx/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt +++ b/ksoup-engine-kotlinx/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt @@ -6,10 +6,6 @@ import io.ktor.utils.io.charsets.* object KsoupEngineImpl : KsoupEngine { - override fun urlResolveOrNull(base: String, relUrl: String): String? { - return URLUtil.urlResolveOrNull(base = base, relUrl = relUrl) - } - override fun openSourceReader(content: String, charset: Charset?): SourceReader { return SourceReader.from(charset?.toByteArray(content) ?: content.encodeToByteArray()) } diff --git a/ksoup-engine-kotlinx/src/com/fleeksoft/ksoup/engine/URLUtil.kt b/ksoup-engine-kotlinx/src/com/fleeksoft/ksoup/engine/URLUtil.kt deleted file mode 100644 index 7ba490bf..00000000 --- a/ksoup-engine-kotlinx/src/com/fleeksoft/ksoup/engine/URLUtil.kt +++ /dev/null @@ -1,153 +0,0 @@ -package com.fleeksoft.ksoup.engine - -import io.ktor.http.* - -object URLUtil { - private fun String.isValidResourceUrl() = - this.startsWith("http", ignoreCase = true) || this.startsWith("ftp://", ignoreCase = true) || - this.startsWith("ftps://", ignoreCase = true) || - this.startsWith("file:/", ignoreCase = true) || - this.startsWith("//") - - private fun String.isAbsResource(): Boolean = Regex("\\w+:").containsMatchIn(this) - private val validUriScheme: Regex = "^[a-zA-Z][a-zA-Z0-9+-.]*:".toRegex() - - private fun URLBuilder.appendRelativePath(relativePath: String): URLBuilder { - val segments = this.encodedPathSegments.toMutableList() - - val isLastSlash = segments.isNotEmpty() && segments.last() == "" - - // clear / its already joining with / - segments.removeAll { it.isEmpty() } - - val relativePathParts: MutableList = - if (relativePath.contains("?")) { - handleQueryParams(relativePath, "?") - } else if (relativePath.contains("#")) { - handleQueryParams(relativePath, "#") - } else { - relativePath.split("/").toMutableList() - } - - if (relativePathParts.size > 1 && relativePathParts.last() == "/") { - relativePathParts.removeLast() - } - - if (relativePathParts.isNotEmpty() && segments.isNotEmpty() && !isLastSlash && - relativePathParts.first().startsWith("?") - ) { - segments.add("${segments.removeLast()}${relativePathParts.removeFirst()}") - } - -// in files when file://etc/var/message + /var/message = file://var/message -// etc considered as host - - if (this.protocol == URLProtocol.createOrDefault("file")) { - if (relativePathParts.size > 1 && relativePathParts.firstOrNull() == "") { - segments.clear() - // remove first / space - relativePathParts.removeFirst() - this.host = relativePathParts.removeFirst() - } - } - - var isNewPathAdded = false - relativePathParts.forEachIndexed { index, path -> - when (path) { - "" -> { - if (index == 0) { - segments.clear() - } else { - segments.add("") - } - } - - "." -> { -// if its last part and . then append / example: .com/b/c/d + ./g/. = .com/b/c/d/g/ - if (index == relativePathParts.size - 1 && segments[index] != "") { - segments.add("") - } else if (!isLastSlash && !isNewPathAdded) { -// isNewPathAdded use to avoid /b/c/d + g/./h here . will not remove last path because its already added new - segments.removeLastOrNull() - } - } - - ".." -> { - // Clean up last path if exist - if (index == 0 && !isLastSlash) { - segments.removeLastOrNull() - } - if (segments.isNotEmpty()) { - segments.removeLast() - } - } - - else -> { -// remove last trailing path if not query or fragment g.com/a/b to g.com/a - if (index == 0 && segments.isNotEmpty() && - !isLastSlash && !path.startsWith("?") && !path.startsWith("#") - ) { - segments.removeLast() - } - isNewPathAdded = true - segments.add(path) - } - } - } - this.encodedPathSegments = segments - - return this - } - - - private fun handleQueryParams( - relativePath: String, - separator: String, - ): MutableList { - val querySplit = relativePath.split(separator).toMutableList() - val firstQueryPath = querySplit.removeFirst() - val relativePathParts = firstQueryPath.split("/").toMutableList() - if (querySplit.isNotEmpty()) { - relativePathParts.add( - "${relativePathParts.removeLastOrNull() ?: ""}$separator${querySplit.joinToString(separator)}", - ) - } - return relativePathParts - } - - private fun resolve(base: Url, cleanedRelUrl: String): Url { - - if (cleanedRelUrl.isEmpty()) { - return base - } - - if (cleanedRelUrl.isValidResourceUrl()) { - return URLBuilder(cleanedRelUrl).apply { - if (cleanedRelUrl.startsWith("//")) { - protocol = base.protocol - } - }.build() - } - - return URLBuilder( - protocol = base.protocol, - host = base.host, - port = base.port, - pathSegments = base.pathSegments - ).appendRelativePath(cleanedRelUrl).build() - } - - fun urlResolveOrNull(base: String, relUrl: String): String? { - // mailto, tel, geo, about etc.. - if (relUrl.isAbsResource()) { - return relUrl - } - return if (base.isValidResourceUrl()) { - resolve(Url(base), relUrl).toString() - } else if (relUrl.isValidResourceUrl()) { - Url(relUrl).toString() - } else { - if (validUriScheme.matches(relUrl)) relUrl else null - } - } -} \ No newline at end of file diff --git a/ksoup-engine-ktor2/module.yaml b/ksoup-engine-ktor2/module.yaml index 0cfbffb3..d81e7a74 100644 --- a/ksoup-engine-ktor2/module.yaml +++ b/ksoup-engine-ktor2/module.yaml @@ -10,5 +10,4 @@ aliases: dependencies: - ../ksoup-engine-common - $libs.kotlinx.io: exported - - $libs.ktor2.io - - $libs.ktor2.http \ No newline at end of file + - $libs.ktor2.io \ No newline at end of file diff --git a/ksoup-engine-ktor2/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt b/ksoup-engine-ktor2/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt index 5b495d6c..ca5ba236 100644 --- a/ksoup-engine-ktor2/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt +++ b/ksoup-engine-ktor2/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt @@ -6,10 +6,6 @@ import io.ktor.utils.io.charsets.* object KsoupEngineImpl : KsoupEngine { - override fun urlResolveOrNull(base: String, relUrl: String): String? { - return URLUtil.urlResolveOrNull(base = base, relUrl = relUrl) - } - override fun openSourceReader(content: String, charset: Charset?): SourceReader { return SourceReader.from(charset?.toByteArray(content) ?: content.encodeToByteArray()) } diff --git a/ksoup-engine-ktor2/src/com/fleeksoft/ksoup/engine/URLUtil.kt b/ksoup-engine-ktor2/src/com/fleeksoft/ksoup/engine/URLUtil.kt deleted file mode 100644 index 7ba490bf..00000000 --- a/ksoup-engine-ktor2/src/com/fleeksoft/ksoup/engine/URLUtil.kt +++ /dev/null @@ -1,153 +0,0 @@ -package com.fleeksoft.ksoup.engine - -import io.ktor.http.* - -object URLUtil { - private fun String.isValidResourceUrl() = - this.startsWith("http", ignoreCase = true) || this.startsWith("ftp://", ignoreCase = true) || - this.startsWith("ftps://", ignoreCase = true) || - this.startsWith("file:/", ignoreCase = true) || - this.startsWith("//") - - private fun String.isAbsResource(): Boolean = Regex("\\w+:").containsMatchIn(this) - private val validUriScheme: Regex = "^[a-zA-Z][a-zA-Z0-9+-.]*:".toRegex() - - private fun URLBuilder.appendRelativePath(relativePath: String): URLBuilder { - val segments = this.encodedPathSegments.toMutableList() - - val isLastSlash = segments.isNotEmpty() && segments.last() == "" - - // clear / its already joining with / - segments.removeAll { it.isEmpty() } - - val relativePathParts: MutableList = - if (relativePath.contains("?")) { - handleQueryParams(relativePath, "?") - } else if (relativePath.contains("#")) { - handleQueryParams(relativePath, "#") - } else { - relativePath.split("/").toMutableList() - } - - if (relativePathParts.size > 1 && relativePathParts.last() == "/") { - relativePathParts.removeLast() - } - - if (relativePathParts.isNotEmpty() && segments.isNotEmpty() && !isLastSlash && - relativePathParts.first().startsWith("?") - ) { - segments.add("${segments.removeLast()}${relativePathParts.removeFirst()}") - } - -// in files when file://etc/var/message + /var/message = file://var/message -// etc considered as host - - if (this.protocol == URLProtocol.createOrDefault("file")) { - if (relativePathParts.size > 1 && relativePathParts.firstOrNull() == "") { - segments.clear() - // remove first / space - relativePathParts.removeFirst() - this.host = relativePathParts.removeFirst() - } - } - - var isNewPathAdded = false - relativePathParts.forEachIndexed { index, path -> - when (path) { - "" -> { - if (index == 0) { - segments.clear() - } else { - segments.add("") - } - } - - "." -> { -// if its last part and . then append / example: .com/b/c/d + ./g/. = .com/b/c/d/g/ - if (index == relativePathParts.size - 1 && segments[index] != "") { - segments.add("") - } else if (!isLastSlash && !isNewPathAdded) { -// isNewPathAdded use to avoid /b/c/d + g/./h here . will not remove last path because its already added new - segments.removeLastOrNull() - } - } - - ".." -> { - // Clean up last path if exist - if (index == 0 && !isLastSlash) { - segments.removeLastOrNull() - } - if (segments.isNotEmpty()) { - segments.removeLast() - } - } - - else -> { -// remove last trailing path if not query or fragment g.com/a/b to g.com/a - if (index == 0 && segments.isNotEmpty() && - !isLastSlash && !path.startsWith("?") && !path.startsWith("#") - ) { - segments.removeLast() - } - isNewPathAdded = true - segments.add(path) - } - } - } - this.encodedPathSegments = segments - - return this - } - - - private fun handleQueryParams( - relativePath: String, - separator: String, - ): MutableList { - val querySplit = relativePath.split(separator).toMutableList() - val firstQueryPath = querySplit.removeFirst() - val relativePathParts = firstQueryPath.split("/").toMutableList() - if (querySplit.isNotEmpty()) { - relativePathParts.add( - "${relativePathParts.removeLastOrNull() ?: ""}$separator${querySplit.joinToString(separator)}", - ) - } - return relativePathParts - } - - private fun resolve(base: Url, cleanedRelUrl: String): Url { - - if (cleanedRelUrl.isEmpty()) { - return base - } - - if (cleanedRelUrl.isValidResourceUrl()) { - return URLBuilder(cleanedRelUrl).apply { - if (cleanedRelUrl.startsWith("//")) { - protocol = base.protocol - } - }.build() - } - - return URLBuilder( - protocol = base.protocol, - host = base.host, - port = base.port, - pathSegments = base.pathSegments - ).appendRelativePath(cleanedRelUrl).build() - } - - fun urlResolveOrNull(base: String, relUrl: String): String? { - // mailto, tel, geo, about etc.. - if (relUrl.isAbsResource()) { - return relUrl - } - return if (base.isValidResourceUrl()) { - resolve(Url(base), relUrl).toString() - } else if (relUrl.isValidResourceUrl()) { - Url(relUrl).toString() - } else { - if (validUriScheme.matches(relUrl)) relUrl else null - } - } -} \ No newline at end of file diff --git a/ksoup-engine-okio/module.yaml b/ksoup-engine-okio/module.yaml index 1af9de96..dcecaa51 100644 --- a/ksoup-engine-okio/module.yaml +++ b/ksoup-engine-okio/module.yaml @@ -12,7 +12,6 @@ dependencies: - ../ksoup-engine-common - $libs.okio: exported - $libs.ktor2.io - - $libs.ktor2.http dependencies@js: - $libs.okio.nodefilesystem \ No newline at end of file diff --git a/ksoup-engine-okio/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt b/ksoup-engine-okio/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt index 5b495d6c..ca5ba236 100644 --- a/ksoup-engine-okio/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt +++ b/ksoup-engine-okio/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt @@ -6,10 +6,6 @@ import io.ktor.utils.io.charsets.* object KsoupEngineImpl : KsoupEngine { - override fun urlResolveOrNull(base: String, relUrl: String): String? { - return URLUtil.urlResolveOrNull(base = base, relUrl = relUrl) - } - override fun openSourceReader(content: String, charset: Charset?): SourceReader { return SourceReader.from(charset?.toByteArray(content) ?: content.encodeToByteArray()) } diff --git a/ksoup-engine-okio/src/com/fleeksoft/ksoup/engine/URLUtil.kt b/ksoup-engine-okio/src/com/fleeksoft/ksoup/engine/URLUtil.kt deleted file mode 100644 index 7ba490bf..00000000 --- a/ksoup-engine-okio/src/com/fleeksoft/ksoup/engine/URLUtil.kt +++ /dev/null @@ -1,153 +0,0 @@ -package com.fleeksoft.ksoup.engine - -import io.ktor.http.* - -object URLUtil { - private fun String.isValidResourceUrl() = - this.startsWith("http", ignoreCase = true) || this.startsWith("ftp://", ignoreCase = true) || - this.startsWith("ftps://", ignoreCase = true) || - this.startsWith("file:/", ignoreCase = true) || - this.startsWith("//") - - private fun String.isAbsResource(): Boolean = Regex("\\w+:").containsMatchIn(this) - private val validUriScheme: Regex = "^[a-zA-Z][a-zA-Z0-9+-.]*:".toRegex() - - private fun URLBuilder.appendRelativePath(relativePath: String): URLBuilder { - val segments = this.encodedPathSegments.toMutableList() - - val isLastSlash = segments.isNotEmpty() && segments.last() == "" - - // clear / its already joining with / - segments.removeAll { it.isEmpty() } - - val relativePathParts: MutableList = - if (relativePath.contains("?")) { - handleQueryParams(relativePath, "?") - } else if (relativePath.contains("#")) { - handleQueryParams(relativePath, "#") - } else { - relativePath.split("/").toMutableList() - } - - if (relativePathParts.size > 1 && relativePathParts.last() == "/") { - relativePathParts.removeLast() - } - - if (relativePathParts.isNotEmpty() && segments.isNotEmpty() && !isLastSlash && - relativePathParts.first().startsWith("?") - ) { - segments.add("${segments.removeLast()}${relativePathParts.removeFirst()}") - } - -// in files when file://etc/var/message + /var/message = file://var/message -// etc considered as host - - if (this.protocol == URLProtocol.createOrDefault("file")) { - if (relativePathParts.size > 1 && relativePathParts.firstOrNull() == "") { - segments.clear() - // remove first / space - relativePathParts.removeFirst() - this.host = relativePathParts.removeFirst() - } - } - - var isNewPathAdded = false - relativePathParts.forEachIndexed { index, path -> - when (path) { - "" -> { - if (index == 0) { - segments.clear() - } else { - segments.add("") - } - } - - "." -> { -// if its last part and . then append / example: .com/b/c/d + ./g/. = .com/b/c/d/g/ - if (index == relativePathParts.size - 1 && segments[index] != "") { - segments.add("") - } else if (!isLastSlash && !isNewPathAdded) { -// isNewPathAdded use to avoid /b/c/d + g/./h here . will not remove last path because its already added new - segments.removeLastOrNull() - } - } - - ".." -> { - // Clean up last path if exist - if (index == 0 && !isLastSlash) { - segments.removeLastOrNull() - } - if (segments.isNotEmpty()) { - segments.removeLast() - } - } - - else -> { -// remove last trailing path if not query or fragment g.com/a/b to g.com/a - if (index == 0 && segments.isNotEmpty() && - !isLastSlash && !path.startsWith("?") && !path.startsWith("#") - ) { - segments.removeLast() - } - isNewPathAdded = true - segments.add(path) - } - } - } - this.encodedPathSegments = segments - - return this - } - - - private fun handleQueryParams( - relativePath: String, - separator: String, - ): MutableList { - val querySplit = relativePath.split(separator).toMutableList() - val firstQueryPath = querySplit.removeFirst() - val relativePathParts = firstQueryPath.split("/").toMutableList() - if (querySplit.isNotEmpty()) { - relativePathParts.add( - "${relativePathParts.removeLastOrNull() ?: ""}$separator${querySplit.joinToString(separator)}", - ) - } - return relativePathParts - } - - private fun resolve(base: Url, cleanedRelUrl: String): Url { - - if (cleanedRelUrl.isEmpty()) { - return base - } - - if (cleanedRelUrl.isValidResourceUrl()) { - return URLBuilder(cleanedRelUrl).apply { - if (cleanedRelUrl.startsWith("//")) { - protocol = base.protocol - } - }.build() - } - - return URLBuilder( - protocol = base.protocol, - host = base.host, - port = base.port, - pathSegments = base.pathSegments - ).appendRelativePath(cleanedRelUrl).build() - } - - fun urlResolveOrNull(base: String, relUrl: String): String? { - // mailto, tel, geo, about etc.. - if (relUrl.isAbsResource()) { - return relUrl - } - return if (base.isValidResourceUrl()) { - resolve(Url(base), relUrl).toString() - } else if (relUrl.isValidResourceUrl()) { - Url(relUrl).toString() - } else { - if (validUriScheme.matches(relUrl)) relUrl else null - } - } -} \ No newline at end of file diff --git a/ksoup-test/test/com/fleeksoft/ksoup/internal/StringUtilTest.kt b/ksoup-test/test/com/fleeksoft/ksoup/internal/StringUtilTest.kt index c5569d58..0120cba5 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/internal/StringUtilTest.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/internal/StringUtilTest.kt @@ -129,7 +129,7 @@ class StringUtilTest { assertEquals("https://example.com/one", StringUtil.resolve("https://example.com/one", "")) assertEquals("https://example.com/one/two.c", StringUtil.resolve("https://example.com/one/two/", "../two.c")) assertEquals("https://example.com/two.c", StringUtil.resolve("https://example.com/one/two", "../two.c")) -// assertEquals("", StringUtil.resolve("wrong", "also wrong")) + assertEquals("", StringUtil.resolve("wrong", "also wrong")) assertEquals("ftp://example.com/one", StringUtil.resolve("ftp://example.com/two/", "../one")) assertEquals("ftp://example.com/one/two.c", StringUtil.resolve("ftp://example.com/one/", "./two.c")) assertEquals("ftp://example.com/one/two.c", StringUtil.resolve("ftp://example.com/one/", "two.c")) diff --git a/ksoup-test/test/com/fleeksoft/ksoup/nodes/NodeTest.kt b/ksoup-test/test/com/fleeksoft/ksoup/nodes/NodeTest.kt index f8d8bd9d..fec42335 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/nodes/NodeTest.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/nodes/NodeTest.kt @@ -32,11 +32,7 @@ class NodeTest { assertEquals("", withBase.absUrl("noval")) val dodgyBase = Element(tag, "wtf://no-such-protocol/", attribs) assertEquals("http://bar/qux", dodgyBase.absUrl("absHref")) // base fails, but href good, so get that - if (BuildConfig.isKorlibs) { - assertEquals("wtf://no-such-protocol/foo", dodgyBase.absUrl("relHref")) // invalid protocol but still can be resolved - } else { - assertEquals("", dodgyBase.absUrl("relHref")) // base fails, only rel href, so return nothing - } + assertEquals("wtf://no-such-protocol/foo", dodgyBase.absUrl("relHref")) // invalid protocol but still can be resolved } @Test @@ -91,23 +87,18 @@ class NodeTest { @Test fun handleAbsOnFileUris() { - val doc = Ksoup.parse("One/a>Two", "file:///etc/") + val doc = Ksoup.parse("One/a>Two", "file:/etc/") val one = doc.select("a").first() - assertEquals("file:///etc/password", one!!.absUrl("href")) + assertEquals("file:/etc/password", one!!.absUrl("href")) val two = doc.select("a")[1] - if (BuildConfig.isKorlibs) { - assertEquals("file:///var/log/messages", two.absUrl("href")) - } else { - // fixme: in kotlinx its different behaviour - assertEquals("file://var/log/messages", two.absUrl("href")) - } + assertEquals("file:/var/log/messages", two.absUrl("href")) } @Test fun handleAbsOnLocalhostFileUris() { val doc = Ksoup.parse("One/a>Two", "file:///localhost/etc/") val one = doc.select("a").first()!! - assertEquals("file:///localhost/etc/password", one.absUrl("href")) + assertEquals("file://localhost/etc/password", one.absUrl("href")) } @Test diff --git a/ksoup/src/com/fleeksoft/ksoup/internal/StringUtil.kt b/ksoup/src/com/fleeksoft/ksoup/internal/StringUtil.kt index 753d0e3b..d694011d 100644 --- a/ksoup/src/com/fleeksoft/ksoup/internal/StringUtil.kt +++ b/ksoup/src/com/fleeksoft/ksoup/internal/StringUtil.kt @@ -1,7 +1,6 @@ package com.fleeksoft.ksoup.internal import com.fleeksoft.ksoup.ported.Character -import com.fleeksoft.ksoup.ported.resolveOrNull import de.cketti.codepoints.deluxe.CodePoint import de.cketti.codepoints.deluxe.appendCodePoint import de.cketti.codepoints.deluxe.codePointAt @@ -231,7 +230,7 @@ public object StringUtil { // if access url is relative protocol then copy it val cleanedBaseUrl = stripControlChars(baseUrl) val cleanedRelUrl = stripControlChars(relUrl) - return cleanedBaseUrl.resolveOrNull(cleanedRelUrl) ?: "" + return URLUtil.resolve(base = cleanedBaseUrl, relative = cleanedRelUrl) } private val controlChars: Regex = Regex("[\\x00-\\x1f]*") // matches ascii 0 - 31, to strip from url diff --git a/ksoup/src/com/fleeksoft/ksoup/internal/URLUtil.kt b/ksoup/src/com/fleeksoft/ksoup/internal/URLUtil.kt new file mode 100644 index 00000000..d6e4a019 --- /dev/null +++ b/ksoup/src/com/fleeksoft/ksoup/internal/URLUtil.kt @@ -0,0 +1,181 @@ +package com.fleeksoft.ksoup.internal + +import kotlin.math.min + +object URLUtil { + fun resolve(base: String, relative: String): String { + if (relative.isEmpty()) return base + + // If the relative URL is already absolute (has a scheme), return it + if (isAbsoluteUrl(relative)) { + return relative + } + + if (!isAbsoluteUrl(base)) { + // At least one absolute link required + return "" + } + + // Parse the base URL into components (scheme, authority, path, query, fragment) + val baseUrl = parseUrl(base) + + // Handle protocol-relative URLs (e.g. "//example.com/one") + if (relative.startsWith("//")) { + return baseUrl.scheme + ":" + relative + } + + // Handle fragment or query-relative URLs + if (relative.startsWith("?")) { + return "${baseUrl.scheme}:${baseUrl.schemeSeparator}${baseUrl.authority}${baseUrl.path}$relative" + } + if (relative.startsWith("#")) { + return "${baseUrl.scheme}:${baseUrl.schemeSeparator}${baseUrl.authority}${baseUrl.path}${baseUrl.query ?: ""}$relative" + } + + // If the relative URL starts with "/", it's an absolute path on the current authority + var resolvedPath = if (relative.startsWith("/")) { + relative + } else { + // If the base URL has a query or fragment, we need to strip it before merging paths + val cleanedBasePath = stripQueryAndFragment(baseUrl.path) + mergePaths(cleanedBasePath, relative) + } + + val relQueryIndex = resolvedPath.indexOf("?") + val relFragmentIndex = resolvedPath.indexOf("#") + + val queryOrFragmentIndex = if (relQueryIndex != -1 && relFragmentIndex != -1) { + min(relQueryIndex, relFragmentIndex) + } else if (relFragmentIndex != -1) { + relFragmentIndex + } else { + relQueryIndex + } + + val queryOrFragment = if (queryOrFragmentIndex != -1) { + val result = resolvedPath.substring(queryOrFragmentIndex) + resolvedPath = resolvedPath.substring(0, queryOrFragmentIndex) + result + } else null + + // Normalize the path to resolve ".." and "." + // add root slash to path only if authority is not empty + val normalizedPath = normalizePath(resolvedPath, addRoot = baseUrl.authority.isNotEmpty()).let { if (queryOrFragment != null) it + queryOrFragment else it } + +// val relativeFragment = relative.substringAfter('#', "") + + // Form the final URL with scheme, authority, path, query, and fragment + val finalUrl = StringBuilder() + finalUrl.append("${baseUrl.scheme}:${baseUrl.schemeSeparator}${baseUrl.authority}$normalizedPath") + + return finalUrl.toString() + } + + private fun isAbsoluteUrl(url: String): Boolean { + return url.length > 2 && url.contains(":") + } + + private fun mergePaths(basePath: String, relativePath: String): String { + val baseDir = if (basePath.endsWith("/")) basePath else basePath.substring(0, basePath.lastIndexOf('/') + 1) + return baseDir + relativePath + } + + private fun normalizePath(path: String, addRoot: Boolean = true): String { + val segments = path.split("/").toMutableList() + val result = mutableListOf() + + segments.forEachIndexed { index, segment -> + when { + segment.isEmpty() || segment == "." -> { + // if its last part and . then append / example: .com/b/c/d + ./g/. = .com/b/c/d/g/ + if (index == segments.size - 1) { + result.add("") + } + } + + segment == ".." -> { + // Go up a directory (pop last segment) + if (result.isNotEmpty()) { + result.removeAt(result.size - 1) + } + } + + else -> { + result.add(segment) + } + } + } + + return (if (addRoot) "/" else "") + result.joinToString("/") + } + + private fun stripQueryAndFragment(path: String): String { + val queryIndex = path.indexOf('?') + val fragmentIndex = path.indexOf('#') + return when { + queryIndex != -1 -> path.substring(0, queryIndex) + fragmentIndex != -1 -> path.substring(0, fragmentIndex) + else -> path + } + } + + private data class ParsedUrl( + val scheme: String, + val schemeSeparator: String, + val authority: String, + val path: String, + val query: String? = null, + val fragment: String? = null + ) + + private fun parseUrl(url: String): ParsedUrl { + var remainingUrl = url + val scheme: String + val schemeSeparator: String + val schemeEndIndex = url.indexOf(":") + if (schemeEndIndex != -1) { + schemeSeparator = if (url.indexOf("://") != -1) { + "//" + } else if (url.indexOf(":/") != -1) { + "/" + } else { + "" + } + scheme = url.substring(0, schemeEndIndex) + remainingUrl = url.substring(schemeEndIndex + schemeSeparator.length + 1) + } else { + // If no scheme, default to "http" or you can adjust it to defaultScheme + scheme = "https" + schemeSeparator = "//" + } + + val authorityEndIndex = if (schemeSeparator != "/") { + remainingUrl.indexOf('/').takeIf { it != -1 } ?: remainingUrl.indexOf('?').takeIf { it != -1 } ?: remainingUrl.indexOf('#') + .takeIf { it != -1 } ?: remainingUrl.length + } else { + // file paths + -1 + } + + val authority = if (authorityEndIndex != -1) remainingUrl.substring(0, authorityEndIndex) else null + val pathAndMore = if (authorityEndIndex == -1) remainingUrl else remainingUrl.substring(authorityEndIndex) + val pathEndIndex = pathAndMore.indexOfAny(charArrayOf('?', '#')).takeIf { it != -1 } ?: pathAndMore.length + val path = pathAndMore.substring(0, pathEndIndex) + + val queryStartIndex = pathAndMore.indexOf('?').takeIf { it != -1 } ?: pathAndMore.length + val fragmentStartIndex = pathAndMore.indexOf('#').takeIf { it != -1 } ?: pathAndMore.length + + val query = if (queryStartIndex != pathAndMore.length) pathAndMore.substring(queryStartIndex, fragmentStartIndex) else null + val fragment = if (fragmentStartIndex != pathAndMore.length) pathAndMore.substring(fragmentStartIndex) else null + + return ParsedUrl( + scheme = scheme, + schemeSeparator = schemeSeparator, + authority = authority ?: "", + path = path, + query = query, + fragment = fragment + ) + } + +} \ No newline at end of file diff --git a/ksoup/src/com/fleeksoft/ksoup/ported/KsoupExt.kt b/ksoup/src/com/fleeksoft/ksoup/ported/KsoupExt.kt index df2f70c1..49b62169 100644 --- a/ksoup/src/com/fleeksoft/ksoup/ported/KsoupExt.kt +++ b/ksoup/src/com/fleeksoft/ksoup/ported/KsoupExt.kt @@ -5,7 +5,10 @@ import com.fleeksoft.ksoup.internal.SharedConstants import com.fleeksoft.ksoup.io.Charset import com.fleeksoft.ksoup.io.FileSource import com.fleeksoft.ksoup.io.SourceReader -import com.fleeksoft.ksoup.ported.io.* +import com.fleeksoft.ksoup.ported.io.BufferedReader +import com.fleeksoft.ksoup.ported.io.Charsets +import com.fleeksoft.ksoup.ported.io.InputSourceReader +import com.fleeksoft.ksoup.ported.io.Reader fun String.openSourceReader(charset: Charset? = null): SourceReader = KsoupEngineInstance.ksoupEngine.openSourceReader(content = this, charset = charset) @@ -14,10 +17,6 @@ fun ByteArray.openSourceReader(): SourceReader = KsoupEngineInstance.ksoupEngine fun SourceReader.toReader(charset: Charset = Charsets.UTF8, chunkSize: Int = SharedConstants.DefaultBufferSize): Reader = BufferedReader(InputSourceReader(this, charset = charset), chunkSize) -fun String.toReader(): StringReader = StringReader(this) - -fun String.resolveOrNull(access: String): String? = KsoupEngineInstance.ksoupEngine.urlResolveOrNull(base = this, relUrl = access) - fun String.toByteArray(charset: Charset? = null): ByteArray = charset?.toByteArray(this) ?: this.encodeToByteArray() fun String.toSourceFile(): FileSource = KsoupEngineInstance.ksoupEngine.pathToFileSource(this) \ No newline at end of file