diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml index 9ff7ce63..4e6b1e50 100644 --- a/.github/workflows/deploy.yaml +++ b/.github/workflows/deploy.yaml @@ -9,6 +9,7 @@ jobs: matrix: buildType: - "common" + - "lite" - "kotlinx" - "korlibs" - "ktor2" diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ffff27a6..f1ca35bd 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -18,7 +18,7 @@ jobs: { target: windows, os: windows-latest, tasks: mingwX64Test, continueOnError: false }, { target: linux, os: ubuntu-latest, tasks: linuxX64Test, continueOnError: false }, ] - libBuildType: [ "korlibs", "kotlinx", "okio", "ktor2" ] + libBuildType: [ "lite", "korlibs", "kotlinx", "okio", "ktor2" ] runs-on: ${{ matrix.config.os }} name: Build ${{ matrix.config.target }} with libBuildType=${{ matrix.libBuildType }} steps: diff --git a/README.md b/README.md index 205e5eec..ecfae092 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,11 @@ Ksoup is adept at handling all varieties of HTML found in the wild. Include the dependency in `commonMain`. Latest version [![Maven Central](https://img.shields.io/maven-central/v/com.fleeksoft.ksoup/ksoup.svg)](https://central.sonatype.com/artifact/com.fleeksoft.ksoup/ksoup) Ksoup published in four variants. Pick the one that suits your needs and start building! -1. **This variant built with [kotlinx-io](https://github.com/Kotlin/kotlinx-io) and [Ktor 3.0.0-rc-1](https://github.com/ktorio/ktor)** +1. **This variant is built without any external IO or Network dependencies. Use this if you want to parse HTML from a string.** + ```kotlin + implementation("com.fleeksoft.ksoup:ksoup-lite:") + ``` +2. **This variant built with [kotlinx-io](https://github.com/Kotlin/kotlinx-io) and [Ktor 3.0.0-rc-1](https://github.com/ktorio/ktor)** ```kotlin implementation("com.fleeksoft.ksoup:ksoup:") @@ -41,7 +45,7 @@ Ksoup published in four variants. Pick the one that suits your needs and start b implementation("com.fleeksoft.ksoup:ksoup-network:") ``` -2. **This variant is built with [korlibs-io](https://github.com/korlibs/korlibs-io)** +3. **This variant is built with [korlibs-io](https://github.com/korlibs/korlibs-io)** ```kotlin implementation("com.fleeksoft.ksoup:ksoup-korlibs:") @@ -50,7 +54,7 @@ Ksoup published in four variants. Pick the one that suits your needs and start b implementation("com.fleeksoft.ksoup:ksoup-network-korlibs:") ``` -3. **This variant built with [kotlinx-io](https://github.com/Kotlin/kotlinx-io) and [Ktor 2.3.12](https://github.com/ktorio/ktor)** +4. **This variant built with [kotlinx-io](https://github.com/Kotlin/kotlinx-io) and [Ktor 2.3.12](https://github.com/ktorio/ktor)** ```kotlin implementation("com.fleeksoft.ksoup:ksoup-ktor2:") @@ -58,7 +62,7 @@ Ksoup published in four variants. Pick the one that suits your needs and start b // Ksoup.parseGetRequest, Ksoup.parseSubmitRequest, and Ksoup.parsePostRequest implementation("com.fleeksoft.ksoup:ksoup-network-ktor2:") ``` -4. **This variant built with [okio](https://github.com/square/okio) and [Ktor 2.3.12](https://github.com/ktorio/ktor)** +5. **This variant built with [okio](https://github.com/square/okio) and [Ktor 2.3.12](https://github.com/ktorio/ktor)** ```kotlin implementation("com.fleeksoft.ksoup:ksoup-okio:") diff --git a/gradle.properties b/gradle.properties index 16967811..093150cf 100644 --- a/gradle.properties +++ b/gradle.properties @@ -8,9 +8,9 @@ android.nonTransitiveRClass=true kotlin.native.ignoreIncorrectDependencies=true kotlin.mpp.enableCInteropCommonization=true kotlin.mpp.applyDefaultHierarchyTemplate=false -# dev, common, kotlinx, korlibs, okio, ktor2 +# dev, common, lite, kotlinx, korlibs, okio, ktor2 # dev will include all modules in settings.gradle.kts but use kotlinx dep for engine -libBuildType=okio +libBuildType=lite SONATYPE_HOST=CENTRAL_PORTAL diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index ec73e43b..e228fc7a 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -3,14 +3,13 @@ agp = "8.5.2" kotlin = "2.0.20" compileSdk = "34" minSdk = "21" -libraryVersion = "0.1.7" +libraryVersion = "0.1.8" ktor = "3.0.0-rc-1" ktor2 = "2.3.12" coroutines = "1.8.1" kotlinxDatetime = "0.6.1" kotlinx-io = "0.5.3" okio = "3.9.0" -codepoints = "0.9.0" dokka = "1.9.20" #korlibs = "999.0.0.999" # 999.0.0.999 is local version @@ -43,7 +42,6 @@ ktor-client-js = { module = "io.ktor:ktor-client-js", version.ref = "ktor" } ktor-client-win = { module = "io.ktor:ktor-client-winhttp", version.ref = "ktor" } kotlinx-datetime = { module = "org.jetbrains.kotlinx:kotlinx-datetime", version.ref = "kotlinxDatetime" } kotlinx-io = { module = "org.jetbrains.kotlinx:kotlinx-io-core", version.ref = "kotlinx-io" } -codepoints = { module = "de.cketti.unicode:kotlin-codepoints-deluxe", version.ref = "codepoints" } korlibs-io = { module = "com.soywiz:korlibs-io", version.ref = "korlibs" } korlibs-io-network-core = { module = "com.soywiz:korlibs-io-network-core", version.ref = "korlibs" } stately-concurrency = { module = "co.touchlab:stately-concurrency", version.ref = "stately" } diff --git a/ksoup-engine-common/src/com/fleeksoft/ksoup/engine/KsoupEngine.kt b/ksoup-engine-common/src/com/fleeksoft/ksoup/engine/KsoupEngine.kt index 46887edb..6244c613 100644 --- a/ksoup-engine-common/src/com/fleeksoft/ksoup/engine/KsoupEngine.kt +++ b/ksoup-engine-common/src/com/fleeksoft/ksoup/engine/KsoupEngine.kt @@ -2,14 +2,8 @@ package com.fleeksoft.ksoup.engine import com.fleeksoft.ksoup.io.Charset import com.fleeksoft.ksoup.io.FileSource -import com.fleeksoft.ksoup.io.SourceReader interface KsoupEngine { - fun urlResolveOrNull(base: String, relUrl: String): String? - - fun openSourceReader(content: String, charset: Charset? = null): SourceReader - - fun openSourceReader(byteArray: ByteArray): SourceReader fun getUtf8Charset(): Charset diff --git a/ksoup-engine-common/src/com/fleeksoft/ksoup/io/Charset.kt b/ksoup-engine-common/src/com/fleeksoft/ksoup/io/Charset.kt index b5e0c656..8b4e0a1e 100644 --- a/ksoup-engine-common/src/com/fleeksoft/ksoup/io/Charset.kt +++ b/ksoup-engine-common/src/com/fleeksoft/ksoup/io/Charset.kt @@ -22,4 +22,6 @@ interface Charset { fun decode(stringBuilder: StringBuilder, byteArray: ByteArray, start: Int, end: Int): Int fun toByteArray(value: String): ByteArray + + fun onlyUtf8(): Boolean = false } \ No newline at end of file diff --git a/ksoup-engine-common/src/com/fleeksoft/ksoup/io/SourceReaderByteArray.kt b/ksoup-engine-common/src/com/fleeksoft/ksoup/io/SourceReaderByteArray.kt new file mode 100644 index 00000000..ebaa89ca --- /dev/null +++ b/ksoup-engine-common/src/com/fleeksoft/ksoup/io/SourceReaderByteArray.kt @@ -0,0 +1,68 @@ +package com.fleeksoft.ksoup.io + +internal class SourceReaderByteArray(bytes: ByteArray) : SourceReader { + private var source: ByteArray = bytes + private var currentPosition: Int = 0 + private var markedPosition: Int? = null + private var isClosed: Boolean = false + + override fun mark(readLimit: Long) { + markedPosition = currentPosition + } + + override fun reset() { + isClosed = false + markedPosition?.let { + currentPosition = it + markedPosition = null + } + } + + + override fun readBytes(count: Int): ByteArray { + val byteArray = ByteArray(count) + var i = 0 + while (exhausted().not() && i < count) { + byteArray[i] = source[currentPosition++] + i++ + } + return if (i == 0) { + byteArrayOf() + } else if (i != count) { + byteArray.sliceArray(0 until i) + } else { + byteArray + } + } + + override fun read(bytes: ByteArray, offset: Int, length: Int): Int { + var i = offset + while (exhausted().not() && i < length) { + bytes[i] = source[currentPosition++] + i++ + } + return i + } + + override fun readAllBytes(): ByteArray { + return readBytes(source.size - currentPosition) + } + + override fun exhausted(): Boolean { + return currentPosition >= source.size + } + + override fun close() { +// on reset we need bytes again +// source = ByteArray(0) +// currentPosition = 0 +// markedPosition = null + isClosed = true + } + + override fun readAtMostTo(sink: KByteBuffer, byteCount: Int): Int { + val bytes = readBytes(byteCount) + sink.writeBytes(bytes, bytes.size) + return bytes.size + } +} \ No newline at end of file diff --git a/ksoup-engine-common/src/com/fleeksoft/ksoup/io/SourceReaderExt.kt b/ksoup-engine-common/src/com/fleeksoft/ksoup/io/SourceReaderExt.kt new file mode 100644 index 00000000..de583d6c --- /dev/null +++ b/ksoup-engine-common/src/com/fleeksoft/ksoup/io/SourceReaderExt.kt @@ -0,0 +1,3 @@ +package com.fleeksoft.ksoup.io + +fun SourceReader.Companion.from(byteArray: ByteArray): SourceReader = SourceReaderByteArray(byteArray) \ No newline at end of file diff --git a/ksoup-engine-korlibs/src/com/fleeksoft/ksoup/EngineExt.kt b/ksoup-engine-korlibs/src/com/fleeksoft/ksoup/EngineExt.kt index d01aa283..35db0214 100644 --- a/ksoup-engine-korlibs/src/com/fleeksoft/ksoup/EngineExt.kt +++ b/ksoup-engine-korlibs/src/com/fleeksoft/ksoup/EngineExt.kt @@ -1,7 +1,7 @@ package com.fleeksoft.ksoup -import com.fleeksoft.ksoup.engine.KsoupEngineImpl import com.fleeksoft.ksoup.io.SourceReader +import com.fleeksoft.ksoup.io.from import korlibs.io.compression.deflate.GZIP import korlibs.io.compression.uncompress import korlibs.io.file.VfsFile @@ -16,8 +16,8 @@ suspend fun VfsFile.openStream(): SourceReader { val zipped = (byteArray.size == 2 && byteArray[0].toInt() == 31 && byteArray[1].toInt() == -117) // gzip magic bytes 31(0x1f), -117(0x1f) if (zipped) { - return KsoupEngineImpl.openSourceReader(this.readAsSyncStream().readAll().uncompress(GZIP)) + return SourceReader.from(this.readAsSyncStream().readAll().uncompress(GZIP)) } } - return KsoupEngineImpl.openSourceReader(this.readAll()) + return SourceReader.from(this.readAll()) } \ No newline at end of file diff --git a/ksoup-engine-korlibs/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt b/ksoup-engine-korlibs/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt index d39476c1..0164f61c 100644 --- a/ksoup-engine-korlibs/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt +++ b/ksoup-engine-korlibs/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt @@ -2,20 +2,8 @@ package com.fleeksoft.ksoup.engine import com.fleeksoft.ksoup.io.* import korlibs.io.lang.Charsets -import korlibs.io.net.URL object KsoupEngineImpl : KsoupEngine { - override fun urlResolveOrNull(base: String, relUrl: String): String? { - return URL.resolveOrNull(base = base, access = relUrl) - } - - override fun openSourceReader(content: String, charset: Charset?): SourceReader { - return SourceReader.from(charset?.toByteArray(content) ?: content.encodeToByteArray()) - } - - override fun openSourceReader(byteArray: ByteArray): SourceReader { - return SourceReader.from(byteArray) - } override fun getUtf8Charset(): Charset { return CharsetImpl(Charsets.UTF8) diff --git a/ksoup-engine-korlibs/src/com/fleeksoft/ksoup/io/SourceExt.kt b/ksoup-engine-korlibs/src/com/fleeksoft/ksoup/io/SourceExt.kt index 24a7fc5f..7275f56c 100644 --- a/ksoup-engine-korlibs/src/com/fleeksoft/ksoup/io/SourceExt.kt +++ b/ksoup-engine-korlibs/src/com/fleeksoft/ksoup/io/SourceExt.kt @@ -4,7 +4,6 @@ import korlibs.io.file.VfsFile import korlibs.io.stream.* -fun SourceReader.Companion.from(byteArray: ByteArray): SourceReader = SourceReaderImpl(byteArray) fun SourceReader.Companion.from(syncStream: SyncStream): SourceReader = SourceReaderImpl(syncStream) suspend fun SourceReader.Companion.from(asyncInputStream: AsyncInputStream): SourceReader = SourceReaderImpl(asyncInputStream.toAsyncStream().toSyncOrNull() ?: asyncInputStream.readAll().openSync()) diff --git a/ksoup-engine-kotlinx/module.yaml b/ksoup-engine-kotlinx/module.yaml index 8ab31086..aac34660 100644 --- a/ksoup-engine-kotlinx/module.yaml +++ b/ksoup-engine-kotlinx/module.yaml @@ -10,5 +10,4 @@ aliases: dependencies: - ../ksoup-engine-common - $libs.kotlinx.io: exported - - $libs.ktor.io - - $libs.ktor.http \ No newline at end of file + - $libs.ktor.io \ No newline at end of file diff --git a/ksoup-engine-kotlinx/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt b/ksoup-engine-kotlinx/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt index 5b495d6c..999cfc66 100644 --- a/ksoup-engine-kotlinx/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt +++ b/ksoup-engine-kotlinx/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt @@ -6,18 +6,6 @@ import io.ktor.utils.io.charsets.* object KsoupEngineImpl : KsoupEngine { - override fun urlResolveOrNull(base: String, relUrl: String): String? { - return URLUtil.urlResolveOrNull(base = base, relUrl = relUrl) - } - - override fun openSourceReader(content: String, charset: Charset?): SourceReader { - return SourceReader.from(charset?.toByteArray(content) ?: content.encodeToByteArray()) - } - - override fun openSourceReader(byteArray: ByteArray): SourceReader { - return SourceReader.from(byteArray) - } - override fun getUtf8Charset(): Charset { return CharsetImpl(Charsets.UTF_8) } diff --git a/ksoup-engine-kotlinx/src/com/fleeksoft/ksoup/engine/URLUtil.kt b/ksoup-engine-kotlinx/src/com/fleeksoft/ksoup/engine/URLUtil.kt deleted file mode 100644 index 7ba490bf..00000000 --- a/ksoup-engine-kotlinx/src/com/fleeksoft/ksoup/engine/URLUtil.kt +++ /dev/null @@ -1,153 +0,0 @@ -package com.fleeksoft.ksoup.engine - -import io.ktor.http.* - -object URLUtil { - private fun String.isValidResourceUrl() = - this.startsWith("http", ignoreCase = true) || this.startsWith("ftp://", ignoreCase = true) || - this.startsWith("ftps://", ignoreCase = true) || - this.startsWith("file:/", ignoreCase = true) || - this.startsWith("//") - - private fun String.isAbsResource(): Boolean = Regex("\\w+:").containsMatchIn(this) - private val validUriScheme: Regex = "^[a-zA-Z][a-zA-Z0-9+-.]*:".toRegex() - - private fun URLBuilder.appendRelativePath(relativePath: String): URLBuilder { - val segments = this.encodedPathSegments.toMutableList() - - val isLastSlash = segments.isNotEmpty() && segments.last() == "" - - // clear / its already joining with / - segments.removeAll { it.isEmpty() } - - val relativePathParts: MutableList = - if (relativePath.contains("?")) { - handleQueryParams(relativePath, "?") - } else if (relativePath.contains("#")) { - handleQueryParams(relativePath, "#") - } else { - relativePath.split("/").toMutableList() - } - - if (relativePathParts.size > 1 && relativePathParts.last() == "/") { - relativePathParts.removeLast() - } - - if (relativePathParts.isNotEmpty() && segments.isNotEmpty() && !isLastSlash && - relativePathParts.first().startsWith("?") - ) { - segments.add("${segments.removeLast()}${relativePathParts.removeFirst()}") - } - -// in files when file://etc/var/message + /var/message = file://var/message -// etc considered as host - - if (this.protocol == URLProtocol.createOrDefault("file")) { - if (relativePathParts.size > 1 && relativePathParts.firstOrNull() == "") { - segments.clear() - // remove first / space - relativePathParts.removeFirst() - this.host = relativePathParts.removeFirst() - } - } - - var isNewPathAdded = false - relativePathParts.forEachIndexed { index, path -> - when (path) { - "" -> { - if (index == 0) { - segments.clear() - } else { - segments.add("") - } - } - - "." -> { -// if its last part and . then append / example: .com/b/c/d + ./g/. = .com/b/c/d/g/ - if (index == relativePathParts.size - 1 && segments[index] != "") { - segments.add("") - } else if (!isLastSlash && !isNewPathAdded) { -// isNewPathAdded use to avoid /b/c/d + g/./h here . will not remove last path because its already added new - segments.removeLastOrNull() - } - } - - ".." -> { - // Clean up last path if exist - if (index == 0 && !isLastSlash) { - segments.removeLastOrNull() - } - if (segments.isNotEmpty()) { - segments.removeLast() - } - } - - else -> { -// remove last trailing path if not query or fragment g.com/a/b to g.com/a - if (index == 0 && segments.isNotEmpty() && - !isLastSlash && !path.startsWith("?") && !path.startsWith("#") - ) { - segments.removeLast() - } - isNewPathAdded = true - segments.add(path) - } - } - } - this.encodedPathSegments = segments - - return this - } - - - private fun handleQueryParams( - relativePath: String, - separator: String, - ): MutableList { - val querySplit = relativePath.split(separator).toMutableList() - val firstQueryPath = querySplit.removeFirst() - val relativePathParts = firstQueryPath.split("/").toMutableList() - if (querySplit.isNotEmpty()) { - relativePathParts.add( - "${relativePathParts.removeLastOrNull() ?: ""}$separator${querySplit.joinToString(separator)}", - ) - } - return relativePathParts - } - - private fun resolve(base: Url, cleanedRelUrl: String): Url { - - if (cleanedRelUrl.isEmpty()) { - return base - } - - if (cleanedRelUrl.isValidResourceUrl()) { - return URLBuilder(cleanedRelUrl).apply { - if (cleanedRelUrl.startsWith("//")) { - protocol = base.protocol - } - }.build() - } - - return URLBuilder( - protocol = base.protocol, - host = base.host, - port = base.port, - pathSegments = base.pathSegments - ).appendRelativePath(cleanedRelUrl).build() - } - - fun urlResolveOrNull(base: String, relUrl: String): String? { - // mailto, tel, geo, about etc.. - if (relUrl.isAbsResource()) { - return relUrl - } - return if (base.isValidResourceUrl()) { - resolve(Url(base), relUrl).toString() - } else if (relUrl.isValidResourceUrl()) { - Url(relUrl).toString() - } else { - if (validUriScheme.matches(relUrl)) relUrl else null - } - } -} \ No newline at end of file diff --git a/ksoup-engine-kotlinx/src/com/fleeksoft/ksoup/io/SourceExt.kt b/ksoup-engine-kotlinx/src/com/fleeksoft/ksoup/io/SourceExt.kt index b370f16c..318546ad 100644 --- a/ksoup-engine-kotlinx/src/com/fleeksoft/ksoup/io/SourceExt.kt +++ b/ksoup-engine-kotlinx/src/com/fleeksoft/ksoup/io/SourceExt.kt @@ -7,7 +7,6 @@ import kotlinx.io.Source import kotlinx.io.files.Path -fun SourceReader.Companion.from(byteArray: ByteArray): SourceReader = SourceReaderImpl(byteArray) fun SourceReader.Companion.from(source: Source): SourceReader = SourceReaderImpl(source) fun SourceReader.Companion.from(bodyChannel: ByteReadChannel): SourceReader = SourceReaderImpl(bodyChannel.readBuffer) diff --git a/ksoup-engine-ktor2/module.yaml b/ksoup-engine-ktor2/module.yaml index 0cfbffb3..d81e7a74 100644 --- a/ksoup-engine-ktor2/module.yaml +++ b/ksoup-engine-ktor2/module.yaml @@ -10,5 +10,4 @@ aliases: dependencies: - ../ksoup-engine-common - $libs.kotlinx.io: exported - - $libs.ktor2.io - - $libs.ktor2.http \ No newline at end of file + - $libs.ktor2.io \ No newline at end of file diff --git a/ksoup-engine-ktor2/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt b/ksoup-engine-ktor2/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt index 5b495d6c..999cfc66 100644 --- a/ksoup-engine-ktor2/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt +++ b/ksoup-engine-ktor2/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt @@ -6,18 +6,6 @@ import io.ktor.utils.io.charsets.* object KsoupEngineImpl : KsoupEngine { - override fun urlResolveOrNull(base: String, relUrl: String): String? { - return URLUtil.urlResolveOrNull(base = base, relUrl = relUrl) - } - - override fun openSourceReader(content: String, charset: Charset?): SourceReader { - return SourceReader.from(charset?.toByteArray(content) ?: content.encodeToByteArray()) - } - - override fun openSourceReader(byteArray: ByteArray): SourceReader { - return SourceReader.from(byteArray) - } - override fun getUtf8Charset(): Charset { return CharsetImpl(Charsets.UTF_8) } diff --git a/ksoup-engine-ktor2/src/com/fleeksoft/ksoup/engine/URLUtil.kt b/ksoup-engine-ktor2/src/com/fleeksoft/ksoup/engine/URLUtil.kt deleted file mode 100644 index 7ba490bf..00000000 --- a/ksoup-engine-ktor2/src/com/fleeksoft/ksoup/engine/URLUtil.kt +++ /dev/null @@ -1,153 +0,0 @@ -package com.fleeksoft.ksoup.engine - -import io.ktor.http.* - -object URLUtil { - private fun String.isValidResourceUrl() = - this.startsWith("http", ignoreCase = true) || this.startsWith("ftp://", ignoreCase = true) || - this.startsWith("ftps://", ignoreCase = true) || - this.startsWith("file:/", ignoreCase = true) || - this.startsWith("//") - - private fun String.isAbsResource(): Boolean = Regex("\\w+:").containsMatchIn(this) - private val validUriScheme: Regex = "^[a-zA-Z][a-zA-Z0-9+-.]*:".toRegex() - - private fun URLBuilder.appendRelativePath(relativePath: String): URLBuilder { - val segments = this.encodedPathSegments.toMutableList() - - val isLastSlash = segments.isNotEmpty() && segments.last() == "" - - // clear / its already joining with / - segments.removeAll { it.isEmpty() } - - val relativePathParts: MutableList = - if (relativePath.contains("?")) { - handleQueryParams(relativePath, "?") - } else if (relativePath.contains("#")) { - handleQueryParams(relativePath, "#") - } else { - relativePath.split("/").toMutableList() - } - - if (relativePathParts.size > 1 && relativePathParts.last() == "/") { - relativePathParts.removeLast() - } - - if (relativePathParts.isNotEmpty() && segments.isNotEmpty() && !isLastSlash && - relativePathParts.first().startsWith("?") - ) { - segments.add("${segments.removeLast()}${relativePathParts.removeFirst()}") - } - -// in files when file://etc/var/message + /var/message = file://var/message -// etc considered as host - - if (this.protocol == URLProtocol.createOrDefault("file")) { - if (relativePathParts.size > 1 && relativePathParts.firstOrNull() == "") { - segments.clear() - // remove first / space - relativePathParts.removeFirst() - this.host = relativePathParts.removeFirst() - } - } - - var isNewPathAdded = false - relativePathParts.forEachIndexed { index, path -> - when (path) { - "" -> { - if (index == 0) { - segments.clear() - } else { - segments.add("") - } - } - - "." -> { -// if its last part and . then append / example: .com/b/c/d + ./g/. = .com/b/c/d/g/ - if (index == relativePathParts.size - 1 && segments[index] != "") { - segments.add("") - } else if (!isLastSlash && !isNewPathAdded) { -// isNewPathAdded use to avoid /b/c/d + g/./h here . will not remove last path because its already added new - segments.removeLastOrNull() - } - } - - ".." -> { - // Clean up last path if exist - if (index == 0 && !isLastSlash) { - segments.removeLastOrNull() - } - if (segments.isNotEmpty()) { - segments.removeLast() - } - } - - else -> { -// remove last trailing path if not query or fragment g.com/a/b to g.com/a - if (index == 0 && segments.isNotEmpty() && - !isLastSlash && !path.startsWith("?") && !path.startsWith("#") - ) { - segments.removeLast() - } - isNewPathAdded = true - segments.add(path) - } - } - } - this.encodedPathSegments = segments - - return this - } - - - private fun handleQueryParams( - relativePath: String, - separator: String, - ): MutableList { - val querySplit = relativePath.split(separator).toMutableList() - val firstQueryPath = querySplit.removeFirst() - val relativePathParts = firstQueryPath.split("/").toMutableList() - if (querySplit.isNotEmpty()) { - relativePathParts.add( - "${relativePathParts.removeLastOrNull() ?: ""}$separator${querySplit.joinToString(separator)}", - ) - } - return relativePathParts - } - - private fun resolve(base: Url, cleanedRelUrl: String): Url { - - if (cleanedRelUrl.isEmpty()) { - return base - } - - if (cleanedRelUrl.isValidResourceUrl()) { - return URLBuilder(cleanedRelUrl).apply { - if (cleanedRelUrl.startsWith("//")) { - protocol = base.protocol - } - }.build() - } - - return URLBuilder( - protocol = base.protocol, - host = base.host, - port = base.port, - pathSegments = base.pathSegments - ).appendRelativePath(cleanedRelUrl).build() - } - - fun urlResolveOrNull(base: String, relUrl: String): String? { - // mailto, tel, geo, about etc.. - if (relUrl.isAbsResource()) { - return relUrl - } - return if (base.isValidResourceUrl()) { - resolve(Url(base), relUrl).toString() - } else if (relUrl.isValidResourceUrl()) { - Url(relUrl).toString() - } else { - if (validUriScheme.matches(relUrl)) relUrl else null - } - } -} \ No newline at end of file diff --git a/ksoup-engine-ktor2/src/com/fleeksoft/ksoup/io/SourceExt.kt b/ksoup-engine-ktor2/src/com/fleeksoft/ksoup/io/SourceExt.kt index 8888bf5a..351840fa 100644 --- a/ksoup-engine-ktor2/src/com/fleeksoft/ksoup/io/SourceExt.kt +++ b/ksoup-engine-ktor2/src/com/fleeksoft/ksoup/io/SourceExt.kt @@ -4,9 +4,7 @@ import kotlinx.io.Source import kotlinx.io.files.Path -fun SourceReader.Companion.from(byteArray: ByteArray): SourceReader = SourceReaderImpl(byteArray) fun SourceReader.Companion.from(source: Source): SourceReader = SourceReaderImpl(source) - fun FileSource.Companion.from(file: Path): FileSource = FileSourceImpl(file) fun FileSource.Companion.from(filePath: String): FileSource = FileSourceImpl(filePath) \ No newline at end of file diff --git a/ksoup-engine-lite/build.gradle.kts b/ksoup-engine-lite/build.gradle.kts new file mode 100644 index 00000000..231b1827 --- /dev/null +++ b/ksoup-engine-lite/build.gradle.kts @@ -0,0 +1,37 @@ +plugins { + alias(libs.plugins.mavenPublish) +} + +group = "com.fleeksoft.ksoup" +version = libs.versions.libraryVersion.get() + +val artifactId = "ksoup-engine-lite" +mavenPublishing { + coordinates("com.fleeksoft.ksoup", artifactId, libs.versions.libraryVersion.get()) + pom { + name.set(artifactId) + description.set("Ksoup is a Kotlin Multiplatform library for working with HTML and XML, and offers an easy-to-use API for URL fetching, data parsing, extraction, and manipulation using DOM and CSS selectors.") + licenses { + license { + name.set("Apache-2.0") + url.set("https://opensource.org/licenses/Apache-2.0") + } + } + url.set("https://github.com/fleeksoft/ksoup") + issueManagement { + system.set("Github") + url.set("https://github.com/fleeksoft/ksoup/issues") + } + scm { + connection.set("https://github.com/fleeksoft/ksoup.git") + url.set("https://github.com/fleeksoft/ksoup") + } + developers { + developer { + name.set("Sabeeh Ul Hussnain Anjum") + email.set("fleeksoft@gmail.com") + organization.set("Fleek Soft") + } + } + } +} \ No newline at end of file diff --git a/ksoup-engine-lite/module.yaml b/ksoup-engine-lite/module.yaml new file mode 100644 index 00000000..38b368a6 --- /dev/null +++ b/ksoup-engine-lite/module.yaml @@ -0,0 +1,11 @@ +product: + type: lib + platforms: [ jvm, js, wasm, android, linuxX64, linuxArm64, tvosArm64, tvosX64, tvosSimulatorArm64, macosX64, macosArm64, iosArm64, iosSimulatorArm64, iosX64, mingwX64 ] + +apply: [ ../common.module-template.yaml ] + +aliases: + - jvmAndAndroid: [ jvm, android ] + +dependencies: + - ../ksoup-engine-common \ No newline at end of file diff --git a/ksoup-engine-lite/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt b/ksoup-engine-lite/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt new file mode 100644 index 00000000..6eeff62e --- /dev/null +++ b/ksoup-engine-lite/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt @@ -0,0 +1,18 @@ +package com.fleeksoft.ksoup.engine + +import com.fleeksoft.ksoup.io.* + +object KsoupEngineImpl : KsoupEngine { + + override fun getUtf8Charset(): Charset { + return CharsetImpl("UTF-8") + } + + override fun charsetForName(name: String): Charset { + return CharsetImpl(name) + } + + override fun pathToFileSource(path: String): FileSource { + TODO("File Source not supported in lite") + } +} \ No newline at end of file diff --git a/ksoup-engine-lite/src/com/fleeksoft/ksoup/io/CharsetImpl.kt b/ksoup-engine-lite/src/com/fleeksoft/ksoup/io/CharsetImpl.kt new file mode 100644 index 00000000..026107ca --- /dev/null +++ b/ksoup-engine-lite/src/com/fleeksoft/ksoup/io/CharsetImpl.kt @@ -0,0 +1,57 @@ +package com.fleeksoft.ksoup.io + +import kotlin.math.max + + +class CharsetImpl(override val name: String) : Charset { + init { + require(name.lowercase() == "utf8" || name.lowercase() == "utf-8" || name.lowercase() == "iso-8859-1" || name.lowercase() == "ascii" || name.lowercase() == "us-ascii") { + "Charset $name not supported" + } + } + + override fun onlyUtf8(): Boolean = true + + override fun decode(stringBuilder: StringBuilder, byteArray: ByteArray, start: Int, end: Int): Int { + if (end <= 0) return 0 + var incompleteByteIndex = -1 + + val isUtf8 = name.lowercase() == "utf-8" || name.lowercase() == "utf8" + if (isUtf8) { +// TODO:// may be we can use this for other charsets + val startIndex = if (end > 4) end - 4 else 0 + var i = startIndex + while (i < end) { + val byteLength = guessByteSequenceLength(byteArray[i]) + if (byteLength > 1 && (i + byteLength) > end) { + incompleteByteIndex = i + break + } else { + i += max(byteLength, 1) + } + } + } + val toDecodeSize = if (incompleteByteIndex > 0) { + incompleteByteIndex + } else { + end + } + + stringBuilder.append(byteArray.sliceArray(start until toDecodeSize).decodeToString()) + return toDecodeSize - start + } + + private fun guessByteSequenceLength(byte: Byte): Int { + return when ((byte.toInt() and 0xFF) shr 4) { + in 0b0000..0b0111 -> 1 + in 0b1100..0b1101 -> 2 + 0b1110 -> 3 + 0b1111 -> 4 + else -> 0 + } + } + + override fun toByteArray(value: String): ByteArray { + return value.encodeToByteArray() + } +} \ No newline at end of file diff --git a/ksoup-engine-lite/src@jvmAndAndroid/com/fleeksoft/ksoup/io/SourceExtJvm.kt b/ksoup-engine-lite/src@jvmAndAndroid/com/fleeksoft/ksoup/io/SourceExtJvm.kt new file mode 100644 index 00000000..56931d94 --- /dev/null +++ b/ksoup-engine-lite/src@jvmAndAndroid/com/fleeksoft/ksoup/io/SourceExtJvm.kt @@ -0,0 +1,9 @@ +package com.fleeksoft.ksoup.io + +import java.io.File +import java.io.InputStream + +// todo for jvm we can use streaming api in lite module +fun SourceReader.Companion.from(inputStream: InputStream): SourceReader = SourceReader.from(inputStream.readAllBytes()) +fun FileSource.Companion.from(file: File): FileSource = TODO("File Source not supported in lite") +fun FileSource.Companion.from(file: String): FileSource = TODO("File Source not supported in lite") \ No newline at end of file diff --git a/ksoup-engine-okio/module.yaml b/ksoup-engine-okio/module.yaml index 1af9de96..dcecaa51 100644 --- a/ksoup-engine-okio/module.yaml +++ b/ksoup-engine-okio/module.yaml @@ -12,7 +12,6 @@ dependencies: - ../ksoup-engine-common - $libs.okio: exported - $libs.ktor2.io - - $libs.ktor2.http dependencies@js: - $libs.okio.nodefilesystem \ No newline at end of file diff --git a/ksoup-engine-okio/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt b/ksoup-engine-okio/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt index 5b495d6c..999cfc66 100644 --- a/ksoup-engine-okio/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt +++ b/ksoup-engine-okio/src/com/fleeksoft/ksoup/engine/KsoupEngineImpl.kt @@ -6,18 +6,6 @@ import io.ktor.utils.io.charsets.* object KsoupEngineImpl : KsoupEngine { - override fun urlResolveOrNull(base: String, relUrl: String): String? { - return URLUtil.urlResolveOrNull(base = base, relUrl = relUrl) - } - - override fun openSourceReader(content: String, charset: Charset?): SourceReader { - return SourceReader.from(charset?.toByteArray(content) ?: content.encodeToByteArray()) - } - - override fun openSourceReader(byteArray: ByteArray): SourceReader { - return SourceReader.from(byteArray) - } - override fun getUtf8Charset(): Charset { return CharsetImpl(Charsets.UTF_8) } diff --git a/ksoup-engine-okio/src/com/fleeksoft/ksoup/engine/URLUtil.kt b/ksoup-engine-okio/src/com/fleeksoft/ksoup/engine/URLUtil.kt deleted file mode 100644 index 7ba490bf..00000000 --- a/ksoup-engine-okio/src/com/fleeksoft/ksoup/engine/URLUtil.kt +++ /dev/null @@ -1,153 +0,0 @@ -package com.fleeksoft.ksoup.engine - -import io.ktor.http.* - -object URLUtil { - private fun String.isValidResourceUrl() = - this.startsWith("http", ignoreCase = true) || this.startsWith("ftp://", ignoreCase = true) || - this.startsWith("ftps://", ignoreCase = true) || - this.startsWith("file:/", ignoreCase = true) || - this.startsWith("//") - - private fun String.isAbsResource(): Boolean = Regex("\\w+:").containsMatchIn(this) - private val validUriScheme: Regex = "^[a-zA-Z][a-zA-Z0-9+-.]*:".toRegex() - - private fun URLBuilder.appendRelativePath(relativePath: String): URLBuilder { - val segments = this.encodedPathSegments.toMutableList() - - val isLastSlash = segments.isNotEmpty() && segments.last() == "" - - // clear / its already joining with / - segments.removeAll { it.isEmpty() } - - val relativePathParts: MutableList = - if (relativePath.contains("?")) { - handleQueryParams(relativePath, "?") - } else if (relativePath.contains("#")) { - handleQueryParams(relativePath, "#") - } else { - relativePath.split("/").toMutableList() - } - - if (relativePathParts.size > 1 && relativePathParts.last() == "/") { - relativePathParts.removeLast() - } - - if (relativePathParts.isNotEmpty() && segments.isNotEmpty() && !isLastSlash && - relativePathParts.first().startsWith("?") - ) { - segments.add("${segments.removeLast()}${relativePathParts.removeFirst()}") - } - -// in files when file://etc/var/message + /var/message = file://var/message -// etc considered as host - - if (this.protocol == URLProtocol.createOrDefault("file")) { - if (relativePathParts.size > 1 && relativePathParts.firstOrNull() == "") { - segments.clear() - // remove first / space - relativePathParts.removeFirst() - this.host = relativePathParts.removeFirst() - } - } - - var isNewPathAdded = false - relativePathParts.forEachIndexed { index, path -> - when (path) { - "" -> { - if (index == 0) { - segments.clear() - } else { - segments.add("") - } - } - - "." -> { -// if its last part and . then append / example: .com/b/c/d + ./g/. = .com/b/c/d/g/ - if (index == relativePathParts.size - 1 && segments[index] != "") { - segments.add("") - } else if (!isLastSlash && !isNewPathAdded) { -// isNewPathAdded use to avoid /b/c/d + g/./h here . will not remove last path because its already added new - segments.removeLastOrNull() - } - } - - ".." -> { - // Clean up last path if exist - if (index == 0 && !isLastSlash) { - segments.removeLastOrNull() - } - if (segments.isNotEmpty()) { - segments.removeLast() - } - } - - else -> { -// remove last trailing path if not query or fragment g.com/a/b to g.com/a - if (index == 0 && segments.isNotEmpty() && - !isLastSlash && !path.startsWith("?") && !path.startsWith("#") - ) { - segments.removeLast() - } - isNewPathAdded = true - segments.add(path) - } - } - } - this.encodedPathSegments = segments - - return this - } - - - private fun handleQueryParams( - relativePath: String, - separator: String, - ): MutableList { - val querySplit = relativePath.split(separator).toMutableList() - val firstQueryPath = querySplit.removeFirst() - val relativePathParts = firstQueryPath.split("/").toMutableList() - if (querySplit.isNotEmpty()) { - relativePathParts.add( - "${relativePathParts.removeLastOrNull() ?: ""}$separator${querySplit.joinToString(separator)}", - ) - } - return relativePathParts - } - - private fun resolve(base: Url, cleanedRelUrl: String): Url { - - if (cleanedRelUrl.isEmpty()) { - return base - } - - if (cleanedRelUrl.isValidResourceUrl()) { - return URLBuilder(cleanedRelUrl).apply { - if (cleanedRelUrl.startsWith("//")) { - protocol = base.protocol - } - }.build() - } - - return URLBuilder( - protocol = base.protocol, - host = base.host, - port = base.port, - pathSegments = base.pathSegments - ).appendRelativePath(cleanedRelUrl).build() - } - - fun urlResolveOrNull(base: String, relUrl: String): String? { - // mailto, tel, geo, about etc.. - if (relUrl.isAbsResource()) { - return relUrl - } - return if (base.isValidResourceUrl()) { - resolve(Url(base), relUrl).toString() - } else if (relUrl.isValidResourceUrl()) { - Url(relUrl).toString() - } else { - if (validUriScheme.matches(relUrl)) relUrl else null - } - } -} \ No newline at end of file diff --git a/ksoup-engine-okio/src/com/fleeksoft/ksoup/io/SourceExt.kt b/ksoup-engine-okio/src/com/fleeksoft/ksoup/io/SourceExt.kt index afb8a59a..e886482a 100644 --- a/ksoup-engine-okio/src/com/fleeksoft/ksoup/io/SourceExt.kt +++ b/ksoup-engine-okio/src/com/fleeksoft/ksoup/io/SourceExt.kt @@ -4,7 +4,6 @@ import okio.Path import okio.Source -fun SourceReader.Companion.from(byteArray: ByteArray): SourceReader = SourceReaderImpl(byteArray) fun SourceReader.Companion.from(source: Source): SourceReader = SourceReaderImpl(source) diff --git a/ksoup-test/build.gradle.kts b/ksoup-test/build.gradle.kts index 451600b7..928080df 100644 --- a/ksoup-test/build.gradle.kts +++ b/ksoup-test/build.gradle.kts @@ -1,7 +1,3 @@ -plugins { - alias(libs.plugins.power.assert) -} - val rootPath = "generated/kotlin" val isGithubActions: Boolean = System.getenv("GITHUB_ACTIONS")?.toBoolean() == true @@ -32,6 +28,7 @@ val generateBuildConfigFile: Task by tasks.creating { const val isKorlibs: Boolean = ${libBuildType == "korlibs"} const val isOkio: Boolean = ${libBuildType == "okio"} const val isKtor2: Boolean = ${libBuildType == "ktor2"} + const val isLite: Boolean = ${libBuildType == "lite"} } """.trimIndent() file.get().asFile.writeText(content) diff --git a/ksoup-test/module.yaml b/ksoup-test/module.yaml index 3198a7f1..0da4a804 100644 --- a/ksoup-test/module.yaml +++ b/ksoup-test/module.yaml @@ -14,7 +14,6 @@ test-dependencies: - ../ksoup - $libs.korlibs.io - $libs.kotlinx.io - - $libs.codepoints - $libs.kotlinx.coroutines.test - $libs.kotlinx.datetime - $libs.stately.concurrency diff --git a/ksoup-test/test/com/fleeksoft/ksoup/System.kt b/ksoup-test/test/com/fleeksoft/ksoup/System.kt index e841f4be..2d973301 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/System.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/System.kt @@ -1,16 +1,11 @@ package com.fleeksoft.ksoup import kotlinx.datetime.Clock -import kotlin.time.Duration.Companion.seconds public class System { public companion object { - public fun nanoTime(): Long { - return Clock.System.now().epochSeconds.seconds.inWholeNanoseconds - } - public fun currentTimeMillis(): Long { - return Clock.System.now().epochSeconds.seconds.inWholeMilliseconds + return Clock.System.now().toEpochMilliseconds() } } } diff --git a/ksoup-test/test/com/fleeksoft/ksoup/TestHelper.kt b/ksoup-test/test/com/fleeksoft/ksoup/TestHelper.kt index 22e5556c..c325b9f1 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/TestHelper.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/TestHelper.kt @@ -1,7 +1,9 @@ package com.fleeksoft.ksoup import com.fleeksoft.ksoup.io.SourceReader +import com.fleeksoft.ksoup.ported.io.Charsets import com.fleeksoft.ksoup.ported.openSourceReader +import com.fleeksoft.ksoup.ported.toByteArray import korlibs.io.compression.deflate.GZIP import korlibs.io.compression.uncompress import korlibs.io.file.std.uniVfs @@ -69,10 +71,23 @@ object TestHelper { return bytes.uncompress(GZIP).openSourceReader() } + fun dataToStream( + data: String, + charset: String, + ): SourceReader { + if (BuildConfig.isLite) { + return data.encodeToByteArray().openSourceReader() + } + return data.toByteArray(Charsets.forName(charset)).openSourceReader() + } + fun isGzipSupported(): Boolean = BuildConfig.isKorlibs - fun isUtf16Supported(): Boolean = !((BuildConfig.isKotlinx || BuildConfig.isOkio || BuildConfig.isKtor2) && Platform.isJsOrWasm()) + fun isUtf16Supported(): Boolean = !(((BuildConfig.isKotlinx || BuildConfig.isOkio || BuildConfig.isKtor2) && Platform.isJsOrWasm()) || BuildConfig.isLite) fun isUtf32Supported(): Boolean = !(Platform.isJsOrWasm() || Platform.isWindows() || Platform.isLinux()) fun isEUCKRSupported(): Boolean = !(Platform.isJsOrWasm() || Platform.isApple() || Platform.isWindows() || (BuildConfig.isKorlibs && Platform.isLinux())) - fun isGB2312Supported(): Boolean = !(Platform.isApple() || Platform.isWindows() || ((BuildConfig.isKotlinx || BuildConfig.isOkio || BuildConfig.isKtor2) && Platform.isJsOrWasm()) || (BuildConfig.isKorlibs && Platform.isLinux())) - fun canReadResourceFile(): Boolean = !Platform.isWasmJs() || BuildConfig.isKorlibs + fun isGB2312Supported(): Boolean = !(BuildConfig.isLite || Platform.isApple() || Platform.isWindows() || ((BuildConfig.isKotlinx || BuildConfig.isOkio || BuildConfig.isKtor2) && Platform.isJsOrWasm()) || (BuildConfig.isKorlibs && Platform.isLinux())) + + fun canReadResourceFile(): Boolean = (!Platform.isWasmJs() || BuildConfig.isKorlibs) && !BuildConfig.isLite + + fun isFileSourceSupported(): Boolean = !BuildConfig.isLite } \ No newline at end of file diff --git a/ksoup-test/test/com/fleeksoft/ksoup/helper/DataUtilTest.kt b/ksoup-test/test/com/fleeksoft/ksoup/helper/DataUtilTest.kt index 0e98c27c..76a93eb1 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/helper/DataUtilTest.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/helper/DataUtilTest.kt @@ -39,13 +39,6 @@ class DataUtilTest { assertEquals("UTF-8", DataUtil.getCharsetFromContentType("text/html; charset='UTF-8'")) } - private fun dataToStream( - data: String, - charset: String, - ): SourceReader { - return data.toByteArray(Charsets.forName(charset)).openSourceReader() - } - @Test fun discardsSpuriousByteOrderMark() { val html = "\uFEFFOneTwo" @@ -138,7 +131,7 @@ class DataUtilTest { "" + "한국어" val doc: Document = DataUtil.parseInputSource( - sourceReader = dataToStream(data = html, charset = "euc-kr"), + sourceReader = TestHelper.dataToStream(data = html, charset = "euc-kr"), baseUri = "http://example.com", charsetName = null, parser = Parser.htmlParser(), @@ -152,15 +145,14 @@ class DataUtilTest { "" + "" + "Übergrößenträger" - val docByteArrayCharset: Document = - DataUtil.parseInputSource( - sourceReader = dataToStream(data = html, charset = "iso-8859-1"), - baseUri = "http://example.com", - charsetName = null, - parser = Parser.htmlParser(), - ) + val document = DataUtil.parseInputSource( + sourceReader = TestHelper.dataToStream(data = html, charset = "iso-8859-1"), + baseUri = "http://example.com", + charsetName = null, + parser = Parser.htmlParser(), + ) - assertEquals("Übergrößenträger", docByteArrayCharset.body().text()) + assertEquals("Übergrößenträger", document.body().text()) } @Test @@ -362,7 +354,7 @@ class DataUtilTest { @Test fun handlesUnlimitedRead() = runTest { val input: String = TestHelper.readResourceAsString("htmltests/large.html.gz") - val byteBuffer: ByteArray = DataUtil.readToByteBuffer(input.openSourceReader(), 0) + val byteBuffer: ByteArray = input.openSourceReader().readAllBytes() val read = byteBuffer.decodeToString() assertEquals(input, read) } diff --git a/ksoup-test/test/com/fleeksoft/ksoup/integration/ParseTest.kt b/ksoup-test/test/com/fleeksoft/ksoup/integration/ParseTest.kt index 18380c08..3595e7da 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/integration/ParseTest.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/integration/ParseTest.kt @@ -1,8 +1,6 @@ package com.fleeksoft.ksoup.integration import com.fleeksoft.ksoup.Ksoup -import com.fleeksoft.ksoup.Ksoup.parse -import com.fleeksoft.ksoup.Ksoup.parseFile import com.fleeksoft.ksoup.TestHelper import com.fleeksoft.ksoup.nodes.Document import com.fleeksoft.ksoup.parser.Parser @@ -24,8 +22,9 @@ class ParseTest { return@runTest } // test that works - var input = TestHelper.getResourceAbsolutePath("htmltests/meta-charset-1.html") - var doc: Document = parseFile( + val resourceName = "htmltests/meta-charset-1.html" + var input = TestHelper.getResourceAbsolutePath(resourceName) + var doc = Ksoup.parseFile( filePath = input, baseUri = "http://example.com/", charsetName = null, @@ -38,7 +37,7 @@ class ParseTest { // double check, no charset, falls back to utf8 which is incorrect input = TestHelper.getResourceAbsolutePath("htmltests/meta-charset-2.html") // - doc = parseFile( + doc = Ksoup.parseFile( filePath = input, baseUri = "http://example.com", charsetName = null, @@ -48,12 +47,11 @@ class ParseTest { // confirm fallback to utf8 input = TestHelper.getResourceAbsolutePath("htmltests/meta-charset-3.html") - doc = - parseFile( - filePath = input, - baseUri = "http://example.com/", - charsetName = null, - ) // utf8, no charset + doc = Ksoup.parseFile( + filePath = input, + baseUri = "http://example.com/", + charsetName = null, + ) // utf8, no charset assertEquals("UTF-8", doc.outputSettings().charset().name.uppercase()) assertEquals("新", doc.text()) } @@ -68,7 +66,7 @@ class ParseTest { """.trimIndent().openSourceReader() - val doc: Document = parse(sourceReader = input, baseUri = "http://example.com/", charsetName = null) + val doc: Document = Ksoup.parse(sourceReader = input, baseUri = "http://example.com/", charsetName = null) assertEquals("UTF-8", doc.outputSettings().charset().name.uppercase()) } @@ -111,7 +109,7 @@ class ParseTest { // this tests that if there is a huge illegal character reference, we can get through a buffer and rewind, and still catch that it's an invalid refence, // and the parse tree is correct. val parser = Parser.htmlParser() - val doc = parse( + val doc = Ksoup.parse( sourceReader = TestHelper.resourceFilePathToStream("htmltests/xwiki-edit.html.gz"), baseUri = "https://localhost/", charsetName = "UTF-8", @@ -131,7 +129,7 @@ class ParseTest { @Test fun testWikiExpandedFromString() = runTest { val html = TestHelper.readResourceAsString("htmltests/xwiki-edit.html.gz") - val doc = parse(html) + val doc = Ksoup.parse(html) assertEquals("XWiki Jetty HSQLDB 12.1-SNAPSHOT", doc.select("#xwikiplatformversion").text()) val wantHtml = "User Directory" diff --git a/ksoup-test/test/com/fleeksoft/ksoup/internal/StringUtilTest.kt b/ksoup-test/test/com/fleeksoft/ksoup/internal/StringUtilTest.kt index c5569d58..0120cba5 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/internal/StringUtilTest.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/internal/StringUtilTest.kt @@ -129,7 +129,7 @@ class StringUtilTest { assertEquals("https://example.com/one", StringUtil.resolve("https://example.com/one", "")) assertEquals("https://example.com/one/two.c", StringUtil.resolve("https://example.com/one/two/", "../two.c")) assertEquals("https://example.com/two.c", StringUtil.resolve("https://example.com/one/two", "../two.c")) -// assertEquals("", StringUtil.resolve("wrong", "also wrong")) + assertEquals("", StringUtil.resolve("wrong", "also wrong")) assertEquals("ftp://example.com/one", StringUtil.resolve("ftp://example.com/two/", "../one")) assertEquals("ftp://example.com/one/two.c", StringUtil.resolve("ftp://example.com/one/", "./two.c")) assertEquals("ftp://example.com/one/two.c", StringUtil.resolve("ftp://example.com/one/", "two.c")) diff --git a/ksoup-test/test/com/fleeksoft/ksoup/nodes/AttributeTest.kt b/ksoup-test/test/com/fleeksoft/ksoup/nodes/AttributeTest.kt index 33adb9f0..403da9ac 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/nodes/AttributeTest.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/nodes/AttributeTest.kt @@ -1,10 +1,9 @@ package com.fleeksoft.ksoup.nodes import com.fleeksoft.ksoup.Ksoup.parse -import com.fleeksoft.ksoup.TestHelper import com.fleeksoft.ksoup.parser.ParseSettings import com.fleeksoft.ksoup.parser.Parser -import de.cketti.codepoints.deluxe.toCodePoint +import com.fleeksoft.ksoup.ported.toCodePoint import kotlin.test.* class AttributeTest { diff --git a/ksoup-test/test/com/fleeksoft/ksoup/nodes/EntitiesTest.kt b/ksoup-test/test/com/fleeksoft/ksoup/nodes/EntitiesTest.kt index 8b9c9367..62e59f0a 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/nodes/EntitiesTest.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/nodes/EntitiesTest.kt @@ -2,7 +2,7 @@ package com.fleeksoft.ksoup.nodes import com.fleeksoft.ksoup.Ksoup.parse import com.fleeksoft.ksoup.parser.Parser -import de.cketti.codepoints.deluxe.toCodePoint +import com.fleeksoft.ksoup.ported.toCodePoint import kotlin.test.Test import kotlin.test.assertEquals diff --git a/ksoup-test/test/com/fleeksoft/ksoup/nodes/NodeTest.kt b/ksoup-test/test/com/fleeksoft/ksoup/nodes/NodeTest.kt index f8d8bd9d..fec42335 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/nodes/NodeTest.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/nodes/NodeTest.kt @@ -32,11 +32,7 @@ class NodeTest { assertEquals("", withBase.absUrl("noval")) val dodgyBase = Element(tag, "wtf://no-such-protocol/", attribs) assertEquals("http://bar/qux", dodgyBase.absUrl("absHref")) // base fails, but href good, so get that - if (BuildConfig.isKorlibs) { - assertEquals("wtf://no-such-protocol/foo", dodgyBase.absUrl("relHref")) // invalid protocol but still can be resolved - } else { - assertEquals("", dodgyBase.absUrl("relHref")) // base fails, only rel href, so return nothing - } + assertEquals("wtf://no-such-protocol/foo", dodgyBase.absUrl("relHref")) // invalid protocol but still can be resolved } @Test @@ -91,23 +87,18 @@ class NodeTest { @Test fun handleAbsOnFileUris() { - val doc = Ksoup.parse("One/a>Two", "file:///etc/") + val doc = Ksoup.parse("One/a>Two", "file:/etc/") val one = doc.select("a").first() - assertEquals("file:///etc/password", one!!.absUrl("href")) + assertEquals("file:/etc/password", one!!.absUrl("href")) val two = doc.select("a")[1] - if (BuildConfig.isKorlibs) { - assertEquals("file:///var/log/messages", two.absUrl("href")) - } else { - // fixme: in kotlinx its different behaviour - assertEquals("file://var/log/messages", two.absUrl("href")) - } + assertEquals("file:/var/log/messages", two.absUrl("href")) } @Test fun handleAbsOnLocalhostFileUris() { val doc = Ksoup.parse("One/a>Two", "file:///localhost/etc/") val one = doc.select("a").first()!! - assertEquals("file:///localhost/etc/password", one.absUrl("href")) + assertEquals("file://localhost/etc/password", one.absUrl("href")) } @Test diff --git a/ksoup-test/test/com/fleeksoft/ksoup/nodes/TextNodeTest.kt b/ksoup-test/test/com/fleeksoft/ksoup/nodes/TextNodeTest.kt index 1b0d43df..adcbb273 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/nodes/TextNodeTest.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/nodes/TextNodeTest.kt @@ -1,11 +1,10 @@ package com.fleeksoft.ksoup.nodes import com.fleeksoft.ksoup.Ksoup -import com.fleeksoft.ksoup.TestHelper import com.fleeksoft.ksoup.TextUtil import com.fleeksoft.ksoup.internal.StringUtil import com.fleeksoft.ksoup.ported.exception.ValidationException -import de.cketti.codepoints.deluxe.toCodePoint +import com.fleeksoft.ksoup.ported.toCodePoint import kotlin.test.* /** diff --git a/ksoup-test/test/com/fleeksoft/ksoup/select/SelectorTest.kt b/ksoup-test/test/com/fleeksoft/ksoup/select/SelectorTest.kt index 21b68c87..c702c056 100644 --- a/ksoup-test/test/com/fleeksoft/ksoup/select/SelectorTest.kt +++ b/ksoup-test/test/com/fleeksoft/ksoup/select/SelectorTest.kt @@ -5,7 +5,7 @@ import com.fleeksoft.ksoup.nodes.Document import com.fleeksoft.ksoup.nodes.Element import com.fleeksoft.ksoup.parser.Parser import com.fleeksoft.ksoup.ported.IdentityHashMap -import de.cketti.codepoints.deluxe.toCodePoint +import com.fleeksoft.ksoup.ported.toCodePoint import kotlin.test.* /** diff --git a/ksoup-test/test@jvmAndAndroid/com/fleeksoft/ksoup/DataUtilTestJvm.kt b/ksoup-test/test@jvmAndAndroid/com/fleeksoft/ksoup/DataUtilTestJvm.kt index 2c1bddac..94f28103 100644 --- a/ksoup-test/test@jvmAndAndroid/com/fleeksoft/ksoup/DataUtilTestJvm.kt +++ b/ksoup-test/test@jvmAndAndroid/com/fleeksoft/ksoup/DataUtilTestJvm.kt @@ -156,7 +156,7 @@ class DataUtilTestJvm { val bytes: ByteArray = if (file.getName().endsWith(".gz")) { val stream: InputStream = GZIPInputStream(FileInputStream(file)) - val byteBuffer: ByteArray = DataUtil.readToByteBuffer(stream.toSourceReader(), 0) + val byteBuffer: ByteArray = stream.toSourceReader().readAllBytes() byteBuffer } else { file.readBytes() diff --git a/ksoup/build.gradle.kts b/ksoup/build.gradle.kts index bba959a8..e0b83f68 100644 --- a/ksoup/build.gradle.kts +++ b/ksoup/build.gradle.kts @@ -14,6 +14,10 @@ kotlin { commonMain { dependencies { when (libBuildType) { + "lite" -> { + api(project(":ksoup-engine-lite")) + } + "korlibs" -> { api(project(":ksoup-engine-korlibs")) } @@ -39,6 +43,7 @@ val artifactId = when (libBuildType) { "korlibs" -> "ksoup-korlibs" "okio" -> "ksoup-okio" "ktor2" -> "ksoup-ktor2" + "lite" -> "ksoup-lite" else -> "ksoup" } diff --git a/ksoup/module.yaml b/ksoup/module.yaml index a134e7f0..633380b1 100644 --- a/ksoup/module.yaml +++ b/ksoup/module.yaml @@ -12,7 +12,6 @@ repositories: dependencies: - ../ksoup-engine-common: exported - - $libs.codepoints - $libs.stately.concurrency test-dependencies: diff --git a/ksoup/src/com/fleeksoft/ksoup/helper/DataUtil.kt b/ksoup/src/com/fleeksoft/ksoup/helper/DataUtil.kt index 2ef4f5e0..e62af645 100644 --- a/ksoup/src/com/fleeksoft/ksoup/helper/DataUtil.kt +++ b/ksoup/src/com/fleeksoft/ksoup/helper/DataUtil.kt @@ -142,7 +142,12 @@ public object DataUtil { // need to re-decode. (case insensitive check here to match how validate works) foundCharset = foundCharset.trim { it <= ' ' }.replace("[\"']".toRegex(), "") effectiveCharsetName = foundCharset - doc = null +// if can't change charset don't try other + if (Charsets.isOnlyUtf8 && inputSource.exhausted()) { + inputSource.close() + } else { + doc = null + } } else if (inputSource.exhausted()) { // if we have read fully, and the charset was correct, keep that current parse inputSource.close() } else { @@ -184,27 +189,6 @@ public object DataUtil { return doc } - /** - * Read the input stream into a byte buffer. To deal with slow input streams, you may interrupt the thread this - * method is executing on. The data read until being interrupted will be available. - * @param sourceReader the input stream to read from - * @param maxSize the maximum size in bytes to read from the stream. Set to 0 to be unlimited. - * @return the filled byte buffer - */ - public fun readToByteBuffer( - sourceReader: SourceReader, - maxSize: Long, - ): ByteArray { - return if (maxSize == 0L) { - sourceReader.readAllBytes() - } else { -// todo:// check this sources may don't have any stream size -// val size = if (!bufferReader.exhausted()) minOf(maxSize, bufferReader.availableRead()) else maxSize - val size = maxSize - sourceReader.readBytes(size.toInt()) - } - } - /** * Parse out a charset from a content type header. If the charset is not supported, returns null (so the default * will kick in.) diff --git a/ksoup/src/com/fleeksoft/ksoup/internal/StringUtil.kt b/ksoup/src/com/fleeksoft/ksoup/internal/StringUtil.kt index 753d0e3b..df294190 100644 --- a/ksoup/src/com/fleeksoft/ksoup/internal/StringUtil.kt +++ b/ksoup/src/com/fleeksoft/ksoup/internal/StringUtil.kt @@ -1,10 +1,6 @@ package com.fleeksoft.ksoup.internal -import com.fleeksoft.ksoup.ported.Character -import com.fleeksoft.ksoup.ported.resolveOrNull -import de.cketti.codepoints.deluxe.CodePoint -import de.cketti.codepoints.deluxe.appendCodePoint -import de.cketti.codepoints.deluxe.codePointAt +import com.fleeksoft.ksoup.ported.* import kotlin.math.min /** @@ -95,7 +91,7 @@ public object StringUtil { if (string.isNullOrEmpty()) return true val l = string.length for (i in 0 until l) { - if (!isWhitespace(string.codePointAt(i).value)) return false + if (!isWhitespace(string.codePointValueAt(i))) return false } return true } @@ -118,7 +114,7 @@ public object StringUtil { if (string.isNullOrEmpty()) return false val l = string.length for (i in 0 until l) { - if (!Character.isDigit(string.codePointAt(i))) return false + if (!Character.isDigit(string.codePointValueAt(i))) return false } return true } @@ -186,7 +182,7 @@ public object StringUtil { accum.append(' ') lastWasWhite = true } else if (!isInvisibleChar(c.value)) { - accum.appendCodePoint(c) + accum.appendCodePoint(c.value) lastWasWhite = false reachedNonWhite = true } @@ -231,7 +227,7 @@ public object StringUtil { // if access url is relative protocol then copy it val cleanedBaseUrl = stripControlChars(baseUrl) val cleanedRelUrl = stripControlChars(relUrl) - return cleanedBaseUrl.resolveOrNull(cleanedRelUrl) ?: "" + return URLUtil.resolve(base = cleanedBaseUrl, relative = cleanedRelUrl) } private val controlChars: Regex = Regex("[\\x00-\\x1f]*") // matches ascii 0 - 31, to strip from url diff --git a/ksoup/src/com/fleeksoft/ksoup/internal/URLUtil.kt b/ksoup/src/com/fleeksoft/ksoup/internal/URLUtil.kt new file mode 100644 index 00000000..d6e4a019 --- /dev/null +++ b/ksoup/src/com/fleeksoft/ksoup/internal/URLUtil.kt @@ -0,0 +1,181 @@ +package com.fleeksoft.ksoup.internal + +import kotlin.math.min + +object URLUtil { + fun resolve(base: String, relative: String): String { + if (relative.isEmpty()) return base + + // If the relative URL is already absolute (has a scheme), return it + if (isAbsoluteUrl(relative)) { + return relative + } + + if (!isAbsoluteUrl(base)) { + // At least one absolute link required + return "" + } + + // Parse the base URL into components (scheme, authority, path, query, fragment) + val baseUrl = parseUrl(base) + + // Handle protocol-relative URLs (e.g. "//example.com/one") + if (relative.startsWith("//")) { + return baseUrl.scheme + ":" + relative + } + + // Handle fragment or query-relative URLs + if (relative.startsWith("?")) { + return "${baseUrl.scheme}:${baseUrl.schemeSeparator}${baseUrl.authority}${baseUrl.path}$relative" + } + if (relative.startsWith("#")) { + return "${baseUrl.scheme}:${baseUrl.schemeSeparator}${baseUrl.authority}${baseUrl.path}${baseUrl.query ?: ""}$relative" + } + + // If the relative URL starts with "/", it's an absolute path on the current authority + var resolvedPath = if (relative.startsWith("/")) { + relative + } else { + // If the base URL has a query or fragment, we need to strip it before merging paths + val cleanedBasePath = stripQueryAndFragment(baseUrl.path) + mergePaths(cleanedBasePath, relative) + } + + val relQueryIndex = resolvedPath.indexOf("?") + val relFragmentIndex = resolvedPath.indexOf("#") + + val queryOrFragmentIndex = if (relQueryIndex != -1 && relFragmentIndex != -1) { + min(relQueryIndex, relFragmentIndex) + } else if (relFragmentIndex != -1) { + relFragmentIndex + } else { + relQueryIndex + } + + val queryOrFragment = if (queryOrFragmentIndex != -1) { + val result = resolvedPath.substring(queryOrFragmentIndex) + resolvedPath = resolvedPath.substring(0, queryOrFragmentIndex) + result + } else null + + // Normalize the path to resolve ".." and "." + // add root slash to path only if authority is not empty + val normalizedPath = normalizePath(resolvedPath, addRoot = baseUrl.authority.isNotEmpty()).let { if (queryOrFragment != null) it + queryOrFragment else it } + +// val relativeFragment = relative.substringAfter('#', "") + + // Form the final URL with scheme, authority, path, query, and fragment + val finalUrl = StringBuilder() + finalUrl.append("${baseUrl.scheme}:${baseUrl.schemeSeparator}${baseUrl.authority}$normalizedPath") + + return finalUrl.toString() + } + + private fun isAbsoluteUrl(url: String): Boolean { + return url.length > 2 && url.contains(":") + } + + private fun mergePaths(basePath: String, relativePath: String): String { + val baseDir = if (basePath.endsWith("/")) basePath else basePath.substring(0, basePath.lastIndexOf('/') + 1) + return baseDir + relativePath + } + + private fun normalizePath(path: String, addRoot: Boolean = true): String { + val segments = path.split("/").toMutableList() + val result = mutableListOf() + + segments.forEachIndexed { index, segment -> + when { + segment.isEmpty() || segment == "." -> { + // if its last part and . then append / example: .com/b/c/d + ./g/. = .com/b/c/d/g/ + if (index == segments.size - 1) { + result.add("") + } + } + + segment == ".." -> { + // Go up a directory (pop last segment) + if (result.isNotEmpty()) { + result.removeAt(result.size - 1) + } + } + + else -> { + result.add(segment) + } + } + } + + return (if (addRoot) "/" else "") + result.joinToString("/") + } + + private fun stripQueryAndFragment(path: String): String { + val queryIndex = path.indexOf('?') + val fragmentIndex = path.indexOf('#') + return when { + queryIndex != -1 -> path.substring(0, queryIndex) + fragmentIndex != -1 -> path.substring(0, fragmentIndex) + else -> path + } + } + + private data class ParsedUrl( + val scheme: String, + val schemeSeparator: String, + val authority: String, + val path: String, + val query: String? = null, + val fragment: String? = null + ) + + private fun parseUrl(url: String): ParsedUrl { + var remainingUrl = url + val scheme: String + val schemeSeparator: String + val schemeEndIndex = url.indexOf(":") + if (schemeEndIndex != -1) { + schemeSeparator = if (url.indexOf("://") != -1) { + "//" + } else if (url.indexOf(":/") != -1) { + "/" + } else { + "" + } + scheme = url.substring(0, schemeEndIndex) + remainingUrl = url.substring(schemeEndIndex + schemeSeparator.length + 1) + } else { + // If no scheme, default to "http" or you can adjust it to defaultScheme + scheme = "https" + schemeSeparator = "//" + } + + val authorityEndIndex = if (schemeSeparator != "/") { + remainingUrl.indexOf('/').takeIf { it != -1 } ?: remainingUrl.indexOf('?').takeIf { it != -1 } ?: remainingUrl.indexOf('#') + .takeIf { it != -1 } ?: remainingUrl.length + } else { + // file paths + -1 + } + + val authority = if (authorityEndIndex != -1) remainingUrl.substring(0, authorityEndIndex) else null + val pathAndMore = if (authorityEndIndex == -1) remainingUrl else remainingUrl.substring(authorityEndIndex) + val pathEndIndex = pathAndMore.indexOfAny(charArrayOf('?', '#')).takeIf { it != -1 } ?: pathAndMore.length + val path = pathAndMore.substring(0, pathEndIndex) + + val queryStartIndex = pathAndMore.indexOf('?').takeIf { it != -1 } ?: pathAndMore.length + val fragmentStartIndex = pathAndMore.indexOf('#').takeIf { it != -1 } ?: pathAndMore.length + + val query = if (queryStartIndex != pathAndMore.length) pathAndMore.substring(queryStartIndex, fragmentStartIndex) else null + val fragment = if (fragmentStartIndex != pathAndMore.length) pathAndMore.substring(fragmentStartIndex) else null + + return ParsedUrl( + scheme = scheme, + schemeSeparator = schemeSeparator, + authority = authority ?: "", + path = path, + query = query, + fragment = fragment + ) + } + +} \ No newline at end of file diff --git a/ksoup/src/com/fleeksoft/ksoup/nodes/Entities.kt b/ksoup/src/com/fleeksoft/ksoup/nodes/Entities.kt index 72a6ab68..51bb3230 100644 --- a/ksoup/src/com/fleeksoft/ksoup/nodes/Entities.kt +++ b/ksoup/src/com/fleeksoft/ksoup/nodes/Entities.kt @@ -11,13 +11,11 @@ import com.fleeksoft.ksoup.nodes.Entities.EscapeMode.base import com.fleeksoft.ksoup.nodes.Entities.EscapeMode.extended import com.fleeksoft.ksoup.parser.CharacterReader import com.fleeksoft.ksoup.parser.Parser +import com.fleeksoft.ksoup.ported.* import com.fleeksoft.ksoup.ported.Character -import com.fleeksoft.ksoup.ported.ThreadLocal import com.fleeksoft.ksoup.ported.exception.IOException import com.fleeksoft.ksoup.ported.exception.SerializationException import com.fleeksoft.ksoup.ported.io.Charsets -import de.cketti.codepoints.deluxe.CodePoint -import de.cketti.codepoints.deluxe.codePointAt /** @@ -83,8 +81,8 @@ public object Entities { ): Int { val value: String? = multipoints[name] if (value != null) { - codepoints[0] = value.codePointAt(0).value - codepoints[1] = value.codePointAt(1).value + codepoints[0] = value.codePointValueAt(0) + codepoints[1] = value.codePointValueAt(1) return 2 } val codepoint = extended.codepointForName(name) diff --git a/ksoup/src/com/fleeksoft/ksoup/nodes/Node.kt b/ksoup/src/com/fleeksoft/ksoup/nodes/Node.kt index 67f4b17b..4ff4f16e 100644 --- a/ksoup/src/com/fleeksoft/ksoup/nodes/Node.kt +++ b/ksoup/src/com/fleeksoft/ksoup/nodes/Node.kt @@ -767,8 +767,7 @@ public abstract class Node protected constructor() : KCloneable { depth: Int, out: Document.OutputSettings, ) { - accum.append('\n') - .append(StringUtil.padding(depth * out.indentAmount(), out.maxPaddingWidth())) + accum.append('\n').append(StringUtil.padding(depth * out.indentAmount(), out.maxPaddingWidth())) } /** diff --git a/ksoup/src/com/fleeksoft/ksoup/parser/CharacterReader.kt b/ksoup/src/com/fleeksoft/ksoup/parser/CharacterReader.kt index b15702dd..9d6854b0 100644 --- a/ksoup/src/com/fleeksoft/ksoup/parser/CharacterReader.kt +++ b/ksoup/src/com/fleeksoft/ksoup/parser/CharacterReader.kt @@ -623,7 +623,7 @@ public class CharacterReader { } override fun toString(): String { - return if (bufLength - bufPos < 0) { + return if (charBuf == null || bufLength - bufPos < 0) { "" } else { String.buildString( diff --git a/ksoup/src/com/fleeksoft/ksoup/parser/Token.kt b/ksoup/src/com/fleeksoft/ksoup/parser/Token.kt index 1c23b651..06d15b02 100644 --- a/ksoup/src/com/fleeksoft/ksoup/parser/Token.kt +++ b/ksoup/src/com/fleeksoft/ksoup/parser/Token.kt @@ -6,8 +6,8 @@ import com.fleeksoft.ksoup.internal.SharedConstants import com.fleeksoft.ksoup.nodes.Attributes import com.fleeksoft.ksoup.nodes.Range import com.fleeksoft.ksoup.ported.KCloneable +import com.fleeksoft.ksoup.ported.appendCodePoint import com.fleeksoft.ksoup.ported.assert -import de.cketti.codepoints.appendCodePoint /** * Parse tokens for the Tokeniser. diff --git a/ksoup/src/com/fleeksoft/ksoup/parser/Tokeniser.kt b/ksoup/src/com/fleeksoft/ksoup/parser/Tokeniser.kt index b4f5ba32..942e0398 100644 --- a/ksoup/src/com/fleeksoft/ksoup/parser/Tokeniser.kt +++ b/ksoup/src/com/fleeksoft/ksoup/parser/Tokeniser.kt @@ -3,8 +3,8 @@ package com.fleeksoft.ksoup.parser import com.fleeksoft.ksoup.helper.Validate import com.fleeksoft.ksoup.internal.StringUtil import com.fleeksoft.ksoup.nodes.Entities +import com.fleeksoft.ksoup.ported.appendCodePoint import com.fleeksoft.ksoup.ported.codePointsToString -import de.cketti.codepoints.appendCodePoint /** * Readers the input stream into tokens. diff --git a/ksoup/src/com/fleeksoft/ksoup/ported/Character.kt b/ksoup/src/com/fleeksoft/ksoup/ported/Character.kt index 4ca59f4a..a50f4e8b 100644 --- a/ksoup/src/com/fleeksoft/ksoup/ported/Character.kt +++ b/ksoup/src/com/fleeksoft/ksoup/ported/Character.kt @@ -1,18 +1,111 @@ package com.fleeksoft.ksoup.ported -import de.cketti.codepoints.deluxe.CodePoint +internal object Character { + val MIN_SUPPLEMENTARY_CODE_POINT: Int = 0x010000 + const val MIN_HIGH_SURROGATE: Char = '\uD800' + const val MIN_LOW_SURROGATE: Char = '\uDC00' + const val MAX_LOW_SURROGATE: Char = '\uDFFF' + const val MIN_SURROGATE: Char = MIN_HIGH_SURROGATE + val MAX_SURROGATE: Char = MAX_LOW_SURROGATE + const val MAX_CODE_POINT: Int = 0X10FFFF -internal class Character { - companion object { - fun isDigit(codePoint: CodePoint): Boolean { - return codePoint.value.toChar().isDigit() + + fun toCodePoint(high: Char, low: Char): Int { + return ((high.code shl 10) + low.code) + (MIN_SUPPLEMENTARY_CODE_POINT - (MIN_HIGH_SURROGATE.code shl 10) - MIN_LOW_SURROGATE.code) + } + + fun isDigit(codePoint: CodePoint): Boolean { + return codePoint.value.toChar().isDigit() + } + + fun isDigit(codePointValue: Int): Boolean { + return codePointValue.toChar().isDigit() + } + + fun isValidCodePoint(codePoint: Int): Boolean { + val plane = codePoint ushr 16 + return plane < ((MAX_CODE_POINT + 1) ushr 16) + } + + fun isBmpCodePoint(codePoint: Int): Boolean { + return codePoint ushr 16 == 0 + } + + fun highSurrogate(codePoint: Int): Char { + return ((codePoint ushr 10) + (MIN_HIGH_SURROGATE.code - (MIN_SUPPLEMENTARY_CODE_POINT ushr 10))).toChar() + } + + fun lowSurrogate(codePoint: Int): Char { + return ((codePoint and 0x3ff) + MIN_LOW_SURROGATE.code).toChar() + } + + fun toSurrogates(codePoint: Int, dst: CharArray, index: Int) { + // We write elements "backwards" to guarantee all-or-nothing + dst[index + 1] = lowSurrogate(codePoint) + dst[index] = highSurrogate(codePoint) + } + + fun toChars(codePoint: Int): CharArray { + return when { + isBmpCodePoint(codePoint) -> { + charArrayOf(codePoint.toChar()) + } + + isValidCodePoint(codePoint) -> { + val result = CharArray(2) + toSurrogates(codePoint, result, 0) + result + } + + else -> throw IllegalArgumentException( + "Not a valid Unicode code point: 0x${codePoint.toString(16).uppercase()}" + ) + } + } + + fun toChars(codePoint: Int, dst: CharArray, dstIndex: Int): Int { + return when { + isBmpCodePoint(codePoint) -> { + dst[dstIndex] = codePoint.toChar() + 1 + } + + isValidCodePoint(codePoint) -> { + toSurrogates(codePoint, dst, dstIndex) + 2 + } + + else -> throw IllegalArgumentException( + "Not a valid Unicode code point: 0x${codePoint.toString(16).uppercase()}" + ) } + } - val MIN_SUPPLEMENTARY_CODE_POINT: Int = 0x010000 - const val MIN_HIGH_SURROGATE: Char = '\uD800' - const val MAX_LOW_SURROGATE: Char = '\uDFFF' - const val MIN_SURROGATE: Char = MIN_HIGH_SURROGATE - val MAX_SURROGATE: Char = MAX_LOW_SURROGATE +} + +fun CharSequence.codePointValueAt(index: Int): Int { + if (index !in indices) throw IndexOutOfBoundsException() + val firstChar = this[index] + if (firstChar.isHighSurrogate() && index + 1 < length) { + val nextChar = this[index + 1] + if (nextChar.isLowSurrogate()) { + return Character.toCodePoint(firstChar, nextChar) + } } + + return firstChar.code } + +fun CharSequence.codePointAt(index: Int): CodePoint { + return this.codePointValueAt(index).toCodePoint() +} + +fun T.appendCodePoint(codePoint: Int): T = apply { + if (Character.isBmpCodePoint(codePoint)) { + append(codePoint.toChar()) + } else { + append(Character.highSurrogate(codePoint)) + append(Character.lowSurrogate(codePoint)) + } +} \ No newline at end of file diff --git a/ksoup/src/com/fleeksoft/ksoup/ported/CodePoint.kt b/ksoup/src/com/fleeksoft/ksoup/ported/CodePoint.kt new file mode 100644 index 00000000..2a97b5af --- /dev/null +++ b/ksoup/src/com/fleeksoft/ksoup/ported/CodePoint.kt @@ -0,0 +1,78 @@ +package com.fleeksoft.ksoup.ported + +import kotlin.jvm.JvmInline + +/** + * Represents a Unicode code point. + * + * You can create/retrieve instances of this class by using the following functions: + * - [Int.toCodePoint] + * - [Char.toCodePoint] + */ +@JvmInline +value class CodePoint internal constructor(val value: Int) { + val charCount: Int + get() = if (value >= Character.MIN_SUPPLEMENTARY_CODE_POINT) 2 else 1 + + init { + require(Character.isValidCodePoint(value)) { "Not a valid code point" } + } + + /** + * Converts this Unicode code point to its UTF-16 representation stored in a char array. + * + * If this code point is a BMP (Basic Multilingual Plane or Plane 0) value, the resulting char array has the same + * value as [value]. If the specified code point is a supplementary code point, the resulting char array has the + * corresponding surrogate pair. + */ + fun toChars(): CharArray { + return Character.toChars(value) + } + + /** + * Converts this Unicode code point to its UTF-16 representation. + * + * If this code point is a BMP (Basic Multilingual Plane or Plane 0) value, the same value is stored in + * `destination[offset]`, and 1 is returned. If this code point is a supplementary character, its surrogate values + * are stored in `destination[offset]` (high-surrogate) and `destination[offset+1]` (low-surrogate), and 2 is + * returned. + */ + fun toChars(destination: CharArray, offset: Int): Int { + return Character.toChars(value, destination, offset) + } + + /** + * Returns the standard Unicode notation of this code point. + * + * "U+" followed by the code point value in hexadecimal (using upper case letters), which is prepended with leading + * zeros to a minimum of four digits. + */ + fun toUnicodeNotation(): String { + return "U+${value.toString(16).uppercase().padStart(4, '0')}" + } + + /** + * Returns the string representation of this code point. + * + * The returned string consists of the sequence of characters returned by [toChars]. + */ + override fun toString(): String { + return toChars().concatToString() + } +} + +/** + * Returns a [CodePoint] with this value. + * + * Throws [IllegalArgumentException] if this value falls outside the range of valid code points. + */ +fun Int.toCodePoint(): CodePoint { + return CodePoint(this) +} + +/** + * Returns a [CodePoint] with the same value as this `Char`. + */ +fun Char.toCodePoint(): CodePoint { + return CodePoint(this.code) +} \ No newline at end of file diff --git a/ksoup/src/com/fleeksoft/ksoup/ported/CoreExtensions.kt b/ksoup/src/com/fleeksoft/ksoup/ported/CoreExtensions.kt index efabf4d2..ef911f59 100644 --- a/ksoup/src/com/fleeksoft/ksoup/ported/CoreExtensions.kt +++ b/ksoup/src/com/fleeksoft/ksoup/ported/CoreExtensions.kt @@ -1,7 +1,6 @@ package com.fleeksoft.ksoup.ported import com.fleeksoft.ksoup.ported.io.Charsets -import de.cketti.codepoints.appendCodePoint internal fun String.isCharsetSupported(): Boolean { val result = runCatching { Charsets.forName(this) }.getOrNull() diff --git a/ksoup/src/com/fleeksoft/ksoup/ported/KsoupExt.kt b/ksoup/src/com/fleeksoft/ksoup/ported/KsoupExt.kt index df2f70c1..a06ff458 100644 --- a/ksoup/src/com/fleeksoft/ksoup/ported/KsoupExt.kt +++ b/ksoup/src/com/fleeksoft/ksoup/ported/KsoupExt.kt @@ -5,19 +5,18 @@ import com.fleeksoft.ksoup.internal.SharedConstants import com.fleeksoft.ksoup.io.Charset import com.fleeksoft.ksoup.io.FileSource import com.fleeksoft.ksoup.io.SourceReader -import com.fleeksoft.ksoup.ported.io.* +import com.fleeksoft.ksoup.io.from +import com.fleeksoft.ksoup.ported.io.BufferedReader +import com.fleeksoft.ksoup.ported.io.Charsets +import com.fleeksoft.ksoup.ported.io.InputSourceReader +import com.fleeksoft.ksoup.ported.io.Reader -fun String.openSourceReader(charset: Charset? = null): SourceReader = - KsoupEngineInstance.ksoupEngine.openSourceReader(content = this, charset = charset) +fun String.openSourceReader(charset: Charset? = null): SourceReader = SourceReader.from(charset?.toByteArray(this) ?: this.encodeToByteArray()) -fun ByteArray.openSourceReader(): SourceReader = KsoupEngineInstance.ksoupEngine.openSourceReader(byteArray = this) +fun ByteArray.openSourceReader(): SourceReader = SourceReader.from(this) fun SourceReader.toReader(charset: Charset = Charsets.UTF8, chunkSize: Int = SharedConstants.DefaultBufferSize): Reader = BufferedReader(InputSourceReader(this, charset = charset), chunkSize) -fun String.toReader(): StringReader = StringReader(this) - -fun String.resolveOrNull(access: String): String? = KsoupEngineInstance.ksoupEngine.urlResolveOrNull(base = this, relUrl = access) - fun String.toByteArray(charset: Charset? = null): ByteArray = charset?.toByteArray(this) ?: this.encodeToByteArray() fun String.toSourceFile(): FileSource = KsoupEngineInstance.ksoupEngine.pathToFileSource(this) \ No newline at end of file diff --git a/ksoup/src/com/fleeksoft/ksoup/ported/io/Charsets.kt b/ksoup/src/com/fleeksoft/ksoup/ported/io/Charsets.kt index a2dc3d5d..1c5b0753 100644 --- a/ksoup/src/com/fleeksoft/ksoup/ported/io/Charsets.kt +++ b/ksoup/src/com/fleeksoft/ksoup/ported/io/Charsets.kt @@ -7,4 +7,6 @@ object Charsets { val UTF8: Charset = KsoupEngineInstance.ksoupEngine.getUtf8Charset() fun forName(name: String): Charset = KsoupEngineInstance.ksoupEngine.charsetForName(name) + + val isOnlyUtf8 = UTF8.onlyUtf8() } \ No newline at end of file diff --git a/ksoup/src/com/fleeksoft/ksoup/ported/io/StreamDecoder.kt b/ksoup/src/com/fleeksoft/ksoup/ported/io/StreamDecoder.kt index 2f6326ba..f00573b9 100644 --- a/ksoup/src/com/fleeksoft/ksoup/ported/io/StreamDecoder.kt +++ b/ksoup/src/com/fleeksoft/ksoup/ported/io/StreamDecoder.kt @@ -148,43 +148,6 @@ class StreamDecoder(source: SourceReader, charset: Charset) : Reader() { text.toCharArray().copyInto(cbuf, off) return text.length - - /*var eof = false - while (true) { - val read = decoder!!.decode(bb!!, cb, length) - if (read <= length) { -// underflow - if (eof) break - if (!cb.hasRemaining()) break - if ((cb.position() > 0) && !inReady()) break // Block at most once - - val n = readBytes() - if (n < 0) { - eof = true - if ((cb.position() == 0) && (bb!!.exhausted())) break - } - continue - } - if (bb!!.remaining > 0 && read == length) { -// overflow - require(cb.position() > 0) - break - } - throw Exception("error decoding stream") - } - - *//*if (eof) { - // ## Need to flush decoder - decoder.reset() - }*//* - - if (cb.position() == 0) { - if (eof) { - return -1 - } - require(false) - } - return cb.position()*/ } fun encodingName(): String { diff --git a/publishToMaven.sh b/publishToMaven.sh index 35fe2a1c..6d3bce85 100755 --- a/publishToMaven.sh +++ b/publishToMaven.sh @@ -16,7 +16,7 @@ if [ "$1" == "--remote" ]; then fi # Default build types if none are passed -default_build_types=("common" "kotlinx" "korlibs" "ktor2" "okio") +default_build_types=("common" "lite" "korlibs" "kotlinx" "okio" "ktor2") # If build types are passed, use them; otherwise, use the default list if [ "$#" -ge 1 ]; then @@ -32,6 +32,9 @@ add_projects_based_on_key() { "common") projects=("ksoup-engine-common") ;; + "lite") + projects=("ksoup-engine-lite" "ksoup") + ;; "kotlinx") projects=("ksoup-engine-kotlinx" "ksoup" "ksoup-network") ;; @@ -106,7 +109,7 @@ for buildType in "${build_types[@]}"; do safe_remove_dir ".gradle" safe_remove_dir "kotlin-js-store" - if [ "$ADD_WASM" = true ] && [[ "$buildType" == "kotlinx" || "$buildType" == "korlibs" ]]; then + if [ "$ADD_WASM" = true ] && [[ "$buildType" != "ktor2" && "$buildType" != "okio" ]]; then echo "check and add wasm to projects" for projectName in "${projects[@]}"; do add_wasm_platform "$projectName" diff --git a/runTests.sh b/runTests.sh index 279b388e..9ab078d8 100755 --- a/runTests.sh +++ b/runTests.sh @@ -70,8 +70,8 @@ run_tests() { echo "Running tests with libBuildType=$libBuildType and tasks=${tasks[*]}..." - # Only add/remove wasm for kotlinx and korlibs - if [[ "$libBuildType" == "kotlinx" || "$libBuildType" == "korlibs" ]]; then + # kto2 doesn't support wasm + if [[ "$libBuildType" != "ktor2" && "$libBuildType" != "okio" ]]; then add_wasm_platform fi @@ -98,7 +98,7 @@ run_tests() { } # Supported parameters -SUPPORTED_PARAMS=("korlibs" "okio" "kotlinx" "ktor2") +SUPPORTED_PARAMS=("lite" "korlibs" "okio" "kotlinx" "ktor2") # Function to check if the provided parameter is supported is_supported_param() { diff --git a/settings.gradle.kts b/settings.gradle.kts index 4aa2e4f6..361ee904 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -24,6 +24,11 @@ dependencyResolutionManagement { val libBuildType = settings.providers.gradleProperty("libBuildType").get() include("ksoup-engine-common") + +if (libBuildType == "lite" || libBuildType == "dev") { + include("ksoup-engine-lite") +} + if (libBuildType == "korlibs" || libBuildType == "dev") { include("ksoup-engine-korlibs", "ksoup-network-korlibs") }