Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve performance #83

Merged
merged 6 commits into from
Sep 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,8 @@ In this example, `Ksoup.parseGetRequest` fetches and parses HTML content from Wi
#### For further documentation, please check here: [Jsoup](https://jsoup.org/)

### Ksoup vs. Jsoup Performance: Parsing & Selecting 448KB HTML File [test.tx](https://github.com/fleeksoft/ksoup/blob/develop/ksoup-test/testResources/test.txt)
![Ksoup vs Jsoup](performance1.png)

![Ksoup vs Jsoup](performance.png)

## Open source
Expand Down
6 changes: 5 additions & 1 deletion gradle/libs.versions.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@ ktor = "3.0.0-rc-1"
ktor2 = "2.3.12"
coroutines = "1.8.1"
kotlinxDatetime = "0.6.1"
kotlinx-io = "0.5.3"
kotlinx-io = "0.5.4"
okio = "3.9.0"
dokka = "1.9.20"
kotlinx-benchmark = "0.4.12"

#korlibs = "999.0.0.999" # 999.0.0.999 is local version
korlibs = "6.0.1"
Expand Down Expand Up @@ -48,10 +49,13 @@ stately-concurrency = { module = "co.touchlab:stately-concurrency", version.ref
jsoup = { module = "org.jsoup:jsoup", version.ref = "jsoup" }
okio = { module = "com.squareup.okio:okio", version.ref = "okio" }
okio-nodefilesystem = { module = "com.squareup.okio:okio-nodefilesystem", version.ref = "okio" }
kotlinx-benchmark-runtime = { module = "org.jetbrains.kotlinx:kotlinx-benchmark-runtime", version.ref = "kotlinx-benchmark" }

[plugins]
androidLibrary = { id = "com.android.library", version.ref = "agp" }
kmp = { id = "org.jetbrains.kotlin.multiplatform", version.ref = "kotlin" }
dokka = { id = "org.jetbrains.dokka", version.ref = "dokka" }
power-assert = { id = "org.jetbrains.kotlin.plugin.power-assert", version.ref = "kotlin" }
mavenPublish = { id = "com.vanniktech.maven.publish", version.ref = "mavenPublish" }
kotlinx-benchmark = { id = "org.jetbrains.kotlinx.benchmark", version.ref = "kotlinx-benchmark" }
allopen = { id = "org.jetbrains.kotlin.plugin.allopen", version.ref = "kotlin" }
23 changes: 23 additions & 0 deletions ksoup-benchmark/build.gradle.kts
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
plugins {
alias(libs.plugins.kotlinx.benchmark)
alias(libs.plugins.allopen)
}

allOpen {
annotation("org.openjdk.jmh.annotations.State")
}


benchmark {
targets {
register("jvm")
}

configurations {
named("main") {
// exclude("org.jsoup.parser.JsoupBenchmark")
// exclude("com.fleeksoft.ksoup.benchmark.KsoupBenchmark")
}
}

}
24 changes: 24 additions & 0 deletions ksoup-benchmark/module.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
product:
type: lib
platforms: [ jvm, js, android, linuxX64, linuxArm64, tvosArm64, tvosX64, tvosSimulatorArm64, macosX64, macosArm64, iosArm64, iosSimulatorArm64, iosX64, mingwX64 ]

apply: [ ../common.module-template.yaml ]

aliases:
- jvmAndAndroid: [ jvm, android ]

repositories:
- mavenLocal

dependencies:
- $libs.kotlinx.io
- $libs.kotlinx.benchmark.runtime
# - com.fleeksoft.ksoup:ksoup-lite:0.1.8
- ../ksoup

dependencies@jvm:
- $libs.jsoup

settings:
kotlin:
optIns: [ kotlinx.cinterop.BetaInteropApi, kotlinx.cinterop.UnsafeNumber, kotlinx.cinterop.ExperimentalForeignApi, kotlin.experimental.ExperimentalNativeApi ]
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
package com.fleeksoft.ksoup.benchmark

import com.fleeksoft.ksoup.Ksoup
import com.fleeksoft.ksoup.nodes.Document
import com.fleeksoft.ksoup.nodes.Element
import com.fleeksoft.ksoup.select.Elements
import com.fleeksoft.ksoup.select.Evaluator
import kotlinx.benchmark.*
import kotlinx.io.buffered
import kotlinx.io.files.Path
import kotlinx.io.files.SystemFileSystem
import kotlinx.io.readString


@State(Scope.Benchmark)
@Warmup(iterations = 5)
@Measurement(iterations = 5, time = 1, timeUnit = BenchmarkTimeUnit.SECONDS)
class KsoupBenchmark {
private lateinit var fileData: String
private lateinit var doc1: Document

@Setup
fun setUp() {
fileData =
SystemFileSystem.source(Path("/Users/sabeeh/IdeaProjects/ksoup-benchmark/ksoup-test/testResources/test.txt")).buffered().readString()
doc1 = parseHtml()
}

@Benchmark
fun parse() {
val doc = parseHtml()
}

@Benchmark
fun select() {
val doc = parseHtml()
doc.getElementsByClass("an-info").mapNotNull { anInfo ->
anInfo.parent()?.let { a ->
val attr = a.attr("href")
if (attr.isEmpty()) return@let null

attr.substringAfter("/Home/Bangumi/", "")
.takeIf { it.isNotBlank() }
}
}
}

private fun parseHtml() = Ksoup.parse(fileData)
}
45 changes: 45 additions & 0 deletions ksoup-benchmark/src@jvm/org/jsoup/parser/JsoupBenchmark.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package org.jsoup.parser

import kotlinx.benchmark.*
import kotlinx.io.buffered
import kotlinx.io.files.Path
import kotlinx.io.files.SystemFileSystem
import kotlinx.io.readString
import org.jsoup.Jsoup
import org.jsoup.nodes.Document

@State(Scope.Benchmark)
@Warmup(iterations = 5)
@Measurement(iterations = 5, time = 1, timeUnit = BenchmarkTimeUnit.SECONDS)
class JsoupBenchmark {
private lateinit var fileData: String
private lateinit var doc1: Document

@Setup
fun setUp() {
fileData =
SystemFileSystem.source(Path("/Users/sabeeh/IdeaProjects/ksoup-benchmark/ksoup-test/testResources/test.txt")).buffered().readString()
doc1 = parseHtml()
}

@Benchmark
fun parse() {
val doc = parseHtml()
}

@Benchmark
fun select() {
val doc = parseHtml()
doc.getElementsByClass("an-info").mapNotNull { anInfo ->
anInfo.parent()?.let { a ->
val attr = a.attr("href")
if (attr.isEmpty()) return@let null

attr.substringAfter("/Home/Bangumi/", "")
.takeIf { it.isNotBlank() }
}
}
}

private fun parseHtml() = Jsoup.parse(fileData)
}
2 changes: 1 addition & 1 deletion ksoup-engine-common/src/com/fleeksoft/ksoup/io/Charset.kt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ interface Charset {
}.isSuccess*/
}

fun decode(stringBuilder: StringBuilder, byteArray: ByteArray, start: Int, end: Int): Int
fun decode(stringBuilder: StringBuilder, byteArray: ByteArray, start: Int, end: Int = byteArray.size): Int
fun toByteArray(value: String): ByteArray

fun onlyUtf8(): Boolean = false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,17 @@ class PerformanceComparisonTest {
val jsoupParseTimes = mutableListOf<Long>()
val jsoupSelectTimes = mutableListOf<Long>()

// warmup
repeat(10) {
ksoupTest(ksoupParseTimes, ksoupSelectTimes)
jsoupTest(jsoupParseTimes, jsoupSelectTimes)
}

ksoupParseTimes.clear()
ksoupSelectTimes.clear()
jsoupParseTimes.clear()
jsoupSelectTimes.clear()

// Perform multiple tests
repeat(30) {
ksoupTest(ksoupParseTimes, ksoupSelectTimes)
Expand Down
2 changes: 1 addition & 1 deletion ksoup/src/com/fleeksoft/ksoup/internal/StringUtil.kt
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ public object StringUtil {
}

public fun inSorted(needle: String, haystack: Array<out String>): Boolean {
return haystack.toList().binarySearch(needle) >= 0
return haystack.binarySearch(needle) >= 0
}

/**
Expand Down
3 changes: 2 additions & 1 deletion ksoup/src/com/fleeksoft/ksoup/nodes/Attribute.kt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import com.fleeksoft.ksoup.helper.Validate
import com.fleeksoft.ksoup.internal.StringUtil
import com.fleeksoft.ksoup.nodes.Document.OutputSettings.Syntax
import com.fleeksoft.ksoup.ported.KCloneable
import com.fleeksoft.ksoup.ported.binarySearchBy
import com.fleeksoft.ksoup.ported.exception.IOException
import com.fleeksoft.ksoup.ported.exception.SerializationException

Expand Down Expand Up @@ -351,7 +352,7 @@ public open class Attribute : Map.Entry<String, String?>, KCloneable<Attribute>
* Checks if this attribute name is defined as a boolean attribute in HTML5
*/
public fun isBooleanAttribute(key: String): Boolean {
return booleanAttributes.toList().binarySearch { it.compareTo(key.lowercase()) } >= 0
return booleanAttributes.binarySearchBy { it.compareTo(key.lowercase()) } >= 0
}
}
}
3 changes: 0 additions & 3 deletions ksoup/src/com/fleeksoft/ksoup/nodes/Element.kt
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ import com.fleeksoft.ksoup.select.*
import kotlin.js.JsName
import kotlin.jvm.JvmOverloads
import kotlin.reflect.KClass
import kotlin.reflect.cast

/**
* An HTML Element consists of a tag name, attributes, and child nodes (including text nodes and other elements).
Expand Down Expand Up @@ -398,8 +397,6 @@ public open class Element : Node {

private inline fun <reified T : Any> filterNodes(clazz: KClass<T>): List<T> {
return _childNodes.filterIsInstance<T>()
.map { clazz.cast(it) }
.toList()
}

/**
Expand Down
6 changes: 3 additions & 3 deletions ksoup/src/com/fleeksoft/ksoup/nodes/Entities.kt
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ public object Entities {
if (value != null) return value
val codepoint = extended.codepointForName(name)
return if (codepoint != empty) {
charArrayOf(codepoint.toChar()).concatToString()
codepoint.toChar().toString()
} else {
emptyName
}
Expand Down Expand Up @@ -420,12 +420,12 @@ public object Entities {
}

public fun codepointForName(name: String): Int {
val index: Int = nameKeys.toList().binarySearch(name)
val index: Int = nameKeys.binarySearch(name)
return if (index >= 0) codeVals[index] else empty
}

public fun nameForCodepoint(codepoint: Int): String {
val index: Int = codeKeys.toList().binarySearch(codepoint)
val index: Int = codeKeys.binarySearch(codepoint)
return if (index >= 0) {
// the results are ordered so lower case versions of same codepoint come after uppercase, and we prefer to emit lower
// (and binary search for same item with multi results is undefined
Expand Down
2 changes: 1 addition & 1 deletion ksoup/src/com/fleeksoft/ksoup/nodes/FormElement.kt
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ public class FormElement(tag: Tag, baseUri: String?, attributes: Attributes?) :
private val linkedEls: Elements = Elements()

// contains form submittable elements that were linked during the parse (and due to parse rules, may no longer be a child of this form)
private val submittable = QueryParser.parse(StringUtil.join(SharedConstants.FormSubmitTags.toList(), ", "))
private val submittable = QueryParser.parse(SharedConstants.FormSubmitTags.joinToString(", "))

/**
* Get the list of form control elements associated with this form.
Expand Down
Loading
Loading