Skip to content

Commit

Permalink
Improve performance (#83) (#84)
Browse files Browse the repository at this point in the history
* kotlinx benchmark added

* performance improved

* performance doc updated

* warmup added

* minor tweak

* bump kotlinx-io version
  • Loading branch information
itboy87 authored Sep 21, 2024
1 parent b4307aa commit 00c1149
Show file tree
Hide file tree
Showing 24 changed files with 264 additions and 54 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,8 @@ In this example, `Ksoup.parseGetRequest` fetches and parses HTML content from Wi
#### For further documentation, please check here: [Jsoup](https://jsoup.org/)

### Ksoup vs. Jsoup Performance: Parsing & Selecting 448KB HTML File [test.tx](https://github.com/fleeksoft/ksoup/blob/develop/ksoup-test/testResources/test.txt)
![Ksoup vs Jsoup](performance1.png)

![Ksoup vs Jsoup](performance.png)

## Open source
Expand Down
6 changes: 5 additions & 1 deletion gradle/libs.versions.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@ ktor = "3.0.0-rc-1"
ktor2 = "2.3.12"
coroutines = "1.8.1"
kotlinxDatetime = "0.6.1"
kotlinx-io = "0.5.3"
kotlinx-io = "0.5.4"
okio = "3.9.0"
dokka = "1.9.20"
kotlinx-benchmark = "0.4.12"

#korlibs = "999.0.0.999" # 999.0.0.999 is local version
korlibs = "6.0.1"
Expand Down Expand Up @@ -48,10 +49,13 @@ stately-concurrency = { module = "co.touchlab:stately-concurrency", version.ref
jsoup = { module = "org.jsoup:jsoup", version.ref = "jsoup" }
okio = { module = "com.squareup.okio:okio", version.ref = "okio" }
okio-nodefilesystem = { module = "com.squareup.okio:okio-nodefilesystem", version.ref = "okio" }
kotlinx-benchmark-runtime = { module = "org.jetbrains.kotlinx:kotlinx-benchmark-runtime", version.ref = "kotlinx-benchmark" }

[plugins]
androidLibrary = { id = "com.android.library", version.ref = "agp" }
kmp = { id = "org.jetbrains.kotlin.multiplatform", version.ref = "kotlin" }
dokka = { id = "org.jetbrains.dokka", version.ref = "dokka" }
power-assert = { id = "org.jetbrains.kotlin.plugin.power-assert", version.ref = "kotlin" }
mavenPublish = { id = "com.vanniktech.maven.publish", version.ref = "mavenPublish" }
kotlinx-benchmark = { id = "org.jetbrains.kotlinx.benchmark", version.ref = "kotlinx-benchmark" }
allopen = { id = "org.jetbrains.kotlin.plugin.allopen", version.ref = "kotlin" }
23 changes: 23 additions & 0 deletions ksoup-benchmark/build.gradle.kts
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
plugins {
alias(libs.plugins.kotlinx.benchmark)
alias(libs.plugins.allopen)
}

allOpen {
annotation("org.openjdk.jmh.annotations.State")
}


benchmark {
targets {
register("jvm")
}

configurations {
named("main") {
// exclude("org.jsoup.parser.JsoupBenchmark")
// exclude("com.fleeksoft.ksoup.benchmark.KsoupBenchmark")
}
}

}
24 changes: 24 additions & 0 deletions ksoup-benchmark/module.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
product:
type: lib
platforms: [ jvm, js, android, linuxX64, linuxArm64, tvosArm64, tvosX64, tvosSimulatorArm64, macosX64, macosArm64, iosArm64, iosSimulatorArm64, iosX64, mingwX64 ]

apply: [ ../common.module-template.yaml ]

aliases:
- jvmAndAndroid: [ jvm, android ]

repositories:
- mavenLocal

dependencies:
- $libs.kotlinx.io
- $libs.kotlinx.benchmark.runtime
# - com.fleeksoft.ksoup:ksoup-lite:0.1.8
- ../ksoup

dependencies@jvm:
- $libs.jsoup

settings:
kotlin:
optIns: [ kotlinx.cinterop.BetaInteropApi, kotlinx.cinterop.UnsafeNumber, kotlinx.cinterop.ExperimentalForeignApi, kotlin.experimental.ExperimentalNativeApi ]
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
package com.fleeksoft.ksoup.benchmark

import com.fleeksoft.ksoup.Ksoup
import com.fleeksoft.ksoup.nodes.Document
import com.fleeksoft.ksoup.nodes.Element
import com.fleeksoft.ksoup.select.Elements
import com.fleeksoft.ksoup.select.Evaluator
import kotlinx.benchmark.*
import kotlinx.io.buffered
import kotlinx.io.files.Path
import kotlinx.io.files.SystemFileSystem
import kotlinx.io.readString


@State(Scope.Benchmark)
@Warmup(iterations = 5)
@Measurement(iterations = 5, time = 1, timeUnit = BenchmarkTimeUnit.SECONDS)
class KsoupBenchmark {
private lateinit var fileData: String
private lateinit var doc1: Document

@Setup
fun setUp() {
fileData =
SystemFileSystem.source(Path("/Users/sabeeh/IdeaProjects/ksoup-benchmark/ksoup-test/testResources/test.txt")).buffered().readString()
doc1 = parseHtml()
}

@Benchmark
fun parse() {
val doc = parseHtml()
}

@Benchmark
fun select() {
val doc = parseHtml()
doc.getElementsByClass("an-info").mapNotNull { anInfo ->
anInfo.parent()?.let { a ->
val attr = a.attr("href")
if (attr.isEmpty()) return@let null

attr.substringAfter("/Home/Bangumi/", "")
.takeIf { it.isNotBlank() }
}
}
}

private fun parseHtml() = Ksoup.parse(fileData)
}
45 changes: 45 additions & 0 deletions ksoup-benchmark/src@jvm/org/jsoup/parser/JsoupBenchmark.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package org.jsoup.parser

import kotlinx.benchmark.*
import kotlinx.io.buffered
import kotlinx.io.files.Path
import kotlinx.io.files.SystemFileSystem
import kotlinx.io.readString
import org.jsoup.Jsoup
import org.jsoup.nodes.Document

@State(Scope.Benchmark)
@Warmup(iterations = 5)
@Measurement(iterations = 5, time = 1, timeUnit = BenchmarkTimeUnit.SECONDS)
class JsoupBenchmark {
private lateinit var fileData: String
private lateinit var doc1: Document

@Setup
fun setUp() {
fileData =
SystemFileSystem.source(Path("/Users/sabeeh/IdeaProjects/ksoup-benchmark/ksoup-test/testResources/test.txt")).buffered().readString()
doc1 = parseHtml()
}

@Benchmark
fun parse() {
val doc = parseHtml()
}

@Benchmark
fun select() {
val doc = parseHtml()
doc.getElementsByClass("an-info").mapNotNull { anInfo ->
anInfo.parent()?.let { a ->
val attr = a.attr("href")
if (attr.isEmpty()) return@let null

attr.substringAfter("/Home/Bangumi/", "")
.takeIf { it.isNotBlank() }
}
}
}

private fun parseHtml() = Jsoup.parse(fileData)
}
2 changes: 1 addition & 1 deletion ksoup-engine-common/src/com/fleeksoft/ksoup/io/Charset.kt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ interface Charset {
}.isSuccess*/
}

fun decode(stringBuilder: StringBuilder, byteArray: ByteArray, start: Int, end: Int): Int
fun decode(stringBuilder: StringBuilder, byteArray: ByteArray, start: Int, end: Int = byteArray.size): Int
fun toByteArray(value: String): ByteArray

fun onlyUtf8(): Boolean = false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,17 @@ class PerformanceComparisonTest {
val jsoupParseTimes = mutableListOf<Long>()
val jsoupSelectTimes = mutableListOf<Long>()

// warmup
repeat(10) {
ksoupTest(ksoupParseTimes, ksoupSelectTimes)
jsoupTest(jsoupParseTimes, jsoupSelectTimes)
}

ksoupParseTimes.clear()
ksoupSelectTimes.clear()
jsoupParseTimes.clear()
jsoupSelectTimes.clear()

// Perform multiple tests
repeat(30) {
ksoupTest(ksoupParseTimes, ksoupSelectTimes)
Expand Down
2 changes: 1 addition & 1 deletion ksoup/src/com/fleeksoft/ksoup/internal/StringUtil.kt
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ public object StringUtil {
}

public fun inSorted(needle: String, haystack: Array<out String>): Boolean {
return haystack.toList().binarySearch(needle) >= 0
return haystack.binarySearch(needle) >= 0
}

/**
Expand Down
3 changes: 2 additions & 1 deletion ksoup/src/com/fleeksoft/ksoup/nodes/Attribute.kt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import com.fleeksoft.ksoup.helper.Validate
import com.fleeksoft.ksoup.internal.StringUtil
import com.fleeksoft.ksoup.nodes.Document.OutputSettings.Syntax
import com.fleeksoft.ksoup.ported.KCloneable
import com.fleeksoft.ksoup.ported.binarySearchBy
import com.fleeksoft.ksoup.ported.exception.IOException
import com.fleeksoft.ksoup.ported.exception.SerializationException

Expand Down Expand Up @@ -351,7 +352,7 @@ public open class Attribute : Map.Entry<String, String?>, KCloneable<Attribute>
* Checks if this attribute name is defined as a boolean attribute in HTML5
*/
public fun isBooleanAttribute(key: String): Boolean {
return booleanAttributes.toList().binarySearch { it.compareTo(key.lowercase()) } >= 0
return booleanAttributes.binarySearchBy { it.compareTo(key.lowercase()) } >= 0
}
}
}
3 changes: 0 additions & 3 deletions ksoup/src/com/fleeksoft/ksoup/nodes/Element.kt
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ import com.fleeksoft.ksoup.select.*
import kotlin.js.JsName
import kotlin.jvm.JvmOverloads
import kotlin.reflect.KClass
import kotlin.reflect.cast

/**
* An HTML Element consists of a tag name, attributes, and child nodes (including text nodes and other elements).
Expand Down Expand Up @@ -398,8 +397,6 @@ public open class Element : Node {

private inline fun <reified T : Any> filterNodes(clazz: KClass<T>): List<T> {
return _childNodes.filterIsInstance<T>()
.map { clazz.cast(it) }
.toList()
}

/**
Expand Down
6 changes: 3 additions & 3 deletions ksoup/src/com/fleeksoft/ksoup/nodes/Entities.kt
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ public object Entities {
if (value != null) return value
val codepoint = extended.codepointForName(name)
return if (codepoint != empty) {
charArrayOf(codepoint.toChar()).concatToString()
codepoint.toChar().toString()
} else {
emptyName
}
Expand Down Expand Up @@ -420,12 +420,12 @@ public object Entities {
}

public fun codepointForName(name: String): Int {
val index: Int = nameKeys.toList().binarySearch(name)
val index: Int = nameKeys.binarySearch(name)
return if (index >= 0) codeVals[index] else empty
}

public fun nameForCodepoint(codepoint: Int): String {
val index: Int = codeKeys.toList().binarySearch(codepoint)
val index: Int = codeKeys.binarySearch(codepoint)
return if (index >= 0) {
// the results are ordered so lower case versions of same codepoint come after uppercase, and we prefer to emit lower
// (and binary search for same item with multi results is undefined
Expand Down
2 changes: 1 addition & 1 deletion ksoup/src/com/fleeksoft/ksoup/nodes/FormElement.kt
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ public class FormElement(tag: Tag, baseUri: String?, attributes: Attributes?) :
private val linkedEls: Elements = Elements()

// contains form submittable elements that were linked during the parse (and due to parse rules, may no longer be a child of this form)
private val submittable = QueryParser.parse(StringUtil.join(SharedConstants.FormSubmitTags.toList(), ", "))
private val submittable = QueryParser.parse(SharedConstants.FormSubmitTags.joinToString(", "))

/**
* Get the list of form control elements associated with this form.
Expand Down
Loading

0 comments on commit 00c1149

Please sign in to comment.