Skip to content

Commit

Permalink
Merge pull request #7 from fleeksoft/develop
Browse files Browse the repository at this point in the history
Release 0.0.6, fixing issue #2
  • Loading branch information
itboy87 authored Nov 25, 2023
2 parents 1087e6b + 6b4a82d commit 1e3e056
Show file tree
Hide file tree
Showing 12 changed files with 272 additions and 183 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@ Ksoup is adept at handling all varieties of HTML found in the wild.
### Ksoup is published on Maven Central
```Kotlin
commonMain.dependencies {
implementation("com.fleeksoft.ksoup:ksoup:0.0.5")
implementation("com.fleeksoft.ksoup:ksoup:0.0.6")

// Optional: Include only if you need to use network request functions such as
// Ksoup.parseGetRequest, Ksoup.parseSubmitRequest, and Ksoup.parsePostRequest
implementation("com.fleeksoft.ksoup:ksoup-network:0.0.5")
implementation("com.fleeksoft.ksoup:ksoup-network:0.0.6")
}
```

Expand Down
1 change: 1 addition & 0 deletions build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ plugins {
//trick: for the same plugin versions in all sub-modules
alias(libs.plugins.androidLibrary).apply(false)
alias(libs.plugins.kotlinMultiplatform).apply(false)
alias(libs.plugins.dokka)
}
2 changes: 1 addition & 1 deletion gradle/libs.versions.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ agp = "8.1.3"
kotlin = "1.9.20"
compileSdk = "34"
minSdk = "21"
libraryVersion = "0.0.5"
libraryVersion = "0.0.6"
junitJupiter = "5.9.3"
compose = "1.5.4"
compose-compiler = "1.5.4"
Expand Down
36 changes: 7 additions & 29 deletions ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/helper/DataUtil.kt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ import com.fleeksoft.ksoup.readFile
import com.fleeksoft.ksoup.readGzipFile
import com.fleeksoft.ksoup.select.Elements
import io.ktor.utils.io.charsets.*
import io.ktor.utils.io.core.*
import okio.*
import okio.Buffer
import kotlin.random.Random
Expand Down Expand Up @@ -89,11 +88,7 @@ internal object DataUtil {
}.getOrNull() ?: false

if (zipped) {
BufferReader(readGzipFile(filePath).readByteArray())
/*BufferReader(
GzipSource(Buffer().apply { write(bufferedSource.readByteArray()) }).buffer()
.readByteArray()
)*/
BufferReader(readGzipFile(filePath))
} else {
BufferReader(bufferedSource)
}
Expand Down Expand Up @@ -178,7 +173,7 @@ internal object DataUtil {

// read the start of the stream and look for a BOM or meta charset

val peekedBuffer = bufferReader.getPeek()
val peekedBuffer = bufferReader.peek()
// -1 because we read one more to see if completed. First read is < buffer size, so can't be invalid.
val firstBytes: ByteArray = readToByteBuffer(peekedBuffer, firstReadBufferSize - 1)
val fullyRead = peekedBuffer.exhausted()
Expand All @@ -189,19 +184,6 @@ internal object DataUtil {
if (bomCharset != null) charsetName = bomCharset.charset
if (charsetName == null) { // determine from meta. safe first parse as UTF-8
doc = try {
/*val defaultDecoded: java.nio.CharBuffer = UTF_8.decode(firstBytes)
if (defaultDecoded.hasArray()) {
parser.parseInput(
java.io.CharArrayReader(
defaultDecoded.array(),
defaultDecoded.arrayOffset(),
defaultDecoded.limit(),
),
baseUri,
)
} else {
parser.parseInput(defaultDecoded.toString(), baseUri)
}*/
parser.parseInput(firstBytes, baseUri)
} catch (e: UncheckedIOException) {
throw e
Expand Down Expand Up @@ -261,18 +243,14 @@ internal object DataUtil {
if (doc == null) {
if (charsetName == null) charsetName = defaultCharsetName
// TODO: bufferSize not used here because not supported yet
val reader = BufferReader(
String(
bufferReader.readByteArray(),
charset = Charset.forName(charsetName)
)
)
bufferReader.setCharSet(charsetName)

if (bomCharset != null && bomCharset.offset) { // creating the buffered inputReader ignores the input pos, so must skip here
// skip first char which can be 2-4
reader.skipFirstUnicodeChar(1)
bufferReader.skipFirstUnicodeChar(1)
}
doc = try {
parser.parseInput(reader, baseUri)
parser.parseInput(bufferReader, baseUri)
} catch (e: UncheckedIOException) {
// io exception when parsing (not seen before because reading the stream as we go)
throw e
Expand Down Expand Up @@ -304,7 +282,7 @@ internal object DataUtil {
* @throws IOException if an exception occurs whilst reading from the input stream.
*/
@Throws(IOException::class)
fun readToByteBuffer(bufferReader: BufferedSource, maxSize: Long): ByteArray {
fun readToByteBuffer(bufferReader: BufferReader, maxSize: Long): ByteArray {
require(maxSize >= 0) {
"maxSize must be 0 (unlimited) or larger"
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ internal class ConstrainableSource(

return try {
val calculatedByteCount: Int = if (this.size() > 0) this.size().toInt() else toRead
val read = getActiveSource().read(
val read = getBuffer().read(
sink = sink,
offset = 0,
byteCount = calculatedByteCount
Expand All @@ -59,7 +59,7 @@ internal class ConstrainableSource(
}
read
} catch (e: Exception) {
0
throw e
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import okio.IOException
import com.fleeksoft.ksoup.UncheckedIOException
import com.fleeksoft.ksoup.ported.BufferReader
import com.fleeksoft.ksoup.ported.buildString
import io.ktor.utils.io.core.*
import kotlin.math.abs
import kotlin.math.min

Expand All @@ -12,7 +13,7 @@ import kotlin.math.min
*/
internal class CharacterReader {
private var charBuf: CharArray?
private var source: CharArray?
private var source: BufferReader?
private var bufLength = 0
private var bufSplitPoint = 0
private var bufPos = 0
Expand All @@ -26,11 +27,9 @@ internal class CharacterReader {
private var newlinePositions: ArrayList<Int>? = null
private var lineNumberOffset = 1 // line numbers start at 1; += newlinePosition[indexof(pos)]

constructor(input: BufferReader) : this(input.readCharArray(), maxBufferLen)
constructor(input: String) : this(BufferReader(input), input.toByteArray().size)

constructor(input: String) : this(input.toCharArray(), input.length)

constructor(input: CharArray, sz: Int = maxBufferLen) {
constructor(input: BufferReader, sz: Int = maxBufferLen) {
source = input
charBuf = CharArray(min(sz, maxBufferLen))
bufferUp()
Expand All @@ -52,7 +51,6 @@ internal class CharacterReader {
private var readFully =
false // if the underlying stream has been completely read, no value in further buffering

private var skipPos: Int = 0
private fun bufferUp() {
// println("pre => bufSize: ${charBuf?.size} bufLength: $bufLength, readerPos: $readerPos, bufPos: $bufPos, bufSplitPoint: $bufSplitPoint")
if (readFully || bufPos < bufSplitPoint) return
Expand All @@ -62,112 +60,49 @@ internal class CharacterReader {
} else {
Pair(bufPos.toLong(), 0)
}
// val markSource = SourceMarker(source, minReadAheadLen.toLong())

try {
skipPos += pos.toInt()
var read: Int = 0
while (read <= minReadAheadLen && !readFully) {
var thisRead = 0
// val readData: ByteArray = ByteArray(charBuf!!.size)
val toIndex = min(source!!.size, skipPos + charBuf!!.size) - read
val toReadDataSize = toIndex - skipPos
if (toReadDataSize > 0 && toIndex > skipPos) {
source!!.copyInto(charBuf!!, startIndex = skipPos, endIndex = toIndex)
// val readData: CharArray = source!!.copyOfRange(skipPos, toIndex)
thisRead = toReadDataSize
if (toIndex >= source!!.size) readFully = true
}

// thisRead = peekSource.read(readData, read, charBuf!!.size - pos.toInt()) // always reading 8126 bytes only
/*charBuf = peekSource.readByteArray(charBuf!!.size - read).also { thisRead = it.size }.decodeToString()
.toCharArray()*/
// if (thisRead == -1) readFully = true
if (thisRead <= 0) break
read += thisRead
source!!.skip(pos)
val reader: BufferReader = source!!.peek()
var read: Int = 0
while (read <= minReadAheadLen) {
val toReadSize = charBuf!!.size - read
val str = if (toReadSize > 0) {
reader.readString(toReadSize.toLong())
} else {
""
}

if (read > 0) {
bufLength = read
readerPos += pos.toInt()
bufPos = offset
if (bufMark != -1) bufMark = 0
bufSplitPoint = minOf(bufLength, readAheadLimit)
}
val thisRead = if (str.isEmpty() && reader.exhausted()) -1 else str.length

// println("post => bufSize: ${charBuf?.size} bufLength: $bufLength, readerPos: $readerPos, bufPos: $bufPos, bufSplitPoint: $bufSplitPoint")
if (thisRead > 0) {
str.toCharArray().copyInto(charBuf!!, destinationOffset = read)
}

/*if (source.buffer.size > 0) {
markSource.source().skip(pos)
val userOffset = markSource.mark(maxBufferLen.toLong())
var read = 0
val byteArray = ByteArray(charBuf!!.size)
while (read <= minReadAheadLen) {
val length = charBuf!!.size - read
val endIndex = read + length
if (endIndex == -1) readFully = true
if (endIndex <= 0) break
val thisRead = source.buffer.readAtMostTo(byteArray, read, endIndex)
if (thisRead == -1) readFully = true
if (thisRead <= 0) break
read += thisRead
}
charBuf = byteArray.decodeToString().toCharArray()
markSource.reset(userOffset)
if (read > 0) {
bufLength = read
readerPos += pos.toInt()
bufPos = offset
if (bufMark != -1) bufMark = 0
bufSplitPoint = minOf(bufLength, readAheadLimit)
}
} else {
readFully = true
/*val readData = ByteArray(toReadSize)
val thisRead = reader.read(readData, 0, toReadSize) //read max 8192
if (thisRead > 0) {
readData.copyOfRange(0, thisRead).decodeToString()
.toCharArray().copyInto(charBuf!!, destinationOffset = read)
}*/

} catch (e: IOException) {
throw UncheckedIOException(e)
if (thisRead == -1) readFully = true
if (thisRead <= 0) break
read += thisRead
}
scanBufferForNewlines() // if enabled, we index newline positions for line number tracking
lastIcSeq = null // cache for last containsIgnoreCase(seq)
}

/*private fun bufferUp() {
if (readFully || bufPos < bufSplitPoint) return
val pos: Int
val offset: Int
if (bufMark != -1) {
pos = bufMark
offset = bufPos - bufMark
} else {
pos = bufPos
offset = 0
}
try {
val skipped: Long = reader.skip(pos.toLong())
reader.mark(maxBufferLen)
var read = 0
while (read <= minReadAheadLen) {
val thisRead: Int = reader.read(charBuf, read, charBuf!!.size - read)
if (thisRead == -1) readFully = true
if (thisRead <= 0) break
read += thisRead
}
reader.reset()
if (read > 0) {
Validate.isTrue(skipped == pos.toLong()) // Previously asserted that there is room in buf to skip, so this will be a WTF
bufLength = read
readerPos += pos
bufPos = offset
if (bufMark != -1) bufMark = 0
bufSplitPoint = min(bufLength, readAheadLimit)
}
} catch (e: IOException) {
throw UncheckedIOException(e)
if (read > 0) {
bufLength = read
readerPos += pos.toInt()
bufPos = offset
if (bufMark != -1) bufMark = 0
bufSplitPoint = minOf(bufLength, readAheadLimit)
}

// println("post => bufSize: ${charBuf?.size} bufLength: $bufLength, readerPos: $readerPos, bufPos: $bufPos, bufSplitPoint: $bufSplitPoint")

scanBufferForNewlines() // if enabled, we index newline positions for line number tracking
lastIcSeq = null // cache for last containsIgnoreCase(seq)
}*/
}

/**
* Gets the position currently read to in the content. Starts at 0.
Expand Down Expand Up @@ -676,7 +611,7 @@ internal class CharacterReader {
// we maintain a cache of the previously scanned sequence, and return that if applicable on repeated scans.
// that improves the situation where there is a sequence of <p<p<p<p<p<p<p...</title> and we're bashing on the <p
// looking for the </title>. Resets in bufferUp()

private var lastIcSeq: String? = null // scan cache
private var lastIcIndex = 0 // nearest found indexOf

Expand Down Expand Up @@ -723,8 +658,9 @@ internal class CharacterReader {
private const val maxStringCacheLen = 12
const val maxBufferLen = 1024 * 32 // visible for testing
const val readAheadLimit = (maxBufferLen * 0.75).toInt() // visible for testing
private const val minReadAheadLen =
1024 // the minimum mark length supported. No HTML entities can be larger than this.

// the minimum mark length supported. No HTML entities can be larger than this.
private const val minReadAheadLen = 1024
private const val stringCacheSize = 512

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ public class Parser {
}

public fun parseInput(html: String, baseUri: String): Document {
return treeBuilder.parse(BufferReader(html.toByteArray()), baseUri, this)
return treeBuilder.parse(BufferReader(html), baseUri, this)
}

public fun parseInput(inputHtml: BufferReader, baseUri: String): Document {
Expand Down
Loading

0 comments on commit 1e3e056

Please sign in to comment.