From b8830379cd0051e63d7a6e9bc9b2364f06634fb5 Mon Sep 17 00:00:00 2001 From: Sabeeh Ul Hussnain Date: Mon, 23 Sep 2024 13:51:30 +0500 Subject: [PATCH 1/3] Remove unnecessary suspend from streamParser --- ksoup/src/com/fleeksoft/ksoup/helper/DataUtil.kt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ksoup/src/com/fleeksoft/ksoup/helper/DataUtil.kt b/ksoup/src/com/fleeksoft/ksoup/helper/DataUtil.kt index e62af64..bb1508c 100644 --- a/ksoup/src/com/fleeksoft/ksoup/helper/DataUtil.kt +++ b/ksoup/src/com/fleeksoft/ksoup/helper/DataUtil.kt @@ -59,7 +59,7 @@ public object DataUtil { * @throws IOException on IO error * @see Connection.Response.streamParser */ - suspend fun streamParser(sourceReader: SourceReader, baseUri: String, charset: Charset?, parser: Parser): StreamParser { + fun streamParser(sourceReader: SourceReader, baseUri: String, charset: Charset?, parser: Parser): StreamParser { val streamer = StreamParser(parser) val charsetName: String? = charset?.name val charsetDoc: CharsetDoc = detectCharset(sourceReader, baseUri, charsetName, parser) From 801976b23881385295a42f933af9db0b56ceb3b8 Mon Sep 17 00:00:00 2001 From: Sabeeh Ul Hussnain Date: Mon, 23 Sep 2024 13:54:21 +0500 Subject: [PATCH 2/3] Add Ksoup.parseMetaData function to parse website metadata --- .../com/fleeksoft/ksoup/meta/MetadataTest.kt | 66 ++++++++++++++ ksoup/src/com/fleeksoft/ksoup/Ksoup.kt | 91 +++++++++++++++++++ .../src/com/fleeksoft/ksoup/model/MetaData.kt | 21 +++++ 3 files changed, 178 insertions(+) create mode 100644 ksoup-test/test/com/fleeksoft/ksoup/meta/MetadataTest.kt create mode 100644 ksoup/src/com/fleeksoft/ksoup/model/MetaData.kt diff --git a/ksoup-test/test/com/fleeksoft/ksoup/meta/MetadataTest.kt b/ksoup-test/test/com/fleeksoft/ksoup/meta/MetadataTest.kt new file mode 100644 index 0000000..84e8bb6 --- /dev/null +++ b/ksoup-test/test/com/fleeksoft/ksoup/meta/MetadataTest.kt @@ -0,0 +1,66 @@ +package com.fleeksoft.ksoup.meta + +import com.fleeksoft.ksoup.Ksoup +import com.fleeksoft.ksoup.model.MetaData +import com.fleeksoft.ksoup.ported.openSourceReader +import kotlin.test.Test +import kotlin.test.assertEquals + +class MetadataTest { + val html = """ + + + Test Page + + + + + + + + + + + + + """.trimIndent() + + @Test + fun testParseMetaDataFromString() { + val metaData = Ksoup.parseMetaData(html, "https://example.com") + + assertMetaData(metaData) + } + + @Test + fun testParseMetaDataFromSourceReader() { + val sourceReader = html.openSourceReader() + val metaData = Ksoup.parseMetaData(sourceReader, "https://example.com") + + assertMetaData(metaData) + } + + + @Test + fun testParseMetaDataFromElement() { + val doc = Ksoup.parse(html, "https://example.com") + val metaData = Ksoup.parseMetaData(doc) + + assertMetaData(metaData) + } + + private fun assertMetaData(metaData: MetaData) { + assertEquals("Test Page", metaData.htmlTitle) + assertEquals("Test OG Title", metaData.ogTitle) + assertEquals("Test OG Description", metaData.ogDescription) + assertEquals("https://example.com/image.png", metaData.ogImage) + assertEquals("https://example.com", metaData.ogUrl) + assertEquals("Test Twitter Title", metaData.twitterTitle) + assertEquals("Test Twitter Description", metaData.twitterDescription) + assertEquals("https://example.com/twitter_image.png", metaData.twitterImage) + assertEquals("Test Description", metaData.description) + assertEquals("https://example.com", metaData.canonical) + assertEquals("https://example.com/favicon.ico", metaData.favicon) + } + +} \ No newline at end of file diff --git a/ksoup/src/com/fleeksoft/ksoup/Ksoup.kt b/ksoup/src/com/fleeksoft/ksoup/Ksoup.kt index 98966dc..ffad98c 100644 --- a/ksoup/src/com/fleeksoft/ksoup/Ksoup.kt +++ b/ksoup/src/com/fleeksoft/ksoup/Ksoup.kt @@ -3,8 +3,11 @@ package com.fleeksoft.ksoup import com.fleeksoft.ksoup.helper.DataUtil import com.fleeksoft.ksoup.io.FileSource import com.fleeksoft.ksoup.io.SourceReader +import com.fleeksoft.ksoup.model.MetaData import com.fleeksoft.ksoup.nodes.Document +import com.fleeksoft.ksoup.nodes.Element import com.fleeksoft.ksoup.parser.Parser +import com.fleeksoft.ksoup.parser.StreamParser import com.fleeksoft.ksoup.ported.toSourceFile import com.fleeksoft.ksoup.safety.Cleaner import com.fleeksoft.ksoup.safety.Safelist @@ -185,4 +188,92 @@ public object Ksoup { ): Boolean { return Cleaner(safelist).isValidBodyHtml(bodyHtml) } + + fun parseMetaData(element: Element): MetaData { + val title = element.selectFirst("title")?.text() + return parseMetaDataInternal(baseUri = element.baseUri(), title = title) { query -> + element.selectFirst(query) + } + } + + fun parseMetaData( + html: String, + baseUri: String = "", + interceptor: ((head: Element, metaData: MetaData) -> Unit)? = null + ): MetaData { + val head = parse(html = html, baseUri = baseUri).head() + + val title = head.selectFirst("title")?.text() + return parseMetaDataInternal(baseUri = baseUri, title = title) { query -> + head.selectFirst(query) + }.also { + interceptor?.invoke(head, it) + } + } + + fun parseMetaData( + sourceReader: SourceReader, + baseUri: String = "", + interceptor: ((headStream: StreamParser, metaData: MetaData) -> Unit)? = null + ): MetaData { + val head = DataUtil.streamParser(sourceReader = sourceReader, baseUri = baseUri, null, Parser.htmlParser()) + val title = head.selectFirst("title")?.text() + return parseMetaDataInternal(baseUri = baseUri, title = title) { query -> + head.selectFirst(query) + }.also { + interceptor?.invoke(head, it) + } + } + + private fun parseMetaDataInternal(baseUri: String, title: String?, selectFirst: (query: String) -> Element?): MetaData { + // Extract Open Graph metadata + val ogTitle = selectFirst("meta[property=og:title]")?.attr("content") + val ogSiteName = selectFirst("meta[property=og:site_name]")?.attr("content") + val ogType = selectFirst("meta[property=og:type]")?.attr("content") + val ogLocale = selectFirst("meta[property=og:locale]")?.attr("content") + val ogDescription = selectFirst("meta[property=og:description]")?.attr("content") + val ogImage = selectFirst("meta[property=og:image]")?.attr("content") + val ogUrl = selectFirst("meta[property=og:url]")?.attr("content") + + // Extract Twitter metadata + val twitterTitle = selectFirst("meta[name=twitter:title]")?.attr("content") + val twitterCard = selectFirst("meta[name=twitter:card]")?.attr("content") + val twitterDescription = selectFirst("meta[name=twitter:description]")?.attr("content") + val twitterImage = selectFirst("meta[name=twitter:image]")?.attr("content") + + // Extract standard metadata + val titleTag = selectFirst("meta[name=title]")?.attr("content") + val descriptionTag = selectFirst("meta[name=description]")?.attr("content") + val author = selectFirst("meta[name=author]")?.attr("content") + + // Extract canonical URL + val canonicalTag = selectFirst("link[rel=canonical]")?.attr("href") + + // Fetch favicon + var faviconTag = selectFirst("link[rel~=icon]")?.attr("href") + if (faviconTag != null && !faviconTag.startsWith("http") && baseUri.isNotEmpty()) { + faviconTag = baseUri + faviconTag + } + + // Create a MetaData object + return MetaData( + ogTitle = ogTitle, + ogSiteName = ogSiteName, + ogType = ogType, + ogLocale = ogLocale, + ogDescription = ogDescription, + ogImage = ogImage, + ogUrl = ogUrl, + twitterCard = twitterCard, + twitterTitle = twitterTitle, + twitterDescription = twitterDescription, + twitterImage = twitterImage, + title = titleTag, + description = descriptionTag, + canonical = canonicalTag, + htmlTitle = title, + author = author, + favicon = faviconTag + ) + } } \ No newline at end of file diff --git a/ksoup/src/com/fleeksoft/ksoup/model/MetaData.kt b/ksoup/src/com/fleeksoft/ksoup/model/MetaData.kt new file mode 100644 index 0000000..8350f24 --- /dev/null +++ b/ksoup/src/com/fleeksoft/ksoup/model/MetaData.kt @@ -0,0 +1,21 @@ +package com.fleeksoft.ksoup.model + +data class MetaData( + val ogTitle: String? = null, + val ogSiteName: String? = null, + val ogDescription: String? = null, + val ogImage: String? = null, + val ogUrl: String? = null, + val ogType: String? = null, + val ogLocale: String? = null, + val twitterCard: String? = null, + val twitterTitle: String? = null, + val twitterDescription: String? = null, + val twitterImage: String? = null, + val title: String? = null, + val description: String? = null, + val canonical: String? = null, + val htmlTitle: String? = null, + val author: String? = null, + val favicon: String? = null +) \ No newline at end of file From 3d1d02e7d5c2ac859e5bc0f9cdfac90961740ffe Mon Sep 17 00:00:00 2001 From: Sabeeh Ul Hussnain Date: Mon, 23 Sep 2024 14:01:10 +0500 Subject: [PATCH 3/3] Add Ksoup.parseMetaData function to parse website metadata --- README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/README.md b/README.md index aa07603..3f855cf 100644 --- a/README.md +++ b/README.md @@ -102,6 +102,24 @@ headlines.forEach { headline: Element -> println("$headlineTitle => $headlineLink") } ``` + +### Parsing Metadata from Website +```kotlin +//Please note that the com.fleeksoft.ksoup:ksoup-network library is required for Ksoup.parseGetRequest. +val doc: Document = Ksoup.parseGetRequest(url = "https://en.wikipedia.org/") // suspend function +val metadata: Metadata = Ksoup.parseMetaData(element = doc) // suspend function +// or +val metadata: Metadata = Ksoup.parseMetaData(html = HTML) + +println("title: ${metadata.title}") +println("description: ${metadata.description}") +println("ogTitle: ${metadata.ogTitle}") +println("ogDescription: ${metadata.ogDescription}") +println("twitterTitle: ${metadata.twitterTitle}") +println("twitterDescription: ${metadata.twitterDescription}") +// Check com.fleeksoft.ksoup.model.MetaData for more fields +``` + In this example, `Ksoup.parseGetRequest` fetches and parses HTML content from Wikipedia, extracting and printing news headlines and their corresponding links. ### Ksoup Public functions - Ksoup.parse