Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Ksoup.parseMetaData function to parse website metadata #85

Merged
merged 3 commits into from
Sep 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,24 @@ headlines.forEach { headline: Element ->
println("$headlineTitle => $headlineLink")
}
```

### Parsing Metadata from Website
```kotlin
//Please note that the com.fleeksoft.ksoup:ksoup-network library is required for Ksoup.parseGetRequest.
val doc: Document = Ksoup.parseGetRequest(url = "https://en.wikipedia.org/") // suspend function
val metadata: Metadata = Ksoup.parseMetaData(element = doc) // suspend function
// or
val metadata: Metadata = Ksoup.parseMetaData(html = HTML)

println("title: ${metadata.title}")
println("description: ${metadata.description}")
println("ogTitle: ${metadata.ogTitle}")
println("ogDescription: ${metadata.ogDescription}")
println("twitterTitle: ${metadata.twitterTitle}")
println("twitterDescription: ${metadata.twitterDescription}")
// Check com.fleeksoft.ksoup.model.MetaData for more fields
```

In this example, `Ksoup.parseGetRequest` fetches and parses HTML content from Wikipedia, extracting and printing news headlines and their corresponding links.
### Ksoup Public functions
- Ksoup.parse
Expand Down
66 changes: 66 additions & 0 deletions ksoup-test/test/com/fleeksoft/ksoup/meta/MetadataTest.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
package com.fleeksoft.ksoup.meta

import com.fleeksoft.ksoup.Ksoup
import com.fleeksoft.ksoup.model.MetaData
import com.fleeksoft.ksoup.ported.openSourceReader
import kotlin.test.Test
import kotlin.test.assertEquals

class MetadataTest {
val html = """
<html>
<head>
<title>Test Page</title>
<meta property="og:title" content="Test OG Title">
<meta property="og:description" content="Test OG Description">
<meta property="og:image" content="https://example.com/image.png">
<meta property="og:url" content="https://example.com">
<meta name="twitter:title" content="Test Twitter Title">
<meta name="twitter:description" content="Test Twitter Description">
<meta name="twitter:image" content="https://example.com/twitter_image.png">
<meta name="description" content="Test Description">
<link rel="canonical" href="https://example.com">
<link rel="icon" href="/favicon.ico">
</head>
</html>
""".trimIndent()

@Test
fun testParseMetaDataFromString() {
val metaData = Ksoup.parseMetaData(html, "https://example.com")

assertMetaData(metaData)
}

@Test
fun testParseMetaDataFromSourceReader() {
val sourceReader = html.openSourceReader()
val metaData = Ksoup.parseMetaData(sourceReader, "https://example.com")

assertMetaData(metaData)
}


@Test
fun testParseMetaDataFromElement() {
val doc = Ksoup.parse(html, "https://example.com")
val metaData = Ksoup.parseMetaData(doc)

assertMetaData(metaData)
}

private fun assertMetaData(metaData: MetaData) {
assertEquals("Test Page", metaData.htmlTitle)
assertEquals("Test OG Title", metaData.ogTitle)
assertEquals("Test OG Description", metaData.ogDescription)
assertEquals("https://example.com/image.png", metaData.ogImage)
assertEquals("https://example.com", metaData.ogUrl)
assertEquals("Test Twitter Title", metaData.twitterTitle)
assertEquals("Test Twitter Description", metaData.twitterDescription)
assertEquals("https://example.com/twitter_image.png", metaData.twitterImage)
assertEquals("Test Description", metaData.description)
assertEquals("https://example.com", metaData.canonical)
assertEquals("https://example.com/favicon.ico", metaData.favicon)
}

}
91 changes: 91 additions & 0 deletions ksoup/src/com/fleeksoft/ksoup/Ksoup.kt
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@ package com.fleeksoft.ksoup
import com.fleeksoft.ksoup.helper.DataUtil
import com.fleeksoft.ksoup.io.FileSource
import com.fleeksoft.ksoup.io.SourceReader
import com.fleeksoft.ksoup.model.MetaData
import com.fleeksoft.ksoup.nodes.Document
import com.fleeksoft.ksoup.nodes.Element
import com.fleeksoft.ksoup.parser.Parser
import com.fleeksoft.ksoup.parser.StreamParser
import com.fleeksoft.ksoup.ported.toSourceFile
import com.fleeksoft.ksoup.safety.Cleaner
import com.fleeksoft.ksoup.safety.Safelist
Expand Down Expand Up @@ -185,4 +188,92 @@ public object Ksoup {
): Boolean {
return Cleaner(safelist).isValidBodyHtml(bodyHtml)
}

fun parseMetaData(element: Element): MetaData {
val title = element.selectFirst("title")?.text()
return parseMetaDataInternal(baseUri = element.baseUri(), title = title) { query ->
element.selectFirst(query)
}
}

fun parseMetaData(
html: String,
baseUri: String = "",
interceptor: ((head: Element, metaData: MetaData) -> Unit)? = null
): MetaData {
val head = parse(html = html, baseUri = baseUri).head()

val title = head.selectFirst("title")?.text()
return parseMetaDataInternal(baseUri = baseUri, title = title) { query ->
head.selectFirst(query)
}.also {
interceptor?.invoke(head, it)
}
}

fun parseMetaData(
sourceReader: SourceReader,
baseUri: String = "",
interceptor: ((headStream: StreamParser, metaData: MetaData) -> Unit)? = null
): MetaData {
val head = DataUtil.streamParser(sourceReader = sourceReader, baseUri = baseUri, null, Parser.htmlParser())
val title = head.selectFirst("title")?.text()
return parseMetaDataInternal(baseUri = baseUri, title = title) { query ->
head.selectFirst(query)
}.also {
interceptor?.invoke(head, it)
}
}

private fun parseMetaDataInternal(baseUri: String, title: String?, selectFirst: (query: String) -> Element?): MetaData {
// Extract Open Graph metadata
val ogTitle = selectFirst("meta[property=og:title]")?.attr("content")
val ogSiteName = selectFirst("meta[property=og:site_name]")?.attr("content")
val ogType = selectFirst("meta[property=og:type]")?.attr("content")
val ogLocale = selectFirst("meta[property=og:locale]")?.attr("content")
val ogDescription = selectFirst("meta[property=og:description]")?.attr("content")
val ogImage = selectFirst("meta[property=og:image]")?.attr("content")
val ogUrl = selectFirst("meta[property=og:url]")?.attr("content")

// Extract Twitter metadata
val twitterTitle = selectFirst("meta[name=twitter:title]")?.attr("content")
val twitterCard = selectFirst("meta[name=twitter:card]")?.attr("content")
val twitterDescription = selectFirst("meta[name=twitter:description]")?.attr("content")
val twitterImage = selectFirst("meta[name=twitter:image]")?.attr("content")

// Extract standard metadata
val titleTag = selectFirst("meta[name=title]")?.attr("content")
val descriptionTag = selectFirst("meta[name=description]")?.attr("content")
val author = selectFirst("meta[name=author]")?.attr("content")

// Extract canonical URL
val canonicalTag = selectFirst("link[rel=canonical]")?.attr("href")

// Fetch favicon
var faviconTag = selectFirst("link[rel~=icon]")?.attr("href")
if (faviconTag != null && !faviconTag.startsWith("http") && baseUri.isNotEmpty()) {
faviconTag = baseUri + faviconTag
}

// Create a MetaData object
return MetaData(
ogTitle = ogTitle,
ogSiteName = ogSiteName,
ogType = ogType,
ogLocale = ogLocale,
ogDescription = ogDescription,
ogImage = ogImage,
ogUrl = ogUrl,
twitterCard = twitterCard,
twitterTitle = twitterTitle,
twitterDescription = twitterDescription,
twitterImage = twitterImage,
title = titleTag,
description = descriptionTag,
canonical = canonicalTag,
htmlTitle = title,
author = author,
favicon = faviconTag
)
}
}
2 changes: 1 addition & 1 deletion ksoup/src/com/fleeksoft/ksoup/helper/DataUtil.kt
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ public object DataUtil {
* @throws IOException on IO error
* @see Connection.Response.streamParser
*/
suspend fun streamParser(sourceReader: SourceReader, baseUri: String, charset: Charset?, parser: Parser): StreamParser {
fun streamParser(sourceReader: SourceReader, baseUri: String, charset: Charset?, parser: Parser): StreamParser {
val streamer = StreamParser(parser)
val charsetName: String? = charset?.name
val charsetDoc: CharsetDoc = detectCharset(sourceReader, baseUri, charsetName, parser)
Expand Down
21 changes: 21 additions & 0 deletions ksoup/src/com/fleeksoft/ksoup/model/MetaData.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
package com.fleeksoft.ksoup.model

data class MetaData(
val ogTitle: String? = null,
val ogSiteName: String? = null,
val ogDescription: String? = null,
val ogImage: String? = null,
val ogUrl: String? = null,
val ogType: String? = null,
val ogLocale: String? = null,
val twitterCard: String? = null,
val twitterTitle: String? = null,
val twitterDescription: String? = null,
val twitterImage: String? = null,
val title: String? = null,
val description: String? = null,
val canonical: String? = null,
val htmlTitle: String? = null,
val author: String? = null,
val favicon: String? = null
)
Loading