Skip to content

Commit

Permalink
Add Ksoup.parseMetaData function to parse website metadata (#85)
Browse files Browse the repository at this point in the history
* Remove unnecessary suspend from streamParser

* Add Ksoup.parseMetaData function to parse website metadata

* Add Ksoup.parseMetaData function to parse website metadata
  • Loading branch information
itboy87 authored Sep 23, 2024
1 parent 00c1149 commit 54a7d28
Show file tree
Hide file tree
Showing 5 changed files with 197 additions and 1 deletion.
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,24 @@ headlines.forEach { headline: Element ->
println("$headlineTitle => $headlineLink")
}
```

### Parsing Metadata from Website
```kotlin
//Please note that the com.fleeksoft.ksoup:ksoup-network library is required for Ksoup.parseGetRequest.
val doc: Document = Ksoup.parseGetRequest(url = "https://en.wikipedia.org/") // suspend function
val metadata: Metadata = Ksoup.parseMetaData(element = doc) // suspend function
// or
val metadata: Metadata = Ksoup.parseMetaData(html = HTML)

println("title: ${metadata.title}")
println("description: ${metadata.description}")
println("ogTitle: ${metadata.ogTitle}")
println("ogDescription: ${metadata.ogDescription}")
println("twitterTitle: ${metadata.twitterTitle}")
println("twitterDescription: ${metadata.twitterDescription}")
// Check com.fleeksoft.ksoup.model.MetaData for more fields
```

In this example, `Ksoup.parseGetRequest` fetches and parses HTML content from Wikipedia, extracting and printing news headlines and their corresponding links.
### Ksoup Public functions
- Ksoup.parse
Expand Down
66 changes: 66 additions & 0 deletions ksoup-test/test/com/fleeksoft/ksoup/meta/MetadataTest.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
package com.fleeksoft.ksoup.meta

import com.fleeksoft.ksoup.Ksoup
import com.fleeksoft.ksoup.model.MetaData
import com.fleeksoft.ksoup.ported.openSourceReader
import kotlin.test.Test
import kotlin.test.assertEquals

class MetadataTest {
val html = """
<html>
<head>
<title>Test Page</title>
<meta property="og:title" content="Test OG Title">
<meta property="og:description" content="Test OG Description">
<meta property="og:image" content="https://example.com/image.png">
<meta property="og:url" content="https://example.com">
<meta name="twitter:title" content="Test Twitter Title">
<meta name="twitter:description" content="Test Twitter Description">
<meta name="twitter:image" content="https://example.com/twitter_image.png">
<meta name="description" content="Test Description">
<link rel="canonical" href="https://example.com">
<link rel="icon" href="/favicon.ico">
</head>
</html>
""".trimIndent()

@Test
fun testParseMetaDataFromString() {
val metaData = Ksoup.parseMetaData(html, "https://example.com")

assertMetaData(metaData)
}

@Test
fun testParseMetaDataFromSourceReader() {
val sourceReader = html.openSourceReader()
val metaData = Ksoup.parseMetaData(sourceReader, "https://example.com")

assertMetaData(metaData)
}


@Test
fun testParseMetaDataFromElement() {
val doc = Ksoup.parse(html, "https://example.com")
val metaData = Ksoup.parseMetaData(doc)

assertMetaData(metaData)
}

private fun assertMetaData(metaData: MetaData) {
assertEquals("Test Page", metaData.htmlTitle)
assertEquals("Test OG Title", metaData.ogTitle)
assertEquals("Test OG Description", metaData.ogDescription)
assertEquals("https://example.com/image.png", metaData.ogImage)
assertEquals("https://example.com", metaData.ogUrl)
assertEquals("Test Twitter Title", metaData.twitterTitle)
assertEquals("Test Twitter Description", metaData.twitterDescription)
assertEquals("https://example.com/twitter_image.png", metaData.twitterImage)
assertEquals("Test Description", metaData.description)
assertEquals("https://example.com", metaData.canonical)
assertEquals("https://example.com/favicon.ico", metaData.favicon)
}

}
91 changes: 91 additions & 0 deletions ksoup/src/com/fleeksoft/ksoup/Ksoup.kt
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@ package com.fleeksoft.ksoup
import com.fleeksoft.ksoup.helper.DataUtil
import com.fleeksoft.ksoup.io.FileSource
import com.fleeksoft.ksoup.io.SourceReader
import com.fleeksoft.ksoup.model.MetaData
import com.fleeksoft.ksoup.nodes.Document
import com.fleeksoft.ksoup.nodes.Element
import com.fleeksoft.ksoup.parser.Parser
import com.fleeksoft.ksoup.parser.StreamParser
import com.fleeksoft.ksoup.ported.toSourceFile
import com.fleeksoft.ksoup.safety.Cleaner
import com.fleeksoft.ksoup.safety.Safelist
Expand Down Expand Up @@ -185,4 +188,92 @@ public object Ksoup {
): Boolean {
return Cleaner(safelist).isValidBodyHtml(bodyHtml)
}

fun parseMetaData(element: Element): MetaData {
val title = element.selectFirst("title")?.text()
return parseMetaDataInternal(baseUri = element.baseUri(), title = title) { query ->
element.selectFirst(query)
}
}

fun parseMetaData(
html: String,
baseUri: String = "",
interceptor: ((head: Element, metaData: MetaData) -> Unit)? = null
): MetaData {
val head = parse(html = html, baseUri = baseUri).head()

val title = head.selectFirst("title")?.text()
return parseMetaDataInternal(baseUri = baseUri, title = title) { query ->
head.selectFirst(query)
}.also {
interceptor?.invoke(head, it)
}
}

fun parseMetaData(
sourceReader: SourceReader,
baseUri: String = "",
interceptor: ((headStream: StreamParser, metaData: MetaData) -> Unit)? = null
): MetaData {
val head = DataUtil.streamParser(sourceReader = sourceReader, baseUri = baseUri, null, Parser.htmlParser())
val title = head.selectFirst("title")?.text()
return parseMetaDataInternal(baseUri = baseUri, title = title) { query ->
head.selectFirst(query)
}.also {
interceptor?.invoke(head, it)
}
}

private fun parseMetaDataInternal(baseUri: String, title: String?, selectFirst: (query: String) -> Element?): MetaData {
// Extract Open Graph metadata
val ogTitle = selectFirst("meta[property=og:title]")?.attr("content")
val ogSiteName = selectFirst("meta[property=og:site_name]")?.attr("content")
val ogType = selectFirst("meta[property=og:type]")?.attr("content")
val ogLocale = selectFirst("meta[property=og:locale]")?.attr("content")
val ogDescription = selectFirst("meta[property=og:description]")?.attr("content")
val ogImage = selectFirst("meta[property=og:image]")?.attr("content")
val ogUrl = selectFirst("meta[property=og:url]")?.attr("content")

// Extract Twitter metadata
val twitterTitle = selectFirst("meta[name=twitter:title]")?.attr("content")
val twitterCard = selectFirst("meta[name=twitter:card]")?.attr("content")
val twitterDescription = selectFirst("meta[name=twitter:description]")?.attr("content")
val twitterImage = selectFirst("meta[name=twitter:image]")?.attr("content")

// Extract standard metadata
val titleTag = selectFirst("meta[name=title]")?.attr("content")
val descriptionTag = selectFirst("meta[name=description]")?.attr("content")
val author = selectFirst("meta[name=author]")?.attr("content")

// Extract canonical URL
val canonicalTag = selectFirst("link[rel=canonical]")?.attr("href")

// Fetch favicon
var faviconTag = selectFirst("link[rel~=icon]")?.attr("href")
if (faviconTag != null && !faviconTag.startsWith("http") && baseUri.isNotEmpty()) {
faviconTag = baseUri + faviconTag
}

// Create a MetaData object
return MetaData(
ogTitle = ogTitle,
ogSiteName = ogSiteName,
ogType = ogType,
ogLocale = ogLocale,
ogDescription = ogDescription,
ogImage = ogImage,
ogUrl = ogUrl,
twitterCard = twitterCard,
twitterTitle = twitterTitle,
twitterDescription = twitterDescription,
twitterImage = twitterImage,
title = titleTag,
description = descriptionTag,
canonical = canonicalTag,
htmlTitle = title,
author = author,
favicon = faviconTag
)
}
}
2 changes: 1 addition & 1 deletion ksoup/src/com/fleeksoft/ksoup/helper/DataUtil.kt
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ public object DataUtil {
* @throws IOException on IO error
* @see Connection.Response.streamParser
*/
suspend fun streamParser(sourceReader: SourceReader, baseUri: String, charset: Charset?, parser: Parser): StreamParser {
fun streamParser(sourceReader: SourceReader, baseUri: String, charset: Charset?, parser: Parser): StreamParser {
val streamer = StreamParser(parser)
val charsetName: String? = charset?.name
val charsetDoc: CharsetDoc = detectCharset(sourceReader, baseUri, charsetName, parser)
Expand Down
21 changes: 21 additions & 0 deletions ksoup/src/com/fleeksoft/ksoup/model/MetaData.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
package com.fleeksoft.ksoup.model

data class MetaData(
val ogTitle: String? = null,
val ogSiteName: String? = null,
val ogDescription: String? = null,
val ogImage: String? = null,
val ogUrl: String? = null,
val ogType: String? = null,
val ogLocale: String? = null,
val twitterCard: String? = null,
val twitterTitle: String? = null,
val twitterDescription: String? = null,
val twitterImage: String? = null,
val title: String? = null,
val description: String? = null,
val canonical: String? = null,
val htmlTitle: String? = null,
val author: String? = null,
val favicon: String? = null
)

0 comments on commit 54a7d28

Please sign in to comment.