Skip to content

Commit

Permalink
feat: external XML sitemaps as sources
Browse files Browse the repository at this point in the history
  • Loading branch information
harlan-zw committed Jan 17, 2025
1 parent c7b1ebd commit add57d3
Show file tree
Hide file tree
Showing 6 changed files with 345 additions and 9 deletions.
3 changes: 2 additions & 1 deletion docs/content/2.guides/0.data-sources.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ export default defineNuxtConfig({

If you need your sitemap data to always be up-to-date at runtime, you will need to provide your own sources explicitly.

A source is a URL that will be fetched and is expected to return an array of Sitemap URL entries.
A source is a URL that will be fetched and is expected to return either JSON with an array of Sitemap URL entries or
a XML sitemap.

::code-group

Expand Down
2 changes: 1 addition & 1 deletion docs/content/2.guides/0.multi-sitemaps.md
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ export default defineSitemapEventHandler(() => {
If you need to fetch the URLs from an endpoint for a sitemap, then you will need to use either the `urls` or `sources` option.

- `urls` - Array of static URLs to include in the sitemap. You should avoid using this option if you have a lot of URLs
- `sources` - Custom endpoint to fetch [dynamic URLs](/docs/sitemap/guides/dynamic-urls) from.
- `sources` - Custom endpoint to fetch [dynamic URLs](/docs/sitemap/guides/dynamic-urls) from as JSON or XML.

```ts
export default defineNuxtConfig({
Expand Down
17 changes: 16 additions & 1 deletion docs/content/2.guides/2.dynamic-urls.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,22 @@ description: Use runtime API endpoints to generate dynamic URLs for your sitemap
In some instances, like using a CMS, you may need to implement an endpoint to make
all of your site URLs visible to the module.

To do this, you can provide [user sources](/docs/sitemap/getting-started/data-sources) to the module.
To do this, you can provide [user sources](/docs/sitemap/getting-started/data-sources) to the module. These can either be
a JSON response or an XML sitemap.

## XML Sitemap

If you're providing an XML sitemap, you can use the `sources` option to provide the URL to the sitemap.

```ts [nuxt.config.ts]
export default defineNuxtConfig({
sitemap: {
sources: [
'https://example.com/sitemap.xml',
]
}
})
```

## Dynamic URLs from an external API

Expand Down
23 changes: 17 additions & 6 deletions src/runtime/server/sitemap/urlset/sources.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@ import { getRequestHost } from 'h3'
import type { H3Event } from 'h3'
import type { FetchError } from 'ofetch'
import { defu } from 'defu'
import { parseURL } from 'ufo'
import type {
ModuleRuntimeConfig,
SitemapSourceBase,
SitemapSourceResolved,
SitemapUrlInput,
} from '../../../types'
import { extractSitemapXML } from '#sitemap/server/sitemap/utils/extractSitemapXML'

export async function fetchDataSource(input: SitemapSourceBase | SitemapSourceResolved, event?: H3Event): Promise<SitemapSourceResolved> {
const context = typeof input.context === 'string' ? { name: input.context } : input.context || { name: 'fetch' }
Expand All @@ -21,24 +23,25 @@ export async function fetchDataSource(input: SitemapSourceBase | SitemapSourceRe
const timeoutController = new AbortController()
const abortRequestTimeout = setTimeout(() => timeoutController.abort(), timeout)

let isHtmlResponse = false
let isMaybeErrorResponse = false
const isXmlRequest = parseURL(url).pathname.endsWith('.xml')
const fetchContainer = (url.startsWith('/') && event) ? event : globalThis
try {
const fetchContainer = (url.startsWith('/') && event) ? event : globalThis
const urls = await fetchContainer.$fetch(url, {
const res = await fetchContainer.$fetch(url, {
...options,
responseType: 'json',
responseType: isXmlRequest ? 'json' : 'text',
signal: timeoutController.signal,
headers: defu(options?.headers, {
Accept: 'application/json',
}, event ? { Host: getRequestHost(event, { xForwardedHost: true }) } : {}),
// @ts-expect-error untyped
onResponse({ response }) {
if (typeof response._data === 'string' && response._data.startsWith('<!DOCTYPE html>'))
isHtmlResponse = true
isMaybeErrorResponse = true
},
})
const timeTakenMs = Date.now() - start
if (isHtmlResponse) {
if (isMaybeErrorResponse) {
context.tips.push('This is usually because the URL isn\'t correct or is throwing an error. Please check the URL')
return {
...input,
Expand All @@ -48,6 +51,14 @@ export async function fetchDataSource(input: SitemapSourceBase | SitemapSourceRe
error: 'Received HTML response instead of JSON',
}
}
let urls = []
if (typeof res === 'object') {
urls = res.urls || res
}
else if (typeof res === 'string' && parseURL(url).pathname.endsWith('.xml')) {
// fast pass XML extract all loc data, let's use
urls = extractSitemapXML(res)
}
return {
...input,
context,
Expand Down
101 changes: 101 additions & 0 deletions src/runtime/server/sitemap/utils/extractSitemapXML.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import type { SitemapUrlInput } from '../../../types'

export function extractSitemapXML(xml: string): SitemapUrlInput[] {
const urls = xml.match(/<url>[\s\S]*?<\/url>/g) || []
return urls.map((url) => {
const loc = url.match(/<loc>([^<]+)<\/loc>/)?.[1]
if (!loc) return null

const lastmod = url.match(/<lastmod>([^<]+)<\/lastmod>/)?.[1]
const changefreq = url.match(/<changefreq>([^<]+)<\/changefreq>/)?.[1]
const priority = url.match(/<priority>([^<]+)<\/priority>/) ? Number.parseFloat(url.match(/<priority>([^<]+)<\/priority>/)[1]) : undefined

const images = (url.match(/<image:image>[\s\S]*?<\/image:image>/g) || []).map((image) => {
const imageLoc = image.match(/<image:loc>([^<]+)<\/image:loc>/)?.[1]
return imageLoc ? { loc: imageLoc } : null
}).filter(Boolean)

const videos = (url.match(/<video:video>[\s\S]*?<\/video:video>/g) || []).map((video) => {
const videoObj: any = {}
const title = video.match(/<video:title>([^<]+)<\/video:title>/)?.[1]
const thumbnail_loc = video.match(/<video:thumbnail_loc>([^<]+)<\/video:thumbnail_loc>/)?.[1]
const description = video.match(/<video:description>([^<]+)<\/video:description>/)?.[1]
const content_loc = video.match(/<video:content_loc>([^<]+)<\/video:content_loc>/)?.[1]
if (!title || !thumbnail_loc || !description || !content_loc) return null

videoObj.title = title
videoObj.thumbnail_loc = thumbnail_loc
videoObj.description = description
videoObj.content_loc = content_loc

const player_loc = video.match(/<video:player_loc>([^<]+)<\/video:player_loc>/)?.[1]
if (player_loc) videoObj.player_loc = player_loc

const duration = video.match(/<video:duration>([^<]+)<\/video:duration>/) ? Number.parseInt(video.match(/<video:duration>([^<]+)<\/video:duration>/)[1], 10) : undefined
if (duration) videoObj.duration = duration

const expiration_date = video.match(/<video:expiration_date>([^<]+)<\/video:expiration_date>/)?.[1]
if (expiration_date) videoObj.expiration_date = expiration_date

const rating = video.match(/<video:rating>([^<]+)<\/video:rating>/) ? Number.parseFloat(video.match(/<video:rating>([^<]+)<\/video:rating>/)[1]) : undefined
if (rating) videoObj.rating = rating

const view_count = video.match(/<video:view_count>([^<]+)<\/video:view_count>/) ? Number.parseInt(video.match(/<video:view_count>([^<]+)<\/video:view_count>/)[1], 10) : undefined
if (view_count) videoObj.view_count = view_count

const publication_date = video.match(/<video:publication_date>([^<]+)<\/video:publication_date>/)?.[1]
if (publication_date) videoObj.publication_date = publication_date

const family_friendly = video.match(/<video:family_friendly>([^<]+)<\/video:family_friendly>/)?.[1]
if (family_friendly) videoObj.family_friendly = family_friendly

const restriction = video.match(/<video:restriction relationship="([^"]+)">([^<]+)<\/video:restriction>/)
if (restriction) videoObj.restriction = { relationship: restriction[1], restriction: restriction[2] }

const platform = video.match(/<video:platform relationship="([^"]+)">([^<]+)<\/video:platform>/)
if (platform) videoObj.platform = { relationship: platform[1], platform: platform[2] }

const price = (video.match(/<video:price [^>]+>([^<]+)<\/video:price>/g) || []).map((price) => {
const priceValue = price.match(/<video:price [^>]+>([^<]+)<\/video:price>/)?.[1]
const currency = price.match(/currency="([^"]+)"/)?.[1]
const type = price.match(/type="([^"]+)"/)?.[1]
return priceValue ? { price: priceValue, currency, type } : null
}).filter(Boolean)
if (price.length) videoObj.price = price

const requires_subscription = video.match(/<video:requires_subscription>([^<]+)<\/video:requires_subscription>/)?.[1]
if (requires_subscription) videoObj.requires_subscription = requires_subscription

const uploader = video.match(/<video:uploader info="([^"]+)">([^<]+)<\/video:uploader>/)
if (uploader) videoObj.uploader = { uploader: uploader[2], info: uploader[1] }

const live = video.match(/<video:live>([^<]+)<\/video:live>/)?.[1]
if (live) videoObj.live = live

const tag = (video.match(/<video:tag>([^<]+)<\/video:tag>/g) || []).map(tag => tag.match(/<video:tag>([^<]+)<\/video:tag>/)?.[1]).filter(Boolean)
if (tag.length) videoObj.tag = tag

return videoObj
}).filter(Boolean)

const alternatives = (url.match(/<xhtml:link[\s\S]*?\/>/g) || []).map((link) => {
const hreflang = link.match(/hreflang="([^"]+)"/)?.[1]
const href = link.match(/href="([^"]+)"/)?.[1]
return hreflang && href ? { hreflang, href } : null
}).filter(Boolean)

const news = url.match(/<news:news>[\s\S]*?<\/news:news>/)
? {
title: url.match(/<news:title>([^<]+)<\/news:title>/)?.[1],
publication_date: url.match(/<news:publication_date>([^<]+)<\/news:publication_date>/)?.[1],
publication: {
name: url.match(/<news:name>([^<]+)<\/news:name>/)?.[1],
language: url.match(/<news:language>([^<]+)<\/news:language>/)?.[1],
},
}
: undefined

const urlObj: any = { loc, lastmod, changefreq, priority, images, videos, alternatives, news }
return Object.fromEntries(Object.entries(urlObj).filter(([_, v]) => v != null && v.length !== 0))
}).filter(Boolean) as any as SitemapUrlInput[]
}
Loading

0 comments on commit add57d3

Please sign in to comment.