diff --git a/docs/content/2.guides/0.data-sources.md b/docs/content/2.guides/0.data-sources.md index 8f86aa5..8fdf827 100644 --- a/docs/content/2.guides/0.data-sources.md +++ b/docs/content/2.guides/0.data-sources.md @@ -72,7 +72,8 @@ export default defineNuxtConfig({ If you need your sitemap data to always be up-to-date at runtime, you will need to provide your own sources explicitly. -A source is a URL that will be fetched and is expected to return an array of Sitemap URL entries. +A source is a URL that will be fetched and is expected to return either JSON with an array of Sitemap URL entries or +a XML sitemap. ::code-group diff --git a/docs/content/2.guides/0.multi-sitemaps.md b/docs/content/2.guides/0.multi-sitemaps.md index 84e1d82..11305d6 100644 --- a/docs/content/2.guides/0.multi-sitemaps.md +++ b/docs/content/2.guides/0.multi-sitemaps.md @@ -166,7 +166,7 @@ export default defineSitemapEventHandler(() => { If you need to fetch the URLs from an endpoint for a sitemap, then you will need to use either the `urls` or `sources` option. - `urls` - Array of static URLs to include in the sitemap. You should avoid using this option if you have a lot of URLs -- `sources` - Custom endpoint to fetch [dynamic URLs](/docs/sitemap/guides/dynamic-urls) from. +- `sources` - Custom endpoint to fetch [dynamic URLs](/docs/sitemap/guides/dynamic-urls) from as JSON or XML. ```ts export default defineNuxtConfig({ diff --git a/docs/content/2.guides/2.dynamic-urls.md b/docs/content/2.guides/2.dynamic-urls.md index 8d61946..69d0e01 100644 --- a/docs/content/2.guides/2.dynamic-urls.md +++ b/docs/content/2.guides/2.dynamic-urls.md @@ -8,7 +8,22 @@ description: Use runtime API endpoints to generate dynamic URLs for your sitemap In some instances, like using a CMS, you may need to implement an endpoint to make all of your site URLs visible to the module. -To do this, you can provide [user sources](/docs/sitemap/getting-started/data-sources) to the module. +To do this, you can provide [user sources](/docs/sitemap/getting-started/data-sources) to the module. These can either be +a JSON response or an XML sitemap. + +## XML Sitemap + +If you're providing an XML sitemap, you can use the `sources` option to provide the URL to the sitemap. + +```ts [nuxt.config.ts] +export default defineNuxtConfig({ + sitemap: { + sources: [ + 'https://example.com/sitemap.xml', + ] + } +}) +``` ## Dynamic URLs from an external API diff --git a/src/runtime/server/sitemap/urlset/sources.ts b/src/runtime/server/sitemap/urlset/sources.ts index c206795..6146b76 100644 --- a/src/runtime/server/sitemap/urlset/sources.ts +++ b/src/runtime/server/sitemap/urlset/sources.ts @@ -2,12 +2,14 @@ import { getRequestHost } from 'h3' import type { H3Event } from 'h3' import type { FetchError } from 'ofetch' import { defu } from 'defu' +import { parseURL } from 'ufo' import type { ModuleRuntimeConfig, SitemapSourceBase, SitemapSourceResolved, SitemapUrlInput, } from '../../../types' +import { extractSitemapXML } from '../utils/extractSitemapXML' export async function fetchDataSource(input: SitemapSourceBase | SitemapSourceResolved, event?: H3Event): Promise { const context = typeof input.context === 'string' ? { name: input.context } : input.context || { name: 'fetch' } @@ -21,24 +23,25 @@ export async function fetchDataSource(input: SitemapSourceBase | SitemapSourceRe const timeoutController = new AbortController() const abortRequestTimeout = setTimeout(() => timeoutController.abort(), timeout) - let isHtmlResponse = false + let isMaybeErrorResponse = false + const isXmlRequest = parseURL(url).pathname.endsWith('.xml') + const fetchContainer = (url.startsWith('/') && event) ? event : globalThis try { - const fetchContainer = (url.startsWith('/') && event) ? event : globalThis - const urls = await fetchContainer.$fetch(url, { + const res = await fetchContainer.$fetch(url, { ...options, - responseType: 'json', + responseType: isXmlRequest ? 'text' : 'json', signal: timeoutController.signal, headers: defu(options?.headers, { - Accept: 'application/json', + Accept: isXmlRequest ? 'text/xml' : 'application/json', }, event ? { Host: getRequestHost(event, { xForwardedHost: true }) } : {}), // @ts-expect-error untyped onResponse({ response }) { if (typeof response._data === 'string' && response._data.startsWith('')) - isHtmlResponse = true + isMaybeErrorResponse = true }, }) const timeTakenMs = Date.now() - start - if (isHtmlResponse) { + if (isMaybeErrorResponse) { context.tips.push('This is usually because the URL isn\'t correct or is throwing an error. Please check the URL') return { ...input, @@ -48,6 +51,14 @@ export async function fetchDataSource(input: SitemapSourceBase | SitemapSourceRe error: 'Received HTML response instead of JSON', } } + let urls = [] + if (typeof res === 'object') { + urls = res.urls || res + } + else if (typeof res === 'string' && parseURL(url).pathname.endsWith('.xml')) { + // fast pass XML extract all loc data, let's use + urls = extractSitemapXML(res) + } return { ...input, context, diff --git a/src/runtime/server/sitemap/utils/extractSitemapXML.ts b/src/runtime/server/sitemap/utils/extractSitemapXML.ts new file mode 100644 index 0000000..2576403 --- /dev/null +++ b/src/runtime/server/sitemap/utils/extractSitemapXML.ts @@ -0,0 +1,101 @@ +import type { SitemapUrlInput } from '../../../types' + +export function extractSitemapXML(xml: string): SitemapUrlInput[] { + const urls = xml.match(/[\s\S]*?<\/url>/g) || [] + return urls.map((url) => { + const loc = url.match(/([^<]+)<\/loc>/)?.[1] + if (!loc) return null + + const lastmod = url.match(/([^<]+)<\/lastmod>/)?.[1] + const changefreq = url.match(/([^<]+)<\/changefreq>/)?.[1] + const priority = url.match(/([^<]+)<\/priority>/) ? Number.parseFloat(url.match(/([^<]+)<\/priority>/)[1]) : undefined + + const images = (url.match(/[\s\S]*?<\/image:image>/g) || []).map((image) => { + const imageLoc = image.match(/([^<]+)<\/image:loc>/)?.[1] + return imageLoc ? { loc: imageLoc } : null + }).filter(Boolean) + + const videos = (url.match(/[\s\S]*?<\/video:video>/g) || []).map((video) => { + const videoObj: any = {} + const title = video.match(/([^<]+)<\/video:title>/)?.[1] + const thumbnail_loc = video.match(/([^<]+)<\/video:thumbnail_loc>/)?.[1] + const description = video.match(/([^<]+)<\/video:description>/)?.[1] + const content_loc = video.match(/([^<]+)<\/video:content_loc>/)?.[1] + if (!title || !thumbnail_loc || !description || !content_loc) return null + + videoObj.title = title + videoObj.thumbnail_loc = thumbnail_loc + videoObj.description = description + videoObj.content_loc = content_loc + + const player_loc = video.match(/([^<]+)<\/video:player_loc>/)?.[1] + if (player_loc) videoObj.player_loc = player_loc + + const duration = video.match(/([^<]+)<\/video:duration>/) ? Number.parseInt(video.match(/([^<]+)<\/video:duration>/)[1], 10) : undefined + if (duration) videoObj.duration = duration + + const expiration_date = video.match(/([^<]+)<\/video:expiration_date>/)?.[1] + if (expiration_date) videoObj.expiration_date = expiration_date + + const rating = video.match(/([^<]+)<\/video:rating>/) ? Number.parseFloat(video.match(/([^<]+)<\/video:rating>/)[1]) : undefined + if (rating) videoObj.rating = rating + + const view_count = video.match(/([^<]+)<\/video:view_count>/) ? Number.parseInt(video.match(/([^<]+)<\/video:view_count>/)[1], 10) : undefined + if (view_count) videoObj.view_count = view_count + + const publication_date = video.match(/([^<]+)<\/video:publication_date>/)?.[1] + if (publication_date) videoObj.publication_date = publication_date + + const family_friendly = video.match(/([^<]+)<\/video:family_friendly>/)?.[1] + if (family_friendly) videoObj.family_friendly = family_friendly + + const restriction = video.match(/([^<]+)<\/video:restriction>/) + if (restriction) videoObj.restriction = { relationship: restriction[1], restriction: restriction[2] } + + const platform = video.match(/([^<]+)<\/video:platform>/) + if (platform) videoObj.platform = { relationship: platform[1], platform: platform[2] } + + const price = (video.match(/]+>([^<]+)<\/video:price>/g) || []).map((price) => { + const priceValue = price.match(/]+>([^<]+)<\/video:price>/)?.[1] + const currency = price.match(/currency="([^"]+)"/)?.[1] + const type = price.match(/type="([^"]+)"/)?.[1] + return priceValue ? { price: priceValue, currency, type } : null + }).filter(Boolean) + if (price.length) videoObj.price = price + + const requires_subscription = video.match(/([^<]+)<\/video:requires_subscription>/)?.[1] + if (requires_subscription) videoObj.requires_subscription = requires_subscription + + const uploader = video.match(/([^<]+)<\/video:uploader>/) + if (uploader) videoObj.uploader = { uploader: uploader[2], info: uploader[1] } + + const live = video.match(/([^<]+)<\/video:live>/)?.[1] + if (live) videoObj.live = live + + const tag = (video.match(/([^<]+)<\/video:tag>/g) || []).map(tag => tag.match(/([^<]+)<\/video:tag>/)?.[1]).filter(Boolean) + if (tag.length) videoObj.tag = tag + + return videoObj + }).filter(Boolean) + + const alternatives = (url.match(//g) || []).map((link) => { + const hreflang = link.match(/hreflang="([^"]+)"/)?.[1] + const href = link.match(/href="([^"]+)"/)?.[1] + return hreflang && href ? { hreflang, href } : null + }).filter(Boolean) + + const news = url.match(/[\s\S]*?<\/news:news>/) + ? { + title: url.match(/([^<]+)<\/news:title>/)?.[1], + publication_date: url.match(/([^<]+)<\/news:publication_date>/)?.[1], + publication: { + name: url.match(/([^<]+)<\/news:name>/)?.[1], + language: url.match(/([^<]+)<\/news:language>/)?.[1], + }, + } + : undefined + + const urlObj: any = { loc, lastmod, changefreq, priority, images, videos, alternatives, news } + return Object.fromEntries(Object.entries(urlObj).filter(([_, v]) => v != null && v.length !== 0)) + }).filter(Boolean) as any as SitemapUrlInput[] +} diff --git a/test/unit/extractSitemapXML.ts b/test/unit/extractSitemapXML.ts new file mode 100644 index 0000000..939b338 --- /dev/null +++ b/test/unit/extractSitemapXML.ts @@ -0,0 +1,208 @@ +import { describe, it, expect } from 'vitest' +import { extractSitemapXML } from '../../src/runtime/server/sitemap/utils/extractSitemapXML' + +describe('extractSitemapXML', () => { + it('should extract loc, lastmod, changefreq, priority, images, videos, alternatives, and news from XML', () => { + const xml = ` + + + http://example.com/ + 2023-01-01 + daily + 0.8 + + http://example.com/image1.jpg + + + Example Video + http://example.com/thumbnail.jpg + Example Description + http://example.com/video1.mp4 + 600 + + + + Example News + 2023-01-01 + + Example Publication + en + + + + + ` + const result = extractSitemapXML(xml) + expect(result).toMatchInlineSnapshot(` + [ + { + "alternatives": [ + { + "href": "http://example.com/en", + "hreflang": "en", + }, + ], + "changefreq": "daily", + "images": [ + { + "loc": "http://example.com/image1.jpg", + }, + ], + "lastmod": "2023-01-01", + "loc": "http://example.com/", + "news": { + "publication": { + "language": "en", + "name": "Example Publication", + }, + "publication_date": "2023-01-01", + "title": "Example News", + }, + "priority": 0.8, + "videos": [ + { + "content_loc": "http://example.com/video1.mp4", + "description": "Example Description", + "duration": 600, + "thumbnail_loc": "http://example.com/thumbnail.jpg", + "title": "Example Video", + }, + ], + }, + ] + `) + }) + + it('should handle missing optional fields', () => { + const xml = ` + + + http://example.com/ + 2023-01-01 + daily + + + ` + const result = extractSitemapXML(xml) + expect(result).toMatchInlineSnapshot(` + [ + { + "changefreq": "daily", + "lastmod": "2023-01-01", + "loc": "http://example.com/", + }, + ] + `) + }) + + it('should handle multiple images and videos', () => { + const xml = ` + + + http://example.com/ + + http://example.com/image1.jpg + + + http://example.com/image2.jpg + + + Example Video 1 + http://example.com/thumbnail1.jpg + Example Description 1 + http://example.com/video1.mp4 + + + Example Video 2 + http://example.com/thumbnail2.jpg + Example Description 2 + http://example.com/video2.mp4 + + + + ` + const result = extractSitemapXML(xml) + expect(result).toMatchInlineSnapshot(` + [ + { + "images": [ + { + "loc": "http://example.com/image1.jpg", + }, + { + "loc": "http://example.com/image2.jpg", + }, + ], + "loc": "http://example.com/", + "videos": [ + { + "content_loc": "http://example.com/video1.mp4", + "description": "Example Description 1", + "thumbnail_loc": "http://example.com/thumbnail1.jpg", + "title": "Example Video 1", + }, + { + "content_loc": "http://example.com/video2.mp4", + "description": "Example Description 2", + "thumbnail_loc": "http://example.com/thumbnail2.jpg", + "title": "Example Video 2", + }, + ], + }, + ] + `) + }) + + it('should handle missing loc, lastmod, and changefreq', () => { + const xml = ` + + + + http://example.com/image1.jpg + + + + ` + const result = extractSitemapXML(xml) + expect(result).toMatchInlineSnapshot(`[]`) + }) + + it('should return an empty array if no URLs are found', () => { + const xml = '' + const result = extractSitemapXML(xml) + expect(result).toMatchInlineSnapshot(`[]`) + }) + + it('should handle malformed XML', () => { + const xml = ` + + + http://example.com/ + 2023-01-01 + daily + + + ` + const result = extractSitemapXML(xml) + expect(result).toMatchInlineSnapshot(`[]`) + }) + + it('should handle XML with unexpected tags', () => { + const xml = ` + + + http://example.com/ + unexpectedValue + + + ` + const result = extractSitemapXML(xml) + expect(result).toMatchInlineSnapshot(` + [ + { + "loc": "http://example.com/", + }, + ] + `) + }) +})