Skip to content

Commit

Permalink
allow custom cmark options and extensions when converting Markdown
Browse files Browse the repository at this point in the history
updated from: swiftlang#23
  • Loading branch information
CreatureSurvive committed Dec 31, 2023
1 parent c211079 commit 5df006a
Show file tree
Hide file tree
Showing 6 changed files with 202 additions and 30 deletions.
34 changes: 26 additions & 8 deletions Sources/Markdown/Base/Document.swift
Original file line number Diff line number Diff line change
Expand Up @@ -38,31 +38,49 @@ public extension Document {
/// Parse a string into a `Document`.
///
/// - parameter string: the input Markdown text to parse.
/// - parameter options: options for parsing Markdown text.
/// - parameter source: an explicit source URL from which the input `string` came for marking source locations.
/// - parameter options: options for parsing Markdown text, including
/// Commonmark-specific options and extensions.
/// This need not be a file URL.
init(parsing string: String, source: URL? = nil, options: ParseOptions = []) {
if options.contains(.parseBlockDirectives) {
init(parsing string: String, source: URL? = nil, convertOptions options: ConvertOptions) {
if options.parseOptions.contains(.parseBlockDirectives) {
self = BlockDirectiveParser.parse(string, source: source,
options: options)
} else {
self = MarkupParser.parseString(string, source: source, options: options)
}
}

/// Parse a string into a `Document`.
///
/// - parameter string: the input Markdown text to parse.
/// - parameter options: options for parsing Markdown text.
/// - parameter source: an explicit source URL from which the input `string` came for marking source locations.
/// This need not be a file URL.
init(parsing string: String, source: URL? = nil, options: ParseOptions = []) {
self.init(parsing: string, source: source, convertOptions: .init(fromParseOptions: options))
}

/// Parse a file's contents into a `Document`.
///
/// - parameter file: a file URL from which to load Markdown text to parse.
/// - parameter options: options for parsing Markdown text.
init(parsing file: URL, options: ParseOptions = []) throws {
/// - parameter options: options for parsing Markdown text, including
/// Commonmark-specific options and extensions.
init(parsing file: URL, convertOptions options: ConvertOptions) throws {
let string = try String(contentsOf: file)
if options.contains(.parseBlockDirectives) {
if options.parseOptions.contains(.parseBlockDirectives) {
self = BlockDirectiveParser.parse(string, source: file,
options: options)
} else {
self = MarkupParser.parseString(string, source: file, options: options)
}
}

/// Parse a file's contents into a `Document`.
///
/// - parameter file: a file URL from which to load Markdown text to parse.
/// - parameter options: options for parsing Markdown text.
init(parsing file: URL, options: ParseOptions = []) throws {
try self.init(parsing: file, convertOptions: .init(fromParseOptions: options))
}

/// Create a document from a sequence of block markup elements.
init<Children: Sequence>(_ children: Children) where Children.Element == BlockMarkup {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,7 @@
### Options

- ``MarkupDumpOptions``
- ``ParseOptions``
- ``ConvertOptions``

<!-- Copyright (c) 2021-2022 Apple Inc and the Swift Project authors. All Rights Reserved. -->
16 changes: 8 additions & 8 deletions Sources/Markdown/Parser/BlockDirectiveParser.swift
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,7 @@ private enum ParseContainer: CustomStringConvertible {
/// A Doxygen command, which can contain arbitrary markup (but not block directives).
case doxygenCommand(PendingDoxygenCommand, [TrimmedLine])

init<TrimmedLines: Sequence>(parsingHierarchyFrom trimmedLines: TrimmedLines, options: ParseOptions) where TrimmedLines.Element == TrimmedLine {
init<TrimmedLines: Sequence>(parsingHierarchyFrom trimmedLines: TrimmedLines, options: ConvertOptions) where TrimmedLines.Element == TrimmedLine {
self = ParseContainerStack(parsingHierarchyFrom: trimmedLines, options: options).top
}

Expand Down Expand Up @@ -663,7 +663,7 @@ private enum ParseContainer: CustomStringConvertible {
/// Convert this container to the corresponding ``RawMarkup`` node.
func convertToRawMarkup(ranges: inout RangeTracker,
parent: ParseContainer?,
options: ParseOptions) -> [RawMarkup] {
options: ConvertOptions) -> [RawMarkup] {
switch self {
case let .root(children):
let rawChildren = children.flatMap {
Expand Down Expand Up @@ -749,9 +749,9 @@ struct ParseContainerStack {
/// The stack of containers to be incrementally folded into a hierarchy.
private var stack: [ParseContainer]

private let options: ParseOptions
private let options: ConvertOptions

init<TrimmedLines: Sequence>(parsingHierarchyFrom trimmedLines: TrimmedLines, options: ParseOptions) where TrimmedLines.Element == TrimmedLine {
init<TrimmedLines: Sequence>(parsingHierarchyFrom trimmedLines: TrimmedLines, options: ConvertOptions) where TrimmedLines.Element == TrimmedLine {
self.stack = [.root([])]
self.options = options
for line in trimmedLines {
Expand All @@ -772,7 +772,7 @@ struct ParseContainerStack {
}

private var canParseDoxygenCommand: Bool {
guard options.contains(.parseMinimalDoxygen) else { return false }
guard options.parseOptions.contains(.parseMinimalDoxygen) else { return false }

guard !isInBlockDirective else { return false }

Expand Down Expand Up @@ -1105,7 +1105,7 @@ extension Document {
///
/// - Precondition: The `rootContainer` must be the `.root` case.
fileprivate init(converting rootContainer: ParseContainer, from source: URL?,
options: ParseOptions) {
options: ConvertOptions) {
guard case .root = rootContainer else {
fatalError("Tried to convert a non-root container to a `Document`")
}
Expand All @@ -1128,14 +1128,14 @@ extension Document {
}

struct BlockDirectiveParser {
static func parse(_ input: URL, options: ParseOptions = []) throws -> Document {
static func parse(_ input: URL, options: ConvertOptions = .init()) throws -> Document {
let string = try String(contentsOf: input, encoding: .utf8)
return parse(string, source: input, options: options)
}

/// Parse the input.
static func parse(_ input: String, source: URL?,
options: ParseOptions = []) -> Document {
options: ConvertOptions = .init()) -> Document {
// Phase 0: Split the input into lines lazily, keeping track of
// line numbers, consecutive blank lines, and start positions on each line where indentation ends.
// These trim points may be used to adjust the indentation seen by the CommonMark parser when
Expand Down
21 changes: 7 additions & 14 deletions Sources/Markdown/Parser/CommonMarkConverter.swift
Original file line number Diff line number Diff line change
Expand Up @@ -608,25 +608,18 @@ struct MarkupParser {
return MarkupConversion(state: childConversion.state.next(), result: .inlineAttributes(attributes: attributes, parsedRange: parsedRange, childConversion.result))
}

static func parseString(_ string: String, source: URL?, options: ParseOptions) -> Document {
static func parseString(_ string: String, source: URL?, options: ConvertOptions) -> Document {
cmark_gfm_core_extensions_ensure_registered()

var cmarkOptions = CMARK_OPT_TABLE_SPANS
if !options.contains(.disableSmartOpts) {
cmarkOptions |= CMARK_OPT_SMART
}
if !options.contains(.disableSourcePosOpts) {
cmarkOptions |= CMARK_OPT_SOURCEPOS
}

let parser = cmark_parser_new(cmarkOptions)
let parser = cmark_parser_new(options.commonmarkOptions.rawValue)

for ext in options.commonmarkExtensions {
cmark_parser_attach_syntax_extension(parser, cmark_find_syntax_extension(ext))
}

cmark_parser_attach_syntax_extension(parser, cmark_find_syntax_extension("table"))
cmark_parser_attach_syntax_extension(parser, cmark_find_syntax_extension("strikethrough"))
cmark_parser_attach_syntax_extension(parser, cmark_find_syntax_extension("tasklist"))
cmark_parser_feed(parser, string, string.utf8.count)
let rawDocument = cmark_parser_finish(parser)
let initialState = MarkupConverterState(source: source, iterator: cmark_iter_new(rawDocument), event: CMARK_EVENT_NONE, node: nil, options: options, headerSeen: false, pendingTableBody: nil).next()
let initialState = MarkupConverterState(source: source, iterator: cmark_iter_new(rawDocument), event: CMARK_EVENT_NONE, node: nil, options: options.parseOptions, headerSeen: false, pendingTableBody: nil).next()
precondition(initialState.event == CMARK_EVENT_ENTER)
precondition(initialState.nodeType == .document)
let conversion = convertAnyElement(initialState)
Expand Down
132 changes: 132 additions & 0 deletions Sources/Markdown/Parser/ConvertOptions.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
/*
This source file is part of the Swift.org open source project

Copyright (c) 2021 Apple Inc. and the Swift project authors
Licensed under Apache License v2.0 with Runtime Library Exception

See https://swift.org/LICENSE.txt for license information
See https://swift.org/CONTRIBUTORS.txt for Swift project authors
*/

import cmark_gfm

/// Options to use when converting Markdown.
public struct ConvertOptions {
public let parseOptions: ParseOptions
public let commonmarkOptions: CommonmarkOptions
public let commonmarkExtensions: [String]

public init(parseOptions: ParseOptions, commonmarkOptions: CommonmarkOptions, extensions: [String]) {
self.parseOptions = parseOptions
self.commonmarkOptions = commonmarkOptions
self.commonmarkExtensions = extensions
}

public init(fromParseOptions options: ParseOptions) {
var commonmarkOptions = ConvertOptions.defaultCommonmarkOptions
if options.contains(.disableSmartOpts) {
commonmarkOptions.remove(.smart)
}
if options.contains(.disableSourcePosOpts) {
commonmarkOptions.remove(.sourcepos)
}
self.init(
parseOptions: options,
commonmarkOptions: commonmarkOptions,
extensions: ConvertOptions.defaultCommonmarkExtensions
)
}

public init() {
self.init(fromParseOptions: ConvertOptions.defaultParseOptions)
}

public static let defaultParseOptions: ParseOptions = []
public static let defaultCommonmarkOptions: CommonmarkOptions = [
.smart,
.tableSpans,
.sourcepos
]
public static let defaultCommonmarkExtensions: [String] = [
"table",
"strikethrough",
"tasklist",
]
}

/// Options given to the Commonmark converter.
public struct CommonmarkOptions: OptionSet {
public var rawValue: Int32

public init(rawValue: Int32) {
self.rawValue = rawValue
}

/// The default Commonmark behavior, no special options.
public static let `default` = CommonmarkOptions(rawValue: CMARK_OPT_DEFAULT)

/// Include a `data-sourcepos` element on all block elements.
public static let sourcepos = CommonmarkOptions(rawValue: CMARK_OPT_SOURCEPOS)

/// Render `softbreak` elements as hard line breaks.
public static let hardBreaks = CommonmarkOptions(rawValue: CMARK_OPT_HARDBREAKS)

/// Render raw HTML and unsafe links.
///
/// Unsafe links are `javascript:`, `vbscript:`, `file:`, and
/// `data:`, except for `image/png`, `image/gif`, `image/jpeg`
/// or `image/webp` MIME types. Without this option, raw HTML
/// is replaced by a placeholder HTML comment. Unsafe links
/// are replaced by empty strings.
public static let unsafe = CommonmarkOptions(rawValue: CMARK_OPT_UNSAFE)

/// Render `softbreak` elements as spaces.
public static let noBreaks = CommonmarkOptions(rawValue: CMARK_OPT_NOBREAKS)

/// Validate UTF-8 in the input before parsing, replacing illegal
/// sequences with the replacement character `U+FFFD`.
public static let validateUtf8 = CommonmarkOptions(rawValue: CMARK_OPT_VALIDATE_UTF8)

/// Convert straight quotes to curly, `---` to em dashes, `--` to en dashes.
public static let smart = CommonmarkOptions(rawValue: CMARK_OPT_SMART)

/// Use GitHub-style `<pre lang="x">` tags for code blocks instead of
/// `<pre><code class="language-x">`.
public static let githubPreLang = CommonmarkOptions(rawValue: CMARK_OPT_GITHUB_PRE_LANG)

/// Be liberal in interpreting inline HTML tags.
public static let liberalHtmlTag = CommonmarkOptions(rawValue: CMARK_OPT_LIBERAL_HTML_TAG)

/// Parse footnotes.
public static let footnotes = CommonmarkOptions(rawValue: CMARK_OPT_FOOTNOTES)

/// Only parse strikethroughs if surrounded by exactly 2 tildes.
///
/// Strikethroughs are still only parsed when the `"strikethrough"`
/// extension is enabled.
public static let strikethroughDoubleTilde = CommonmarkOptions(rawValue: CMARK_OPT_STRIKETHROUGH_DOUBLE_TILDE)

/// Use style attributes to align table cells instead of align attributes.
public static let tablePreferStyleAttributes = CommonmarkOptions(rawValue: CMARK_OPT_TABLE_PREFER_STYLE_ATTRIBUTES)

/// Include the remainder of the info string in code blocks in
/// a separate attribute.
public static let fullInfoString = CommonmarkOptions(rawValue: CMARK_OPT_FULL_INFO_STRING)

/// Parse only inline markdown directives. Block directives will not be
/// parsed (their literal representations will remain in the output).
public static let inlineOnly = CommonmarkOptions(rawValue: CMARK_OPT_INLINE_ONLY)

/// Parse the markdown input without removing preceding/trailing whitespace and
/// without converting newline characters to breaks.
///
/// Using this option also enables the `CMARK_OPT_INLINE_ONLY` option.
// FIXME: the original `CMARK_OPT_PRESERVE_WHITESPACE` isn't available to the swift compiler?
public static let preserveWhitespace = CommonmarkOptions(rawValue: (1 << 19) | CMARK_OPT_INLINE_ONLY)

/// Enable the row- and column-span syntax in the tables extension.
public static let tableSpans = CommonmarkOptions(rawValue: CMARK_OPT_TABLE_SPANS)

/// Use a "ditto mark" (`"`) instead of a caret (`^`) to indicate row-spans in the tables extension.
public static let tableRowspanDitto = CommonmarkOptions(rawValue: CMARK_OPT_TABLE_ROWSPAN_DITTO)
}
27 changes: 27 additions & 0 deletions Tests/MarkdownTests/Parsing/CommonMarkConverterTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,31 @@ class CommonMarkConverterTests: XCTestCase {
let document = Document(parsing: text, source: nil, options: [.parseBlockDirectives, .parseSymbolLinks])
XCTAssertEqual(expectedDump, document.debugDescription(options: .printSourceLocations))
}

/// Test using a custom set of Commonmark options to convert Markdown.
func testCustomOpts() {
let text = "~This is not strikethrough~ -- but ~~this is strikethrough~~."

// Because the "smart" option is not set, the `--` should not be converted
// to an en-dash.
let expectedDump = """
Document @1:1-1:62
└─ Paragraph @1:1-1:62
├─ Text @1:1-1:36 "~This is not strikethrough~ -- but "
├─ Strikethrough @1:36-1:61
│ └─ Text @1:38-1:59 "this is strikethrough"
└─ Text @1:61-1:62 "."
"""

let document = Document(
parsing: text,
source: nil,
convertOptions: .init(
parseOptions: ConvertOptions.defaultParseOptions,
commonmarkOptions: .strikethroughDoubleTilde,
extensions: ConvertOptions.defaultCommonmarkExtensions
)
)
XCTAssertEqual(expectedDump, document.debugDescription(options: .printSourceLocations))
}
}

0 comments on commit 5df006a

Please sign in to comment.