diff --git a/README.md b/README.md index 787d92e..440a1f9 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,8 @@ [![Package Version](https://img.shields.io/hexpm/v/glubs)](https://hex.pm/packages/glubs) [![Hex Docs](https://img.shields.io/badge/hex-docs-ffaff3)](https://hexdocs.pm/glubs/) -glubs (gleam subtitles) is a WebVTT (and in the future maybe SRT) parser written in Gleam, designed to parse WebVTT files and provide a structured representation of the content. +glubs is a WebVTT and SRT parser and serializer written in Gleam. +It also has a tokenizer for formatted WebVTT payloads. ## Installation @@ -21,9 +22,10 @@ and its documentation can be found at . * [x] Handles both comments and cues with start and end times * [x] Tokenizes WebVTT cue payload into individual tokens * [x] Converts a WebVTT type back to a string +* [x] Parse SRT +* [x] Convert SRT to string * [ ] Parse WebVTT metadata * [ ] Converts a list of tokens type back to a string -* [ ] Add SRT ## Example diff --git a/gleam.toml b/gleam.toml index f99d7ad..596d06e 100644 --- a/gleam.toml +++ b/gleam.toml @@ -1,10 +1,15 @@ name = "glubs" -version = "0.1.0" +version = "0.2.0" + +description = "WebVTT and SRT parser and serializer." -description = "WebVTT parser and payload tokenizer" licences = ["Apache-2.0"] + repository = { type = "github", user = "philipgiuliani", repo = "glubs" } -links = [{ title = "Website", href = "https://github.com/philipgiuliani/glubs" }] + +internal_modules = [ + "glubs/timestamp" +] [dependencies] gleam_stdlib = "~> 0.32" diff --git a/src/glubs/srt.gleam b/src/glubs/srt.gleam new file mode 100644 index 0000000..8643d0d --- /dev/null +++ b/src/glubs/srt.gleam @@ -0,0 +1,67 @@ +import gleam/string +import gleam/list +import gleam/result +import gleam/int +import gleam/string_builder.{StringBuilder} +import glubs/timestamp + +pub type Srt { + Srt(cues: List(Cue)) +} + +// Cue represents a single cue in a srt file. +pub type Cue { + Cue(id: Int, start_time: Int, end_time: Int, payload: String) +} + +// Parses a Srt string and returns a Result containing the parsed Srt structure or a parsing error. +pub fn parse(input: String) -> Result(Srt, String) { + input + |> string.replace("\r\n", "\n") + |> string.trim_right() + |> string.split("\n\n") + |> list.try_map(parse_cue) + |> result.map(Srt(cues: _)) +} + +/// Converts a Srt type to a string. +pub fn to_string(srt: Srt) -> String { + srt.cues + |> list.map(cue_to_string) + |> string_builder.join("\n\n") + |> string_builder.append("\n") + |> string_builder.to_string() +} + +fn cue_to_string(cue: Cue) -> StringBuilder { + let start_time = timestamp.to_string(cue.start_time, ",") + let end_time = timestamp.to_string(cue.end_time, ",") + + [ + string_builder.from_string(int.to_string(cue.id)), + start_time + |> string_builder.append(" --> ") + |> string_builder.append_builder(end_time), + string_builder.from_string(cue.payload), + ] + |> string_builder.join("\n") +} + +fn parse_cue(input: String) -> Result(Cue, String) { + let [id, ts, ..lines] = string.split(input, "\n") + + use id <- result.try( + id + |> int.parse() + |> result.replace_error("Cannot parse identifier"), + ) + + use #(start_time, end_time) <- result.try(timestamp.parse_range(ts, ",")) + + Ok(Cue( + id: id, + start_time: start_time, + end_time: end_time, + payload: string.join(lines, "\n"), + )) +} diff --git a/src/glubs/timestamp.gleam b/src/glubs/timestamp.gleam new file mode 100644 index 0000000..5fca688 --- /dev/null +++ b/src/glubs/timestamp.gleam @@ -0,0 +1,88 @@ +import gleam/result +import gleam/string +import gleam/string_builder.{StringBuilder} +import gleam/int + +// Parses the given string to a timestamp. +pub fn parse(input: String, fraction_sep: String) -> Result(Int, Nil) { + use #(h, m, s_ms) <- result.try({ + case string.split(input, on: ":") { + [m, s_ms] -> Ok(#("0", m, s_ms)) + [h, m, s_ms] -> Ok(#(h, m, s_ms)) + _ -> Error(Nil) + } + }) + + use h <- result.try(int.parse(h)) + use m <- result.try(int.parse(m)) + use #(s, ms) <- result.try(split_seconds(s_ms, fraction_sep)) + + Ok({ s + m * 60 + h * 60 * 60 } * 1000 + ms) +} + +// Parses a timestamp range. +pub fn parse_range( + line: String, + fraction_sep: String, +) -> Result(#(Int, Int), String) { + case string.split(line, " --> ") { + [start, end] -> { + use start <- result.try( + start + |> parse(fraction_sep) + |> result.replace_error("Invalid start timestamp"), + ) + + use end <- result.try( + end + |> parse(fraction_sep) + |> result.replace_error("Invalid end timestamp"), + ) + + Ok(#(start, end)) + } + _other -> Error("Invalid timestamp") + } +} + +// Converts the given ms to a timestamp. +pub fn to_string(ms: Int, fraction_sep: String) -> StringBuilder { + let hours = pad({ ms / 3_600_000 }, 2) + let minutes = pad({ { ms % 3_600_000 } / 60_000 }, 2) + let seconds = pad({ ms % 60_000 } / 1000, 2) + let ms = pad(ms % 1000, 3) + + string_builder.from_strings([ + hours, + ":", + minutes, + ":", + seconds, + fraction_sep, + ms, + ]) +} + +fn split_seconds( + input: String, + fraction_sep: String, +) -> Result(#(Int, Int), Nil) { + case string.split(input, on: fraction_sep) { + [_s] -> { + use s <- result.try(int.parse(input)) + Ok(#(s, 0)) + } + [s, ms] -> { + use s <- result.try(int.parse(s)) + use ms <- result.try(int.parse(ms)) + Ok(#(s, ms)) + } + _other -> Error(Nil) + } +} + +fn pad(number: Int, count: Int) -> String { + number + |> int.to_string() + |> string.pad_left(count, "0") +} diff --git a/src/glubs/webvtt.gleam b/src/glubs/webvtt.gleam index 048fd7b..e23e64c 100644 --- a/src/glubs/webvtt.gleam +++ b/src/glubs/webvtt.gleam @@ -2,8 +2,8 @@ import gleam/option.{None, Option, Some} import gleam/string import gleam/result import gleam/list -import gleam/int import gleam/string_builder.{StringBuilder} +import glubs/timestamp /// Item represents an individual item in a WebVTT file, which can be either a Note or a Cue. pub type Item { @@ -67,8 +67,8 @@ fn item_to_string(item: Item) -> StringBuilder { False -> string_builder.from_strings(["NOTE ", content]) } Cue(id: id, start_time: start_time, end_time: end_time, payload: payload) -> { - let start_time = timestamp_to_string(start_time) - let end_time = timestamp_to_string(end_time) + let start_time = timestamp.to_string(start_time, ".") + let end_time = timestamp.to_string(end_time, ".") let timestamp = start_time |> string_builder.append(" --> ") @@ -117,7 +117,7 @@ fn parse_cue(cue: String) -> Result(Item, String) { case string.split_once(rest, "\n") { Ok(#(line, payload)) -> { - use #(start, end) <- result.try(parse_timestamps(line)) + use #(start, end) <- result.try(timestamp.parse_range(line, ".")) Ok(Cue(id: id, payload: payload, start_time: start, end_time: end)) } Error(Nil) -> Error("Invalid cue") @@ -136,27 +136,6 @@ fn parse_cue_id(cue: String) -> Result(#(Option(String), String), String) { } } -fn parse_timestamps(line: String) -> Result(#(Int, Int), String) { - case string.split(line, " --> ") { - [start, end] -> { - use start <- result.try( - start - |> parse_timestamp() - |> result.replace_error("Invalid start timestamp"), - ) - - use end <- result.try( - end - |> parse_timestamp() - |> result.replace_error("Invalid end timestamp"), - ) - - Ok(#(start, end)) - } - _other -> Error("Invalid timestamp") - } -} - /// Token represents individual tokens that can be generated during the tokenization of WebVTT cue payload. pub type Token { StartTag(tag: String, classes: List(String), annotation: Option(String)) @@ -197,7 +176,7 @@ fn do_tokenize( "<" <> rest -> { case string.split_once(rest, on: ">") { Ok(#(tag, rest)) -> { - case parse_timestamp(tag) { + case timestamp.parse(tag, ".") { Ok(ts) -> do_tokenize(rest, [Timestamp(ts), ..acc]) Error(_) -> do_tokenize(rest, [parse_start_tag(tag), ..acc]) } @@ -235,49 +214,3 @@ fn parse_tag_and_classes(input: String) -> #(String, List(String)) { let [tag, ..classes] = string.split(input, on: ".") #(tag, classes) } - -fn parse_timestamp(input: String) -> Result(Int, Nil) { - use #(h, m, s_ms) <- result.try({ - case string.split(input, on: ":") { - [m, s_ms] -> Ok(#("0", m, s_ms)) - [h, m, s_ms] -> Ok(#(h, m, s_ms)) - _ -> Error(Nil) - } - }) - - use h <- result.try(int.parse(h)) - use m <- result.try(int.parse(m)) - use #(s, ms) <- result.try(split_seconds(s_ms)) - - Ok({ s + m * 60 + h * 60 * 60 } * 1000 + ms) -} - -fn split_seconds(input: String) -> Result(#(Int, Int), Nil) { - case string.split(input, on: ".") { - [_s] -> { - use s <- result.try(int.parse(input)) - Ok(#(s, 0)) - } - [s, ms] -> { - use s <- result.try(int.parse(s)) - use ms <- result.try(int.parse(ms)) - Ok(#(s, ms)) - } - _other -> Error(Nil) - } -} - -fn timestamp_to_string(ms: Int) -> StringBuilder { - let hours = pad({ ms / 3_600_000 }, 2) - let minutes = pad({ { ms % 3_600_000 } / 60_000 }, 2) - let seconds = pad({ ms % 60_000 } / 1000, 2) - let ms = pad(ms % 1000, 3) - - string_builder.from_strings([hours, ":", minutes, ":", seconds, ".", ms]) -} - -fn pad(number: Int, count: Int) -> String { - number - |> int.to_string() - |> string.pad_left(count, "0") -} diff --git a/test/fixtures/example.srt b/test/fixtures/example.srt new file mode 100644 index 0000000..e16f344 --- /dev/null +++ b/test/fixtures/example.srt @@ -0,0 +1,20 @@ +1 +00:02:16,612 --> 00:02:19,376 +Senator, we're making +our final approach into Coruscant. + +2 +00:02:19,482 --> 00:02:21,609 +Very good, Lieutenant. + +3 +00:03:13,336 --> 00:03:15,167 +We made it. + +4 +00:03:18,608 --> 00:03:20,371 +I guess I was wrong. + +5 +00:03:20,476 --> 00:03:22,671 +There was no danger at all. diff --git a/test/glubs/srt_test.gleam b/test/glubs/srt_test.gleam new file mode 100644 index 0000000..bb9d992 --- /dev/null +++ b/test/glubs/srt_test.gleam @@ -0,0 +1,34 @@ +import simplifile +import glubs/srt.{Cue, Srt} +import gleeunit/should + +pub fn parse_example_test() { + let assert Ok(content) = simplifile.read("test/fixtures/example.srt") + + content + |> srt.parse() + |> should.equal(Ok(example())) +} + +pub fn to_string_example_test() { + let assert Ok(expected) = simplifile.read("test/fixtures/example.srt") + + example() + |> srt.to_string() + |> should.equal(expected) +} + +fn example() -> Srt { + Srt([ + Cue( + 1, + 136_612, + 139_376, + "Senator, we're making\nour final approach into Coruscant.", + ), + Cue(2, 139_482, 141_609, "Very good, Lieutenant."), + Cue(3, 193_336, 195_167, "We made it."), + Cue(4, 198_608, 200_371, "I guess I was wrong."), + Cue(5, 200_476, 202_671, "There was no danger at all."), + ]) +}