Skip to content

Commit

Permalink
Add SRT parser and serializer
Browse files Browse the repository at this point in the history
  • Loading branch information
philipgiuliani committed Nov 18, 2023
1 parent eaa4a24 commit c644cf4
Show file tree
Hide file tree
Showing 7 changed files with 226 additions and 77 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
[![Package Version](https://img.shields.io/hexpm/v/glubs)](https://hex.pm/packages/glubs)
[![Hex Docs](https://img.shields.io/badge/hex-docs-ffaff3)](https://hexdocs.pm/glubs/)

glubs (gleam subtitles) is a WebVTT (and in the future maybe SRT) parser written in Gleam, designed to parse WebVTT files and provide a structured representation of the content.
glubs is a WebVTT and SRT parser and serializer written in Gleam.
It also has a tokenizer for formatted WebVTT payloads.

## Installation

Expand All @@ -21,9 +22,10 @@ and its documentation can be found at <https://hexdocs.pm/glubs>.
* [x] Handles both comments and cues with start and end times
* [x] Tokenizes WebVTT cue payload into individual tokens
* [x] Converts a WebVTT type back to a string
* [x] Parse SRT
* [x] Convert SRT to string
* [ ] Parse WebVTT metadata
* [ ] Converts a list of tokens type back to a string
* [ ] Add SRT

## Example

Expand Down
11 changes: 8 additions & 3 deletions gleam.toml
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
name = "glubs"
version = "0.1.0"
version = "0.2.0"

description = "WebVTT and SRT parser and serializer."

description = "WebVTT parser and payload tokenizer"
licences = ["Apache-2.0"]

repository = { type = "github", user = "philipgiuliani", repo = "glubs" }
links = [{ title = "Website", href = "https://github.com/philipgiuliani/glubs" }]

internal_modules = [
"glubs/timestamp"
]

[dependencies]
gleam_stdlib = "~> 0.32"
Expand Down
67 changes: 67 additions & 0 deletions src/glubs/srt.gleam
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import gleam/string
import gleam/list
import gleam/result
import gleam/int
import gleam/string_builder.{StringBuilder}
import glubs/timestamp

pub type Srt {
Srt(cues: List(Cue))
}

// Cue represents a single cue in a srt file.
pub type Cue {
Cue(id: Int, start_time: Int, end_time: Int, payload: String)
}

// Parses a Srt string and returns a Result containing the parsed Srt structure or a parsing error.
pub fn parse(input: String) -> Result(Srt, String) {
input
|> string.replace("\r\n", "\n")
|> string.trim_right()
|> string.split("\n\n")
|> list.try_map(parse_cue)
|> result.map(Srt(cues: _))
}

/// Converts a Srt type to a string.
pub fn to_string(srt: Srt) -> String {
srt.cues
|> list.map(cue_to_string)
|> string_builder.join("\n\n")
|> string_builder.append("\n")
|> string_builder.to_string()
}

fn cue_to_string(cue: Cue) -> StringBuilder {
let start_time = timestamp.to_string(cue.start_time, ",")
let end_time = timestamp.to_string(cue.end_time, ",")

[
string_builder.from_string(int.to_string(cue.id)),
start_time
|> string_builder.append(" --> ")
|> string_builder.append_builder(end_time),
string_builder.from_string(cue.payload),
]
|> string_builder.join("\n")
}

fn parse_cue(input: String) -> Result(Cue, String) {
let [id, ts, ..lines] = string.split(input, "\n")

use id <- result.try(
id
|> int.parse()
|> result.replace_error("Cannot parse identifier"),
)

use #(start_time, end_time) <- result.try(timestamp.parse_range(ts, ","))

Ok(Cue(
id: id,
start_time: start_time,
end_time: end_time,
payload: string.join(lines, "\n"),
))
}
88 changes: 88 additions & 0 deletions src/glubs/timestamp.gleam
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import gleam/result
import gleam/string
import gleam/string_builder.{StringBuilder}
import gleam/int

// Parses the given string to a timestamp.
pub fn parse(input: String, fraction_sep: String) -> Result(Int, Nil) {
use #(h, m, s_ms) <- result.try({
case string.split(input, on: ":") {
[m, s_ms] -> Ok(#("0", m, s_ms))
[h, m, s_ms] -> Ok(#(h, m, s_ms))
_ -> Error(Nil)
}
})

use h <- result.try(int.parse(h))
use m <- result.try(int.parse(m))
use #(s, ms) <- result.try(split_seconds(s_ms, fraction_sep))

Ok({ s + m * 60 + h * 60 * 60 } * 1000 + ms)
}

// Parses a timestamp range.
pub fn parse_range(
line: String,
fraction_sep: String,
) -> Result(#(Int, Int), String) {
case string.split(line, " --> ") {
[start, end] -> {
use start <- result.try(
start
|> parse(fraction_sep)
|> result.replace_error("Invalid start timestamp"),
)

use end <- result.try(
end
|> parse(fraction_sep)
|> result.replace_error("Invalid end timestamp"),
)

Ok(#(start, end))
}
_other -> Error("Invalid timestamp")
}
}

// Converts the given ms to a timestamp.
pub fn to_string(ms: Int, fraction_sep: String) -> StringBuilder {
let hours = pad({ ms / 3_600_000 }, 2)
let minutes = pad({ { ms % 3_600_000 } / 60_000 }, 2)
let seconds = pad({ ms % 60_000 } / 1000, 2)
let ms = pad(ms % 1000, 3)

string_builder.from_strings([
hours,
":",
minutes,
":",
seconds,
fraction_sep,
ms,
])
}

fn split_seconds(
input: String,
fraction_sep: String,
) -> Result(#(Int, Int), Nil) {
case string.split(input, on: fraction_sep) {
[_s] -> {
use s <- result.try(int.parse(input))
Ok(#(s, 0))
}
[s, ms] -> {
use s <- result.try(int.parse(s))
use ms <- result.try(int.parse(ms))
Ok(#(s, ms))
}
_other -> Error(Nil)
}
}

fn pad(number: Int, count: Int) -> String {
number
|> int.to_string()
|> string.pad_left(count, "0")
}
77 changes: 5 additions & 72 deletions src/glubs/webvtt.gleam
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ import gleam/option.{None, Option, Some}
import gleam/string
import gleam/result
import gleam/list
import gleam/int
import gleam/string_builder.{StringBuilder}
import glubs/timestamp

/// Item represents an individual item in a WebVTT file, which can be either a Note or a Cue.
pub type Item {
Expand Down Expand Up @@ -67,8 +67,8 @@ fn item_to_string(item: Item) -> StringBuilder {
False -> string_builder.from_strings(["NOTE ", content])
}
Cue(id: id, start_time: start_time, end_time: end_time, payload: payload) -> {
let start_time = timestamp_to_string(start_time)
let end_time = timestamp_to_string(end_time)
let start_time = timestamp.to_string(start_time, ".")
let end_time = timestamp.to_string(end_time, ".")
let timestamp =
start_time
|> string_builder.append(" --> ")
Expand Down Expand Up @@ -117,7 +117,7 @@ fn parse_cue(cue: String) -> Result(Item, String) {

case string.split_once(rest, "\n") {
Ok(#(line, payload)) -> {
use #(start, end) <- result.try(parse_timestamps(line))
use #(start, end) <- result.try(timestamp.parse_range(line, "."))
Ok(Cue(id: id, payload: payload, start_time: start, end_time: end))
}
Error(Nil) -> Error("Invalid cue")
Expand All @@ -136,27 +136,6 @@ fn parse_cue_id(cue: String) -> Result(#(Option(String), String), String) {
}
}

fn parse_timestamps(line: String) -> Result(#(Int, Int), String) {
case string.split(line, " --> ") {
[start, end] -> {
use start <- result.try(
start
|> parse_timestamp()
|> result.replace_error("Invalid start timestamp"),
)

use end <- result.try(
end
|> parse_timestamp()
|> result.replace_error("Invalid end timestamp"),
)

Ok(#(start, end))
}
_other -> Error("Invalid timestamp")
}
}

/// Token represents individual tokens that can be generated during the tokenization of WebVTT cue payload.
pub type Token {
StartTag(tag: String, classes: List(String), annotation: Option(String))
Expand Down Expand Up @@ -197,7 +176,7 @@ fn do_tokenize(
"<" <> rest -> {
case string.split_once(rest, on: ">") {
Ok(#(tag, rest)) -> {
case parse_timestamp(tag) {
case timestamp.parse(tag, ".") {
Ok(ts) -> do_tokenize(rest, [Timestamp(ts), ..acc])
Error(_) -> do_tokenize(rest, [parse_start_tag(tag), ..acc])
}
Expand Down Expand Up @@ -235,49 +214,3 @@ fn parse_tag_and_classes(input: String) -> #(String, List(String)) {
let [tag, ..classes] = string.split(input, on: ".")
#(tag, classes)
}

fn parse_timestamp(input: String) -> Result(Int, Nil) {
use #(h, m, s_ms) <- result.try({
case string.split(input, on: ":") {
[m, s_ms] -> Ok(#("0", m, s_ms))
[h, m, s_ms] -> Ok(#(h, m, s_ms))
_ -> Error(Nil)
}
})

use h <- result.try(int.parse(h))
use m <- result.try(int.parse(m))
use #(s, ms) <- result.try(split_seconds(s_ms))

Ok({ s + m * 60 + h * 60 * 60 } * 1000 + ms)
}

fn split_seconds(input: String) -> Result(#(Int, Int), Nil) {
case string.split(input, on: ".") {
[_s] -> {
use s <- result.try(int.parse(input))
Ok(#(s, 0))
}
[s, ms] -> {
use s <- result.try(int.parse(s))
use ms <- result.try(int.parse(ms))
Ok(#(s, ms))
}
_other -> Error(Nil)
}
}

fn timestamp_to_string(ms: Int) -> StringBuilder {
let hours = pad({ ms / 3_600_000 }, 2)
let minutes = pad({ { ms % 3_600_000 } / 60_000 }, 2)
let seconds = pad({ ms % 60_000 } / 1000, 2)
let ms = pad(ms % 1000, 3)

string_builder.from_strings([hours, ":", minutes, ":", seconds, ".", ms])
}

fn pad(number: Int, count: Int) -> String {
number
|> int.to_string()
|> string.pad_left(count, "0")
}
20 changes: 20 additions & 0 deletions test/fixtures/example.srt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
1
00:02:16,612 --> 00:02:19,376
Senator, we're making
our final approach into Coruscant.

2
00:02:19,482 --> 00:02:21,609
Very good, Lieutenant.

3
00:03:13,336 --> 00:03:15,167
We made it.

4
00:03:18,608 --> 00:03:20,371
I guess I was wrong.

5
00:03:20,476 --> 00:03:22,671
There was no danger at all.
34 changes: 34 additions & 0 deletions test/glubs/srt_test.gleam
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import simplifile
import glubs/srt.{Cue, Srt}
import gleeunit/should

pub fn parse_example_test() {
let assert Ok(content) = simplifile.read("test/fixtures/example.srt")

content
|> srt.parse()
|> should.equal(Ok(example()))
}

pub fn to_string_example_test() {
let assert Ok(expected) = simplifile.read("test/fixtures/example.srt")

example()
|> srt.to_string()
|> should.equal(expected)
}

fn example() -> Srt {
Srt([
Cue(
1,
136_612,
139_376,
"Senator, we're making\nour final approach into Coruscant.",
),
Cue(2, 139_482, 141_609, "Very good, Lieutenant."),
Cue(3, 193_336, 195_167, "We made it."),
Cue(4, 198_608, 200_371, "I guess I was wrong."),
Cue(5, 200_476, 202_671, "There was no danger at all."),
])
}

0 comments on commit c644cf4

Please sign in to comment.