Add SRT parser and serializer

philipgiuliani · Nov 18, 2023 · c644cf4 · c644cf4
1 parent eaa4a24
commit c644cf4
Show file tree

Hide file tree

Showing 7 changed files with 226 additions and 77 deletions.
diff --git a/README.md b/README.md
@@ -3,7 +3,8 @@
 [![Package Version](https://img.shields.io/hexpm/v/glubs)](https://hex.pm/packages/glubs)
 [![Hex Docs](https://img.shields.io/badge/hex-docs-ffaff3)](https://hexdocs.pm/glubs/)
 
-glubs (gleam subtitles) is a WebVTT (and in the future maybe SRT) parser written in Gleam, designed to parse WebVTT files and provide a structured representation of the content.
+glubs is a WebVTT and SRT parser and serializer written in Gleam.
+It also has a tokenizer for formatted WebVTT payloads.
 
 ## Installation
 
@@ -21,9 +22,10 @@ and its documentation can be found at <https://hexdocs.pm/glubs>.
 * [x] Handles both comments and cues with start and end times
 * [x] Tokenizes WebVTT cue payload into individual tokens
 * [x] Converts a WebVTT type back to a string
+* [x] Parse SRT
+* [x] Convert SRT to string
 * [ ] Parse WebVTT metadata
 * [ ] Converts a list of tokens type back to a string
-* [ ] Add SRT
 
 ## Example
 

diff --git a/gleam.toml b/gleam.toml
@@ -1,10 +1,15 @@
 name = "glubs"
-version = "0.1.0"
+version = "0.2.0"
+
+description = "WebVTT and SRT parser and serializer."
 
-description = "WebVTT parser and payload tokenizer"
 licences = ["Apache-2.0"]
+
 repository = { type = "github", user = "philipgiuliani", repo = "glubs" }
-links = [{ title = "Website", href = "https://github.com/philipgiuliani/glubs" }]
+
+internal_modules = [
+  "glubs/timestamp"
+]
 
 [dependencies]
 gleam_stdlib = "~> 0.32"

diff --git a/src/glubs/srt.gleam b/src/glubs/srt.gleam
@@ -0,0 +1,67 @@
+import gleam/string
+import gleam/list
+import gleam/result
+import gleam/int
+import gleam/string_builder.{StringBuilder}
+import glubs/timestamp
+
+pub type Srt {
+  Srt(cues: List(Cue))
+}
+
+// Cue represents a single cue in a srt file.
+pub type Cue {
+  Cue(id: Int, start_time: Int, end_time: Int, payload: String)
+}
+
+// Parses a Srt string and returns a Result containing the parsed Srt structure or a parsing error.
+pub fn parse(input: String) -> Result(Srt, String) {
+  input
+  |> string.replace("\r\n", "\n")
+  |> string.trim_right()
+  |> string.split("\n\n")
+  |> list.try_map(parse_cue)
+  |> result.map(Srt(cues: _))
+}
+
+/// Converts a Srt type to a string.
+pub fn to_string(srt: Srt) -> String {
+  srt.cues
+  |> list.map(cue_to_string)
+  |> string_builder.join("\n\n")
+  |> string_builder.append("\n")
+  |> string_builder.to_string()
+}
+
+fn cue_to_string(cue: Cue) -> StringBuilder {
+  let start_time = timestamp.to_string(cue.start_time, ",")
+  let end_time = timestamp.to_string(cue.end_time, ",")
+
+  [
+    string_builder.from_string(int.to_string(cue.id)),
+    start_time
+    |> string_builder.append(" --> ")
+    |> string_builder.append_builder(end_time),
+    string_builder.from_string(cue.payload),
+  ]
+  |> string_builder.join("\n")
+}
+
+fn parse_cue(input: String) -> Result(Cue, String) {
+  let [id, ts, ..lines] = string.split(input, "\n")
+
+  use id <- result.try(
+    id
+    |> int.parse()
+    |> result.replace_error("Cannot parse identifier"),
+  )
+
+  use #(start_time, end_time) <- result.try(timestamp.parse_range(ts, ","))
+
+  Ok(Cue(
+    id: id,
+    start_time: start_time,
+    end_time: end_time,
+    payload: string.join(lines, "\n"),
+  ))
+}
diff --git a/src/glubs/timestamp.gleam b/src/glubs/timestamp.gleam
@@ -0,0 +1,88 @@
+import gleam/result
+import gleam/string
+import gleam/string_builder.{StringBuilder}
+import gleam/int
+
+// Parses the given string to a timestamp.
+pub fn parse(input: String, fraction_sep: String) -> Result(Int, Nil) {
+  use #(h, m, s_ms) <- result.try({
+    case string.split(input, on: ":") {
+      [m, s_ms] -> Ok(#("0", m, s_ms))
+      [h, m, s_ms] -> Ok(#(h, m, s_ms))
+      _ -> Error(Nil)
+    }
+  })
+
+  use h <- result.try(int.parse(h))
+  use m <- result.try(int.parse(m))
+  use #(s, ms) <- result.try(split_seconds(s_ms, fraction_sep))
+
+  Ok({ s + m * 60 + h * 60 * 60 } * 1000 + ms)
+}
+
+// Parses a timestamp range.
+pub fn parse_range(
+  line: String,
+  fraction_sep: String,
+) -> Result(#(Int, Int), String) {
+  case string.split(line, " --> ") {
+    [start, end] -> {
+      use start <- result.try(
+        start
+        |> parse(fraction_sep)
+        |> result.replace_error("Invalid start timestamp"),
+      )
+
+      use end <- result.try(
+        end
+        |> parse(fraction_sep)
+        |> result.replace_error("Invalid end timestamp"),
+      )
+
+      Ok(#(start, end))
+    }
+    _other -> Error("Invalid timestamp")
+  }
+}
+
+// Converts the given ms to a timestamp.
+pub fn to_string(ms: Int, fraction_sep: String) -> StringBuilder {
+  let hours = pad({ ms / 3_600_000 }, 2)
+  let minutes = pad({ { ms % 3_600_000 } / 60_000 }, 2)
+  let seconds = pad({ ms % 60_000 } / 1000, 2)
+  let ms = pad(ms % 1000, 3)
+
+  string_builder.from_strings([
+    hours,
+    ":",
+    minutes,
+    ":",
+    seconds,
+    fraction_sep,
+    ms,
+  ])
+}
+
+fn split_seconds(
+  input: String,
+  fraction_sep: String,
+) -> Result(#(Int, Int), Nil) {
+  case string.split(input, on: fraction_sep) {
+    [_s] -> {
+      use s <- result.try(int.parse(input))
+      Ok(#(s, 0))
+    }
+    [s, ms] -> {
+      use s <- result.try(int.parse(s))
+      use ms <- result.try(int.parse(ms))
+      Ok(#(s, ms))
+    }
+    _other -> Error(Nil)
+  }
+}
+
+fn pad(number: Int, count: Int) -> String {
+  number
+  |> int.to_string()
+  |> string.pad_left(count, "0")
+}
diff --git a/src/glubs/webvtt.gleam b/src/glubs/webvtt.gleam
@@ -2,8 +2,8 @@ import gleam/option.{None, Option, Some}
 import gleam/string
 import gleam/result
 import gleam/list
-import gleam/int
 import gleam/string_builder.{StringBuilder}
+import glubs/timestamp
 
 /// Item represents an individual item in a WebVTT file, which can be either a Note or a Cue.
 pub type Item {
@@ -67,8 +67,8 @@ fn item_to_string(item: Item) -> StringBuilder {
         False -> string_builder.from_strings(["NOTE ", content])
       }
     Cue(id: id, start_time: start_time, end_time: end_time, payload: payload) -> {
-      let start_time = timestamp_to_string(start_time)
-      let end_time = timestamp_to_string(end_time)
+      let start_time = timestamp.to_string(start_time, ".")
+      let end_time = timestamp.to_string(end_time, ".")
       let timestamp =
         start_time
         |> string_builder.append(" --> ")
@@ -117,7 +117,7 @@ fn parse_cue(cue: String) -> Result(Item, String) {
 
   case string.split_once(rest, "\n") {
     Ok(#(line, payload)) -> {
-      use #(start, end) <- result.try(parse_timestamps(line))
+      use #(start, end) <- result.try(timestamp.parse_range(line, "."))
       Ok(Cue(id: id, payload: payload, start_time: start, end_time: end))
     }
     Error(Nil) -> Error("Invalid cue")
@@ -136,27 +136,6 @@ fn parse_cue_id(cue: String) -> Result(#(Option(String), String), String) {
   }
 }
 
-fn parse_timestamps(line: String) -> Result(#(Int, Int), String) {
-  case string.split(line, " --> ") {
-    [start, end] -> {
-      use start <- result.try(
-        start
-        |> parse_timestamp()
-        |> result.replace_error("Invalid start timestamp"),
-      )
-
-      use end <- result.try(
-        end
-        |> parse_timestamp()
-        |> result.replace_error("Invalid end timestamp"),
-      )
-
-      Ok(#(start, end))
-    }
-    _other -> Error("Invalid timestamp")
-  }
-}
-
 /// Token represents individual tokens that can be generated during the tokenization of WebVTT cue payload.
 pub type Token {
   StartTag(tag: String, classes: List(String), annotation: Option(String))
@@ -197,7 +176,7 @@ fn do_tokenize(
     "<" <> rest -> {
       case string.split_once(rest, on: ">") {
         Ok(#(tag, rest)) -> {
-          case parse_timestamp(tag) {
+          case timestamp.parse(tag, ".") {
             Ok(ts) -> do_tokenize(rest, [Timestamp(ts), ..acc])
             Error(_) -> do_tokenize(rest, [parse_start_tag(tag), ..acc])
           }
@@ -235,49 +214,3 @@ fn parse_tag_and_classes(input: String) -> #(String, List(String)) {
   let [tag, ..classes] = string.split(input, on: ".")
   #(tag, classes)
 }
-
-fn parse_timestamp(input: String) -> Result(Int, Nil) {
-  use #(h, m, s_ms) <- result.try({
-    case string.split(input, on: ":") {
-      [m, s_ms] -> Ok(#("0", m, s_ms))
-      [h, m, s_ms] -> Ok(#(h, m, s_ms))
-      _ -> Error(Nil)
-    }
-  })
-
-  use h <- result.try(int.parse(h))
-  use m <- result.try(int.parse(m))
-  use #(s, ms) <- result.try(split_seconds(s_ms))
-
-  Ok({ s + m * 60 + h * 60 * 60 } * 1000 + ms)
-}
-
-fn split_seconds(input: String) -> Result(#(Int, Int), Nil) {
-  case string.split(input, on: ".") {
-    [_s] -> {
-      use s <- result.try(int.parse(input))
-      Ok(#(s, 0))
-    }
-    [s, ms] -> {
-      use s <- result.try(int.parse(s))
-      use ms <- result.try(int.parse(ms))
-      Ok(#(s, ms))
-    }
-    _other -> Error(Nil)
-  }
-}
-
-fn timestamp_to_string(ms: Int) -> StringBuilder {
-  let hours = pad({ ms / 3_600_000 }, 2)
-  let minutes = pad({ { ms % 3_600_000 } / 60_000 }, 2)
-  let seconds = pad({ ms % 60_000 } / 1000, 2)
-  let ms = pad(ms % 1000, 3)
-
-  string_builder.from_strings([hours, ":", minutes, ":", seconds, ".", ms])
-}
-
-fn pad(number: Int, count: Int) -> String {
-  number
-  |> int.to_string()
-  |> string.pad_left(count, "0")
-}
diff --git a/test/fixtures/example.srt b/test/fixtures/example.srt
@@ -0,0 +1,20 @@
+1
+00:02:16,612 --> 00:02:19,376
+Senator, we're making
+our final approach into Coruscant.
+
+2
+00:02:19,482 --> 00:02:21,609
+Very good, Lieutenant.
+
+3
+00:03:13,336 --> 00:03:15,167
+We made it.
+
+4
+00:03:18,608 --> 00:03:20,371
+I guess I was wrong.
+
+5
+00:03:20,476 --> 00:03:22,671
+There was no danger at all.
diff --git a/test/glubs/srt_test.gleam b/test/glubs/srt_test.gleam
@@ -0,0 +1,34 @@
+import simplifile
+import glubs/srt.{Cue, Srt}
+import gleeunit/should
+
+pub fn parse_example_test() {
+  let assert Ok(content) = simplifile.read("test/fixtures/example.srt")
+
+  content
+  |> srt.parse()
+  |> should.equal(Ok(example()))
+}
+
+pub fn to_string_example_test() {
+  let assert Ok(expected) = simplifile.read("test/fixtures/example.srt")
+
+  example()
+  |> srt.to_string()
+  |> should.equal(expected)
+}
+
+fn example() -> Srt {
+  Srt([
+    Cue(
+      1,
+      136_612,
+      139_376,
+      "Senator, we're making\nour final approach into Coruscant.",
+    ),
+    Cue(2, 139_482, 141_609, "Very good, Lieutenant."),
+    Cue(3, 193_336, 195_167, "We made it."),
+    Cue(4, 198_608, 200_371, "I guess I was wrong."),
+    Cue(5, 200_476, 202_671, "There was no danger at all."),
+  ])
+}