From 80e6dcf52c72ae8ce7449cb906bc33974dc5c5ec Mon Sep 17 00:00:00 2001 From: Nick Brandt Date: Tue, 8 Oct 2024 09:52:55 -0400 Subject: [PATCH] Enforce UTF-8 encoding for input bytes (#115) * Detect UTF-8 for vtt/srt, error if encoded otherwise * Switch to isValidUTF8Reader * Remove isValidUTF8Reader function * Enforce valid UTF8 on each SRT and WebVTT line * Add benchmark tests for webvtt and srt * Update benchmark test * Add non-utf-8 tests and files * Remove benchmarks --- srt.go | 5 +++++ srt_test.go | 5 +++++ testdata/example-in-non-utf8.srt | Bin 0 -> 734 bytes testdata/example-in-non-utf8.vtt | Bin 0 -> 3584 bytes webvtt.go | 9 +++++++++ webvtt_test.go | 5 +++++ 6 files changed, 24 insertions(+) create mode 100644 testdata/example-in-non-utf8.srt create mode 100644 testdata/example-in-non-utf8.vtt diff --git a/srt.go b/srt.go index e7f016c..58f77ce 100644 --- a/srt.go +++ b/srt.go @@ -7,6 +7,7 @@ import ( "strconv" "strings" "time" + "unicode/utf8" ) // Constants @@ -43,6 +44,10 @@ func ReadFromSRT(i io.Reader) (o *Subtitles, err error) { // Fetch line line = strings.TrimSpace(scanner.Text()) lineNum++ + if !utf8.ValidString(line) { + err = fmt.Errorf("astisub: line %d is not valid utf-8", lineNum) + return + } // Remove BOM header if lineNum == 1 { diff --git a/srt_test.go b/srt_test.go index f44fbbd..611c3fc 100644 --- a/srt_test.go +++ b/srt_test.go @@ -46,3 +46,8 @@ func TestSRTMissingSequence(t *testing.T) { assert.NoError(t, err) assert.Equal(t, string(c), w.String()) } + +func TestNonUTF8SRT(t *testing.T) { + _, err := astisub.OpenFile("./testdata/example-in-non-utf8.srt") + assert.Error(t, err) +} diff --git a/testdata/example-in-non-utf8.srt b/testdata/example-in-non-utf8.srt new file mode 100644 index 0000000000000000000000000000000000000000..41fea028837824531c7e57077e9bc2edf280926b GIT binary patch literal 734 zcmZ{iJ5R$v5QL}puW$!Q0pUlCp#ag*Km!t_OkyjJ$V*AAD1RRKb`OgPf=;&YZf<90 z=luDRDAFb4h9}{j=~}VQ8Mk~7ksQ3oPEN(CD%Fe?XVz*p*GMx>wa`Ro;E~b;-qhK1 zuP%?U2mA^=p)S^&JIT?qxuJzSi8XK}b!6a3pLwK5-Rp^Sksh?vj=Bmr72h2$W1fYt zT6f&AUh~{>KBD8F1We03T<|Wa`<+9;`{3pw(u*2&Hk@41|4?IB3p))?){LoVQ&VGV zmR^^tu^ErC<^QOq?j}`9-7`7Y=*(Qv@_u){5z1jVX!QSRlw_O1biAXCZVfM}dv(SN z?+cx1OU1}FZ9M_EJC51!`ri1=)=NBeT^iG2mq8)i=&?*Q)O%zQ{;e@wa}sUQv774B b-MhNEuoP<2-h!I5Df9R{oh3KD_rLH9l)7He literal 0 HcmV?d00001 diff --git a/testdata/example-in-non-utf8.vtt b/testdata/example-in-non-utf8.vtt new file mode 100644 index 0000000000000000000000000000000000000000..1f230b15b0b761f5c4a688b0fec0f14b34b43b26 GIT binary patch literal 3584 zcmeH}TW=Ck5Xa|vKgGT@(xh$5rHU9dnl$mjOJW-vAGvIi6j%a_h{msO{r%^#utgwE zd@>;?%bqi5=0Eox?tkCdrMyV^+sf&{CVg5rdqdp@aT0+_$V3@cr6A z=gEAEx7tnMr5MIlfu>JH}n0_PB@EWxa3|qML0dY&sNmqmHo=^ClGt^q~J<#AB-ju z4xYv~cZzb~w3?mUQNgw;v?^!u<`^w-lquz|>HH^F<{c!zftG+d_WKNPUD^Sw!szGzuKEy`gogzt@~|`h(P2qXz*Jm&Q*Mj$?K`T?vfrlKm6u=Q@Di~xbljX{z}2$iu+b`uj-&}dL8bcabHGG-CwMyMxho}HL^D2 z3H6#+%oJJ+xeV!+$D)T-4R#6gqDt-uC&m7PDvRvBz2@uIVD}cZIXvnf`8)S0C`LM| z?qZLO?0NC0tIT&a`BgrIy$N#ZP!utDy`ykWH*l3^^836 zShu0*>#cf>&^ZD{H52@t`c7wccJ8N;@;*>hg9d$Zc3Dux!kR~?5y I|9$@b0X^^;D*ylh literal 0 HcmV?d00001 diff --git a/webvtt.go b/webvtt.go index ec0e7e3..89672fe 100644 --- a/webvtt.go +++ b/webvtt.go @@ -11,6 +11,7 @@ import ( "strconv" "strings" "time" + "unicode/utf8" "golang.org/x/net/html" ) @@ -128,6 +129,10 @@ func ReadFromWebVTT(i io.Reader) (o *Subtitles, err error) { lineNum++ line = scanner.Text() line = strings.TrimPrefix(line, string(BytesBOM)) + if !utf8.ValidString(line) { + err = fmt.Errorf("astisub: line %d is not valid utf-8", lineNum) + return + } if fs := strings.Fields(line); len(fs) > 0 && fs[0] == "WEBVTT" { break } @@ -144,6 +149,10 @@ func ReadFromWebVTT(i io.Reader) (o *Subtitles, err error) { // Fetch line line = strings.TrimSpace(scanner.Text()) lineNum++ + if !utf8.ValidString(line) { + err = fmt.Errorf("astisub: line %d is not valid utf-8", lineNum) + return + } switch { // Comment diff --git a/webvtt_test.go b/webvtt_test.go index 009d6b4..8ab1fd9 100644 --- a/webvtt_test.go +++ b/webvtt_test.go @@ -48,6 +48,11 @@ func TestBroken1WebVTT(t *testing.T) { assert.Nil(t, err) } +func TestNonUTF8WebVTT(t *testing.T) { + _, err := astisub.OpenFile("./testdata/example-in-non-utf8.vtt") + assert.Error(t, err) +} + func TestWebVTTWithVoiceName(t *testing.T) { testData := `WEBVTT