Skip to content

Commit

Permalink
Improve parsing logic more
Browse files Browse the repository at this point in the history
  • Loading branch information
jakobnissen committed Aug 10, 2022
1 parent a895c8b commit 4befa1f
Show file tree
Hide file tree
Showing 7 changed files with 55 additions and 36 deletions.
9 changes: 9 additions & 0 deletions src/FASTX.jl
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,14 @@ const UTF8 = Union{AbstractVector{UInt8}, String, SubString{String}}
error(String(take!(buf)))
end

function truncate(s::AbstractString, len::Integer)
if length(s) > len
return string(String(collect(Iterators.take(s, len - 1))), '')
else
return s
end
end

include("fasta/fasta.jl")
include("fastq/fastq.jl")

Expand Down Expand Up @@ -132,6 +140,7 @@ function FASTA.Record!(record::FASTQ.Record)
end

Base.parse(::Type{T}, s::AbstractString) where {T <: Record} = parse(T, String(s))
Base.parse(::Type{T}, s::Union{String, SubString{String}}) where {T <: Record} = parse(T, codeunits(s))

"Get the indices of `data` that correspond to sequence indices `part`"
function seq_data_part(record::Record, part::AbstractUnitRange)
Expand Down
2 changes: 1 addition & 1 deletion src/fasta/fasta.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import TranscodingStreams: TranscodingStreams, TranscodingStream, NoopStream
# Trivial use, I only use it here because it's a dep of Automa anyway.
# Can be removed with no big problems
using ScanByte: memchr, ByteSet
import ..FASTX: identifier, description, sequence, UTF8, seqlen, throw_parser_error
import ..FASTX: identifier, description, sequence, UTF8, seqlen, throw_parser_error, truncate

include("record.jl")
include("readrecord.jl")
Expand Down
40 changes: 20 additions & 20 deletions src/fasta/record.jl
Original file line number Diff line number Diff line change
Expand Up @@ -68,19 +68,27 @@ function Base.empty!(record::Record)
return record
end

function Base.parse(::Type{Record}, data::UTF8)
function Base.parse(::Type{Record}, data::AbstractVector{UInt8})
# Error early on empty data to not construct buffers
isempty(data) && throw(ArgumentError("Cannot parse empty string as FASTA record"))

record = Record(Vector{UInt8}(undef, sizeof(data)), 0, 0, 0)
stream = NoopStream(IOBuffer(data))
_, _, found = readrecord!(stream, record, (1, 1))
found || throw(ArgumentError("invalid FASTA record"))
next = false
try
_, _, found = readrecord!(stream, Record(), (1, 1))
next = found
catch e
next = true
stream = NoopStream(IOBuffer(data), bufsize=sizeof(data))
cs, _, found = readrecord!(stream, record, (1, 1))

# If found is not set, then the data terminated early
found || throw(ArgumentError("Incomplete FASTA record"))

# In this case, the machine ran out of data exactly after one record
p = stream.state.buffer1.bufferpos
p > sizeof(data) && iszero(cs) && return record

# Else, we check all trailing data to see it contains only \r\n
for i in p-1:sizeof(data)
if !in(data[i], (UInt8('\r'), UInt8('\n')))
throw(ArgumentError("Invalid trailing data after FASTA record"))
end
end
next && throw(ArgumentError("Multiple FASTA records"))
return record
end

Expand Down Expand Up @@ -137,15 +145,7 @@ function Base.show(io::IO, record::Record)
print(io, summary(record), ':')
println(io)
println(io, "description: \"", description(record), '"')
print(io, " sequence: \"", truncate(sequence(String, record), 40), '"')
end

function truncate(s::String, len::Integer)
if length(s) > len
return string(String(collect(Iterators.take(s, len - 1))), '')
else
return s
end
print(io, " sequence: \"", truncate(sequence(record), 40), '"')
end

function memcmp(p1::Ptr, p2::Ptr, n::Integer)
Expand Down
2 changes: 1 addition & 1 deletion src/fastq/fastq.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import BioGenerics: BioGenerics
import BioGenerics.Automa: State
import StringViews: StringView
import TranscodingStreams: TranscodingStreams, TranscodingStream, NoopStream
import ..FASTX: identifier, description, sequence, UTF8, seqlen, throw_parser_error
import ..FASTX: identifier, description, sequence, UTF8, seqlen, throw_parser_error, truncate

include("quality.jl")
include("record.jl")
Expand Down
36 changes: 22 additions & 14 deletions src/fastq/record.jl
Original file line number Diff line number Diff line change
Expand Up @@ -96,19 +96,27 @@ function Base.empty!(record::Record)
return record
end

function Base.parse(::Type{Record}, data::UTF8)
function Base.parse(::Type{Record}, data::AbstractVector{UInt8})
# Error early on empty data to not construct buffers
isempty(data) && throw(ArgumentError("Cannot parse empty string as FASTQ record"))

record = Record(Vector{UInt8}(undef, sizeof(data)), 0, 0, 0)
stream = NoopStream(IOBuffer(data))
_, _, found = readrecord!(stream, record, (1, 1))
found || throw(ArgumentError("invalid FASTQ record"))
next = false
try
_, _, found = readrecord!(stream, Record(), (1, 1))
next = found
catch e
next = true
stream = NoopStream(IOBuffer(data), bufsize=sizeof(data))
cs, _, found = readrecord!(stream, record, (1, 1))

# If found is not set, then the data terminated early
found || throw(ArgumentError("Incomplete FASTQ record"))

# In this case, the machine ran out of data exactly after one record
p = stream.state.buffer1.bufferpos
p > sizeof(data) && iszero(cs) && return record

# Else, we check all trailing data to see it contains only \r\n
for i in p-1:sizeof(data)
if !in(data[i], (UInt8('\r'), UInt8('\n')))
throw(ArgumentError("Invalid trailing data after FASTQ record"))
end
end
next && throw(ArgumentError("Multiple FASTQ records"))
return record
end

Expand Down Expand Up @@ -199,9 +207,9 @@ end

function Base.show(io::IO, record::Record)
println(io, "FASTQ.Record:")
println(io, " description: ", description(record))
println(io, " sequence: ", sequence(String, record))
print(io, " quality: ", quality(String, record))
println(io, " description: \"", description(record))
println(io, " sequence: \"", truncate(sequence(record), 40), '"')
print(io, " quality: \"", truncate(quality(record), 40), '"')
end

function memcmp(p1::Ptr, p2::Ptr, n::Integer)
Expand Down
1 change: 1 addition & 0 deletions test/fasta/record.jl
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ end

# Trailing extra record
@test_throws Exception parse(Record, ">A\nT\n>G\nA\n")
@test_throws Exception parse(Record, ">A\nT\n>")
@test parse(Record, ">A\nT\n\r\n\r\n") isa Record
end

Expand Down
1 change: 1 addition & 0 deletions test/fastq/record.jl
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ end

# Trailing extra record
@test_throws Exception parse(Record, "@A\nT\n+\nJ\n@B\nU\n+\nK")
@test_throws Exception parse(Record, "@A\nT\n+\nJ\n@B\nU\n")
@test parse(Record, "@A\nT\n+\nJ\n\r\n") isa Record
end

Expand Down

0 comments on commit 4befa1f

Please sign in to comment.