diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml new file mode 100644 index 0000000..7c2215d --- /dev/null +++ b/.github/workflows/CI.yml @@ -0,0 +1,74 @@ +name: CI +# Run on master, tags, or any pull request +on: + schedule: + - cron: '0 2 * * *' # Daily at 2 AM UTC (8 PM CST) + push: + branches: [master] + tags: ["*"] + pull_request: +concurrency: + # Skip intermediate builds: always. + # Cancel intermediate builds: only if it is a pull request build. + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} +jobs: + test: + name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + version: + - "1.6" # Earliest supported release + - "1" # Latest release + os: + - ubuntu-latest + - macOS-latest + - windows-latest + arch: + - x64 + - x86 + exclude: + # Test 32-bit only on Linux + - os: macOS-latest + arch: x86 + - os: windows-latest + arch: x86 + include: + # Add specific version used to run the reference tests. + # Must be kept in sync with version check in `test/runtests.jl`, + # and with the branch protection rules on the repository which + # require this specific job to pass on all PRs + # (see Settings > Branches > Branch protection rules). + - os: ubuntu-latest + version: 1.10.0 + arch: x64 + steps: + - uses: actions/checkout@v4 + - uses: julia-actions/setup-julia@v1 + with: + version: ${{ matrix.version }} + arch: ${{ matrix.arch }} + - uses: actions/cache@v4 + env: + cache-name: cache-artifacts + with: + path: ~/.julia/artifacts + key: ${{ runner.os }}-${{ matrix.arch }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }} + restore-keys: | + ${{ runner.os }}-${{ matrix.arch }}-test-${{ env.cache-name }}- + ${{ runner.os }}-${{ matrix.arch }}-test- + ${{ runner.os }}-${{ matrix.arch }}- + ${{ runner.os }}- + - uses: julia-actions/julia-buildpkg@latest + - run: | + git config --global user.name Tester + git config --global user.email te@st.er + - uses: julia-actions/julia-runtest@latest + - uses: julia-actions/julia-processcoverage@v1 + - uses: codecov/codecov-action@v4 + with: + files: lcov.info + token: ${{ secrets.CODECOV_TOKEN }} + fail_ci_if_error: false diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml new file mode 100644 index 0000000..c988a3e --- /dev/null +++ b/.github/workflows/CompatHelper.yml @@ -0,0 +1,18 @@ +name: CompatHelper +on: + schedule: + - cron: '0 0 * * *' # Everyday at midnight + workflow_dispatch: +jobs: + CompatHelper: + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - name: Pkg.add("CompatHelper") + run: julia -e 'using Pkg; Pkg.add("CompatHelper")' + - name: CompatHelper.main() + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }} + run: julia -e 'using CompatHelper; CompatHelper.main()' diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml new file mode 100644 index 0000000..f49313b --- /dev/null +++ b/.github/workflows/TagBot.yml @@ -0,0 +1,15 @@ +name: TagBot +on: + issue_comment: + types: + - created + workflow_dispatch: +jobs: + TagBot: + if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot' + runs-on: ubuntu-latest + steps: + - uses: JuliaRegistries/TagBot@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} + ssh: ${{ secrets.DOCUMENTER_KEY }} diff --git a/Manifest.toml b/Manifest.toml deleted file mode 100644 index 0b5f5cf..0000000 --- a/Manifest.toml +++ /dev/null @@ -1,65 +0,0 @@ -# This file is machine-generated - editing it directly is not advised - -julia_version = "1.8.2" -manifest_format = "2.0" -project_hash = "3fb881779e865ac1c3d2194226bcb20d5e2599cc" - -[[deps.Artifacts]] -uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" - -[[deps.CompilerSupportLibraries_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" -version = "0.5.2+0" - -[[deps.Libdl]] -uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" - -[[deps.LinearAlgebra]] -deps = ["Libdl", "libblastrampoline_jll"] -uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" - -[[deps.OpenBLAS_jll]] -deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] -uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" -version = "0.3.20+0" - -[[deps.Random]] -deps = ["SHA", "Serialization"] -uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" - -[[deps.SHA]] -uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" -version = "0.7.0" - -[[deps.Serialization]] -uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" - -[[deps.SparseArrays]] -deps = ["LinearAlgebra", "Random"] -uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" - -[[deps.StaticArrays]] -deps = ["LinearAlgebra", "Random", "StaticArraysCore", "Statistics"] -git-tree-sha1 = "6954a456979f23d05085727adb17c4551c19ecd1" -uuid = "90137ffa-7385-5640-81b9-e52037218182" -version = "1.5.12" - -[[deps.StaticArraysCore]] -git-tree-sha1 = "6b7ba252635a5eff6a0b0664a41ee140a1c9e72a" -uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" -version = "1.4.0" - -[[deps.Statistics]] -deps = ["LinearAlgebra", "SparseArrays"] -uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" - -[[deps.StringViews]] -git-tree-sha1 = "609585ed628a4cd46f4c142762be37f5ced5dc7d" -uuid = "354b36f9-a18e-4713-926e-db85100087ba" -version = "1.0.3" - -[[deps.libblastrampoline_jll]] -deps = ["Artifacts", "Libdl", "OpenBLAS_jll"] -uuid = "8e850b90-86db-534c-a0d3-1478176c7d93" -version = "5.1.1+0" diff --git a/Project.toml b/Project.toml index 637264f..66db84c 100644 --- a/Project.toml +++ b/Project.toml @@ -4,5 +4,14 @@ authors = ["codegodz "] version = "0.1.0" [deps] -StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" StringViews = "354b36f9-a18e-4713-926e-db85100087ba" + +[compat] +julia = "1.6" +StringViews = "1.3" + +[extras] +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[targets] +test = ["Test"] diff --git a/README.md b/README.md index 23b0716..8d32e58 100644 --- a/README.md +++ b/README.md @@ -22,13 +22,13 @@ To install use: ### Features Currently we only have some basic features like reading a line and splitting it. -For examples on how to generate test data and run the codes below see [`src/test.jl`](https://github.com/JuliaStrings/ViewReader.jl/blob/master/src/test.jl) +For examples on how to generate test data and run the codes below see [`test/runtest.jl`](https://github.com/JuliaStrings/ViewReader.jl/blob/master/test/runtest.jl) #### 1. eachlineV -**`eachlineV(file_path::String; buffer_size::Int64=10_000)`** +**`eachlineV(file_path::String; buffer_size::Int=10_000)`** -This function can be used just like the base[ `eachline` ](https://docs.julialang.org/en/v1/base/io-network/#Base.eachline " `eachline` ") in Julia. The argument `buffer_size` determines the size of the underlaying UInt8 vector. The `buffer_size` should be bigger than the longest line in a file. If this is uknown just use a big number like 1M. This function will throw a warning if no new line is found when the eof is not reached yet - giving a clue to increase the `buffer_size`. +This function can be used just like the base[ `eachline` ](https://docs.julialang.org/en/v1/base/io-network/#Base.eachline " `eachline` ") in Julia. The argument `buffer_size` determines the size of the underlaying UInt8 vector. The `buffer_size` should be bigger than the longest line in a file. If this is uknown just use a big number like 1M. This function will throw a warning if no new line is found when the eof is not reached yet - giving a clue to increase the `buffer_size`. **Example** @@ -48,16 +48,16 @@ Similar to the base [`split`](https://docs.julialang.org/en/v1/base/strings/#Bas **Example** -For example to check how often we see the string "TARGET" at column 3 in a given file +For example to check how often we see the string "TARGET" at column 3 in a given file ```Julia c = 0 for line in eachlineV("../data/test.txt") - data = splitV(line, '\t') + data = splitV(line, '\t') if data[1] == "TARGET" - c +=1 + c +=1 end -end +end println(c) ``` @@ -82,13 +82,13 @@ c = 0 for line in eachlineV("../data/numbs.txt") for item in splitV(line, '\t') c += parseV(UInt32, item) - end + end end println(c) ``` ### Benchmark -We added a simple benchmark in [`src/test.jl`](https://github.com/JuliaStrings/ViewReader.jl/blob/master/src/test.jl), for my computer with: +We added a simple benchmark in [`test/runtest.jl`](https://github.com/JuliaStrings/ViewReader.jl/blob/master/src/test.jl), for my computer with: - `gen_string_data(10_000)` - `gen_numb_data(10_000)` - and a buffer_size of `10_000` @@ -113,7 +113,5 @@ so the best is just to try some buffer sizes and see where it works optimally To make this a bit more visual, we compared the base reader to the view reader. On the: -- **x-axis** is the nubmer of lines in a file and +- **x-axis** is the nubmer of lines in a file and - **y-axis** the time in seconds to iterate over them - -![BenchmarkImage](https://www.linkpicture.com/q/reader_benchmark.png) diff --git a/src/FileReader.jl b/src/FileReader.jl index ee67979..2d3f2b2 100644 --- a/src/FileReader.jl +++ b/src/FileReader.jl @@ -1,20 +1,19 @@ using StringViews -using StaticArrays ########################################################################### -# Code to read from a file +# Code to read from a file ########################################################################### struct BufferedReader{IOT <: IO} io::IOT - buffer::Int64 - tot_alloc::Int64 + buffer::Int + tot_alloc::Int arr::Vector{UInt8} end # Function to flip elements in an array to a specified offset(buffer size here) -function flip!(arr::Vector{UInt8}, buffer::Int64) +function flip!(arr::Vector{UInt8}, buffer::Int) @inbounds @simd for i in 1:buffer arr[i] = arr[i+buffer] end @@ -24,50 +23,50 @@ function read_next_chunk!(reader::BufferedReader) # Move last read chunk to front of the array # (except in first iter) flip!(reader.arr, reader.buffer) - + # Store new chunk in second part of the array - bytes_read::Int = readbytes!(reader.io, view(reader.arr, reader.buffer+1:reader.tot_alloc), reader.buffer) + bytes_read::Int = readbytes!(reader.io, view(reader.arr, reader.buffer+1:reader.tot_alloc), reader.buffer) # If we read less than the buffer size we have to reset the array # values after "bytes_read" as this is old data (previous read) if bytes_read < reader.buffer @inbounds for i in reader.buffer+bytes_read+1:reader.tot_alloc reader.arr[i] = 0x00 - end - end + end + end end -function find_newline(reader::BufferedReader, state::Int64) +function find_newline(reader::BufferedReader, state::Int) cur_stop = copy(state) + 1 - - @inbounds for i in (state + 1):reader.tot_alloc - if reader.arr[i] == 0x0a - return cur_stop:i-1, i - end - end - + + @inbounds for i in (state + 1):reader.tot_alloc + if reader.arr[i] == 0x0a + return cur_stop:(i > 1 && reader.arr[i-1] == 0x0d ? i-2 : i-1), i + end + end + return 0:0, cur_stop end -function eachlineV(io::IO; buffer_size::Int64=10_000) +function eachlineV(io::IO; buffer_size::Int=10_000) # Allocate buffer array tot_alloc = buffer_size * 2 - buffer_arr = zeros(UInt8, tot_alloc) - - # We will set up a buffered reader through which we + buffer_arr = zeros(UInt8, tot_alloc) + + # We will set up a buffered reader through which we # stream the file bytes, >4x as fast as a regular reader reader = BufferedReader(io, buffer_size, buffer_size*2, buffer_arr) - # Also populate the reader with the first chunk already + # Also populate the reader with the first chunk already read_next_chunk!(reader) return reader end -function eachlineV(file_path::String; buffer_size::Int64=10_000) +function eachlineV(file_path::String; buffer_size::Int=10_000) io = open(file_path, "r") return eachlineV(io, buffer_size=buffer_size) end - + # Override in case we want to reuse buffers and handles function eachlineV(io::IO, buffer_arr::Vector{UInt8}) @@ -79,13 +78,13 @@ function eachlineV(io::IO, buffer_arr::Vector{UInt8}) end @inline function Base.iterate(reader::BufferedReader) - # This is the first iter so only the last half of the array is filled now + # This is the first iter so only the last half of the array is filled now # hence start reading from buffer + 1 r, state = find_newline(reader, reader.buffer) return StringView(view(reader.arr, r)), state end -@inline function Base.iterate(reader::BufferedReader, state::Int64) +@inline function Base.iterate(reader::BufferedReader, state::Int) r, state = find_newline(reader, state) if r.start == 0 if !eof(reader.io) @@ -93,14 +92,11 @@ end r, state = find_newline(reader, state - reader.buffer - 1) else close(reader.io) - return nothing - end + return nothing + end end - # I twould be odd to not reach EOF but still not find + # I twould be odd to not reach EOF but still not find # a full line, throw warning r.stop == 0 && @warn ("Buffer probably too small") return StringView(view(reader.arr, r)), state end - - - diff --git a/src/LineReader.jl b/src/LineReader.jl index fa03d55..c761040 100644 --- a/src/LineReader.jl +++ b/src/LineReader.jl @@ -4,29 +4,29 @@ # - and split the line on a specified delimiter ########################################################################### -const Sview = StringView{SubArray{UInt8, 1, Vector{UInt8}, Tuple{UnitRange{Int64}}, true}} -const Bview = SubArray{UInt8, 1, Vector{UInt8}, Tuple{UnitRange{Int64}}, true} +const Sview = StringView{SubArray{UInt8, 1, Vector{UInt8}, Tuple{UnitRange{Int}}, true}} +const Bview = SubArray{UInt8, 1, Vector{UInt8}, Tuple{UnitRange{Int}}, true} struct Line arr::Bview delimiter::UInt8 end -function Base.getindex(l::Line, index::Int64) +function Base.getindex(l::Line, index::Int) for (i, item) in enumerate(l) if i == index - return item - end - end + return item + end + end error("Index out of range") end -function find_delimiter(line::Bview, delimiter::UInt8, state::Int) +function find_delimiter(line::Bview, delimiter::UInt8, state::Int) # State refers to the last location we scanned @inbounds for i in state+1:length(line) if line[i] == delimiter - return i + return i elseif i == length(line) # i.e. last cut of this line return i + 1 # For other del. we do -1 later to exclude it hence + 1 here end @@ -42,17 +42,17 @@ end return StringView(view(line.arr, 1:loc-1)), loc end -@inline function Base.iterate(line::Line, state::Int64) +@inline function Base.iterate(line::Line, state::Int) loc = find_delimiter(line.arr, line.delimiter, state) if loc == 0 - return nothing + return nothing end return StringView(view(line.arr, state+1:loc-1)), loc end -# For now this only support a single Char, but technically +# For now this only support a single Char, but technically # we can just expand this to an arbitrary String @inline function splitV(line::Sview, delimiter::Char) length(line) > 0 || error("Empty line given") return Line(line.data, UInt8(delimiter)) -end \ No newline at end of file +end diff --git a/src/Utils.jl b/src/Utils.jl index 1254769..fe14b33 100644 --- a/src/Utils.jl +++ b/src/Utils.jl @@ -1,7 +1,7 @@ ########################################################################### -# Some basic non-alloc helpers +# Some basic non-alloc helpers ######################################eachlineV##################################### -# just for illustration now +# just for illustration now function parseV(t::Type, lineSub::Sview) parse(t, StringView(lineSub)) @@ -12,7 +12,7 @@ end # end # function Int64V(lineSub::Sview) -# parse(Int64, StringView(lineSub)) +# parse(Int, StringView(lineSub)) # end # function UInt64V(lineSub::Sview) diff --git a/data/numbs.txt b/test/data/numbs.txt similarity index 100% rename from data/numbs.txt rename to test/data/numbs.txt diff --git a/data/test.txt b/test/data/test.txt similarity index 100% rename from data/test.txt rename to test/data/test.txt diff --git a/src/test.jl b/test/runtests.jl similarity index 62% rename from src/test.jl rename to test/runtests.jl index 1c2f045..473ae7f 100644 --- a/src/test.jl +++ b/test/runtests.jl @@ -1,28 +1,28 @@ +using ViewReader, Test -include("./FileReader.jl") -include("./LineReader.jl") -include("./Utils.jl") -using BenchmarkTools - -const stringFile = "../data/test.txt" -const numbFile = "../data/numbs.txt" +const stringFile = "data/test.txt" +const numbFile = "data/numbs.txt" # To create some random line data -function gen_string_data(copies::Int64) +function gen_string_data(copies::Int) open(stringFile, "w") do handle txt = "Text\twithout\tletter\nbla\tbla\tTARGET\tbla\tbla\nblablabla\nTEST\n" corpus = txt^copies write(handle, corpus) - end + end end # To create some random number data -function gen_numb_data(copies::Int64) +function gen_numb_data(copies::Int) open(numbFile, "w") do handle write(handle, "1\n13\t15\t18\n11\t10\t15\n"^copies) end end +# generate test data (commented out since test files are saved in repo): +# gen_string_data(10_000) +# gen_numb_data(10_000) + ############################################################# # File reading test ############################################################# @@ -30,9 +30,9 @@ function normalRead() c = 0 for line in eachline(stringFile) if line == "TEST" - c += 1 + c += 1 end - end + end return c end @@ -40,9 +40,9 @@ function viewRead() c = 0 for line in eachlineV(stringFile, buffer_size=10_000) if line == "TEST" - c +=1 + c +=1 end - end + end return c end @@ -55,9 +55,9 @@ function normalSplit() for item in split(line, "\t") if item == "bla" c +=1 - end - end - end + end + end + end return c end @@ -66,8 +66,8 @@ function viewSplit() for line in eachlineV(stringFile) for item in splitV(line, '\t') if item == "bla" - c +=1 - end + c +=1 + end end end return c @@ -82,12 +82,12 @@ function normalParse() for line in eachline(numbFile) for item in split(line, '\t') c += parse(UInt32, item) - end - end + end + end return c end -function viewParse() +function viewParse() c = 0 for line in eachlineV(numbFile) for item in splitV(line, '\t') @@ -98,46 +98,28 @@ function viewParse() end ############################################################# -# get index test +# get index test ############################################################# function viewIndex() c = 0 for line in eachlineV("../data/test.txt") - data = splitV(line, '\t') + data = splitV(line, '\t') if data[1] == "TARGET" - c +=1 + c +=1 end - end + end return c end +@testset "Reading lines" begin + @test normalRead() == viewRead() +end +@testset "Splitting lines" begin + @test normalSplit() == viewSplit() +end +@testset "Number parse" begin + @test normalParse() == viewParse() +end -function run_test() - - # println("Reading lines") - # @assert normalRead() == viewRead() - # print("Base eachline: ") - # @btime normalRead() - # print("View eachline: ") - # @btime viewRead() - - # println("\nSplitting lines") - # @assert normalSplit() == viewSplit() - # print("Base split: ") - # @btime normalSplit() - # print("View split: ") - # @btime viewSplit() - - # println("\nNumber parse") - # @assert normalParse() == viewParse() - # print("Base parse: ") - # @btime normalParse() - # print("View parse: ") - # @btime viewParse() - @btime viewIndex() - -end - -gen_string_data(10_000) -gen_numb_data(10_000) -run_test() \ No newline at end of file +# using BenchmarkTools +# @btime viewIndex()