Skip to content

Commit

Permalink
Merge pull request #3 from JuliaText/ox/0.6
Browse files Browse the repository at this point in the history
Ox/0.6
  • Loading branch information
oxinabox authored Nov 22, 2017
2 parents 7a7bf56 + cc67bab commit 22f6692
Show file tree
Hide file tree
Showing 11 changed files with 133 additions and 120 deletions.
11 changes: 7 additions & 4 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,25 @@
language: julia
os:
- linux
- osx
# - osx
julia:
# - release
- 0.6
- nightly
matrix:
allow_failures:
- julia: nightly
notifications:
email: false

# uncomment the following lines to override the default test script
#script:
# - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
# - julia -e 'Pkg.clone(pwd()); Pkg.build("CorpusLoaders"); Pkg.test("CorpusLoaders"; coverage=true)'
after_success:
# Push Documentation
# Push Documentation
- julia -e 'Pkg.add("Documenter")'
- julia -e 'cd(Pkg.dir("CorpusLoaders")); include(joinpath("docs", "make.jl"))'
# push coverage results to Coveralls
- julia -e 'cd(Pkg.dir("CorpusLoaders")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())'
# push coverage results to Codecov
- julia -e 'cd(Pkg.dir("CorpusLoaders")); Pkg.add("Coverage"); using Coverage; Codecov.submit(Codecov.process_folder())'

2 changes: 1 addition & 1 deletion REQUIRE
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
julia 0.5-
julia 0.6
LightXML
16 changes: 10 additions & 6 deletions appveyor.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
environment:
matrix:
- JULIAVERSION: "julialang/bin/winnt/x86/0.5/julia-0.5-latest-win32.exe"
- JULIAVERSION: "julialang/bin/winnt/x64/0.5/julia-0.5-latest-win64.exe"
- JULIAVERSION: "julianightlies/bin/winnt/x86/julia-latest-win32.exe"
- JULIAVERSION: "julianightlies/bin/winnt/x64/julia-latest-win64.exe"

- JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x86/0.6/julia-0.6-latest-win32.exe"
- JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.6/julia-0.6-latest-win64.exe"
- JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x86/julia-latest-win32.exe"
- JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x64/julia-latest-win64.exe"
matrix:
allow_failures:
- JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x86/julia-latest-win32.exe"
- JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x64/julia-latest-win64.exe"
branches:
only:
- master
Expand All @@ -17,9 +20,10 @@ notifications:
on_build_status_changed: false

install:
- ps: "[System.Net.ServicePointManager]::SecurityProtocol = [System.Net.SecurityProtocolType]::Tls12"
# Download most recent Julia Windows binary
- ps: (new-object net.webclient).DownloadFile(
$("http://s3.amazonaws.com/"+$env:JULIAVERSION),
$env:JULIA_URL,
"C:\projects\julia-binary.exe")
# Run installer silently, output to C:\projects\julia
- C:\projects\julia-binary.exe /S /D=C:\projects\julia
Expand Down
5 changes: 3 additions & 2 deletions src/CorpusLoaders.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
module CorpusLoaders

typealias AbstractStringVector AbstractVector
#typealias AbstractStringVector{S<:AbstractString} AbstractVector{S}
using LightXML

const AbstractStringVector = AbstractVector{<:AbstractString}

include("util.jl")

Expand Down
7 changes: 4 additions & 3 deletions src/semcor.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ export TaggedWord, SenseAnnotatedWord, PosTaggedWord, TaggedSentence,

@enum SegmentBy NoSegmenting ByDocument ByParagraph BySentence

abstract TaggedWord
abstract type TaggedWord end

immutable SenseAnnotatedWord{S<:AbstractString} <: TaggedWord
pos::S
lemma::S
Expand All @@ -22,7 +23,7 @@ immutable PosTaggedWord{S<:AbstractString} <: TaggedWord
word::S
end

typealias TaggedSentence Vector{TaggedWord}
const TaggedSentence = Vector{TaggedWord}



Expand Down Expand Up @@ -142,7 +143,7 @@ end
"""Load up a semcor corpus. Eg `load_semcor("corpora/semcor2.1/brown1/tagfiles/")`"""
load_semcor(tagdir_path::AbstractString) = collect(lazyload_semcor(tagdir_path))

typealias SemcorIndex Dict{String, Vector{Tuple{TaggedSentence, Int}}}
const SemcorIndex = Dict{String, Vector{Tuple{TaggedSentence, Int}}}

"""
Index a semcor stream, by word (sense-key),
Expand Down
84 changes: 34 additions & 50 deletions src/semeval2007t7.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
using LightXML

export load_challenges_semeval2007t7, lazyload_challenges_semeval2007t7, load_solutions_semeval2007t7, lazyload_solutions_semeval2007t7

Expand Down Expand Up @@ -40,79 +39,66 @@ function breakdown(textnode, stopwords::Function)
end
end
end

instances, words
end


"""
Lazily load semeval 2007 Task 7 corpus of challenges
Context is the sentence the word occurs in
Eg `lazyload_challenges_semeval2007t7("semeval2007_t7/test/eng-coarse-all-words.xml")`
"""
function lazyload_challenges_semeval2007t7(xml_file::AbstractString)
xdoc = parse_file(xml_file)
function lazyload_challenges_semeval2007t7(xdoc::XMLDocument)
xroot = root(xdoc)
Task() do
Channel(ctype=WsdChallenge, csize=Inf) do ch
for text_node in child_elements(xroot)
for sentence_node in child_elements(text_node)
sentence = punctuation_space_tokenize(content(sentence_node))

for lemma_node in child_elements(sentence_node)
for instance in child_elements(sentence_node) # elements (always instances) NOT nodes (can be just text)
context = sentence
produce(WsdChallenge(parse_instance(instance)..., context))
end
put!(ch, WsdChallenge(parse_instance(instance)..., context))
end
end
end
end
end


"""
Lazily load semeval 2007 Task 7 corpus of challenges
giving every word with a window_sized context, from the document.
Filters out tokens from `stopwords`.
Eg `lazyload_challenges_semeval2007t7("semeval2007_t7/test/eng-coarse-all-words.xml", 10, ispunct)`
"""
function lazyload_challenges_semeval2007t7(xdoc::XMLDocument,
window_size::Int,
stopwords::Function=x->false)
xroot = root(xdoc)
Task() do
for text_node in child_elements(xroot)
instances, words = breakdown(text_node, stopwords)
for (ii, instance) in instances
context = window_excluding_center(ii, words, window_size)
produce(WsdChallenge(parse_instance(instance)..., context))
end
end
end
window_size::Int,
stopwords::Function=x->false)
xroot = root(xdoc)
Channel(ctype=WsdChallenge, csize=Inf) do ch
for text_node in child_elements(xroot)
instances, words = breakdown(text_node, stopwords)
for (ii, instance) in instances
context = window_excluding_center(ii, words, window_size)
put!(ch, WsdChallenge(parse_instance(instance)..., context))
end
end
end
end


"""
Load SemEval 2007 Task 7 corpus of challenges
Eg `lazyload_challenges_semeval2007t7("semeval2007_t7/test/eng-coarse-all-words.xml")`
"""
function load_challenges_semeval2007t7(xml_file::AbstractString)
collect(lazyload_challenges_semeval2007t7(xml_file))
function load_challenges_semeval2007t7(args...)
collect(lazyload_challenges_semeval2007t7(args...))
end



function lazyload_challenges_semeval2007t7(
stream::IO,
window_size::Int,
stopwords::Function=x->false)
function lazyload_challenges_semeval2007t7(stream::IO, args...)
xdoc = parse_string(readstring(stream))
lazyload_challenges_semeval2007t7(xdoc, window_size, stopwords)
lazyload_challenges_semeval2007t7(xdoc, args...)
end

function lazyload_challenges_semeval2007t7(
filename::AbstractString,
window_size::Int,
stopwords::Function=x->false)
function lazyload_challenges_semeval2007t7(filename::AbstractString, args...)
xdoc = parse_file(filename)
lazyload_challenges_semeval2007t7(xdoc, window_size, stopwords)
lazyload_challenges_semeval2007t7(xdoc, args...)
end


Expand All @@ -133,17 +119,15 @@ Lazy Load SemEval 2007 Task 7 corpus of solutions
Eg `lazyload_solutions_semeval2007t7("semeval2007_t7/key/dataset21.test.key")`
"""
function lazyload_solutions_semeval2007t7(key_file="data/corpora/wsd/semeval2007_t7/key/dataset21.test.key")
Task() do
for line in eachline(key_file)
line_data, comment = split(line,"!!")
fields = split(line_data)
doc_id = fields[1]
instance_id = fields[2]
solutions = fields[3:end]

lemma,pos = match(r"lemma=(.*)#(.)", comment).captures
produce(WsdSolution(instance_id, lemma, pos[1], solutions))
end
Base.Generator(eachline(key_file)) do line
line_data, comment = split(line,"!!")
fields = split(line_data)
doc_id = fields[1]
instance_id = fields[2]
solutions = fields[3:end]

lemma,pos = match(r"lemma=(.*)#(.)", comment).captures
WsdSolution(instance_id, lemma, pos[1], solutions)
end
end

Expand All @@ -152,7 +136,7 @@ Load SemEval 2007 Task 7 corpus of solutions
Eg `load_solutions_semeval2007t7("semeval2007_t7/key/dataset21.test.key")`
"""
function load_solutions_semeval2007t7(keyfile::AbstractString)
collect(lazyload_solutions_semeval2007t7(keyfile))
collect(lazyload_solutions_semeval2007t7(keyfile))
end


109 changes: 62 additions & 47 deletions test/semeval2007t7.jl
Original file line number Diff line number Diff line change
@@ -1,61 +1,76 @@
using Base.Test
using CorpusLoaders


eg1 = """
<?xml version="1.0" encoding="iso-8859-1" ?>
<!DOCTYPE corpus SYSTEM "coarse-all-words.dtd">
<corpus lang="en">
<text id="d001">
<sentence id="d001.s001">
A
B
<instance id="d001.s001.t001" lemma="editorial" pos="n">editorial</instance>
``
C
<instance id="d001.s001.t002" lemma="Ill" pos="a">Ill</instance>
<instance id="d001.s001.t003" lemma="Homeless" pos="n">Homeless</instance>
.
D
</sentence>
</text>
</corpus>
""";

eg2 = """
<?xml version="1.0" encoding="iso-8859-1" ?>
<!DOCTYPE corpus SYSTEM "coarse-all-words.dtd">
<corpus lang="en">
<text id="d001">
<sentence id="d001.s001">
A
B
<instance id="d001.s001.t001" lemma="editorial" pos="n">1</instance>
C
D
</sentence>
</text>
<text>
<sentence id="d001.s001">
<instance id="d001.s001.t001" lemma="editorial" pos="n">2</instance>
A1
B2
C2
D3
</sentence>
</text>
</corpus>
""";

@testset "sentence" begin
r1 = load_challenges_semeval2007t7(IOBuffer(eg1))
@test length(r1) == 3

r2 = load_challenges_semeval2007t7(IOBuffer(eg2))
@test length(r2) == 2
@test length(first(r2).context) == 5
@test length(last(r2).context) == 5
@test first(r2) != last(r2)
end

@testset "windowing" begin
eg1 = """
<?xml version="1.0" encoding="iso-8859-1" ?>
<!DOCTYPE corpus SYSTEM "coarse-all-words.dtd">
<corpus lang="en">
<text id="d001">
<sentence id="d001.s001">
A
B
<instance id="d001.s001.t001" lemma="editorial" pos="n">editorial</instance>
``
C
<instance id="d001.s001.t002" lemma="Ill" pos="a">Ill</instance>
<instance id="d001.s001.t003" lemma="Homeless" pos="n">Homeless</instance>
.
D
</sentence>
</text>
</corpus>
""";
r1 = lazyload_challenges_semeval2007t7(IOBuffer(eg1), 10, x->!isalnum(x)) |> collect
r1 = lazyload_challenges_semeval2007t7(IOBuffer(eg1), 10, x->!all(isalnum, x)) |> collect


@test length(r1) == 3
@test r1[1].context ==["A","B","C", "Ill","Homeless","D"]
@test r1[1].word == "editorial"



eg2 = """
<?xml version="1.0" encoding="iso-8859-1" ?>
<!DOCTYPE corpus SYSTEM "coarse-all-words.dtd">
<corpus lang="en">
<text id="d001">
<sentence id="d001.s001">
A
B
<instance id="d001.s001.t001" lemma="editorial" pos="n">1</instance>
C
D
</sentence>
</text>
<text>
<sentence id="d001.s001">
<instance id="d001.s001.t001" lemma="editorial" pos="n">2</instance>
A1
B2
C2
D3
</sentence>
</text>
</corpus>
""";
r2 = lazyload_challenges_semeval2007t7(IOBuffer(eg2), 10, x->!isalnum(x)) |> collect
r2 = lazyload_challenges_semeval2007t7(IOBuffer(eg2), 10) |> collect


@test r2[1].context ==["A","B","C","D"]
@test r2[2].context ==["A1","B2","C2","D3"]

end

3 changes: 3 additions & 0 deletions test/similarity.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
using Base.Test
using CorpusLoaders


@testset "wordsim353" begin
egs = """Word 1,Word 2,Human (mean)
Expand Down
11 changes: 7 additions & 4 deletions test/tags.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
@test brownPOStoWordNetPOS("NP") == 'n'
@test brownPOStoWordNetPOS("JJ") == 'a'
using Base.Test
using CorpusLoaders

@test brownPOStoWordNetPOS("NP") == 'n'
@test brownPOStoWordNetPOS("JJ") == 'a'

@test pennPOStoWordNetPOS("NNP") == 'n'
@test pennPOStoWordNetPOS("JJ") == 'a'

@test pennPOStoWordNetPOS("NNP") == 'n'
@test pennPOStoWordNetPOS("JJ") == 'a'
Loading

0 comments on commit 22f6692

Please sign in to comment.