From 12ab1228a340e01aa0fb0f3de2b0f49a44d3e293 Mon Sep 17 00:00:00 2001 From: Roman S Samarev Date: Sat, 29 Jul 2023 12:28:04 -0400 Subject: [PATCH 1/3] updated CI scripts --- .github/workflows/CompatHelper.yml | 44 ++++++++++++++++++++++++------ .github/workflows/TagBot.yml | 14 ++++++++++ 2 files changed, 49 insertions(+), 9 deletions(-) diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml index cfee098..0918161 100644 --- a/.github/workflows/CompatHelper.yml +++ b/.github/workflows/CompatHelper.yml @@ -1,19 +1,45 @@ name: CompatHelper - on: schedule: - - cron: '55 00 * * *' - + - cron: 0 0 * * * + workflow_dispatch: +permissions: + contents: write + pull-requests: write jobs: CompatHelper: runs-on: ubuntu-latest steps: - - uses: julia-actions/setup-julia@latest + - name: Check if Julia is already available in the PATH + id: julia_in_path + run: which julia + continue-on-error: true + - name: Install Julia, but only if it is not already available in the PATH + uses: julia-actions/setup-julia@v1 with: - version: 1.3 - - name: Pkg.add("CompatHelper") - run: julia -e 'using Pkg; Pkg.add("CompatHelper")' - - name: CompatHelper.main() + version: '1' + arch: ${{ runner.arch }} + if: steps.julia_in_path.outcome != 'success' + - name: "Add the General registry via Git" + run: | + import Pkg + ENV["JULIA_PKG_SERVER"] = "" + Pkg.Registry.add("General") + shell: julia --color=yes {0} + - name: "Install CompatHelper" + run: | + import Pkg + name = "CompatHelper" + uuid = "aa819f21-2bde-4658-8897-bab36330d9b7" + version = "3" + Pkg.add(; name, uuid, version) + shell: julia --color=yes {0} + - name: "Run CompatHelper" + run: | + import CompatHelper + CompatHelper.main() + shell: julia --color=yes {0} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: julia -e 'using CompatHelper; CompatHelper.main()' + COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }} + # COMPATHELPER_PRIV: ${{ secrets.COMPATHELPER_PRIV }} diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml index eb6c861..90dc100 100644 --- a/.github/workflows/TagBot.yml +++ b/.github/workflows/TagBot.yml @@ -8,7 +8,18 @@ on: lookback: default: 3 permissions: + actions: read + checks: read contents: write + deployments: read + issues: read + discussions: read + packages: read + pages: read + pull-requests: read + repository-projects: read + security-events: read + statuses: read jobs: TagBot: if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot' @@ -17,3 +28,6 @@ jobs: - uses: JuliaRegistries/TagBot@v1 with: token: ${{ secrets.GITHUB_TOKEN }} + # Edit the following line to reflect the actual name of the GitHub Secret containing your private key + ssh: ${{ secrets.DOCUMENTER_KEY }} + # ssh: ${{ secrets.NAME_OF_MY_SSH_PRIVATE_KEY_SECRET }} From c928e9ddbf674be79d59b463c2595902af3292ac Mon Sep 17 00:00:00 2001 From: Roman S Samarev Date: Sat, 29 Jul 2023 12:29:25 -0400 Subject: [PATCH 2/3] style fix for isnothing --- src/Languages.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Languages.jl b/src/Languages.jl index 00fdb39..468912b 100644 --- a/src/Languages.jl +++ b/src/Languages.jl @@ -26,7 +26,7 @@ using RelocatableFolders global trigram_models = Dict{String, Dict}() for (script, langs) in trigram_models_json for (lang, trigrams) in langs #store only supported langs - if from_code(lang) != nothing + if !isnothing(from_code(lang)) get!(trigram_models, script, Dict{String, Vector{String}}())[lang] = split(trigrams, '|') end end From 99446b5b6a958756f9780fcfe0901907b07810d6 Mon Sep 17 00:00:00 2001 From: Roman S Samarev Date: Sat, 29 Jul 2023 12:32:23 -0400 Subject: [PATCH 3/3] added initial docs --- README.md | 3 +++ docs/.gitignore | 2 ++ docs/Project.toml | 2 ++ docs/make.jl | 16 +++++++++++++ docs/src/api.md | 4 ++++ docs/src/index.md | 57 +++++++++++++++++++++++++++++++++++++++++++++++ src/types.jl | 22 ++++++++++++++++++ src/whatlang.jl | 11 +++++++++ 8 files changed, 117 insertions(+) create mode 100644 docs/.gitignore create mode 100644 docs/Project.toml create mode 100644 docs/make.jl create mode 100644 docs/src/api.md create mode 100644 docs/src/index.md diff --git a/README.md b/README.md index 61ba019..b66a809 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,9 @@ Languages.jl [![pkgeval](https://juliahub.com/docs/Languages/pkgeval.svg)](https://juliahub.com/ui/Packages/Languages/w1H1r) +[![](https://img.shields.io/badge/docs-stable-blue.svg)](https://juliatext.github.io/Languages.jl) [![](https://img.shields.io/badge/docs-dev-blue.svg)](https://juliatext.github.io/Languages.jl/dev) + + ## Introduction Languages.jl is a Julia package for working with human languages. It provides: diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 0000000..a303fff --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1,2 @@ +build/ +site/ diff --git a/docs/Project.toml b/docs/Project.toml new file mode 100644 index 0000000..dfa65cd --- /dev/null +++ b/docs/Project.toml @@ -0,0 +1,2 @@ +[deps] +Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" diff --git a/docs/make.jl b/docs/make.jl new file mode 100644 index 0000000..a226afe --- /dev/null +++ b/docs/make.jl @@ -0,0 +1,16 @@ +using Documenter +using Languages + +makedocs( + sitename = "Languages", + format = Documenter.HTML(), + modules = [Languages], + pages = [ + "Home" => "index.md", + "API" => "api.md" + ] +) + +deploydocs( + repo = "github.com/JuliaText/Languages.jl.git" +) diff --git a/docs/src/api.md b/docs/src/api.md new file mode 100644 index 0000000..a421c2a --- /dev/null +++ b/docs/src/api.md @@ -0,0 +1,4 @@ +```@autodocs +Modules = [Languages] +Private = false +``` diff --git a/docs/src/index.md b/docs/src/index.md new file mode 100644 index 0000000..5f073f0 --- /dev/null +++ b/docs/src/index.md @@ -0,0 +1,57 @@ +# Languages.jl + +Languages.jl is a Julia package for working with human languages. It provides: + +* Lists of words from each language for basic categories: + * Articles + * Indefinite Articles + * Definite Articles + * Prepositions + * Pronouns + * Stopwords + + These methods are supported only for English and German currently. + + This package also detects the script and language for written text in a wide variety of languages. + +## Usage + + using Languages + + articles(Languages.English()) + stopwords(Languages.English()) + +All word lists are returned as vectors of UTF-8 strings. + +## Script detection + +Script detection model works by checking the unicode character ranges present within +the input text + + Languages.detect_script("To be or not to be") # => Languages.LatinScript() + +## Language Detection + +A trigram based model is used to detect the language for the text. The model is +filtered based on the detected script. + +We detect 84 of the most common languages spoken around the world. This usually +covers most languages with more than 10 million native speakers. + + detector = LanguageDetector() + detector("To be or not to be") #=> (Languages.English(), Languages.LatinScript(), 1.0) + +## List All Supported Languages +You can use `list_languages()` to get all supported languages. + +The `LanguageDetector` model returns the language, the script, and the confidence when applied to a string. + +The language and script detection code in this package is heavily inspired from the rust package [whatlang-rs](https://github.com/greyblake/whatlang-rs). That package is in turn derived from [franc](https://github.com/wooorm/franc). See `LICENSE.whatlang-rs` for details. + +## Deprecations + +The API of this package has been refurbished recently. If you have used this package earlier, +please be aware of these changes. + + * The language names have been shortened. So `English` instead of `EnglishLanguage`. However, the language names are no longer exported. So they should be referred to with the package name: `Languages.English` + * Every language is a type. However all functions now accept and return instances of these types, rather than the types themselves. diff --git a/src/types.jl b/src/types.jl index cc38013..692c30f 100644 --- a/src/types.jl +++ b/src/types.jl @@ -5,8 +5,25 @@ abstract type Language; end # Portuguese, Romanian, Russian, Spanish, Swedish, Turkish # These are ISO 639-2T alpha-3 and ISO 639-3 codes +""" + isocode(lang::T) where {T<:Language} + +Returns ISO code of the `lang` +""" isocode(lang::T) where {T<:Language} = isocode(T) + +""" + name(lang::T) where {T<:Language} + +Returns the self-name of the language `lang`. +""" name(lang::T) where {T<:Language} = name(T) + +""" + english_name(lang::T) where {T<:Language} + +Returns the name of the language `lang` in English. +""" english_name(lang::T) where {T<:Language} = english_name(T) struct Esperanto <: Language; end; english_name(::Type{Esperanto}) = "Esperanto"; name(::Type{Esperanto}) = "Esperanto"; isocode(::Type{Esperanto}) = "epo"; @@ -182,6 +199,11 @@ global const code_to_lang = Dict{String, Language}( "uig" => Uyghur(), ) +""" + from_code(code::String) + +Returns the language object for the ISO `code`. +""" function from_code(code::String) return get(code_to_lang, lowercase(code), nothing) end diff --git a/src/whatlang.jl b/src/whatlang.jl index cfa14a2..29f91c2 100644 --- a/src/whatlang.jl +++ b/src/whatlang.jl @@ -9,6 +9,12 @@ const RELIABLE_CONFIDENCE_THRESHOLD = 0.8; +""" + detect_script(text::AbstractString) + +Detect a script for the given `text`. +Returns either `Script` or a tuple `(Script, probability)`. +""" function detect_script(text::AbstractString) script_counters = [ [LatinScript() , 0], @@ -384,6 +390,11 @@ Base.@deprecate detect(text::AbstractString, options=default_options()) Language mutable struct LanguageDetector end +""" + detector::LanguageDetector(text::AbstractString, options=default_options()) + +Returns a tuple `(Language, Script, confidence)` for the given `text` +""" function(m::LanguageDetector)(text::AbstractString, options=default_options()) if text==""; throw(ArgumentError("Cannot detect language for empty text")); end script = detect_script(text)