From 58e246ffdb703235a9d9367b8d677792b7c72fa4 Mon Sep 17 00:00:00 2001 From: Thamme Gowda Date: Fri, 9 Aug 2024 20:10:42 +0000 Subject: [PATCH 1/4] add "pymarian" CLI, a proxy to "marian" binary --- CHANGELOG.md | 1 + src/command/marian_main.cpp | 48 +++++++++++++++++++++++++-------- src/python/binding/bind.cpp | 27 ++++++++++++++++--- src/python/pymarian/__init__.py | 27 +++++++++++++++---- src/python/pyproject.toml | 1 + 5 files changed, 85 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index afa4465ce..6b7c1278d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Fixed compilation with clang 16.0.6 - Added Threads::Threads to `EXT_LIBS` - Updates to pymarian: building for multiple python versions; disabling tcmalloc; hosting gated COMETs on HuggingFace +- Add "pymarian" CLI, a proxy to "marian" binary, but made available in PATH after "pip install pymarian" ### Added - Added `--normalize-gradient-by-ratio` to mildly adapt gradient magnitude if effective batch size diverges from running average effective batch size. diff --git a/src/command/marian_main.cpp b/src/command/marian_main.cpp index e838fe808..3897ac6bb 100644 --- a/src/command/marian_main.cpp +++ b/src/command/marian_main.cpp @@ -38,25 +38,51 @@ #include "marian_conv.cpp" #undef main +#include +#include +#include #include "3rd_party/ExceptionWithCallStack.h" +#include "3rd_party/spdlog/details/format.h" int main(int argc, char** argv) { using namespace marian; + using MainFunc = int(*)(int, char**); + std::map> subcmds = { + {"train", {&mainTrainer, "Train a model (default)"}}, + {"decode", {&mainDecoder, "Decode or translate text"}}, + {"score", {&mainScorer, "Score translations"}}, + {"embed", {&mainEmbedder, "Embed text"}}, + {"evaluate", {&mainEvaluator, "Run Evaluator metric"}}, + {"vocab", {&mainVocab, "Create vocabulary"}}, + {"convert", {&mainConv, "Convert model file format"}} + }; + // no arguments, or the first arg is "?"", print help message + if (argc == 1 || (argc == 2 && (std::string(argv[1]) == "?") )) { + std::cout << "Usage: " << argv[0] << " COMMAND [ARGS]" << std::endl; + std::cout << "Commands:" << std::endl; + for (auto&& [name, val] : subcmds) { + std::cerr << fmt::format("{:10} : {}\n", name, std::get<1>(val)); + } + return 0; + } - if(argc > 1 && argv[1][0] != '-') { + if (argc > 1 && argv[1][0] != '-') { std::string cmd = argv[1]; argc--; argv[1] = argv[0]; argv++; - if(cmd == "train") return mainTrainer(argc, argv); - else if(cmd == "decode") return mainDecoder(argc, argv); - else if (cmd == "score") return mainScorer(argc, argv); - else if (cmd == "embed") return mainEmbedder(argc, argv); - else if (cmd == "evaluate") return mainEvaluator(argc, argv); - else if (cmd == "vocab") return mainVocab(argc, argv); - else if (cmd == "convert") return mainConv(argc, argv); - std::cerr << "Command must be train, decode, score, embed, vocab, or convert." << std::endl; - exit(1); - } else + if (subcmds.count(cmd) > 0) { + auto [func, desc] = subcmds[cmd]; + return func(argc, argv); + } + else { + std::cerr << "Unknown command: " << cmd << ". Known commands are:" << std::endl; + for (auto&& [name, val] : subcmds) { + std::cerr << fmt::format("{:10} : {}\n", name, std::get<1>(val)); + } + return 1; + } + } + else return mainTrainer(argc, argv); } diff --git a/src/python/binding/bind.cpp b/src/python/binding/bind.cpp index 38a1e3429..e42fd4ff5 100644 --- a/src/python/binding/bind.cpp +++ b/src/python/binding/bind.cpp @@ -1,3 +1,5 @@ +#define PYBIND11_DETAILED_ERROR_MESSAGES + #include "pybind11/pybind11.h" #include "pybind11/stl.h" // if your IDE/vscode complains about missing paths @@ -6,13 +8,30 @@ #include "evaluator.hpp" #include "trainer.hpp" #include "translator.hpp" - - -#define PYBIND11_DETAILED_ERROR_MESSAGES +#include "command/marian_main.cpp" namespace py = pybind11; using namespace pymarian; +/** + * @brief Wrapper function to call Marian main entry point from Python + * + * Calls Marian main entry point from Python. + * It converts args from a vector of strings (Python-ic API) to char* (C API) + * before passsing on to the main function. + * @param args vector of strings + * @return int return code + */ +int main_wrap(std::vector args) { + // Convert vector of strings to vector of char* + std::vector argv; + argv.push_back(const_cast("pymarian")); + for (auto& arg : args) { + argv.push_back(const_cast(arg.c_str())); + } + argv.push_back(nullptr); + return main(argv.size() - 1, argv.data()); +} PYBIND11_MODULE(_pymarian, m) { m.doc() = "Marian C++ API bindings via pybind11"; @@ -44,5 +63,7 @@ PYBIND11_MODULE(_pymarian, m) { .def("embed", py::overload_cast<>(&PyEmbedder::embed)) ; + m.def("main", &main_wrap, "Marian main entry point"); + } diff --git a/src/python/pymarian/__init__.py b/src/python/pymarian/__init__.py index 36011c203..7816acc89 100644 --- a/src/python/pymarian/__init__.py +++ b/src/python/pymarian/__init__.py @@ -1,6 +1,7 @@ import logging from itertools import islice from pathlib import Path +import sys from typing import Iterator, List, Optional, Tuple, Union import _pymarian @@ -46,8 +47,8 @@ def model_type(self) -> str: @classmethod def new( cls, - model_file: Path, - vocab_file: Path = None, + model_file: Union[Path, str], + vocab_file: Union[Path, str] = None, devices: Optional[List[int]] = None, width=Defaults.FLOAT_PRECISION, mini_batch=Defaults.MINI_BATCH, @@ -76,8 +77,8 @@ def new( :return: iterator of scores """ - assert model_file.exists(), f'Model file {model_file} does not exist' - assert vocab_file.exists(), f'Vocab file {vocab_file} does not exist' + assert Path(model_file).exists(), f'Model file {model_file} does not exist' + assert Path(vocab_file).exists(), f'Vocab file {vocab_file} does not exist' assert like in Defaults.MODEL_TYPES, f'Unknown model type: {like}' n_inputs = len(Defaults.MODEL_TYPES[like]) vocabs = [vocab_file] * n_inputs @@ -97,7 +98,7 @@ def new( cpu_threads=cpu_threads, average=average, ) - if kwargs.pop('fp16'): + if kwargs.pop('fp16', False): kwargs['fp16'] = '' # empty string for flag; i.e, "--fp16" and not "--fp16=true" # TODO: remove this when c++ bindings supports iterator @@ -171,3 +172,19 @@ def __init__(self, cli_string='', **kwargs): """ cli_string += ' ' + kwargs_to_cli(**kwargs) super().__init__(cli_string.stip()) + +def main(): + """proxy to marian main function""" + code = _pymarian.main(sys.argv[1:]) + sys.exit(code) + +def help(*vargs): + """print help text""" + args = [] + args += vargs + if '--help' not in args and '-h' not in args: + args.append('--help') + # note: this will print to stdout + _pymarian.main(args) + # do not exit, as this is a library function + diff --git a/src/python/pyproject.toml b/src/python/pyproject.toml index 30eb16f36..34445648e 100644 --- a/src/python/pyproject.toml +++ b/src/python/pyproject.toml @@ -37,6 +37,7 @@ dependencies = [ ] [project.scripts] +pymarian = "pymarian:main" pymarian-eval = "pymarian.eval:main" pymarian-qtdemo = "pymarian.qtdemo:main" pymarian-mtapi = "pymarian.mtapi_server:main" From 0a625a916958fb4903a7639238b824182f095fc9 Mon Sep 17 00:00:00 2001 From: Thamme Gowda Date: Mon, 12 Aug 2024 17:52:37 +0000 Subject: [PATCH 2/4] Fix github ubuntu workflow for pymarian build --- .github/workflows/ubuntu.yml | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index 3a4c65b31..0df834766 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -21,6 +21,7 @@ jobs: gpu: false unit_tests: true examples: false + pymarian: true # Using Clang compiler - name: "Ubuntu CPU-only clang-14" os: ubuntu-22.04 @@ -31,6 +32,7 @@ jobs: gpu: false unit_tests: true examples: false + pymarian: true # Ubuntu GPU-only build - name: "Ubuntu GPU-only" os: ubuntu-20.04 @@ -41,6 +43,7 @@ jobs: gpu: true unit_tests: false examples: true + pymarian: true # Ubuntu 22.04 supports CUDA 11.7 # Unit tests and examples are not compiled to save disk space - name: "Ubuntu 22.04 CUDA 11.7 gcc-11" @@ -52,6 +55,7 @@ jobs: gpu: true unit_tests: false examples: false + pymarian: true # Ubuntu 20.04 supports CUDA 11+ # Unit tests and examples are not compiled to save disk space - name: "Ubuntu 20.04 CUDA 11.1 gcc-9" @@ -63,6 +67,7 @@ jobs: gpu: true unit_tests: false examples: false + pymarian: true # Ubuntu 18.04 supports CUDA 10.1+ # But it will soon be removed from GitHub workflows # Ubuntu 16.04 supports CUDA 8+ @@ -123,6 +128,7 @@ jobs: -DUSE_FBGEMM=${{ matrix.cpu }} \ -DUSE_SENTENCEPIECE=on \ -DUSE_STATIC_LIBS=on \ + -DPYMARIAN=${{ matrix.pymarian }} \ - name: Compile working-directory: build @@ -146,11 +152,18 @@ jobs: ls -hlv $(find . -maxdepth 1 -type f -executable \( -name "marian*" -o -name "spm*" \)) - name: Install PyMarian + if: matrix.pymarian == true working-directory: build env: CUDA_VERSION: ${{ matrix.cuda }} run: | - python3 -m pip install --upgrade pip setuptools wheel pytest - CMAKE_ARGS="" python3 -m pip install -v . + ls -lh pymarian*.whl + pytag=$(python3 -c 'import sys; x,y=sys.version_info[:2]; print(f"cp{x}{y}-{sys.platform}")') + whl=$(echo pymarian*${pytag}*.whl) + echo "Chosen wheel: $pytag :: $whl" + ls -lh $whl + python3 -m pip install --upgrade pip pytest + python3 -m pip install -v $whl python3 -m pymarian -v - MARIAN_QUIET=YES python3 -m pytest -vs src/python/tests + pymarian-eval --version + pymarian --version From 315e8aeb9b1e4da9da9cab143ce58ad4ff28f0f7 Mon Sep 17 00:00:00 2001 From: Thamme Gowda Date: Mon, 12 Aug 2024 18:41:49 +0000 Subject: [PATCH 3/4] github workflows: pymarian matrix for Mac, disabled for Windows --- .github/workflows/macos.yml | 20 ++++++++++++++++---- .github/workflows/ubuntu.yml | 5 ++--- .github/workflows/windows.yml | 22 +++++++++++++++++----- 3 files changed, 35 insertions(+), 12 deletions(-) diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml index abff1d712..587a37bff 100644 --- a/.github/workflows/macos.yml +++ b/.github/workflows/macos.yml @@ -8,7 +8,14 @@ on: jobs: build-macos: - name: MacOS CPU-only + strategy: + matrix: + include: + - name: "Pymarian=YES" + pymarian: true + - name: "Pymarian=NO" + pymarian: false + name: "MacOS CPU-only ${{ matrix.name }}" runs-on: macos-12 steps: @@ -33,7 +40,8 @@ jobs: -DCOMPILE_SERVER=off \ -DCOMPILE_TESTS=on \ -DUSE_FBGEMM=on \ - -DUSE_SENTENCEPIECE=on + -DUSE_SENTENCEPIECE=on \ + -DPYMARIAN=${{matrix.pymarian}} - name: Compile working-directory: build @@ -52,8 +60,12 @@ jobs: ls -hlv $(find . -maxdepth 1 -type f -perm +ugo+x \( -name "marian*" -o -name "spm*" \)) - name: Install PyMarian + working-directory: build + if: matrix.pymarian == true run: | + echo "Wheels built: " && ls -lh pymarian*.whl python3 -m pip install --upgrade pip setuptools wheel pytest - CMAKE_ARGS="" python3 -m pip install -v . + python3 -m pip install -v pymarian*.whl python3 -m pymarian -v - MARIAN_QUIET=YES python3 -m pytest -vs src/python/tests \ No newline at end of file + pymarian-eval --version + pymarian --version \ No newline at end of file diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index 0df834766..0de2024f3 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -157,11 +157,10 @@ jobs: env: CUDA_VERSION: ${{ matrix.cuda }} run: | - ls -lh pymarian*.whl + echo "Built wheels:" && ls -lh pymarian*.whl pytag=$(python3 -c 'import sys; x,y=sys.version_info[:2]; print(f"cp{x}{y}-{sys.platform}")') whl=$(echo pymarian*${pytag}*.whl) - echo "Chosen wheel: $pytag :: $whl" - ls -lh $whl + echo "Chosen wheel: $pytag :: $whl" && ls -lh $whl python3 -m pip install --upgrade pip pytest python3 -m pip install -v $whl python3 -m pymarian -v diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 55ff0d688..3be7c6361 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -20,10 +20,12 @@ jobs: - name: "Windows CPU-only" cuda: "" gpu: false + pymarian: false # Windows CPU+GPU build - name: "Windows CPU+CUDA" cuda: "10.2" gpu: true + pymarian: false runs-on: windows-2019 name: ${{ matrix.name }} @@ -86,6 +88,7 @@ jobs: -DUSE_MPI="FALSE" -DUSE_NCCL="FALSE" -DUSE_SENTENCEPIECE="TRUE" + -DPYMARIAN="${{ matrix.pymarian }}" -DUSE_STATIC_LIBS="TRUE"' cmakeListsOrSettingsJson: CMakeListsTxtAdvanced cmakeListsTxtPath: ${{ github.workspace }}/CMakeLists.txt @@ -116,6 +119,7 @@ jobs: -DUSE_MPI="FALSE" -DUSE_NCCL="FALSE" -DUSE_SENTENCEPIECE="TRUE" + -DPYMARIAN="${{ matrix.pymarian }}" -DUSE_STATIC_LIBS="TRUE"' cmakeListsOrSettingsJson: CMakeListsTxtAdvanced cmakeListsTxtPath: ${{ github.workspace }}/CMakeLists.txt @@ -138,12 +142,20 @@ jobs: shell: cmd - name: Install PyMarian - working-directory: src/python + if: matrix.pymarian == true + working-directory: build/ run: | - python3 -m pip install --upgrade pip setuptools wheel pytest - python3 -m pip install -v . + echo "Built wheels:" + ls pymarian*.whl + $pytag = python3 -c 'import sys; x,y=sys.version_info[:2]; print(f"cp{x}{y}-{sys.platform}")' + $whl = ls pymarian*$pytag*.whl + echo "Chosen wheel: $pytag :: $whl" + ls $whl + python3 -m pip install --upgrade pip pytest + python3 -m pip install -v $whl python3 -m pymarian -v - python3 -m pytest -vs src/python/tests + pymarian-eval --version + pymarian --version env: CUDA_VERSION: ${{ matrix.cuda }} - shell: cmd + shell: powershell From c3233df1c3ef747c9aee1d9c75ca5fa2bc9a60ac Mon Sep 17 00:00:00 2001 From: Thamme Gowda Date: Mon, 12 Aug 2024 19:08:37 +0000 Subject: [PATCH 4/4] disable macos pymarian build --- .github/workflows/macos.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml index 587a37bff..34c9c1db6 100644 --- a/.github/workflows/macos.yml +++ b/.github/workflows/macos.yml @@ -11,9 +11,7 @@ jobs: strategy: matrix: include: - - name: "Pymarian=YES" - pymarian: true - - name: "Pymarian=NO" + - name: "pymarian=false" pymarian: false name: "MacOS CPU-only ${{ matrix.name }}" runs-on: macos-12