diff --git a/.github/workflows/validate-translation-files.yml b/.github/workflows/validate-translation-files.yml index 79eaa78ab42..177040657c6 100644 --- a/.github/workflows/validate-translation-files.yml +++ b/.github/workflows/validate-translation-files.yml @@ -44,6 +44,10 @@ jobs: message: | :white_check_mark: All translation files are valid. + ``` + ${{ steps.validate_translation_files.outputs.VALIDATION_ERRORS }} + ``` + This comment has been posted by the `validate-translation-files.yml` GitHub Actions workflow. message-failure: | diff --git a/Makefile b/Makefile index 9cb4142edd2..4373b93c99d 100644 --- a/Makefile +++ b/Makefile @@ -37,9 +37,14 @@ test_requirements: ## Installs test.txt requirements test: ## Run scripts tests pytest -v -s --cov=. --cov-report=term-missing --cov-report=html scripts/tests -validate_translation_files: ## Run basic validation to ensure files are compilable +validate_translation_files: ## Run basic validation to ensure translation files are valid python scripts/validate_translation_files.py + +validate_translation_files_mark_fuzzy: ## Run basic validation and mark invalid entries as fuzzy + python scripts/validate_translation_files.py --mark-fuzzy + + sync_translations: ## Syncs from the old projects to the new openedx-translations project python scripts/sync_translations.py $(SYNC_ARGS) diff --git a/docs/decisions/0002-mark-fuzzy-entries.rst b/docs/decisions/0002-mark-fuzzy-entries.rst new file mode 100644 index 00000000000..3520f2fcba4 --- /dev/null +++ b/docs/decisions/0002-mark-fuzzy-entries.rst @@ -0,0 +1,99 @@ +Mark Transifex invalid entries as fuzzy +####################################### + +Context +******* +As of the `OEP-58`_, the Transifex GitHub App is used to sync Translations. +These translations are validated by `LexiQA`_, built-in Transifex quality +checks, and human reviewers. During this synchronization process, the +`Transifex GitHub App`_ creates pull requests to this repository. The +translations in the pull requests are then validated by ``msgfmt`` in CI. + +Problem +******* +There are times when the translations being synchronized fail ``msgfmt`` +validation. This prevents the pull requests from being merged. + + +Design Decision +*************** + +A GitHub Actions workflow will be implemented to mark invalid entries in +synchronized ``.po`` files as ``fuzzy``. This will update pull requests +created by the `Transifex GitHub App`_. + +To ensure a safe and reliable workflow, the following workflows will be +combined into one single workflow with multiple jobs: + +#. Run ``python-tests.yml`` to validate the python code +#. Then, run ``validate-translation-files.yml`` which performs the following: + + #. Validate the po-files using ``msgfmt`` + #. Notify the translators about the invalid entries via the preferred + communication channel (Slack, Transifex, GitHub) + #. Edit the po files to mark invalid entries as ``fuzzy``, so it's + excluded from ``.mo`` files + #. Revalidate the files + +#. Commit the fixes +#. Run the lint commits +#. Push to the branch +#. Trigger auto merge + + +Informing the Translators +************************* +Translators can be notified about invalid translations via Slack, Transifex +or GitHub issues depending on the community's preference. + +Pros and Cons +************* + +**Pros** + +* New valid strings would make it into the ``.mo`` files +* There's no need for manual intervention, therefore it's fast and won't + create a backlog of pull requests. +* Rejected strings are easily identifiable by looking in the code, so it's + easy to fix them. +* Translators can be notified about invalid translations via Slack, Transifex, + GitHub depending on the community's preference. + + +**Cons** + +* The workflow script runs and edits the pull request, which can be + confusing for the reviewers. +* Previously valid entries are going to be overwritten with new ``fuzzy`` + strings which will make those entries to be shown in source language + (English). + +Rejected Alternative: Keep pull requests open +********************************************* + +- **Create a Transifex issue only:** It would be possible to identify the + faulty entries and create a Transifex issue via API to the faulty entries. + However, this would require attention from the translators + team, which can take a rather long time therefore creating a backlog of + invalid entries. Additionally, Transifex issues are not understood or used + by most of the Open edX community. + +- **Post errors on Slack only:** Post the errors on Slack and ask the + translators + to fix them. Same as the previous option, not all translators are on Slack. + Additionally, this option would have a slow feedback loop causing the pull + requests backlog to build-up. + +- **Mark faulty entries as unreviewed only:** Use the Transifex API to mark + the the invalid entries as unreviewed, then pull only + reviewed entries into this repository. + This option would require an extensive use the of the Transifex API, + and pull again which can be complex to implement. Additionally, this option + would have a slow feedback loop causing the pull requests backlog to + build-up. + + +.. _OEP-58: https://open-edx-proposals.readthedocs.io/en/latest/architectural-decisions/oep-0058-arch-translations-management.html +.. _LexiQA: https://help.transifex.com/en/articles/6219179-lexiqa +.. _Transifex GitHub App: https://github.com/apps/transifex-integration + diff --git a/scripts/tests/mock_translations_dir/demo-xblock/conf/locale/hi/LC_MESSAGES/django.po b/scripts/tests/mock_translations_dir/demo-xblock/conf/locale/hi/LC_MESSAGES/django.po index 87b04909901..651ba802a34 100644 --- a/scripts/tests/mock_translations_dir/demo-xblock/conf/locale/hi/LC_MESSAGES/django.po +++ b/scripts/tests/mock_translations_dir/demo-xblock/conf/locale/hi/LC_MESSAGES/django.po @@ -13,6 +13,51 @@ msgstr "" "Plural-Forms: nplurals=2; plural=(n != 1);\n" "Generated-By: Babel 2.8.0\n" + +#: default_data.py:5 +msgid "" +"An isosceles triangle with three layers of similar height. It is shown " +"upright, so the widest layer is located at the bottom, and the narrowest " +"layer is located at the top." +msgstr "" + +#, python-brace-format +msgid "{action} invalid actions." +msgstr "{something_else} an invalid action." + +#: default_data.py:20 +msgid "" +"Use this zone to associate an item with the bottom layer of the triangle." +msgstr "" + +#: default_data.py:22 +msgid "Correct! This one belongs to The Top Zone." +msgstr "" + #, python-brace-format -msgid "{action} is not a valid action." -msgstr "{कार्रवाई} एक वैध कार्रवाई नहीं है।" # Invalid brace-format with encoding errors +msgid "" +"{action2} another invalid action." +msgstr "{कार्रवाई2} एक वैध कार्रवाई नहीं है।" + +#: default_data.py:23 +msgid "Correct! This one belongs to The Middle Zone." +msgstr "" + +#: default_data.py:24 +msgid "Correct! This one belongs to The Bottom Zone." +msgstr "" + +#: default_data.py:25 +msgid "No, this item does not belong here. Try again." +msgstr "नहीं, यह आइटम यहाँ नहीं है। पुनः प्रयास करें।" + +#: default_data.py:26 +msgid "You silly, there are no zones for this one." +msgstr "आप मूर्खतापूर्ण हैं, इसके लिए कोई क्षेत्र नहीं है।" + +#, python-brace-format +msgid "{action3} one more time!" +msgstr "" +"{कार्रवाई3} एक वैध कार्रवाई नहीं है।" +"" + diff --git a/scripts/tests/test_validate_translation_files.py b/scripts/tests/test_validate_translation_files.py index 821701d5231..d425fedb1f2 100644 --- a/scripts/tests/test_validate_translation_files.py +++ b/scripts/tests/test_validate_translation_files.py @@ -3,10 +3,12 @@ """ import os.path +import re +import shutil from ..validate_translation_files import ( get_translation_files, - main, + validate_translation_files, ) SCRIPT_DIR = os.path.dirname(__file__) @@ -35,7 +37,7 @@ def test_main_on_invalid_files(capsys): Integration test for the `main` function on some invalid files. """ mock_translations_dir = os.path.join(SCRIPT_DIR, 'mock_translations_dir') - exit_code = main(mock_translations_dir) + exit_code = validate_translation_files(mock_translations_dir) out, err = capsys.readouterr() assert 'VALID:' in out, 'Valid files should be printed in stdout' @@ -44,8 +46,8 @@ def test_main_on_invalid_files(capsys): assert 'hi/LC_MESSAGES/django.po' not in out, 'Invalid file should be printed in stderr' assert 'en/LC_MESSAGES/django.po' not in out, 'Source file should not be validated' - assert 'INVALID:' in err - assert 'hi/LC_MESSAGES/django.po' in err + assert re.match(r'INVALID: .*hi/LC_MESSAGES/django.po', err) + assert 'Errors:' in err assert '\'msgstr\' is not a valid Python brace format string, unlike \'msgid\'' in err assert 'FAILURE: Some translations are invalid.' in err @@ -57,11 +59,57 @@ def test_main_on_valid_files(capsys): Integration test for the `main` function but only for the Arabic translations which is valid. """ mock_translations_dir = os.path.join(SCRIPT_DIR, 'mock_translations_dir/demo-xblock/conf/locale/ar') - exit_code = main(mock_translations_dir) + exit_code = validate_translation_files(mock_translations_dir) out, err = capsys.readouterr() assert 'VALID:' in out, 'Valid files should be printed in stdout' assert 'ar/LC_MESSAGES/django.po' in out, 'Arabic translation file is valid' assert 'SUCCESS: All translation files are valid.' in out + assert 'Errors:' not in out, 'There should be no errors' assert not err.strip(), 'The stderr should be empty' assert exit_code == 0, 'Should succeed due in validating the ar/LC_MESSAGES/django.po file' + + +def test_main_on_valid_files_with_mark_fuzzy(capsys, tmp_path): + """ + Test the `main` function with the --mark-fuzzy option. + """ + original_mock_translations_dir = os.path.join(SCRIPT_DIR, 'mock_translations_dir') + mock_translations_dir = tmp_path / 'translations' + shutil.copytree(original_mock_translations_dir, mock_translations_dir) + + exit_code = validate_translation_files(translations_dir=mock_translations_dir, mark_fuzzy=True) + out, err = capsys.readouterr() + + assert 'VALID:' in out, 'Valid files should be printed in stdout' + assert re.match(r'FIXED: .*hi/LC_MESSAGES/django.po', err), 'Should print the FIXED files in stderr' + assert 'ar/LC_MESSAGES/django.po' in out, 'Arabic translation file is valid regardless' + assert 'NOTICE: Some translations were fixed.' in err, 'Should print NOTICE in stderr' + assert 'SUCCESS: All translation files are now valid.' in err, 'Should print SUCCESS in stderr' + assert 'Previous errors:' in err, 'Original errors should be printed' + assert 'New output:' in err, 'New output should be printed after the fix' + assert exit_code == 0, ( + 'Should succeed even though the in/LC_MESSAGES/django.po is invalid, because it was marked as fuzzy' + ) + + +def test_main_on_valid_files_with_mark_fuzzy_unfixable_files(capsys, tmp_path): + """ + Test the `main` function with the --mark-fuzzy option but the file is broken in an unknown way. + """ + original_mock_translations_dir = os.path.join(SCRIPT_DIR, 'mock_translations_dir') + mock_translations_dir = tmp_path / 'translations' + shutil.copytree(original_mock_translations_dir, mock_translations_dir) + hi_language_file = mock_translations_dir / 'demo-xblock/conf/locale/hi/LC_MESSAGES/django.po' + + with open(hi_language_file, 'a') as f: + f.write('<<<>>>') + + exit_code = validate_translation_files(translations_dir=mock_translations_dir, mark_fuzzy=True) + out, err = capsys.readouterr() + + assert 'VALID:' in out, 'Valid files should be printed in stdout regardless' + assert re.match(r'INVALID: .*hi/LC_MESSAGES/django.po', err), 'Should print the Error files in stderr' + assert 'Previous errors:' in err, 'Original errors should be printed' + assert 'New errors:' in err, 'New errors should be printed after the fix' + assert exit_code == 1, 'Should fail due to unfixable hi/LC_MESSAGES/django.po file' diff --git a/scripts/validate_translation_files.py b/scripts/validate_translation_files.py index 8d334e26656..7082b2738ee 100644 --- a/scripts/validate_translation_files.py +++ b/scripts/validate_translation_files.py @@ -1,7 +1,15 @@ -import sys +""" +Validate translation files using GNU gettext `msgfmt` command. + +This script is used to validate translation files in the Open edX platform and mark +invalid entries as fuzzy. +""" + +import argparse import os import os.path import subprocess +import sys def get_translation_files(translation_directory): @@ -34,12 +42,70 @@ def validate_translation_file(po_file): stderr = completed_process.stderr.decode(encoding='utf-8', errors='replace') return { + 'po_file': po_file, 'valid': completed_process.returncode == 0, - 'output': f'{stdout}\n{stderr}', + 'output': f'{stdout}\n{stderr}'.strip(), + 'stdout': stdout, + 'stderr': stderr, } -def main(translations_dir='translations'): +def get_invalid_msgstr_lines(validation_stderr): + """ + Parse the error output of `msgfmt` and return line numbers of invalid msgstr lines. + """ + invalid_msgstr_lines = [] + for line in validation_stderr.splitlines(): + if '.po:' in line: + _file_name, line_number, _rest = line.split(':', 2) + invalid_msgstr_lines.append(int(line_number)) + return invalid_msgstr_lines + + +def get_invalid_msgid_lines(po_file, invalid_msgstr_lines): + """ + Get the line numbers of invalid msgid lines by their msgstr line numbers. + """ + with open(po_file, mode='r', encoding='utf-8') as f: + pofile_lines = f.readlines() + + invalid_msgid_lines = [] + last_msgid_line = None + for line_number, line_text in enumerate(pofile_lines, start=1): + if line_text.startswith('msgid'): + last_msgid_line = line_number + + if line_text.startswith('msgstr') and line_number in invalid_msgstr_lines: + invalid_msgid_lines.append(last_msgid_line) + + return invalid_msgid_lines + + +def mark_invalid_entries_as_fuzzy(validation_result): + """ + Mark invalid entries as fuzzy. + """ + # Get line numbers of invalid msgstr lines + invalid_msgstr_lines = get_invalid_msgstr_lines(validation_result['stderr']) + invalid_msgid_lines = get_invalid_msgid_lines( + validation_result['po_file'], + invalid_msgstr_lines, + ) + + with open(validation_result['po_file'], mode='r', encoding='utf-8') as f: + pofile_lines = f.readlines() + + with open(validation_result['po_file'], mode='w', encoding='utf-8') as f: + for line_number, line_text in enumerate(pofile_lines, start=1): + if line_number in invalid_msgid_lines: + f.write('#, fuzzy\n') + f.write(line_text) + + +def validate_translation_files( + translations_dir='translations', + mark_fuzzy=False, +): """ Run GNU gettext `msgfmt` and print errors to stderr. @@ -49,37 +115,81 @@ def main(translations_dir='translations'): return 1 for invalid translations. """ translations_valid = True + translations_fixed = False - invalid_lines = [] + stderr_lines = [] po_files = get_translation_files(translations_dir) for po_file in po_files: - result = validate_translation_file(po_file) + first_attempt = validate_translation_file(po_file) - if result['valid']: + if first_attempt['valid']: print('VALID: ' + po_file) - print(result['output'], '\n' * 2) + print(first_attempt['output'], '\n' * 2) else: - invalid_lines.append('INVALID: ' + po_file) - invalid_lines.append(result['output'] + '\n' * 2) - translations_valid = False + if mark_fuzzy: + mark_invalid_entries_as_fuzzy(first_attempt) + + second_attempt = validate_translation_file(po_file) + + if second_attempt['valid']: + translations_fixed = True + stderr_lines.append('FIXED: ' + po_file) + stderr_lines.append('This file was invalid, but it was fixed by`make validate_translation_files`.') + stderr_lines.append('Previous errors:') + stderr_lines.append(first_attempt['output'] + '\n') + stderr_lines.append('New output:') + stderr_lines.append(second_attempt['output'] + '\n' * 2) + else: + stderr_lines.append('INVALID: ' + po_file) + stderr_lines.append('Attempted to fix the file, but it is still invalid.') + + if mark_fuzzy: + stderr_lines.append('Previous errors:') + else: + stderr_lines.append('Errors:') + + stderr_lines.append(first_attempt['output'] + '\n') + + if mark_fuzzy: + stderr_lines.append('New errors:') + stderr_lines.append(second_attempt['output'] + '\n' * 2) + translations_valid = False # Print validation errors in the bottom for easy reading - print('\n'.join(invalid_lines), file=sys.stderr) + print('\n'.join(stderr_lines), file=sys.stderr) if translations_valid: - print('-----------------------------------------') - print('SUCCESS: All translation files are valid.') - print('-----------------------------------------') exit_code = 0 + + if translations_fixed: + print('---------------------------------------------', file=sys.stderr) + print('NOTICE: Some translations were fixed.', file=sys.stderr) + print('SUCCESS: All translation files are now valid.', file=sys.stderr) + print('---------------------------------------------', file=sys.stderr) + else: + print('-----------------------------------------') + print('SUCCESS: All translation files are valid.') + print('-----------------------------------------') else: + exit_code = 1 print('---------------------------------------', file=sys.stderr) print('FAILURE: Some translations are invalid.', file=sys.stderr) print('---------------------------------------', file=sys.stderr) - exit_code = 1 return exit_code +def main(): # pragma: no cover + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument('--mark-fuzzy', dest='mark_fuzzy', action='store_true', default=False) + parser.add_argument('--dir', action='store', type=str, default='translations') + args = parser.parse_args() + sys.exit(validate_translation_files( + translations_dir=args.dir, + mark_fuzzy=args.mark_fuzzy, + )) + + if __name__ == '__main__': - sys.exit(main()) # pragma: no cover + main() # pragma: no cover