hplt-project · jelmervdl · Aug 14, 2023 · Apr 3, 2023 · Apr 3, 2023 · Apr 4, 2023
diff --git a/src/opustrainer/alignments.py b/src/opustrainer/alignments.py
@@ -11,14 +11,10 @@ def parse_alignments(pairs:str, src_tokens:Optional[TokenList]=None, trg_tokens:
     ]
 
     if src_tokens is not None and trg_tokens is not None:
-        invalid_pairs = [
-            pair
-            for pair in pairs
-            if pair.src < 0 or pair.src >= len(src_tokens)
-            or pair.trg < 0 or pair.trg >= len(trg_tokens)
-        ]
-        if invalid_pairs:
-            raise ValueError('Out-of-bound alignment pairs: ' + ' '.join(map(repr, invalid_pairs)))
+        for pair in pairs:
+            if pair.src < 0 or pair.src >= len(src_tokens) \
+            or pair.trg < 0 or pair.trg >= len(trg_tokens):
+                raise ValueError('Out-of-bound alignment pairs')
 
     return pairs
 

diff --git a/src/opustrainer/logger.py b/src/opustrainer/logger.py
@@ -28,15 +28,15 @@ def get_log_level(name: str) -> int:
         logging.log(logging.WARNING, f"unknown log level level used: {name} assuming warning...")
         return logging.WARNING
 
-def log(msg: str, loglevel: str = "INFO") -> None:
+def log(msg: str, loglevel: str = "INFO", **kwargs) -> None:
     level = get_log_level(loglevel)
-    logging.log(level, msg)
+    logging.log(level, msg, **kwargs)
 
 
 @lru_cache(None)
-def log_once(msg: str, loglevel: str = "INFO") -> None:
+def log_once(msg: str, loglevel: str = "INFO", **kwargs) -> None:
     """A wrapper to log, to make sure that we only print things once"""
-    log(msg, loglevel)
+    log(msg, loglevel, **kwargs)
 
 
 def setup_logger(outputfilename: Optional[str] = None, loglevel: str = "INFO", disable_stderr: bool=False) -> None:

diff --git a/src/opustrainer/modifiers/placeholders.py b/src/opustrainer/modifiers/placeholders.py
@@ -295,15 +295,9 @@ def __call__(self, line:str) -> str:
         target = trg.split()
         alignments = []
 
-        # Try parsing alignments. If we fail, just treat this sentence pair as one with out any
-        # alignment info.
-        try:
-            alignments = parse_alignments(rest[0], source, target)
-        except IndexError:
-            logger.log_once(f"Encountered empty alignment field, ignoring alignment info for such lines", loglevel="WARNING")
-        except ValueError:
-            logger.log_once(f"Encountered invalid alignments, ignoring alignment info for such lines", loglevel="WARNING")
-
+        # Try parsing alignments. If we fail, the sentence will be thrown out
+        # by the trainer.
+        alignments = parse_alignments(rest[0], source, target)
         candidate_offset = 0;
 
         while self.probability > 0.0:

diff --git a/src/opustrainer/trainer.py b/src/opustrainer/trainer.py
@@ -593,12 +593,13 @@ def state(self) -> EpochTrackerState:
 
 Out = TypeVar('Out')
 
-def trace_map(fn: Callable[[In], Out], items: Iterable[In]) -> Iterable[Out]:
-    for n, item in enumerate(items):
+def try_trace_map(fn: Callable[[In], Out], items: Iterable[In]) -> Iterable[Out]:
+    for item in items:
         try:
             yield fn(item)
         except Exception as exc:
-            raise Exception(f'Exception while processing item {n}: {item!r}') from exc
+            logger.log(f'Exception while processing line, skipping: {item!r}', 'WARNING',
+                exc_info=(type(exc), exc, exc.__traceback__.tb_next)) # skip fn(item) frame
 
 
 class Trainer:
@@ -698,7 +699,7 @@ def run(self, *, batch_size:int=100) -> Iterable[List[str]]:
                 # Apply any modifiers to random lines in the batch, or sentence
                 # (Multiple modifiers can be applied to the same line!)
                 for modifier in modifiers:
-                    batch = list(trace_map(lambda line: modifier(line.rstrip('\r\n')) + '\n', batch))
+                    batch = list(try_trace_map(lambda line: modifier(line.rstrip('\r\n')) + '\n', batch))
 
                 if self.shuffle:
                     random.shuffle(batch)

diff --git a/tests/test_placeholders.py b/tests/test_placeholders.py
@@ -151,19 +151,12 @@ def test_warn_if_tag_modifier_is_not_last(self):
       """))
     self.assertRegex(logger_ctx.output[0], r"Tags modifier should to be the last modifier to be applied")
 
-  def test_warn_if_alignment_is_missing(self):
+  def test_exception_if_alignment_is_missing(self):
     tagger = PlaceholderTagModifier()
-    with self.assertLogs(logger, level='WARNING') as logger_ctx:
-      self.assertEqual(
-        tagger('Hello world\tHallo welt\t'),
-        'Hello world\tHallo welt')
-    self.assertRegex(logger_ctx.output[0], r'empty alignment field')
+    with self.assertRaises(IndexError):
+      tagger('Hello world\tHallo welt\t')
 
-  def test_warn_if_alignment_is_missing(self):
+  def test_exception_if_alignment_is_invalid(self):
     tagger = PlaceholderTagModifier()
-    with self.assertLogs(level='WARNING') as logger_ctx:
-      self.assertEqual(
-        tagger('Hello world\tHallo welt\t0-0 1-2'),
-        'Hello world\tHallo welt')
-    self.assertRegex(logger_ctx.output[0], r'invalid alignments')
-
+    with self.assertRaises(ValueError):
+      tagger('Hello world\tHallo welt\t0-0 1-2')
diff --git a/tests/test_trainer.py b/tests/test_trainer.py
@@ -9,6 +9,7 @@
 from contextlib import closing
 from textwrap import dedent
 from io import StringIO
+from itertools import chain
 
 import yaml
 
@@ -329,7 +330,6 @@ def test_combined_stage_configuration(self):
 		curriculum = CurriculumLoader().load(config)
 		self.assertEqual([modifier.__class__.__name__ for modifier in curriculum.stages['start'].modifiers or []], ['UpperCaseModifier', 'TitleCaseModifier'])
 
-	@unittest.skip('`Tags` no longer raises an exception on invalid alignment pairs')
 	def test_modifier_error_line_context(self):
 		"""Test that when a modifier fails, we get context information about the line that failed"""
 		with tempfile.NamedTemporaryFile('w', encoding='utf-8') as fd:
@@ -357,5 +357,11 @@ def test_modifier_error_line_context(self):
 
 			trainer = Trainer(curriculum)
 
-			with self.assertRaisesRegex(Exception, "Exception while processing item 1:"):
-				list(trainer.run(batch_size=2))
+			with self.assertLogs(level='WARNING') as logger_ctx:
+				output = list(chain.from_iterable(trainer.run(batch_size=1)))
+				# Assert we skipped the line
+				self.assertEqual(len(output), 1)
+				# Assert that we got the general error message
+				self.assertRegex(logger_ctx.output[0], r'Exception while processing line, skipping:')
+				# Assert that we got the specific error as well
+				self.assertRegex(logger_ctx.output[0], r'ValueError: Out-of-bound alignment pairs')