Skip to content

Commit

Permalink
note for later
Browse files Browse the repository at this point in the history
  • Loading branch information
Jeronymous committed Jun 14, 2023
1 parent 8748f8b commit 6ff6414
Showing 1 changed file with 2 additions and 1 deletion.
3 changes: 2 additions & 1 deletion punctuation/recasepunc.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ def generate_predictions(config, line, ignore_disfluencies=False):
# also drop punctuation that we may generate
line = ''.join([c for c in line if c not in mapped_punctuation])
if ignore_disfluencies:
# TODO: fix when there are several disfluencies in a row ("euh euh")
line = collapse_whitespace(line)
line = re.sub(r"(\w) *' *(\w)", r"\1'\2", line) # glue apostrophes to words
disfluencies, line = remove_simple_disfluences(line)
Expand Down Expand Up @@ -424,7 +425,7 @@ def remove_simple_disfluences(text, language=None):
disfluencies = DISFLUENCIES.get(language, [])
all_hits = []
for disfluency in disfluencies:
all_hits += re.finditer(r" *"+disfluency+r" *", text)
all_hits += re.finditer(r" *\b"+disfluency+r"\b *", text)
all_hits = sorted(all_hits, key=lambda x: x.start())
to_be_inserted = [(hit.start(), hit.group()) for hit in all_hits]
new_text = text
Expand Down

0 comments on commit 6ff6414

Please sign in to comment.