Skip to content

Commit

Permalink
Add an option to skip the MWT in a conllu file when training a tagger
Browse files Browse the repository at this point in the history
  • Loading branch information
AngledLuffa committed Oct 17, 2024
1 parent e87f437 commit 98c0b7d
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 6 deletions.
11 changes: 8 additions & 3 deletions src/edu/stanford/nlp/tagger/io/TSVTaggedFileReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ public class TSVTaggedFileReader implements TaggedFileReader {
private final String filename;
private final int wordColumn, tagColumn;
private final boolean usesComments;
// if in a conllu file from UD used directly, might want to skip MWT
private final boolean skipMWT;
private List<TaggedWord> next; // = null;
private int linesRead; // = 0;

Expand All @@ -36,6 +38,7 @@ public TSVTaggedFileReader(TaggedFileRecord record) {
tagColumn = ((record.tagColumn == null) ?
DEFAULT_TAG_COLUMN : record.tagColumn);
usesComments = record.usesComments;
skipMWT = record.skipMWT;
primeNext();
}

Expand Down Expand Up @@ -85,9 +88,11 @@ private void primeNext() {
throw new IllegalArgumentException("File " + filename + " line #" +
linesRead + " too short");
}
String word = pieces[wordColumn];
String tag = pieces[tagColumn];
next.add(new TaggedWord(word, tag));
if (!(skipMWT && pieces[0].matches("[0-9]+-[0-9]+"))) {
String word = pieces[wordColumn];
String tag = pieces[tagColumn];
next.add(new TaggedWord(word, tag));
}
}
try {
line = reader.readLine();
Expand Down
15 changes: 12 additions & 3 deletions src/edu/stanford/nlp/tagger/io/TaggedFileRecord.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ public enum Format {
final Integer tagColumn;
final TreeReaderFactory trf;
final boolean usesComments;
final boolean skipMWT;

private TaggedFileRecord(String file, Format format,
String encoding, String tagSeparator,
Expand All @@ -47,7 +48,7 @@ private TaggedFileRecord(String file, Format format,
NumberRangesFileFilter treeRange,
Predicate<Tree> treeFilter,
Integer wordColumn, Integer tagColumn,
boolean usesComments) {
boolean usesComments, boolean skipMWT) {
this.file = file;
this.format = format;
this.encoding = encoding;
Expand All @@ -60,6 +61,7 @@ private TaggedFileRecord(String file, Format format,
this.tagColumn = tagColumn;
this.trf = trf;
this.usesComments = usesComments;
this.skipMWT = skipMWT;
}

public static final String FORMAT = "format";
Expand All @@ -73,6 +75,7 @@ private TaggedFileRecord(String file, Format format,
public static final String TAG_COLUMN = "tagColumn";
public static final String TREE_READER = "trf";
public static final String COMMENTS = "comments";
public static final String SKIP_MWT = "skipMWT";

public String toString() {
StringBuilder s = new StringBuilder();
Expand Down Expand Up @@ -107,6 +110,9 @@ public String toString() {
if (usesComments) {
s.append("," + COMMENTS + "=true");
}
if (skipMWT) {
s.append("," + SKIP_MWT + "=true");
}
return s.toString();
}

Expand Down Expand Up @@ -142,7 +148,7 @@ public static TaggedFileRecord createRecord(Properties config,
return new TaggedFileRecord(description, Format.TEXT,
getEncoding(config),
getTagSeparator(config),
null, null, null, null, null, null, null, false);
null, null, null, null, null, null, null, false, false);
}

String[] args = new String[pieces.length - 1];
Expand All @@ -158,6 +164,7 @@ public static TaggedFileRecord createRecord(Properties config,
Predicate<Tree> treeFilter = null;
Integer wordColumn = null, tagColumn = null;
boolean comments = false;
boolean skipMWT = false;

for (String arg : args) {
String[] argPieces = arg.split("=", 2);
Expand Down Expand Up @@ -188,14 +195,16 @@ public static TaggedFileRecord createRecord(Properties config,
tagColumn = Integer.valueOf(argPieces[1]);
} else if (argPieces[0].equalsIgnoreCase(COMMENTS)) {
comments = Boolean.valueOf(argPieces[1]);
} else if (argPieces[0].equalsIgnoreCase(SKIP_MWT)) {
skipMWT = Boolean.valueOf(argPieces[1]);
} else {
throw new IllegalArgumentException("TaggedFileRecord argument " +
argPieces[0] + " is unknown");
}
}
return new TaggedFileRecord(file, format, encoding, tagSeparator,
treeTransformer, treeNormalizer, trf, treeRange,
treeFilter, wordColumn, tagColumn, comments);
treeFilter, wordColumn, tagColumn, comments, skipMWT);
}

public static String getEncoding(Properties config) {
Expand Down

0 comments on commit 98c0b7d

Please sign in to comment.