Skip to content

Commit

Permalink
Add a prop file for a newer version of the French WikiNER dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
AngledLuffa committed Dec 4, 2024
1 parent 766013c commit 698d84b
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 1 deletion.
7 changes: 6 additions & 1 deletion scripts/ner/Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

all: chinese genia german hungarian italian all.3class nowiki.3class conll.4class muc.7class spanish
all: chinese french genia german hungarian italian all.3class nowiki.3class conll.4class muc.7class spanish

chinese: chinese.misc.nodistsim.ser.gz chinese.misc.distsim.ser.gz

Expand All @@ -9,6 +9,11 @@ chinese.misc.nodistsim.ser.gz:
chinese.misc.distsim.ser.gz:
java -mx15g edu.stanford.nlp.ie.crf.CRFClassifier -prop chinese.misc.distsim.prop > chinese.misc.distsim.out 2>&1

french: french-wikiner-4class.crf.ser.gz

french-wikiner-4class.crf.ser.gz:
java -mx15g edu.stanford.nlp.ie.crf.CRFClassifier -prop french.wikiner.nodistsim.4class.prop > french.wikiner.nodistsim.out 2>&1

genia: genia-nlpba-2004.crf.gz

genia-nlpba-2004.crf.gz:
Expand Down
45 changes: 45 additions & 0 deletions scripts/ner/french.wikiner.nodistsim.4class.prop
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
trainFileList = /home/john/stanza/data/ner/fr_wikinergold.train.bioes
testFiles = /home/john/stanza/data/ner/fr_wikinergold.dev.bioes
serializeTo = french-wikiner-4class.crf.ser.gz

useDistSim = false

map = word=0,answer=1

useTitle = true
useClassFeature=true
useWord=true
useNGrams=true
noMidNGrams=true
usePrev=true
useNext=true
useLongSequences=true
useSequences=true
usePrevSequences=true
maxLeft=1
useTypeSeqs=true
useTypeSeqs2=true
useTypeySequences=true
useOccurrencePatterns=true
useLastRealWord=true
useNextRealWord=true
normalize=true
wordShape=dan2uselC
useDisjunctive=true
disjunctionWidth=5
#useDisjunctiveShapeInteraction=true

type=crf

saveFeatureIndexToDisk = true

readerAndWriter=edu.stanford.nlp.sequences.ColumnDocumentReaderAndWriter

useObservedSequencesOnly=true

sigma = 1
useQN = true
QNsize = 25

# makes it go faster
featureDiffThresh=0.05

0 comments on commit 698d84b

Please sign in to comment.