-
Notifications
You must be signed in to change notification settings - Fork 9
/
embedded-xml-example.sh
29 lines (25 loc) · 1.06 KB
/
embedded-xml-example.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# Experiment for testing XML bracketing extension.
# Requires a part of the TenTen Corpus. We first extract all bracketing annotation from the
# TenTen files and then convert it to CoNLL-RDF using the presented XMLTSV2RDF class.
# Note that the TenTen Corpus allows for closing xml tags without opening. Using the
# --repair argument we insert artificial opening nodes where needed.
TENTEN=$1;
CONLL="$TENTEN".conll
TTL="$TENTEN".ttl
if [[ $TENTEN == "" ]]; then
echo "Provide the path to the TenTen file, please."
else
if [ ! -e "$TTL" ]; then
cat $TENTEN | \
# head -n 2500 | \
./run.sh TenTen2XMLTSV -r | tee "$CONLL" | \
./run.sh XMLTSV2RDF http://replace.me TOK LEM POS_COARSE POS MORPH HEAD_A HEAD_B LEM_ALT LEM_ALT2 > "$TTL"
fi;
## Count the number of XML nodes in the intermediate file.
echo "Before conversion: $(awk -F'<' 'NF==2' "$CONLL" |\
grep -c "/>\|<[^/]")"
## Count the number of XML:data in the resulting file
echo "After conversion: $(arq --data="$TTL" \
--query=sparql/count_xml_triples.sparql \
--results=TSV | grep -v '^?')"
fi;