-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
f37af21
commit b4ec3d7
Showing
2,372 changed files
with
851 additions
and
158,197 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
import os | ||
import argparse | ||
from shutil import copyfile | ||
import shutil | ||
|
||
|
||
def run(logFile,fileDir): | ||
texts = [] | ||
for fileName in os.listdir(fileDir): | ||
if not fileName[-3:]=='xml': | ||
continue | ||
with open(os.path.join(fileDir,fileName),'r',encoding='utf-8') as file: | ||
fileContent=file.read() | ||
texts+=[(fileName,fileContent)] | ||
with open(logFile,"r",encoding='utf-8') as logFile: | ||
logContent = logFile.read() | ||
files=[] | ||
for line in logContent.split('\n'): | ||
if (line=="No match found for:" | ||
or line=="--------------------------------------" | ||
or len(line) < 1): | ||
continue | ||
for text in list(texts): | ||
if (text[1].find(line) != -1): | ||
print(text[0]) | ||
texts.remove(text) | ||
break | ||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser( | ||
description='This is a script to find SpRL file data that did not find its way into a core file.') | ||
parser.add_argument( | ||
"--path", | ||
dest="path", | ||
required=True, | ||
help='Path to the input files') | ||
parser.add_argument( | ||
"--file", | ||
dest="file", | ||
required=True, | ||
help='Core SpRL log file') | ||
args=parser.parse_args() | ||
run(args.file,args.path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
#Michael Furst | ||
#February 4, 2018 | ||
#Read SPRL XML file and parse matching data from other files into new file | ||
|
||
import os | ||
import xml.etree.ElementTree as ET | ||
import re | ||
import argparse | ||
class File: | ||
def __init__(self): | ||
self.text="" | ||
self.fileName="" | ||
return | ||
def __str__(self): | ||
s = "" | ||
s+=self.fileName+"\n" | ||
s+=self.text+"\n" | ||
return s | ||
|
||
def cleanString(string): | ||
string = re.sub('[^a-zA-Z]',"",string) | ||
charPairs = [('A','a'),('B','b'),('C','c'),('D','d'),('E','e'),('F','f'), | ||
('G','g'),('H','h'),('I','i'),('J','j'),('K','k'),('L','l'), | ||
('M','m'),('N','n'),('O','o'),('P','p'),('Q','q'),('R','r'), | ||
('S','s'),('T','t'),('U','u'),('V','v'),('W','w'),('X','x'), | ||
('Y','y'),('Z','z')] | ||
for pair in charPairs: | ||
string=string.replace(pair[0],pair[1]) | ||
return string | ||
|
||
def findMatch(text,files): | ||
for file in files: | ||
t1 = cleanString(file.text) | ||
t2 = cleanString(text) | ||
if (t1==t2): | ||
return file.fileName | ||
return "" | ||
|
||
def indexFiles(dirPath): | ||
files=[] | ||
#for i in range(0,774): | ||
dirName = dirPath#"/Users/michaelfurst/Desktop/TRIPSCleaned/" | ||
for name in os.listdir(dirName): | ||
name = os.path.join(dirName,name) | ||
try: | ||
_tree = ET.parse(name) | ||
except (FileNotFoundError,ET.ParseError) as e: | ||
print(e) | ||
continue | ||
if (_tree.getroot().tag=='SpRL'): | ||
for scene in _tree.getroot(): | ||
for element in scene: | ||
if element.tag=='SENTENCE': | ||
text = element.find('TEXT').text | ||
file = File() | ||
file.fileName=name | ||
file.text=text | ||
files+=[file] | ||
elif (_tree.getroot().tag=='SENTENCE'): | ||
text = _tree.getroot().find('TEXT').text | ||
file = File() | ||
file.fileName=name | ||
file.text=text | ||
files+=[file] | ||
elif _tree.getroot().tag == 'CleanedXML': | ||
for sentence in _tree.getroot(): | ||
if sentence.tag == 'SENTENCE': | ||
text = sentence.find('TEXT').text | ||
file = File() | ||
file.fileName = name | ||
file.text = text | ||
files += [file] | ||
return files | ||
|
||
def _pullElements(root): | ||
elements=[] | ||
#Have to navigate to/find Sentence | ||
for scene in root: | ||
for element in scene: | ||
if element.tag=='SENTENCE': | ||
for elem in element: | ||
if elem.tag!='TEXT': | ||
elements+=[elem] | ||
return elements | ||
def pullElements(fileName): | ||
tree=ET.parse(fileName) | ||
root=tree.getroot() | ||
if root.tag=='SpRL': | ||
#If is using older format call specialized function | ||
return _pullElements(root) | ||
elements=[] | ||
for elem in root: | ||
#Pull all non-'TEXT' elements | ||
if elem.tag!='TEXT': | ||
elements+=[elem] | ||
return elements | ||
|
||
def run(xmlName,pathToCleanedXMLS): | ||
files = indexFiles(pathToCleanedXMLS) | ||
try: | ||
tree = ET.parse(xmlName) | ||
except (FileNotFoundError,ET.ParseError): | ||
print("Could not load \'"+xmlName) | ||
return | ||
log="" | ||
root = tree.getroot() | ||
for scene in list(root): | ||
for element in list(scene): | ||
if element.tag=='SENTENCE': | ||
text = element.find('TEXT').text | ||
#find matching text file | ||
match=findMatch(text,files) | ||
#if there is no match remove scentence and add to log | ||
if match=="": | ||
log+="No match found for:\n" | ||
log+=text+"\n" | ||
log+="--------------------------------------\n" | ||
root.remove(scene) | ||
break | ||
else: | ||
#Remove non-'TEXT' elements | ||
for sub in list(element): | ||
if sub.tag!='TEXT': | ||
element.remove(sub) | ||
#Update with new non-'TEXT' elements | ||
for elem in list(pullElements(match)): | ||
element.append(elem) | ||
#write tree to output file | ||
tree.write(xmlName[:-4]+"-output.xml") | ||
print(xmlName[:-4]+"-output.xml successfully created") | ||
#write log | ||
if log!="": | ||
logFile = open(xmlName[:-4]+"-log.txt",'a') | ||
logFile.write(log) | ||
logFile.close() | ||
print("An error log (\'"+xmlName[:-4]+"-log.txt\') has been generated.") | ||
return | ||
|
||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser( | ||
description='This is a script to join SpRL file data into a core file.') | ||
parser.add_argument( | ||
"--path", | ||
dest="path", | ||
required=True, | ||
help='Path to the cleaned files') | ||
parser.add_argument( | ||
"--file", | ||
dest="file", | ||
required=True, | ||
help='Core SpRL file') | ||
args=parser.parse_args() | ||
run(args.file,args.path) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
import os | ||
import argparse | ||
from shutil import copyfile | ||
import shutil | ||
|
||
|
||
def run(logFile,fileDir): | ||
inDir = os.path.join(fileDir,"input") | ||
outDir = os.path.join(fileDir,"output") | ||
badDir = os.path.join(fileDir,"bad") | ||
with open(logFile,'r') as file: | ||
for line in file.read().split('\n'): | ||
try: | ||
copyfile(os.path.join(inDir,line),os.path.join(badDir,line)) | ||
copyfile(os.path.join(inDir,line+".clean"),os.path.join(badDir,line+".clean")) | ||
copyfile(os.path.join(outDir,line),os.path.join(badDir,line[:-4]+"_merged.xml")) | ||
except IsADirectoryError: | ||
continue | ||
return | ||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser( | ||
description='This is a script to find SpRL file data that did not find its way into a core file.') | ||
parser.add_argument( | ||
"--path", | ||
dest="path", | ||
required=True, | ||
help='Path to the input files') | ||
parser.add_argument( | ||
"--file", | ||
dest="file", | ||
required=True, | ||
help='Core SpRL log file') | ||
args=parser.parse_args() | ||
run(args.file,args.path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
import os | ||
import argparse | ||
import xml.etree.ElementTree as ET | ||
from copy import deepcopy | ||
|
||
|
||
def run(fileDir): | ||
for file in os.listdir(fileDir): | ||
if not file[-3:]=='xml': | ||
continue | ||
tree = ET.parse(os.path.join(fileDir,file)) | ||
root = tree.getroot() | ||
if (len(root.findall('SENTENCE'))<=1): | ||
continue | ||
theSentence=deepcopy(root.findall('SENTENCE')[0]) | ||
textElem = theSentence.find('TEXT') | ||
for element in list(theSentence): | ||
#if element.tag!='TEXT': | ||
theSentence.remove(element) | ||
theSentence.append(textElem) | ||
end = -1 | ||
text = "" | ||
elements=[] | ||
for sentence in list(root.findall('SENTENCE')): | ||
for element in list(sentence): | ||
if element.tag=='TEXT': | ||
if (len(text)>0): | ||
if text[-1]==element.text[0]: | ||
text+=element.text[1:] | ||
else: | ||
text+=element.text | ||
else: | ||
text=element.text | ||
end = sentence.get('end') | ||
else: | ||
elements+=[element] | ||
root.remove(sentence) | ||
theSentence.set('end',end) | ||
theSentence.find('TEXT').text=text | ||
elements[:]=sorted(elements,key=lambda child: (child.tag,child.get('id'))) | ||
for elem in elements: | ||
theSentence.append(elem) | ||
root.clear() | ||
root.append(theSentence) | ||
tree.write(os.path.join(os.path.join(fileDir,"out"),file)) | ||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser( | ||
description='This is a script to find SpRL file data that did not find its way into a core file.') | ||
parser.add_argument( | ||
"--path", | ||
dest="path", | ||
required=True, | ||
help='Path to the input files') | ||
args=parser.parse_args() | ||
run(args.path) |
Oops, something went wrong.