Skip to content

Commit

Permalink
unchecked files
Browse files Browse the repository at this point in the history
  • Loading branch information
kordjamshidi committed Nov 10, 2018
1 parent f37af21 commit b4ec3d7
Show file tree
Hide file tree
Showing 2,372 changed files with 851 additions and 158,197 deletions.
43 changes: 43 additions & 0 deletions checkFiles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import os
import argparse
from shutil import copyfile
import shutil


def run(logFile,fileDir):
texts = []
for fileName in os.listdir(fileDir):
if not fileName[-3:]=='xml':
continue
with open(os.path.join(fileDir,fileName),'r',encoding='utf-8') as file:
fileContent=file.read()
texts+=[(fileName,fileContent)]
with open(logFile,"r",encoding='utf-8') as logFile:
logContent = logFile.read()
files=[]
for line in logContent.split('\n'):
if (line=="No match found for:"
or line=="--------------------------------------"
or len(line) < 1):
continue
for text in list(texts):
if (text[1].find(line) != -1):
print(text[0])
texts.remove(text)
break

if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='This is a script to find SpRL file data that did not find its way into a core file.')
parser.add_argument(
"--path",
dest="path",
required=True,
help='Path to the input files')
parser.add_argument(
"--file",
dest="file",
required=True,
help='Core SpRL log file')
args=parser.parse_args()
run(args.file,args.path)
47 changes: 40 additions & 7 deletions combine_trips_xml_into_scene.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ def cleanString(string):
('Y','y'),('Z','z')]
for pair in charPairs:
string=string.replace(pair[0],pair[1])
while string[-1]==' ':
string=string[:-1]
return string

def findMatch(text,files):
Expand All @@ -34,12 +36,13 @@ def findMatch(text,files):
t2 = cleanString(text)
if (t1==t2):
return file.fileName
if (t1.find(t2) or t2.find(t1)):
return file.fileName
return ""

def indexFiles(dirPath):
files=[]
#for i in range(0,774):
dirName = dirPath#"/Users/michaelfurst/Desktop/TRIPSCleaned/"
dirName = dirPath
for name in os.listdir(dirName):
name = os.path.join(dirName,name)
try:
Expand All @@ -62,6 +65,14 @@ def indexFiles(dirPath):
file.fileName=name
file.text=text
files+=[file]
elif _tree.getroot().tag=='CleanedXML':
for sentence in _tree.getroot():
if sentence.tag == 'SENTENCE':
text = sentence.find('TEXT').text
file = File()
file.fileName=name
file.text=text
files+=[file]
return files

def _pullElements(root):
Expand Down Expand Up @@ -96,6 +107,11 @@ def run(xmlName,pathToCleanedXMLS):
return
log=""
root = tree.getroot()
removed = 0
# numScenes = len(root.findall("SCENE"))
numSentences = 0
for scene in root.findall("SCENE"):
numSentences += len(scene.findall("SENTENCE"))
for scene in list(root):
for element in list(scene):
if element.tag=='SENTENCE':
Expand All @@ -107,7 +123,8 @@ def run(xmlName,pathToCleanedXMLS):
log+="No match found for:\n"
log+=text+"\n"
log+="--------------------------------------\n"
root.remove(scene)
scene.remove(element)
removed+=1
break
else:
#Remove non-'TEXT' elements
Expand All @@ -118,14 +135,30 @@ def run(xmlName,pathToCleanedXMLS):
for elem in list(pullElements(match)):
element.append(elem)
#write tree to output file
tree.write(xmlName[:-4]+"-output.xml")
print(xmlName[:-4]+"-output.xml successfully created")
tree.write(xmlName[:-4]+"_output.xml")
print(xmlName[:-4]+"_output.xml successfully created")
numScenesAfter = len(root.findall("SCENE"))
numSentencesAfter = 0
for scene in root.findall("SCENE"):
numSentencesAfter += len(scene.findall("SENTENCE"))
#write log
if log!="":
logFile = open(xmlName[:-4]+"-log.txt",'a')
logFile = open(xmlName[:-4]+"_log.txt",'w')
logFile.write(log)
logFile.close()
print("An error log (\'"+xmlName[:-4]+"-log.txt\') has been generated.")
print("An error log (\'"+xmlName[:-4]+"_log.txt\') has been generated.")
with open(xmlName[:-4]+"_results.txt","w") as output:
output.write("Number of scenes: "+str(numScenes)+"\n")
output.write("Number of sentences: "+str(numSentences)+"\n")
output.write("--------------------------\n")
output.write("Number of scenes afterwards: "+str(numScenesAfter)+"\n")
output.write("Number of sentences afterwards: "+str(numSentencesAfter)+"\n")
output.write("Sentences removed: "+str(removed)+"\n")
output.write("Removed "+str(removed*100/numSentences)+"% of sentences.\n")
if numSentencesAfter>numSentences-removed:
output.write("Failed to remove all bad sentences.\n")
elif numSentencesAfter<numSentences-removed:
output.write("Removed extra sentences.\n")
return


Expand Down
155 changes: 155 additions & 0 deletions combine_trips_xml_into_scene_back.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
#Michael Furst
#February 4, 2018
#Read SPRL XML file and parse matching data from other files into new file

import os
import xml.etree.ElementTree as ET
import re
import argparse
class File:
def __init__(self):
self.text=""
self.fileName=""
return
def __str__(self):
s = ""
s+=self.fileName+"\n"
s+=self.text+"\n"
return s

def cleanString(string):
string = re.sub('[^a-zA-Z]',"",string)
charPairs = [('A','a'),('B','b'),('C','c'),('D','d'),('E','e'),('F','f'),
('G','g'),('H','h'),('I','i'),('J','j'),('K','k'),('L','l'),
('M','m'),('N','n'),('O','o'),('P','p'),('Q','q'),('R','r'),
('S','s'),('T','t'),('U','u'),('V','v'),('W','w'),('X','x'),
('Y','y'),('Z','z')]
for pair in charPairs:
string=string.replace(pair[0],pair[1])
return string

def findMatch(text,files):
for file in files:
t1 = cleanString(file.text)
t2 = cleanString(text)
if (t1==t2):
return file.fileName
return ""

def indexFiles(dirPath):
files=[]
#for i in range(0,774):
dirName = dirPath#"/Users/michaelfurst/Desktop/TRIPSCleaned/"
for name in os.listdir(dirName):
name = os.path.join(dirName,name)
try:
_tree = ET.parse(name)
except (FileNotFoundError,ET.ParseError) as e:
print(e)
continue
if (_tree.getroot().tag=='SpRL'):
for scene in _tree.getroot():
for element in scene:
if element.tag=='SENTENCE':
text = element.find('TEXT').text
file = File()
file.fileName=name
file.text=text
files+=[file]
elif (_tree.getroot().tag=='SENTENCE'):
text = _tree.getroot().find('TEXT').text
file = File()
file.fileName=name
file.text=text
files+=[file]
elif _tree.getroot().tag == 'CleanedXML':
for sentence in _tree.getroot():
if sentence.tag == 'SENTENCE':
text = sentence.find('TEXT').text
file = File()
file.fileName = name
file.text = text
files += [file]
return files

def _pullElements(root):
elements=[]
#Have to navigate to/find Sentence
for scene in root:
for element in scene:
if element.tag=='SENTENCE':
for elem in element:
if elem.tag!='TEXT':
elements+=[elem]
return elements
def pullElements(fileName):
tree=ET.parse(fileName)
root=tree.getroot()
if root.tag=='SpRL':
#If is using older format call specialized function
return _pullElements(root)
elements=[]
for elem in root:
#Pull all non-'TEXT' elements
if elem.tag!='TEXT':
elements+=[elem]
return elements

def run(xmlName,pathToCleanedXMLS):
files = indexFiles(pathToCleanedXMLS)
try:
tree = ET.parse(xmlName)
except (FileNotFoundError,ET.ParseError):
print("Could not load \'"+xmlName)
return
log=""
root = tree.getroot()
for scene in list(root):
for element in list(scene):
if element.tag=='SENTENCE':
text = element.find('TEXT').text
#find matching text file
match=findMatch(text,files)
#if there is no match remove scentence and add to log
if match=="":
log+="No match found for:\n"
log+=text+"\n"
log+="--------------------------------------\n"
root.remove(scene)
break
else:
#Remove non-'TEXT' elements
for sub in list(element):
if sub.tag!='TEXT':
element.remove(sub)
#Update with new non-'TEXT' elements
for elem in list(pullElements(match)):
element.append(elem)
#write tree to output file
tree.write(xmlName[:-4]+"-output.xml")
print(xmlName[:-4]+"-output.xml successfully created")
#write log
if log!="":
logFile = open(xmlName[:-4]+"-log.txt",'a')
logFile.write(log)
logFile.close()
print("An error log (\'"+xmlName[:-4]+"-log.txt\') has been generated.")
return


if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='This is a script to join SpRL file data into a core file.')
parser.add_argument(
"--path",
dest="path",
required=True,
help='Path to the cleaned files')
parser.add_argument(
"--file",
dest="file",
required=True,
help='Core SpRL file')
args=parser.parse_args()
run(args.file,args.path)

35 changes: 35 additions & 0 deletions copyFiles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import os
import argparse
from shutil import copyfile
import shutil


def run(logFile,fileDir):
inDir = os.path.join(fileDir,"input")
outDir = os.path.join(fileDir,"output")
badDir = os.path.join(fileDir,"bad")
with open(logFile,'r') as file:
for line in file.read().split('\n'):
try:
copyfile(os.path.join(inDir,line),os.path.join(badDir,line))
copyfile(os.path.join(inDir,line+".clean"),os.path.join(badDir,line+".clean"))
copyfile(os.path.join(outDir,line),os.path.join(badDir,line[:-4]+"_merged.xml"))
except IsADirectoryError:
continue
return

if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='This is a script to find SpRL file data that did not find its way into a core file.')
parser.add_argument(
"--path",
dest="path",
required=True,
help='Path to the input files')
parser.add_argument(
"--file",
dest="file",
required=True,
help='Core SpRL log file')
args=parser.parse_args()
run(args.file,args.path)
56 changes: 56 additions & 0 deletions fixBrokenFiles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import os
import argparse
import xml.etree.ElementTree as ET
from copy import deepcopy


def run(fileDir):
for file in os.listdir(fileDir):
if not file[-3:]=='xml':
continue
tree = ET.parse(os.path.join(fileDir,file))
root = tree.getroot()
if (len(root.findall('SENTENCE'))<=1):
continue
theSentence=deepcopy(root.findall('SENTENCE')[0])
textElem = theSentence.find('TEXT')
for element in list(theSentence):
#if element.tag!='TEXT':
theSentence.remove(element)
theSentence.append(textElem)
end = -1
text = ""
elements=[]
for sentence in list(root.findall('SENTENCE')):
for element in list(sentence):
if element.tag=='TEXT':
if (len(text)>0):
if text[-1]==element.text[0]:
text+=element.text[1:]
else:
text+=element.text
else:
text=element.text
end = sentence.get('end')
else:
elements+=[element]
root.remove(sentence)
theSentence.set('end',end)
theSentence.find('TEXT').text=text
elements[:]=sorted(elements,key=lambda child: (child.tag,child.get('id')))
for elem in elements:
theSentence.append(elem)
root.clear()
root.append(theSentence)
tree.write(os.path.join(os.path.join(fileDir,"out"),file))

if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='This is a script to find SpRL file data that did not find its way into a core file.')
parser.add_argument(
"--path",
dest="path",
required=True,
help='Path to the input files')
args=parser.parse_args()
run(args.path)
Loading

0 comments on commit b4ec3d7

Please sign in to comment.