utilFormat.py

import re
from pynlpl.formats import folia
import os

# returns object
def conll2raw(tags):
    raw_tags = []
    for tag in tags:
        raw_tag = tag
        t = tag.split('-')
        if len(t) > 1: raw_tag = t[1]
        raw_tags.append(raw_tag)
    return raw_tags

# returns object
def stanford2raw(tags):
    tags = ['LOC' if tag == 'LOCATION'
                       else tag for tag in tags]

    tags = ['PER' if tag == 'PERSON'
           else tag for tag in tags]

    tags = ['ORG' if tag == 'ORGANIZATION'
                       else tag for tag in tags]
    return tags

# returns object
def conll2stanford(tags):
    tags = ['LOCATION' if re.match('^.*LOC.*$', tag)
                       else tag for tag in tags]

    tags = ['PERSON' if re.match('^.*PER.*$', tag)
                       else tag for tag in tags]

    tags = ['ORGANIZATION' if re.match('^.*ORG.*$', tag)
                       else tag for tag in tags]
    return tags

# returns objects
def conll2sentences(testfile):
    with open(testfile, 'r') as f:
        lines = []
        sentences = [[]]
        for line in f:
            if line != '\n':
                sentences[-1].append(line.split(None, 1)[0])
                lines.append(line.split())
            else:
                sentences.append([])
    all_tokens = [line[0] for line in lines]
    actual_tags = [line[-1] for line in lines]
    return [sentences, all_tokens, actual_tags]

# create error analysis file (token - folia tag - conll tag - predicted tag)
def createerroranalysisfile(sentences, actual_tags, intermediate_tags,  pred_tags, erroranalysis_file_name='error_analysis_tokenperline.txt'):
    result_file = open(erroranalysis_file_name, 'w')
    idx = -1
    for sentence in sentences:
        for word in sentence:
            idx = idx + 1
            result_file.write(word + '\t\t' + actual_tags[idx] + '\t' + intermediate_tags[idx] + '\t'  + pred_tags[idx] + '\n')
        result_file.write('\n')
    result_file.close()

def createconllevalinputfile(sentences, actual_tags, pred_tags, conlleval_inputfile_name='conlleval_input'):
    result_file = open(conlleval_inputfile_name, 'w')
    idx = -1
    for sentence in sentences:
        for word in sentence:
            idx = idx + 1
            result_file.write(word + ' ' + actual_tags[idx] + ' ' + pred_tags[idx] + '\n')
        result_file.write('\n')
    result_file.close()

#################################################################################################
# intermediate func
def foliaclass2rawtag(e):
    per = 'PER'
    loc = 'LOC'
    org = 'ORG'
    cls = e.cls
    if re.match('^.*Target.*$', e.set):
        if cls == 'name':
            return per
    elif re.match('^.*Organizer.*$', e.set):
        if cls == 'name':
            return org
    if cls == 'loc' or cls == 'place' or cls == 'place_pub':
        return loc
    if cls == 'pname':
        return per
    if cls == 'fname':
        return org
    return 'O'

# intermediate func
def foliaclass2stanfordtag(e):
    per = 'PERSON'
    loc = 'LOCATION'
    org = 'ORGANIZATION'
    cls = e.cls
    if re.match('^.*Target.*$', e.set):
        if cls == 'name':
            return per
    elif re.match('^.*Organizer.*$', e.set):
        if cls == 'name':
            return org
    if cls == 'loc' or cls == 'place' or cls == 'place_pub':
        return loc
    if cls == 'pname':
        return per
    if cls == 'fname':
        return org
    return 'O'

# returns objects
def folia2sentences(path, tagFormat):
    sentences_as_tokens = []
    ids = []
    id2idx = {}
    idx2id = {}
    all_tokens = []
    actual_tags = []
    if os.path.isdir(path):
        idx = -1
        for filename in os.listdir(path):
            doc = folia.Document(file=path + '/' + filename)
            for h, sentence in enumerate(doc.sentences()):
                sentence_tokenized = sentence.select(folia.Word)
                words_folia = list(sentence_tokenized)
                sentence_tokens = []
                for word in words_folia:
                    w_id = word.id
                    w_text = word.text()
                    if w_id in ids:
                        continue
                    idx = idx + 1
                    if w_text == '<P>':
                        idx = idx - 1
                        continue
                    #if w_text == 'krishnappa':
                     #   idx = idx - 1
                      #  continue
                    ids.append(w_id)
                    id2idx[w_id] = idx
                    idx2id[idx] = w_id
                    actual_tags.append('O')
                    sentence_tokens.append(w_text)
                    all_tokens.append(w_text)

                sentences_as_tokens.append(sentence_tokens)
                for layer in sentence.select(folia.EntitiesLayer):
                    for entity in layer.select(folia.Entity):
                        for word in entity.wrefs():
                            word_id = word.id
                            _idx = id2idx[word_id]
                            if tagFormat == 'stanford':
                                tag = foliaclass2stanfordtag(entity)
                            elif tagFormat == 'conll':
                                print('TODO: reuse codes that output files to output objects instead.')
                            elif tagFormat == 'raw':
                                tag = foliaclass2rawtag(entity)
                            actual_tags[_idx] = tag

    else:
        print("TODO: Handling of a single Folia file instead of a folder of Folia files.")
    return [sentences_as_tokens, all_tokens, actual_tags]

#intermediate func
def tag(type, w_nu, prev_type):
    if prev_type is None:
        return 'I-' + type
    else:
        if prev_type not in ['LOC', 'ORG', 'PER']:
            return 'I-' + type
        else:
            if type != prev_type:
                return 'I-' + type
            else:
                if w_nu > 0:
                    return 'I-' + type
                else:
                    return 'B-' + type

def getfoliatag(e):
    cls = e.cls
    if re.match('^.*Target.*$', e.set):
        if cls == 'name':
            return 'target_name'
    elif re.match('^.*Organizer.*$', e.set):
        if cls == 'name':
            return 'organizer_name'
    return cls

# intermediate func
def foliaclass2conlltag(e):
    per = 'PER'
    loc = 'LOC'
    org = 'ORG'
    cls = e.cls
    if re.match('^.*Target.*$', e.set):
        if cls == 'name':
            return per
    elif re.match('^.*Organizer.*$', e.set):
        if cls == 'name':
            return org
    if cls == 'loc' or cls == 'place' or cls == 'place_pub':
        return loc
    if cls == 'pname':
        return per
    if cls == 'fname':
        return org
    return 'O'

def hasEvent(sentence):
    for layer in sentence.select(folia.EntitiesLayer):
        for i, entity in enumerate(layer.select(folia.Entity)):
            if entity.cls == 'etype':
                return True

    return False

# intermediate func
def doc2conll(fp, sentences, ids, id2token, id2tag, id2ftag, idx, idx2id, id2idx, conllfile, folia_tokenperline_file, capitalize=True, onlysentenceswithevents=False):

    doc = folia.Document(file=fp)
    for h, sentence in enumerate(doc.sentences()):
        if onlysentenceswithevents and not hasEvent(sentence):
            continue
        sentence_tokenized = sentence.select(folia.Word)
        words_folia = list(sentence_tokenized)
        sentence_tokens = []  # sentence as token ids

        # cumledeki butun kelimeleri oncelikle 'O' ile tagle.
        # kelime zaten taglenmisse atla.
        for word in words_folia:
            w_id = word.id
            w_text = word.text()
            if w_id in ids:
                continue
            if w_text == '<P>':
                continue
            idx = idx + 1
            sentence_tokens.append(w_id)
            id2token[w_id] = w_text
            id2tag[w_id] = 'O'
            id2ftag[w_id] = 'O'
            ids.append(w_id)
            idx2id[idx] = w_id
            id2idx[w_id] = idx

        sentences.append(sentence_tokens)
        for layer in sentence.select(folia.EntitiesLayer):
            for entity in layer.select(folia.Entity):
                for w_nu, word in enumerate(entity.wrefs()):
                    word_id = word.id
                    if word.id == 'https__timesofindia.indiatimes.com_city_chandigarh_STATESCAN_articleshow_708599418.p.1.s.19.w.29':
                        print('here')
                    pasttag = id2tag[word_id]
                    if pasttag is not None:
                        if pasttag not in ['LOC', 'PER', 'ORG']:
                            conll_tag = foliaclass2conlltag(entity)
                            id2tag[word_id] = conll_tag
                            id2ftag[word_id] = getfoliatag(entity)

        for _id in sentence_tokens:
            if capitalize and id2tag[_id] != 'O':  # capitalize first letter of annotated token.
                token = id2token[_id].capitalize()
            else:
                token = id2token[_id]
            line = token + " " + id2tag[_id] + "\n"
            conllfile.write(line)
            fline = token + " " + id2ftag[_id] + "\n"
            folia_tokenperline_file.write(fline)

        conllfile.write("\n")
        folia_tokenperline_file.write("\n")

# creates folia-tokenperline AND folia_as_conll_tags files.
def folia2conll(flpath, opath, fopath, capitalize=True, onlysentenceswithevents=False):
    sentences = []  # A sentence is a list of token ids.
    ids = []
    id2token = {}
    id2tag = {} # Id to conll tag.
    id2ftag = {} # Id to folia tag.
    idx2id = {}
    id2idx = {}
    conll_file = open(opath, 'w')
    folia_tokenperline_file = open(fopath, 'w')

    idx = -1
    if os.path.isdir(flpath):
        for filename in os.listdir(flpath):
            fpath = flpath + '/' + filename
            doc2conll(fpath, sentences, ids, id2token, id2tag, id2ftag, idx, idx2id, id2idx, conll_file, folia_tokenperline_file, capitalize, onlysentenceswithevents)
    else:
        doc2conll(flpath, sentences, ids, id2token, id2tag, id2ftag, idx, idx2id, id2idx, conll_file, folia_tokenperline_file, capitalize, onlysentenceswithevents)

    print('Folia docs are converted to conll format')
    print('Folia docs are converted to token-per-line format')
    conll_file.close()
    folia_tokenperline_file.close()

# assumes files are of same number of lines.
def merge2_tokenperline_files(f1,f2,of):
    outfile = open(of, 'w')
    in1 = open(f1, 'r')
    in2 = open(f2, 'r')

    lines2 = in2.readlines()
    for idx,line1 in enumerate(in1):
        line1stripped = line1.strip()
        line2stripped = lines2[idx].strip()
        if line1 != '\n':
            ln1 = line1stripped.split(None, 1)
            ln2 = line2stripped.split(None, 1)
            token = ln1[0]
            tag1 = ln1[1]
            tag2 = ln2[1]
            myline = token + ' ' + tag1 + ' ' + tag2
            outfile.write(myline + '\n')
        else:
            outfile.write(myline + '\n')

    outfile.close()
    in1.close()
    in2.close()

'''m1 = 'india_tokenperline_cap.txt'
m2 = 'india_conll_cap.txt'
m1m2 = 'india_tokenperline_conll_cap.txt'
merge2_tokenperline_files(m1, m2, m1m2)'''

'''
folia_folder = './foliadocs/alladjudicated'
folia_file = './foliadocs/alladjudicated/' \
             'https__timesofindia.indiatimes.com_business_india-business_BSNL-Employees-Union-protests-against-disinvestment_articleshow_972751.folia.xml'

outfile = './india_conll_cap_evt.txt'
foutfile = './india_tokenperline_cap_evt.txt'
capitalize = True
onlysentenceswithevents=True
folia2conll(folia_folder, outfile, foutfile, capitalize, onlysentenceswithevents)'''