-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutil.py
65 lines (52 loc) · 1.94 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import os
import io
import pandas as pd
import matplotlib.pyplot as plt
def get_en_idx(dataframe):
pattern = "\/c\/en\/"
index = dataframe.index[
(dataframe['word1'].str.contains(pattern) & dataframe['word2'].str.contains(pattern))].tolist()
return index
def concatenate_files(files_to_merge, out_path):
with io.open(out_path, 'w+', encoding='utf8') as outfile:
for fname in files_to_merge:
print(fname)
with open(fname, 'r') as infile:
for line in infile:
line = line.decode('utf-8', 'ignore')
outfile.write(line.strip() + '\t' + fname + '\n')
print('Merge Done')
def load_df(path_to_data_csv, sep="\t", columns=None, index=None, nrows=None):
"""g_orig = load_df(path_orig, columns=['word1', 'word2', 'score', 'sources', 'relation'])"""
if columns:
data_df = pd.read_csv(path_to_data_csv, sep, index_col=index, names=columns, nrows=nrows)
return data_df
else:
data_df = pd.read_csv(path_to_data_csv, sep, index_col=index, nrows=nrows)
return data_df
def plot_hist(input_data, bins=200, log_scale=False, title=None, out_folder=None, cut_title=True):
"""Plots input data histogram"""
plt.hist(input_data, bins=bins)
if log_scale:
plt.yscale('log')
plt.grid(False)
if title:
label = title.split('/')[-1] if cut_title else title
plt.gca().set(title='Frequency Histogram ' + label, ylabel='Frequency')
else:
plt.gca().set(title='Frequency Histogram', ylabel='Frequency')
if out_folder:
plt.savefig(os.path.join(out_folder, title + '.png'), dpi=400)
plt.show()
def word_to_concept(word):
word = str(word)
conc = '/c/en/'
conc = conc + word.replace(' ', '_')
return conc
def word_to_rel(word):
word = str(word)
if word in ['', '-']:
word = 'is'
rel = '/r/'
rel = rel + word.replace(' ', '_')
return rel