-
Notifications
You must be signed in to change notification settings - Fork 0
/
statMaker.py
221 lines (182 loc) · 8.06 KB
/
statMaker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
import json
import subprocess
import os
import arrow
import csv
import numpy
from tqdm import tqdm
from blamer import clean_checkout
from blame_source import get_status
from blame_source import Data
def main():
BASEDIR = os.getcwd()
# store the time difference
diffs = list()
# store if the same person fixed the stub, who wrote it
same_fixer = list()
# store if the same person wrote the block
same_auth = list()
# store if the entire block was written in 1 commit or not
same_commit = list()
# how many times does does it happen that multiple possible sources exist
multiple_fix_counter = 0
# how many stubs did not make it
error_counter = 0
# store the lengths of stubs that include more then 1 line
lengths = list()
git_date_format = 'ddd MMM D HH:mm:ss YYYY Z'
with open('checkpoint.json', 'r') as fp:
bugData = json.load(fp)
for bug in tqdm(bugData):
try:
repo_name = bug['projectName'].split('.')[-1]
if len(bug['blameData']) >= 1:
if len(bug['blameData']) > 1:
multiple_fix_counter += 1
for blameData in set(bug['blameData']):
if ' ' in blameData:
error_counter += 1
os.chdir(BASEDIR)
continue
try:
os.chdir(f'./repos/{repo_name}')
except FileNotFoundError:
error_counter += 1
os.chdir(BASEDIR)
continue
current_head = subprocess.getoutput('git rev-parse --short HEAD')
if not (current_head in blameData or blameData in current_head):
clean_checkout(blameData)
with open('./'+bug['bugFilePath'], 'r', errors='ignore') as fp:
data = fp.read()
line_count = data[bug['bugNodeStartChar']:bug['bugNodeStartChar']+bug['bugNodeLength']].count('\n')
dLines = [line.strip() for line in data.split('\n')]
lines = dLines[bug['bugLineNum']-1:bug['bugLineNum']+line_count]
# store the unique author names
authors = set()
# store unique hashes
source_hash = set()
lineNum = bug['bugLineNum']
stub_author = ''
if len(lines) > 1:
lengths.append(len(lines))
# check if the stub was fixed by one of the people who interacted with the other parts of the marked block
# the stub is identified by the line that has been changed in the given commit
for i, line in enumerate(lines):
blame = subprocess.check_output(['git', 'blame', bug['bugFilePath'], '-L', str(lineNum+i)+','+str(lineNum+i)]).decode("utf8", 'ignore')
current = blame.split(' ')[1]
source_hash.add(blame.split(' ')[0])
if get_status(Data(
project=bug['projectName'].split('.')[1],
path=bug['bugFilePath'],
blame=[blameData + ' ' + line]
), False).status[0] != 'not_found':
stub_author = current
else:
authors.add(
current
)
# if the stub's author has interacted with other marked lines or there was only 1 line marked, the result is true
same_auth.append(stub_author in authors or len(authors) == 0)
same_commit.append(len(source_hash)==1)
# get the date of the stub's addition and deletion
addDate = subprocess.getoutput('git log -n -1 --format=%ad ' + blameData)
fixData = subprocess.getoutput('git log -n -1 --format=%ad ' + bug['fixCommitSHA1'])
try:
# get the author who added the stub and the author who fixed it
addAuth = subprocess.check_output(['git', 'log', '-n', '1', '--format=short', blameData]).decode('utf8', 'ignore').split('\n')[1]
fixAuth = subprocess.check_output(['git', 'log', '-n', '1', '--format=short', bug['fixCommitSHA1']]).decode('utf8', 'ignore').split('\n')[1]
same_fixer.append(addAuth == fixAuth)
except subprocess.CalledProcessError:
os.chdir(BASEDIR)
continue
try:
diffs.append((arrow.get(fixData, git_date_format).timestamp - arrow.get(addDate, git_date_format).timestamp))
except arrow.parser.ParserMatchError:
os.chdir(BASEDIR)
continue
os.chdir(BASEDIR)
else:
error_counter += 1
os.chdir(BASEDIR)
except Exception:
os.chdir(BASEDIR)
continue
avg_time = arrow.get(numpy.average(diffs)) - arrow.get(0)
std_time = arrow.get(numpy.std(diffs)) - arrow.get(0)
med_time = arrow.get(numpy.median(diffs)) - arrow.get(0)
print(diffs)
print(same_fixer)
with open('stub_stats.csv', 'w') as fp:
field_names = ['statName', 'statVal']
writer = csv.DictWriter(fp, fieldnames=field_names)
writer.writeheader()
writer.writerow({
'statName' : 'AVG_time',
'statVal' : avg_time
})
writer.writerow({
'statName' : 'STD_time',
'statVal' : std_time
})
writer.writerow({
'statName' : 'MED_time',
'statVal' : med_time
})
writer.writerow({
'statName' : 'Same_fixer_perc',
'statVal' : print_percent(same_fixer, 'fixer')
})
writer.writerow({
'statName': 'AVG_time_same_fixer',
'statVal': stats_per_group(diffs, same_fixer, True, numpy.average)
})
writer.writerow({
'statName': 'STD_time_same_fixer',
'statVal': stats_per_group(diffs, same_fixer, True, numpy.std)
})
writer.writerow({
'statName': 'MED_time_same_fixer',
'statVal': stats_per_group(diffs, same_fixer, True, numpy.median)
})
writer.writerow({
'statName': 'AVG_time_diff_fixer',
'statVal': stats_per_group(diffs, same_fixer, False, numpy.average)
})
writer.writerow({
'statName': 'STD_time_diff_fixer',
'statVal': stats_per_group(diffs, same_fixer, False, numpy.std)
})
writer.writerow({
'statName': 'MED_time_diff_fixer',
'statVal': stats_per_group(diffs, same_fixer, False, numpy.median)
})
writer.writerow({
'statName' : 'Same_auth_perc',
'statVal' : print_percent(same_auth, 'auth')
})
writer.writerow({
'statName' : 'Same_commmit_perc',
'statVal' : print_percent(same_commit, 'commit')
})
writer.writerow({
'statName' : 'Multi_src',
'statVal' : multiple_fix_counter
})
writer.writerow({
'statName' : 'Error_num',
'statVal' : error_counter
})
writer.writerow({
'statName' : 'AVG_stub_len',
'statVal' : sum(lengths) / len(lengths)
})
def print_percent(list, name):
list_perc = len([l for l in list if l]) / len(list)
print('same '+ name + ' perc: ' + str(list_perc*100) + '%')
return list_perc
def stats_per_group(diffs, same, group, func):
new_diffs = [d for d in diffs if same[diffs.index(d)] == group]
return arrow.get(func(new_diffs)) - arrow.get(0)
if __name__ == '__main__':
main()