Skip to content

Commit

Permalink
optimisations, extensions statistics in record header
Browse files Browse the repository at this point in the history
  • Loading branch information
piotrj committed Jan 4, 2024
1 parent 1f79570 commit 5e70c8b
Show file tree
Hide file tree
Showing 3 changed files with 228 additions and 126 deletions.
94 changes: 76 additions & 18 deletions src/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,8 @@ def scan_rec(self, path, scan_like_data,filenames_set,check_dev=True,dev_call=No
self_scan_rec = self.scan_rec

filenames_set_add = filenames_set.add
self_header_ext_stats = self.header.ext_stats
self_header_ext_stats_size = self.header.ext_stats_size
try:
with scandir(path) as res:

Expand All @@ -402,7 +404,10 @@ def scan_rec(self, path, scan_like_data,filenames_set,check_dev=True,dev_call=No

is_dir,is_file,is_symlink = entry.is_dir(),entry.is_file(),entry.is_symlink()

self.ext_statistics[pathlib_Path(entry).suffix]+=1
ext=pathlib_Path(entry).suffix

if is_file:
self_header_ext_stats[ext]+=1

self.info_line_current = entry_name

Expand Down Expand Up @@ -450,6 +455,7 @@ def scan_rec(self, path, scan_like_data,filenames_set,check_dev=True,dev_call=No
else:
has_files = False
size = int(stat_res.st_size)
self_header_ext_stats_size[ext]+=size

local_folder_size += size

Expand Down Expand Up @@ -478,7 +484,8 @@ def scan(self,cde_list,check_dev=True):

self.header.sum_size = 0

self.ext_statistics=defaultdict(int)
self.header.ext_stats=defaultdict(int)
self.header.ext_stats_size=defaultdict(int)
self.scan_data={}

#########################
Expand Down Expand Up @@ -508,9 +515,6 @@ def scan(self,cde_list,check_dev=True):

self.info_line = ''

#for ext,stat in sorted(self.ext_statistics.items(),key = lambda x : x[1],reverse=True):
# print(ext,stat)

def prepare_customdata_pool_rec(self,scan_like_data,parent_path):
scan_path = self.header.scan_path
self_prepare_customdata_pool_rec = self.prepare_customdata_pool_rec
Expand Down Expand Up @@ -602,6 +606,13 @@ def threaded_cde(timeout_semi_list):
time_start_all = perf_counter()

aborted_string = 'Custom data extraction was aborted.'

files_cde_errors_quant = defaultdict(int)

files_cde_quant = 0
files_cde_size = 0
files_cde_size_extracted = 0

for (scan_like_list,subpath,rule_nr,size) in self.customdata_pool.values():

self.killed=False
Expand Down Expand Up @@ -629,9 +640,7 @@ def threaded_cde(timeout_semi_list):
subprocess = uni_popen(command,shell)
timeout_semi_list[0]=(timeout_val,subprocess)
except Exception as re:
#print('threaded_cde error:',re)
subprocess = None
timeout_semi_list[0]=(timeout_val,subprocess)
timeout_semi_list[0]=None
returncode=201
output = str(re)
else:
Expand All @@ -655,20 +664,21 @@ def threaded_cde(timeout_semi_list):
output_list_append('Killed.')

output = '\n'.join(output_list).strip()
if not output:
returncode=203

#####################################

time_end = perf_counter()
customdata_stats_time[rule_nr]+=time_end-time_start

if returncode or self.killed or aborted:
self_header.files_cde_errors_quant[rule_nr]+=1
self_header.files_cde_errors_quant_all+=1
files_cde_errors_quant[rule_nr]+=1

if not aborted:
self_header.files_cde_quant += 1
self_header.files_cde_size += size
self_header.files_cde_size_extracted += asizeof(output)
files_cde_quant += 1
files_cde_size += size
files_cde_size_extracted += asizeof(output)

new_elem={}
new_elem['cd_ok']= bool(returncode==0 and not self.killed and not aborted)
Expand All @@ -694,6 +704,13 @@ def threaded_cde(timeout_semi_list):

time_end_all = perf_counter()

self_header.files_cde_errors_quant=files_cde_errors_quant
self_header.files_cde_errors_quant_all = sum(files_cde_errors_quant.values())

self_header.files_cde_quant = files_cde_quant
self_header.files_cde_size = files_cde_size
self_header.files_cde_size_extracted = files_cde_size_extracted

customdata_stats_time_all[0]=time_end_all-time_start_all
sys.exit() #thread

Expand All @@ -709,7 +726,7 @@ def threaded_cde(timeout_semi_list):
kill_subprocess(subprocess)
self.killed=True
else:
sleep(0.2)
sleep(0.1)

cde_thread.join()

Expand Down Expand Up @@ -1142,6 +1159,28 @@ def prepare_info(self):
info_list.append(' ' + ' '.join(line_list))
self.txtinfo_basic = self.txtinfo_basic + f'\n\n{loaded_fs_info}\n{loaded_cd_info}'

try:
longest = max({len(ext) for ext in self.header.ext_stats})+2

sublist=[]
for ext,ext_stat in sorted(self.header.ext_stats.items(),key = lambda x : x[1],reverse=True):
sublist.append(f'{ext.ljust(longest)} {fnumber(ext_stat).rjust(12)} {bytes_to_str(self.header.ext_stats_size[ext]).rjust(12)}')
info_list.append('')
info_list.append('Files extensions statistics by quantity:')
info_list.append('========================================')
info_list.extend(sublist)

sublist_size=[]
for ext,ext_stat in sorted(self.header.ext_stats_size.items(),key = lambda x : x[1],reverse=True):
sublist_size.append(f'{ext.ljust(longest)} {bytes_to_str(self.header.ext_stats_size[ext]).rjust(12)} {fnumber(self.header.ext_stats[ext]).rjust(12)}')
info_list.append('')
info_list.append('Files extensions statistics by sum size:')
info_list.append('========================================')
info_list.extend(sublist_size)
except Exception as se:
#print(se)
pass

self.txtinfo = '\n'.join(info_list)

def has_cd(self):
Expand Down Expand Up @@ -1485,20 +1524,39 @@ def find_items_in_records(self,

records_to_process.sort(reverse = True,key = lambda x : x.header.quant_files)

params = (size_min,size_max,
t_min,t_max,
find_filename_search_kind,name_expr,name_case_sens,
find_cd_search_kind,cd_expr,cd_case_sens,
filename_fuzzy_threshold,cd_fuzzy_threshold)

searchinfofile = sep.join([self.db_dir,'searchinfo'])
try:
with open(searchinfofile, "wb") as f:
f.write(ZstdCompressor(level=8,threads=1).compress(dumps(params)))
except Exception as e:
print(e)

record_commnad_list={}
is_frozen = bool(getattr(sys, 'frozen', False))

for record_nr,record in enumerate(records_to_process):
curr_command_list = record_commnad_list[record_nr] = []

if windows:
if is_frozen:
curr_command_list = record_commnad_list[record_nr] = ['record.exe', 'load',record.file_path]
curr_command_list.append('record.exe')
else:
curr_command_list = record_commnad_list[record_nr] = ['python','src\\record.py', 'load',record.file_path]
curr_command_list.extend(['python','src\\record.py'])
else:
if is_frozen:
curr_command_list = record_commnad_list[record_nr] = ['./record', 'load',record.file_path]
curr_command_list.append('./record')
else:
curr_command_list = record_commnad_list[record_nr] = ['python3','./src/record.py', 'load',record.file_path]
curr_command_list.extend(['python3','./src/record.py'])

curr_command_list.extend(['search',record.file_path])

curr_command_list.append(searchinfofile)

if t_min:
curr_command_list.extend( ['--timestamp_min',str(t_min) ] )
Expand Down
10 changes: 9 additions & 1 deletion src/librer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3096,7 +3096,9 @@ def scan(self,compression_level):

self.last_dir = path_to_scan_from_entry

new_record = librer_core.create(self.scan_label_entry_var.get(),path_to_scan_from_entry)
new_label = self.scan_label_entry_var.get()

new_record = librer_core.create(new_label,path_to_scan_from_entry)

self.main_update()

Expand Down Expand Up @@ -3210,6 +3212,12 @@ def scan(self,compression_level):

#############################

try:
with open(sep.join([DATA_DIR,'scaninfo']), "wb") as f:
f.write(ZstdCompressor(level=8,threads=1).compress(dumps([new_label,path_to_scan_from_entry,check_dev,cde_list])))
except Exception as e:
print(e)

scan_thread=Thread(target=lambda : new_record.scan(tuple(cde_list),check_dev),daemon=True)
scan_thread.start()
scan_thread_is_alive = scan_thread.is_alive
Expand Down
Loading

0 comments on commit 5e70c8b

Please sign in to comment.