From 8116820fc5a28cbbd8968ba522053f68c71da01a Mon Sep 17 00:00:00 2001 From: MaximilianVovk Date: Thu, 17 Oct 2024 22:23:35 -0400 Subject: [PATCH 1/2] Add AutoRefineit option to not show plots, added in GUI an if case in order to read 'cost' data in json generated files, Fix generate simulations to create simulations according to the set magnitude given as input and fix the P_0m as P_0M was not read, added the EMCCD_PhysProp_GenSym that will gnerate simulations for given Metsim result to find more possible solution to define an uncertanty --- wmpl/MetSim/AutoRefineFit.py | 36 +- wmpl/MetSim/GUI.py | 5 + wmpl/MetSim/ML/EMCCD_PhysProp_GenSym.py | 5346 +++++++++++++++++++++++ wmpl/MetSim/ML/GenerateSimulations.py | 212 +- 4 files changed, 5536 insertions(+), 63 deletions(-) create mode 100644 wmpl/MetSim/ML/EMCCD_PhysProp_GenSym.py diff --git a/wmpl/MetSim/AutoRefineFit.py b/wmpl/MetSim/AutoRefineFit.py index 6c4a4b2f..65763274 100644 --- a/wmpl/MetSim/AutoRefineFit.py +++ b/wmpl/MetSim/AutoRefineFit.py @@ -20,7 +20,7 @@ from wmpl.Utils.Pickling import loadPickle -def costFunc(traj, met_obs, sr, mag_sigma, len_sigma, plot_residuals=False): +def costFunc(traj, met_obs, sr, mag_sigma, len_sigma, plot_residuals=False, hideplots=False): """ Compute the difference between the simulated and the observed meteor. Arguments: @@ -221,14 +221,21 @@ def costFunc(traj, met_obs, sr, mag_sigma, len_sigma, plot_residuals=False): plt.tight_layout() ax_mag.legend() - plt.show() + + # Show the plot only if hideplots is False + if not hideplots: + plt.show() + + else: + plt.clf() + plt.close() return mag_res, len_res, cost -def residualFun(params, fit_options, traj, met_obs, const, change_update_params): +def residualFun(params, fit_options, traj, met_obs, const, change_update_params, hideplots=False): """ Take the fit parameters and return the value of the cost function. Arguments: @@ -283,7 +290,7 @@ def residualFun(params, fit_options, traj, met_obs, const, change_update_params) mag_sigma, len_sigma = fit_options["mag_sigma"], fit_options["len_sigma"] # Compute the cost function - mag_res, len_res, cost = costFunc(traj, met_obs, sr, mag_sigma, len_sigma, plot_residuals=False) + mag_res, len_res, cost = costFunc(traj, met_obs, sr, mag_sigma, len_sigma, plot_residuals=False, hideplots=hideplots) print("Magnitude residual: {:f}".format(mag_res)) print("Length residual: {:f}".format(len_res)) @@ -294,7 +301,7 @@ def residualFun(params, fit_options, traj, met_obs, const, change_update_params) -def autoFit(fit_options, traj, met_obs, const): +def autoFit(fit_options, traj, met_obs, const, hideplots=False): """ Automatically fit the parameters to the observations. """ @@ -460,7 +467,7 @@ def autoFit(fit_options, traj, met_obs, const): # Run the optimization using Nelder-Mead res = scipy.optimize.minimize(residualFun, x0, args=(fit_options, traj, met_obs, const, - change_update_params), + change_update_params, hideplots), method="Nelder-Mead", bounds=bounds) # Extract the optimized parameters into Constants @@ -543,6 +550,9 @@ def loadFitOptions(dir_path, file_name): arg_parser.add_argument('--updated', action='store_true', \ help="Load the updated simulation JSON file insted of the original one.") + + arg_parser.add_argument('-x', '--hideplots', \ + help="Don't show generated plots on the screen, just save them to disk.", action="store_true") # Parse the command line arguments cml_args = arg_parser.parse_args() @@ -749,7 +759,7 @@ def loadFitOptions(dir_path, file_name): print("Done!") # Cost function test - costFunc(traj, met_obs, sr, fit_options["mag_sigma"], fit_options["len_sigma"], plot_residuals=True) + costFunc(traj, met_obs, sr, fit_options["mag_sigma"], fit_options["len_sigma"], plot_residuals=True, hideplots=cml_args.hideplots) # Go though all the fit sets @@ -766,7 +776,7 @@ def loadFitOptions(dir_path, file_name): print() print("#"*80) print("Auto fitting...") - const = autoFit(fit_options, traj, met_obs, const) + const = autoFit(fit_options, traj, met_obs, const, hideplots=cml_args.hideplots) print("Fitting done!") @@ -782,7 +792,7 @@ def loadFitOptions(dir_path, file_name): # Cost function test - costFunc(traj, met_obs, sr, fit_options["mag_sigma"], fit_options["len_sigma"], plot_residuals=True) + costFunc(traj, met_obs, sr, fit_options["mag_sigma"], fit_options["len_sigma"], plot_residuals=True, hideplots=cml_args.hideplots) @@ -968,6 +978,12 @@ def loadFitOptions(dir_path, file_name): plt.tight_layout() plt.subplots_adjust(wspace=0.00) - plt.show() + # Show the plot only if hideplots is False + if not cml_args.hideplots: + plt.show() + + else: + plt.clf() + plt.close() ### ### \ No newline at end of file diff --git a/wmpl/MetSim/GUI.py b/wmpl/MetSim/GUI.py index 73da4150..89364523 100644 --- a/wmpl/MetSim/GUI.py +++ b/wmpl/MetSim/GUI.py @@ -1478,6 +1478,11 @@ def loadConstants(sim_fit_json): for key in const_json: setattr(const, key, const_json[key]) + if 'const' in const_json: + # Open the constants parameter part of .json file for simulaitons + for key in const_json['const']: + setattr(const, key, const_json['const'][key]) + if 'fragmentation_entries' in const_json: diff --git a/wmpl/MetSim/ML/EMCCD_PhysProp_GenSym.py b/wmpl/MetSim/ML/EMCCD_PhysProp_GenSym.py new file mode 100644 index 00000000..0a713fca --- /dev/null +++ b/wmpl/MetSim/ML/EMCCD_PhysProp_GenSym.py @@ -0,0 +1,5346 @@ +""" +The code is used to extract the physical properties of the simulated showers from EMCCD observations +by selecting the most similar simulated events in the PC space using: +- Mode of the siumulated events +- The min of the KDE esults +- Principal Component Regression (PCR) +""" + +import json +import copy +import re +import pandas as pd +import matplotlib.pyplot as plt +from numpy.linalg import inv +import numpy as np +import subprocess +import glob +import os +import pickle +import seaborn as sns +import scipy.spatial.distance +from sklearn.decomposition import PCA +from sklearn.preprocessing import StandardScaler +import wmpl +import shutil +from scipy.stats import kurtosis, skew +from wmpl.Utils.OSTools import mkdirP +from matplotlib.ticker import ScalarFormatter +import math +from scipy.stats import gaussian_kde +from wmpl.Utils.PyDomainParallelizer import domainParallelizer +from scipy.linalg import svd +from wmpl.MetSim.GUI import loadConstants, saveConstants,SimulationResults +from wmpl.MetSim.MetSimErosion import runSimulation, Constants, zenithAngleAtSimulationBegin +from scipy.interpolate import interp1d +from matplotlib.colors import Normalize +from scipy.optimize import minimize +import scipy.optimize as opt +import sys +from scipy.stats import zscore +import scipy.spatial +from sklearn.linear_model import LinearRegression +from sklearn.model_selection import train_test_split +from sklearn.pipeline import make_pipeline +from wmpl.Utils.AtmosphereDensity import fitAtmPoly +from sklearn.cluster import KMeans +import scipy.stats as stats +from sklearn.preprocessing import PowerTransformer +from wmpl.MetSim.ML.GenerateSimulations import generateErosionSim,saveProcessedList,MetParam +from wmpl.Utils.TrajConversions import J2000_JD, date2JD +import warnings +import itertools +import time +from multiprocessing import Pool + + +# CONSTANTS ########################################################################################### + +FPS = 32 +NAME_SUFX_GENSIM = "_GenSim" +NAME_SUFX_CSV_OBS = "_obs.csv" +NAME_SUFX_CSV_SIM = "_sim.csv" +NAME_SUFX_CSV_SIM_NEW = "_sim_new.csv" +NAME_SUFX_CSV_CURRENT_FIT = "_fit_sim.csv" +NAME_SUFX_CSV_PHYSICAL_FIT_RESULTS = "_physical_prop.csv" + +SAVE_SELECTION_FOLDER='Selection' +VAR_SEL_DIR_SUFX = '_sel_var_vs_physProp' +PCA_SEL_DIR_SUFX = '_sel_PCA_vs_physProp' +SAVE_RESULTS_FOLDER='Results' +SAVE_RESULTS_FOLDER_EVENTS_PLOTS='Results'+os.sep+'events_plots' + +# sigma value of the RMSD that is considered to select a good fit +SIGMA_ERR = 1 # 1.96 # 95CI +MAG_RMSD = 0.25 +# MAG_RMSD = 0.25 # for heavy +# MAG_RMSD = 0.20 # for steep fast +# MAG_RMSD = 0.15 # for shallow slow +# MAG_RMSD = 0.05 # for small + +LEN_RMSD = 0.04 # 0.02 +# LEN_RMSD = 0.04 +# MAG_RMSD = 0.08 +# LEN_RMSD = 0.04 # 0.025 + +# Use the IF function, one of the logical functions, to return one value if a condition is true and another value if it's false. For example: =IF(A2>B2,"Over Budget","OK") =IF(A2=B2,B4-A4,"") + +# # Calculate the cumulative probability for the z-value, the confidence level is the percentage of the area within ±z_value +# CONFIDENCE_LEVEL = (2 * stats.norm.cdf(SIGMA_ERR) - 1)*100 + +# Length of data that will be used as an input during training +DATA_LENGTH = 256 +# Default number of minimum frames for simulation +MIN_FRAMES_VISIBLE = 4 + +# python -m EMCCD_PCA_Shower_PhysProp "C:\Users\maxiv\Documents\UWO\Papers\1)PCA\PCA_Error_propagation\TEST" "PER" "C:\Users\maxiv\Documents\UWO\Papers\1)PCA\PCA_Error_propagation" 1000 +# python -m EMCCD_PCA_Shower_PhysProp "C:\Users\maxiv\Documents\UWO\Papers\1)PCA\PCA_Error_propagation\TEST" "PER" "C:\Users\maxiv\Documents\UWO\Papers\1)PCA\PCA_Error_propagation" 1000 > output.txt + +# FUNCTIONS ########################################################################################### + +# create a txt file where you save averithing that has been printed +class Logger(object): + def __init__(self, directory=".", filename="log.txt"): + self.terminal = sys.stdout + # Ensure the directory exists + if not os.path.exists(directory): + os.makedirs(directory) + # Combine the directory and filename to create the full path + filepath = os.path.join(directory, filename) + self.log = open(filepath, "a") + + def write(self, message): + self.terminal.write(message) + self.log.write(message) + + def flush(self): + # This might be necessary as stdout could call flush + self.terminal.flush() + + def close(self): + # Close the log file when done + self.log.close() + + +def find_closest_index(time_arr, time_sampled): + closest_indices = [] + for sample in time_sampled: + closest_index = min(range(len(time_arr)), key=lambda i: abs(time_arr[i] - sample)) + closest_indices.append(closest_index) + return closest_indices + + +def cubic_lag(t, a, b, c, t0): + """ + Quadratic lag function. + """ + + # Only take times <= t0 + t_before = t[t <= t0] + + # Only take times > t0 + t_after = t[t > t0] + + # Compute the lag linearly before t0 + l_before = np.zeros_like(t_before)+c + + # Compute the lag quadratically after t0 + l_after = -abs(a)*(t_after - t0)**3 - abs(b)*(t_after - t0)**2 + c + + return np.concatenate((l_before, l_after)) + + +def cubic_velocity(t, a, b, v0, t0): + """ + Quadratic velocity function. + """ + + # Only take times <= t0 + t_before = t[t <= t0] + + # Only take times > t0 + t_after = t[t > t0] + + # Compute the velocity linearly before t0 + v_before = np.ones_like(t_before)*v0 + + # Compute the velocity quadratically after t0 + v_after = -3*abs(a)*(t_after - t0)**2 - 2*abs(b)*(t_after - t0) + v0 + + return np.concatenate((v_before, v_after)) + + +def cubic_acceleration(t, a, b, t0): + """ + Quadratic acceleration function. + """ + + # Only take times <= t0 + t_before = t[t <= t0] + + # Only take times > t0 + t_after = t[t > t0] + + # No deceleration before t0 + a_before = np.zeros_like(t_before) + + # Compute the acceleration quadratically after t0 + a_after = -6*abs(a)*(t_after - t0) - 2*abs(b) + + return np.concatenate((a_before, a_after)) + + +def lag_residual(params, t_time, l_data): + """ + Residual function for the optimization. + """ + + return np.sum((l_data - cubic_lag(t_time, *params))**2) + + +def vel_residual(params, t_time, l_data): + """ + Residual function for the optimization. + """ + + return np.sum((l_data - cubic_velocity(t_time, *params))**2) + + +def fit_mag_polin2_RMSD(data_mag, time_data): + + # Select the data up to the minimum value + x1 = time_data[:np.argmin(data_mag)] + y1 = data_mag[:np.argmin(data_mag)] + + # Fit the first parabolic curve + coeffs1 = np.polyfit(x1, y1, 2) + fit1 = np.polyval(coeffs1, x1) + + # Select the data from the minimum value onwards + x2 = time_data[np.argmin(data_mag):] + y2 = data_mag[np.argmin(data_mag):] + + # Fit the second parabolic curve + coeffs2 = np.polyfit(x2, y2, 2) + fit2 = np.polyval(coeffs2, x2) + + # concatenate fit1 and fit2 + fit1=np.concatenate((fit1, fit2)) + + residuals_pol = data_mag - fit1 + # avg_residual_pol = np.mean(abs(residuals_pol)) + rmsd_pol = np.sqrt(np.mean(residuals_pol**2)) + + return fit1, residuals_pol, rmsd_pol,'Polinomial Fit' + + +def fit_lag_t0_RMSD_old(lag_data,time_data,velocity_data): + v_init=velocity_data[0] + # initial guess of deceleration decel equal to linear fit of velocity + p0 = [np.mean(lag_data), 0, 0, np.mean(time_data)] + opt_res = opt.minimize(lag_residual, p0, args=(np.array(time_data), np.array(lag_data)), method='Nelder-Mead') + a_t0, b_t0, c_t0, t0 = opt_res.x + fitted_lag_t0 = cubic_lag(np.array(time_data), a_t0, b_t0, c_t0, t0) + + opt_res_vel = opt.minimize(vel_residual, [a_t0, b_t0, v_init, t0], args=(np.array(time_data), np.array(velocity_data)), method='Nelder-Mead') + a_t0, b_t0, v_init_new, t0 = opt_res_vel.x # problem with the small time + fitted_vel_t0 = cubic_velocity(np.array(time_data), a_t0, b_t0, v_init, t0) + + fitted_acc_t0 = cubic_acceleration(np.array(time_data), a_t0, b_t0, t0) + residuals_t0 = lag_data - fitted_lag_t0 + rmsd_t0 = np.sqrt(np.mean(residuals_t0 ** 2)) + + return fitted_lag_t0, residuals_t0, rmsd_t0, 'Cubic Fit', fitted_vel_t0, fitted_acc_t0 + +def fit_lag_t0_RMSD(lag_data, time_data, velocity_data): + v_init = velocity_data[0] + # initial guess of deceleration decel equal to linear fit of velocity + p0 = [np.mean(lag_data), 0, 0, np.mean(time_data)] + opt_res = opt.minimize(lag_residual, p0, args=(np.array(time_data), np.array(lag_data)), method='Nelder-Mead') + a_t0, b_t0, c_t0, t0 = opt_res.x + fitted_lag_t0 = cubic_lag(np.array(time_data), a_t0, b_t0, c_t0, t0) + + # Optimize velocity residual based on initial guess from lag residual + opt_res_vel = opt.minimize(vel_residual, [a_t0, b_t0, v_init, t0], args=(np.array(time_data), np.array(velocity_data)), method='Nelder-Mead') + a_t0_vel, b_t0_vel, v_init_vel, t0_vel = opt_res_vel.x + fitted_vel_t0_vel = cubic_velocity(np.array(time_data), a_t0_vel, b_t0_vel, v_init_vel, t0_vel) + + # # Compute fitted velocity from original lag optimization + # fitted_vel_t0_lag = cubic_velocity(np.array(time_data), a_t0, b_t0, v_init, t0) + + # Compute fitted velocity from original lag optimization + fitted_vel_t0_lag = cubic_velocity(np.array(time_data), a_t0, b_t0, v_init_vel, t0) + + # # Compute fitted velocity from original lag optimization + # fitted_vel_t0_lag_vel = cubic_velocity(np.array(time_data), a_t0, b_t0, v_init_vel, t0) + + # Calculate residuals + residuals_vel_vel = velocity_data - fitted_vel_t0_vel + residuals_vel_lag = velocity_data - fitted_vel_t0_lag + + rmsd_vel_vel = np.sqrt(np.mean(residuals_vel_vel ** 2)) + rmsd_vel_lag = np.sqrt(np.mean(residuals_vel_lag ** 2)) + + # Choose the best fitted velocity based on RMSD + if rmsd_vel_vel < rmsd_vel_lag: + best_fitted_vel_t0 = fitted_vel_t0_vel + best_a_t0, best_b_t0, best_t0 = a_t0_vel, b_t0_vel, t0_vel + else: + best_fitted_vel_t0 = fitted_vel_t0_lag + best_a_t0, best_b_t0, best_t0 = a_t0, b_t0, t0 + + fitted_acc_t0 = cubic_acceleration(np.array(time_data), best_a_t0, best_b_t0, best_t0) + residuals_t0 = lag_data - fitted_lag_t0 + rmsd_t0 = np.sqrt(np.mean(residuals_t0 ** 2)) + + return fitted_lag_t0, residuals_t0, rmsd_t0, 'Cubic Fit', best_fitted_vel_t0, fitted_acc_t0 + + +def find_noise_of_data(data, plot_case=False): + # make a copy of data_obs + data_obs = copy.deepcopy(data) + + fitted_lag_t0_lag, residuals_t0_lag, rmsd_t0_lag, fit_type_lag, fitted_vel_t0, fitted_acc_t0 = fit_lag_t0_RMSD(data_obs['lag'],data_obs['time'], data_obs['velocities']) + # now do it for fit_mag_polin2_RMSD + fit_pol_mag, residuals_pol_mag, rmsd_pol_mag, fit_type_mag = fit_mag_polin2_RMSD(data_obs['absolute_magnitudes'],data_obs['time']) + + # create a pd dataframe with fit_pol_mag and fitted_vel_t0 and time and height + fit_funct = { + 'velocities': fitted_vel_t0, + 'height': data_obs['height'], + 'absolute_magnitudes': fit_pol_mag, + 'time': data_obs['time'], + 'lag': fitted_lag_t0_lag + } + + if plot_case: + fig, ax = plt.subplots(1, 2, figsize=(14, 6), dpi=300) + # flat the ax + ax = ax.flatten() + plot_side_by_side(data,fig, ax,'go','Obsevation') + + plot_side_by_side(fit_funct,fig, ax,'k--','fit') + + return rmsd_t0_lag, rmsd_pol_mag, fit_pol_mag, fitted_lag_t0_lag, fit_funct, fig, ax + else: + return rmsd_t0_lag, rmsd_pol_mag, fit_pol_mag, fitted_lag_t0_lag, fit_funct + + +#### Generate Observation ######################################################################### + +def generate_observation_realization(data, rmsd_lag, rmsd_mag, fit_funct, name='', fig='', ax='', plot_case=False): + + # print a . so that the next will be on the same line + print('.', end='') + # make a copy of data_obs + data_obs = copy.deepcopy(data) + fit_pol_mag = copy.deepcopy(fit_funct['absolute_magnitudes']) + fitted_lag_t0_lag = copy.deepcopy(fit_funct['lag']) + fitted_lag_t0_vel = copy.deepcopy(fit_funct['velocities']) + + if name!='': + # print(name) + data_obs['name']=name + + data_obs['type']='Realization' + + ### ADD NOISE ### + + # Add noise to magnitude data (Gaussian noise) for each realization + fit_pol_mag += np.random.normal(loc=0.0, scale=rmsd_mag, size=len(data_obs['absolute_magnitudes'])) + data_obs['absolute_magnitudes']=fit_pol_mag + # Add noise to length data (Gaussian noise) for each realization + fitted_lag_t0_lag += np.random.normal(loc=0.0, scale=rmsd_lag, size=len(data_obs['length'])) + data_obs['lag']=fitted_lag_t0_lag + # add noise to velocity data considering the noise as rmsd_lag/(1.0/FPS) + fitted_lag_t0_vel += np.random.normal(loc=0.0, scale=rmsd_lag/(1.0/FPS), size=len(data_obs['velocities'])) + # fitted_lag_t0_vel += np.random.normal(loc=0.0, scale=rmsd_lag*np.sqrt(2)/(1.0/FPS), size=len(data_obs['velocities'])) + data_obs['velocities']=fitted_lag_t0_vel + + ### ### + + # data_obs['lag']=np.array(data_obs['length'])-(data_obs['v_init']*np.array(data_obs['time'])+data_obs['length'][0]) + data_obs['length']= np.array(data_obs['lag'])+(data_obs['v_init']*np.array(data_obs['time'])+data_obs['length'][0]) + + # # get the new velocity with noise + # for vel_ii in range(1,len(data_obs['time'])-1): + # diff_1=abs((data_obs['time'][vel_ii]-data_obs['time'][vel_ii-1])-1.0/FPS) + # diff_2=abs((data_obs['time'][vel_ii+1]-data_obs['time'][vel_ii-1])-1.0/FPS) + + # if diff_1100: + # stop if too many curves are plotted + break + + if res[0] is not None: + # change res[0] extension to .json + res[0] = res[0].replace('.pickle', '.json') + print(res[0]) + # get the first value of res + gensim_data_sim = read_GenerateSimulations_output(res[0]) + + plot_side_by_side(gensim_data_sim, fig, ax, 'b-') + jj_plots_curve += 1 + + plot_side_by_side(gensim_data,fig, ax,'go','Obsevation') + + return fig, ax + + + +#### Plot ############################################################################# + + +def check_axis_inversion(ax): + x_min, x_max = ax.get_xlim() + y_min, y_max = ax.get_ylim() + is_x_inverted = x_max < x_min + is_y_inverted = y_max < y_min + return is_x_inverted, is_y_inverted + + +def plot_side_by_side(data1, fig='', ax='', colorline1='.', label1='', residuals_mag='', residuals_vel='', residual_time_pos='', residual_height_pos='', fit_funct='', mag_noise='', vel_noise='', label_fit=''): + + # check if data1 is None + if data1 is None: + print("Warning: data1 is None. Skipping plot.") + return + + # check if it is in km/s or in m/s + obs1 = copy.deepcopy(data1) + if 'velocities' not in obs1 or 'height' not in obs1: + print("Warning: Required keys missing in obs1. Skipping plot.") + return + + # check if it is in km/s or in m/s + obs1= copy.deepcopy(data1) + if np.mean(obs1['velocities'])>1000: + # convert to km/s + obs1['velocities'] = np.array(obs1['velocities'])/1000 + obs1['height'] = np.array(obs1['height'])/1000 + + + # Plot the simulation results + if residuals_mag != '' and residuals_vel != '' and residual_time_pos!='' and residual_height_pos!='': + + if fig=='' and ax=='': + fig, ax = plt.subplots(2, 3, figsize=(14, 6),gridspec_kw={'height_ratios': [ 3, 1],'width_ratios': [ 3, 0.5, 3]}) # figsize=(10, 5), dpi=300 0.5, 3, 3, 0.5 + # flat the ax + ax = ax.flatten() + return fig, ax + + if fit_funct!='' and mag_noise!='' and vel_noise!='': + obs_time_err=np.array(fit_funct['time']) + abs_mag_sim_err=np.array(fit_funct['absolute_magnitudes']) + height_km_err=np.array(fit_funct['height']) + vel_kms_err=np.array(fit_funct['velocities']) + # from list to array + if np.mean(fit_funct['height'])>1000: + # convert to km/s + height_km_err=np.array(fit_funct['height'])/1000 + vel_kms_err=np.array(fit_funct['velocities'])/1000 + + # plot noisy area around vel_kms for vel_noise for the fix height_km + ax[0].fill_betweenx(height_km_err, abs_mag_sim_err-mag_noise, abs_mag_sim_err+mag_noise, color='lightgray', alpha=0.5) + + # plot noisy area around vel_kms for vel_noise for the fix height_km + ax[2].fill_between(obs_time_err, vel_kms_err-vel_noise, vel_kms_err+vel_noise, color='lightgray', alpha=0.5, label=label_fit) + + # plot noisy area around vel_kms for vel_noise for the fix height_km + ax[1].fill_betweenx(height_km_err, -mag_noise, mag_noise, color='lightgray', alpha=0.5) + + # plot noisy area around vel_kms for vel_noise for the fix height_km + ax[5].fill_between(obs_time_err, -vel_noise, vel_noise, color='lightgray', alpha=0.5) + + ax[0].plot(obs1['absolute_magnitudes'],obs1['height'], colorline1) + ax[0].set_xlabel('Absolute Magnitude [-]') + ax[0].set_ylabel('Height [km]') + # grid on on both subplot with -- as linestyle and light gray color + ax[0].grid(True) + ax[0].grid(linestyle='--',color='lightgray') + + # flip the y-axis + is_x_inverted, _ =check_axis_inversion(ax[0]) + if is_x_inverted==False: + ax[0].invert_xaxis() + + # Get the color of the last plotted line in graph 0 + line_color = ax[0].get_lines()[-1].get_color() + + # if line_color == '#2ca02c': + # line_color='m' + # ax[0].plot(obs1['absolute_magnitudes'],obs1['height'], colorline1, color='m') + + # plot the residuals against time + ax[1].plot(residuals_mag, residual_height_pos, '.', color=line_color) + # ax[1].set_ylabel('Height [km]') + ax[1].set_xlabel('Res.mag [-]') + ax[1].tick_params(axis='x', rotation=45) + + # flip the y-axis + is_x_inverted, _ =check_axis_inversion(ax[1]) + if is_x_inverted==False: + ax[1].invert_xaxis() + + # ax[1].title(f'Lag Residuals') + # ax[1].legend() + is_x_inverted, _ =check_axis_inversion(ax[1]) + if is_x_inverted==False: + ax[1].invert_xaxis() + ax[1].grid(True) + ax[1].grid(linestyle='--',color='lightgray') + ax[1].set_ylim(ax[0].get_ylim()) + + if label1!='': + ax[2].plot(obs1['time'], obs1['velocities'], colorline1, color=line_color, label=label1) + else: + ax[2].plot(obs1['time'], obs1['velocities'], colorline1, color=line_color) + # show the legend + if label1 != '': + ax[2].legend() + + ax[2].set_xlabel('Time [s]') + ax[2].set_ylabel('Velocity [km/s]') + ax[2].grid(True) + ax[2].grid(linestyle='--',color='lightgray') + + # delete the plot in the middle + ax[3].axis('off') + + # # put as the super title the name + # plt.suptitle(name) + ax[4].axis('off') + + # plot the residuals against time + ax[5].plot(residual_time_pos, residuals_vel, '.', color=line_color) + ax[5].set_ylabel('Res.vel [km/s]') + ax[5].grid(True) + ax[5].grid(linestyle='--',color='lightgray') + # use the same limits of ax[3] + ax[5].set_xlim(ax[2].get_xlim()) + + + # # plot the distribution of the residuals along the y axis + # ax[5].hist(residuals_mag, bins=20, color=line_color, alpha=0.5) + # ax[5].set_ylabel('N.data') + # ax[5].set_xlabel('Res.mag [-]') + # is_x_inverted, _ =check_axis_inversion(ax[5]) + # if is_x_inverted==False: + # ax[5].invert_xaxis() + # ax[5].grid(True) + # ax[5].grid(linestyle='--',color='lightgray') + + # # plot the residuals against time + # ax[6].plot(residual_time_pos, residuals_vel, '.', color=line_color) + # # ax[6].set_xlabel('Time [s]') + # ax[6].set_xticks([]) + # ax[6].set_ylabel('Res.vel [km/s]') + # ax[6].invert_yaxis() + # # ax[3].title(f'Absolute Magnitude Residuals') + # # ax[3].legend() + # ax[6].grid(True) + # ax[6].grid(linestyle='--',color='lightgray') + + # # plot the distribution of the residuals along the y axis + # ax[7].hist(residuals_vel, bins=20, color=line_color, alpha=0.5, orientation='horizontal') + # ax[7].set_xlabel('N.data') + # # invert the y axis + # ax[7].invert_yaxis() + # ax[7].set_ylabel('Res.vel [km/s]') + # # delete the the the line at the top ad the right + # ax[7].spines['top'].set_visible(False) + # ax[7].spines['right'].set_visible(False) + # # do not show the y ticks + # # ax[7].set_yticks([]) + # # # show the zero line + # # ax[7].axhline(0, color='k', linewidth=0.5) + # # grid on + # ax[7].grid(True) + # # grid on + # ax[7].grid(linestyle='--',color='lightgray') + + + + else : + if fig=='' and ax=='': + fig, ax = plt.subplots(1, 2, figsize=(14, 6), dpi=300) + # flat the ax + ax = ax.flatten() + return fig, ax + + # plot the magnitude curve with height + ax[0].plot(obs1['absolute_magnitudes'],obs1['height'], colorline1) + + ax[0].set_xlabel('Absolute Magnitude [-]') + ax[0].set_ylabel('Height [km]') + # check if the axis is inverted + is_x_inverted, _ =check_axis_inversion(ax[0]) + if is_x_inverted==False: + ax[0].invert_xaxis() + # grid on + ax[0].grid(True) + + # plot + if label1 == '': + ax[1].plot(obs1['time'], obs1['velocities'], colorline1) + else: + ax[1].plot(obs1['time'], obs1['velocities'], colorline1, label=label1) + + # show the legend + if label1 != '': + ax[1].legend() + + ax[1].set_xlabel('Time [s]') + ax[1].set_ylabel('Velocity [km/s]') + ax[1].grid(True) + + # grid on on both subplot with -- as linestyle and light gray color + ax[1].grid(linestyle='--',color='lightgray') + # grid on + ax[0].grid(linestyle='--',color='lightgray') + + plt.tight_layout() + + + +#### Reader ############################################################################# + + +def read_GenerateSimulations_output_to_PCA(file_path, name=''): + if name!='': + print(name) + gensim_data = read_GenerateSimulations_output(file_path) + if gensim_data is None: + return None + else: + pd_datfram_PCA = array_to_pd_dataframe_PCA(gensim_data) + return pd_datfram_PCA + + +def read_GenerateSimulations_output(file_path, real_event=''): + + f = open(file_path,"r") + data = json.loads(f.read()) + + # show processed event + print(file_path) + + if data['ht_sampled']!= None: + + vel_sim=data['simulation_results']['leading_frag_vel_arr'][:-1]#['brightest_vel_arr']#['leading_frag_vel_arr']#['main_vel_arr'] + ht_sim=data['simulation_results']['leading_frag_height_arr'][:-1]#['brightest_height_arr']['leading_frag_height_arr']['main_height_arr'] + time_sim=data['simulation_results']['time_arr'][:-1]#['main_time_arr'] + abs_mag_sim=data['simulation_results']['abs_magnitude'][:-1] + len_sim=data['simulation_results']['brightest_length_arr'][:-1]#['brightest_length_arr'] + Dynamic_pressure= data['simulation_results']['leading_frag_dyn_press_arr'][:-1] + + # ht_obs=data['ht_sampled'] + # try: + # index_ht_sim=next(x for x, val in enumerate(ht_sim) if val <= ht_obs[0]) + # except StopIteration: + # # index_ht_sim = None + # print('The first element of the observation is not in the simulation') + # return None + + # try: + # index_ht_sim_end=next(x for x, val in enumerate(ht_sim) if val <= ht_obs[-1]) + # except StopIteration: + # # index_ht_sim_end = None + # print('The last element of the observation is not in the simulation') + # return None + + if real_event!= '': + mag_obs=real_event['absolute_magnitudes'] + else: + mag_obs=data['mag_sampled'] + + print('read_GenerateSimulations_output mag',mag_obs[0],'-',mag_obs[-1]) + + try: + # find the index of the first element of abs_mag_sim that is smaller than the first element of mag_obs + index_abs_mag_sim_start = next(i for i, val in enumerate(abs_mag_sim) if val <= mag_obs[0]) + index_abs_mag_sim_start = index_abs_mag_sim_start + np.random.randint(2) + except StopIteration: + print("The first observation height is not within the simulation data range.") + return None + try: + index_abs_mag_sim_end = next(i for i, val in enumerate(abs_mag_sim[::-1]) if val <= mag_obs[-1]) + index_abs_mag_sim_end = len(abs_mag_sim) - index_abs_mag_sim_end - 1 + except StopIteration: + print("The first observation height is not within the simulation data range.") + return None + + # print('mag',index_abs_mag_sim_start,'-',index_abs_mag_sim_end,'\nheight',index_ht_sim,'-',index_ht_sim_end) + + abs_mag_sim = abs_mag_sim[index_abs_mag_sim_start:index_abs_mag_sim_end] + vel_sim = vel_sim[index_abs_mag_sim_start:index_abs_mag_sim_end] + time_sim = time_sim[index_abs_mag_sim_start:index_abs_mag_sim_end] + ht_sim = ht_sim[index_abs_mag_sim_start:index_abs_mag_sim_end] + len_sim = len_sim[index_abs_mag_sim_start:index_abs_mag_sim_end] + Dynamic_pressure = Dynamic_pressure[index_abs_mag_sim_start:index_abs_mag_sim_end] + + + + # abs_mag_sim=abs_mag_sim[index_ht_sim:index_ht_sim_end] + # vel_sim=vel_sim[index_ht_sim:index_ht_sim_end] + # time_sim=time_sim[index_ht_sim:index_ht_sim_end] + # ht_sim=ht_sim[index_ht_sim:index_ht_sim_end] + # len_sim=len_sim[index_ht_sim:index_ht_sim_end] + + # closest_indices = find_closest_index(ht_sim, ht_obs) + + # Dynamic_pressure= data['simulation_results']['leading_frag_dyn_press_arr'] + # Dynamic_pressure= Dynamic_pressure[index_ht_sim:index_ht_sim_end] + # Dynamic_pressure=[Dynamic_pressure[jj_index_cut] for jj_index_cut in closest_indices] + + # abs_mag_sim=[abs_mag_sim[jj_index_cut] for jj_index_cut in closest_indices] + # vel_sim=[vel_sim[jj_index_cut] for jj_index_cut in closest_indices] + # time_sim=[time_sim[jj_index_cut] for jj_index_cut in closest_indices] + # ht_sim=[ht_sim[jj_index_cut] for jj_index_cut in closest_indices] + # len_sim=[len_sim[jj_index_cut] for jj_index_cut in closest_indices] + + # divide the vel_sim by 1000 considering is a list + time_sim = [i-time_sim[0] for i in time_sim] + # vel_sim = [i/1000 for i in vel_sim] + len_sim = [i-len_sim[0] for i in len_sim] + # ht_sim = [i/1000 for i in ht_sim] + + # Load the constants + const, _ = loadConstants(file_path) + const.dens_co = np.array(const.dens_co) + + # Compute the erosion energies + erosion_energy_per_unit_cross_section, erosion_energy_per_unit_mass = wmpl.MetSim.MetSimErosion.energyReceivedBeforeErosion(const) + + gensim_data = { + 'name': file_path, + 'type': 'Simulation', + 'v_init': vel_sim[0], # m/s + 'velocities': vel_sim, # m/s + 'height': ht_sim, # m + 'absolute_magnitudes': abs_mag_sim, + 'lag': len_sim-(vel_sim[0]*np.array(time_sim)+len_sim[0]), # m + 'length': len_sim, # m + 'time': time_sim, # s + 'v_avg': np.mean(vel_sim), # m/s + 'v_init_180km': data['params']['v_init']['val'], # m/s + 'Dynamic_pressure_peak_abs_mag': Dynamic_pressure[np.argmin(abs_mag_sim)], + 'zenith_angle': data['params']['zenith_angle']['val']*180/np.pi, + 'mass': data['params']['m_init']['val'], + 'rho': data['params']['rho']['val'], + 'sigma': data['params']['sigma']['val'], + 'erosion_height_start': data['params']['erosion_height_start']['val']/1000, + 'erosion_coeff': data['params']['erosion_coeff']['val'], + 'erosion_mass_index': data['params']['erosion_mass_index']['val'], + 'erosion_mass_min': data['params']['erosion_mass_min']['val'], + 'erosion_mass_max': data['params']['erosion_mass_max']['val'], + 'erosion_range': np.log10(data['params']['erosion_mass_max']['val']) - np.log10(data['params']['erosion_mass_min']['val']), + 'erosion_energy_per_unit_cross_section': erosion_energy_per_unit_cross_section, + 'erosion_energy_per_unit_mass': erosion_energy_per_unit_mass + } + + return gensim_data + + else: + return None + + +def Old_GenSym_json_get_vel_lag(data): + + ht_sim=data['simulation_results']['leading_frag_height_arr'][:-1]#['brightest_height_arr']['leading_frag_height_arr']['main_height_arr'] + ht_obs=data['ht_sampled'] + time_sampled = np.array(data['time_sampled']) + len_sampled = np.array(data['len_sampled']) + + closest_indices = find_closest_index(ht_sim, ht_obs) + + vel_sim=data['simulation_results']['leading_frag_vel_arr'][:-1]#['brightest_vel_arr']#['leading_frag_vel_arr']#['main_vel_arr'] + vel_sim=[vel_sim[jj_index_cut] for jj_index_cut in closest_indices] + + # get the new velocity with noise + for vel_ii in range(1,len(time_sampled)): + if time_sampled[vel_ii]-time_sampled[vel_ii-1]<1.0/FPS: + # if time_sampled[vel_ii] % 0.03125 < 0.000000001: + if vel_ii+1 14): + print('Found values below 14 absolute magnitudes:', combined_obs['absolute_magnitudes'][combined_obs['absolute_magnitudes'] > 14]) + + # delete any values above 10 absolute_magnitudes and delete the corresponding values in the other arrays + combined_obs = {key: combined_obs[key][combined_obs['absolute_magnitudes'] < 14] for key in combined_obs.keys()} + + Dynamic_pressure_peak_abs_mag=(wmpl.Utils.Physics.dynamicPressure(lat_dat, lon_dat, combined_obs['height'][np.argmin(combined_obs['absolute_magnitudes'])], jd_dat, combined_obs['velocities'][np.argmin(combined_obs['absolute_magnitudes'])])) + const=Constants() + zenith_angle=zenithAngleAtSimulationBegin(const.h_init, traj.rbeg_ele, traj.orbit.zc, const.r_earth) + + if MetSim_phys_file_path != '': + output_phys = read_MetSim_phyProp_output(MetSim_phys_file_path) + type_sim='MetSim' + + else: + # if no data on weight is 0 + mass=(0) + rho=(0) + sigma=(0) + erosion_height_start=(0) + erosion_coeff=(0) + erosion_mass_index=(0) + erosion_mass_min=(0) + erosion_mass_max=(0) + erosion_range=(0) + erosion_energy_per_unit_cross_section_arr=(0) + erosion_energy_per_unit_mass_arr=(0) + + type_sim='Observation' + + # put all the varible in a array mass, rho, sigma, erosion_height_start, erosion_coeff, erosion_mass_index, erosion_mass_min, erosion_mass_max, erosion_range, erosion_energy_per_unit_cross_section_arr, erosion_energy_per_unit_mass_arr + output_phys = [mass, rho, sigma, erosion_height_start, erosion_coeff, erosion_mass_index, erosion_mass_min, erosion_mass_max, erosion_range, erosion_energy_per_unit_cross_section_arr, erosion_energy_per_unit_mass_arr] + + # delete the elev_data from the combined_obs + del combined_obs['elev_data'] + + # add to combined_obs the avg velocity and the peak dynamic pressure and all the physical parameters + combined_obs['name'] = file_path + combined_obs['v_init'] = combined_obs['velocities'][0] + combined_obs['v_init_180km'] = combined_obs['velocities'][0]+100 + combined_obs['type'] = type_sim + combined_obs['v_avg'] = v_avg + combined_obs['Dynamic_pressure_peak_abs_mag'] = Dynamic_pressure_peak_abs_mag + combined_obs['zenith_angle'] = zenith_angle*180/np.pi + combined_obs['mass'] = output_phys[0] + combined_obs['rho'] = output_phys[1] + combined_obs['sigma'] = output_phys[2] + combined_obs['erosion_height_start'] = output_phys[3] + combined_obs['erosion_coeff'] = output_phys[4] + combined_obs['erosion_mass_index'] = output_phys[5] + combined_obs['erosion_mass_min'] = output_phys[6] + combined_obs['erosion_mass_max'] = output_phys[7] + combined_obs['erosion_range'] = output_phys[8] + combined_obs['erosion_energy_per_unit_cross_section'] = output_phys[9] + combined_obs['erosion_energy_per_unit_mass'] = output_phys[10] + + if obs_sep: + return combined_obs, obs1, obs2 + else: + return combined_obs + + +def read_MetSim_phyProp_output(MetSim_phys_file_path): + + # check if in os.path.join(root, name_file) present and then open the .json file with the same name as the pickle file with in stead of _trajectory.pickle it has _sim_fit_latest.json + if os.path.isfile(MetSim_phys_file_path): + with open(MetSim_phys_file_path,'r') as json_file: # 20210813_061453_sim_fit.json + print('Loading Physical Characteristics MetSim file:', MetSim_phys_file_path) + data = json.load(json_file) + mass=(data['m_init']) + # add also rho sigma erosion_height_start erosion_coeff erosion_mass_index erosion_mass_min erosion_mass_max erosion_range erosion_energy_per_unit_cross_section erosion_energy_per_unit_mass + # mass=(data['m_init']) + rho=(data['rho']) + sigma=(data['sigma']) + erosion_height_start=(data['erosion_height_start']/1000) + erosion_coeff=(data['erosion_coeff']) + erosion_mass_index=(data['erosion_mass_index']) + erosion_mass_min=(data['erosion_mass_min']) + erosion_mass_max=(data['erosion_mass_max']) + + # Compute the erosion range + erosion_range=(np.log10(data['erosion_mass_max']) - np.log10(data['erosion_mass_min'])) + + cost_path = os.path.join(MetSim_phys_file_path) + + # Load the constants + const, _ = loadConstants(cost_path) + const.dens_co = np.array(const.dens_co) + + # Compute the erosion energies + erosion_energy_per_unit_cross_section, erosion_energy_per_unit_mass = wmpl.MetSim.MetSimErosion.energyReceivedBeforeErosion(const) + erosion_energy_per_unit_cross_section_arr=(erosion_energy_per_unit_cross_section) + erosion_energy_per_unit_mass_arr=(erosion_energy_per_unit_mass) + + else: + print('No json file:',MetSim_phys_file_path) + + # if no data on weight is 0 + mass=(0) + rho=(0) + sigma=(0) + erosion_height_start=(0) + erosion_coeff=(0) + erosion_mass_index=(0) + erosion_mass_min=(0) + erosion_mass_max=(0) + erosion_range=(0) + erosion_energy_per_unit_cross_section_arr=(0) + erosion_energy_per_unit_mass_arr=(0) + + # put all the varible in a array mass, rho, sigma, erosion_height_start, erosion_coeff, erosion_mass_index, erosion_mass_min, erosion_mass_max, erosion_range, erosion_energy_per_unit_cross_section_arr, erosion_energy_per_unit_mass_arr + output_phys = [mass, rho, sigma, erosion_height_start, erosion_coeff, erosion_mass_index, erosion_mass_min, erosion_mass_max, erosion_range, erosion_energy_per_unit_cross_section_arr, erosion_energy_per_unit_mass_arr] + + return output_phys + + + +def array_to_pd_dataframe_PCA(data): + + if data is None: + # Handle the None case, maybe log an error or return an empty DataFrame + print(f"Warning: 'data' is None for source returning an empty DataFrame.") + return pd.DataFrame() # or any other appropriate action + # do a copy of data_array + data_array = data.copy() + + # compute the linear regression + data_array['v_init'] = data_array['v_init']/1000 + data_array['v_avg'] = data_array['v_avg']/1000 + data_array['velocities'] = [i/1000 for i in data_array['velocities']] # convert m/s to km/s + data_array['height'] = [i/1000 for i in data_array['height']] + data_array['lag']=[i/1000 for i in data_array['lag']] + v0=data_array['v_init'] + + # from 'time_sampled' extract the last element and save it in a list + duration = data_array['time'][-1] + begin_height = data_array['height'][0] + end_height = data_array['height'][-1] + peak_abs_mag = data_array['absolute_magnitudes'][np.argmin(data_array['absolute_magnitudes'])] + F_param = (begin_height - (data_array['height'][np.argmin(data_array['absolute_magnitudes'])])) / (begin_height - end_height) + peak_mag_height = data_array['height'][np.argmin(data_array['absolute_magnitudes'])] + beg_abs_mag = data_array['absolute_magnitudes'][0] + end_abs_mag = data_array['absolute_magnitudes'][-1] + trail_len = data_array['length'][-1] + avg_lag = np.mean(data_array['lag']) + + + kc_par = begin_height + (2.86 - 2*np.log(data_array['v_init']))/0.0612 + + # fit a line to the throught the vel_sim and ht_sim + a, b = np.polyfit(data_array['time'],data_array['velocities'], 1) + acceleration_lin = a + + t0 = np.mean(data_array['time']) + + # initial guess of deceleration decel equal to linear fit of velocity + p0 = [a, 0, 0, t0] + + opt_res = opt.minimize(lag_residual, p0, args=(np.array(data_array['time']), np.array(data_array['lag'])), method='Nelder-Mead') + + # sample the fit for the velocity and acceleration + a_t0, b_t0, c_t0, t0 = opt_res.x + + # compute reference decelearation + t_decel_ref = (t0 + np.max(data_array['time']))/2 + decel_t0 = cubic_acceleration(t_decel_ref, a_t0, b_t0, t0)[0] + + a_t0=-abs(a_t0) + b_t0=-abs(b_t0) + + acceleration_parab_t0=a_t0*6 + b_t0*2 + + a3, b3, c3 = np.polyfit(data_array['time'],data_array['velocities'], 2) + acceleration_parab=a3*2 + b3 + + # Assuming the jacchiaVel function is defined as: + def jacchiaVel(t, a1, a2, v_init): + return v_init - np.abs(a1) * np.abs(a2) * np.exp(np.abs(a2) * t) + + # Generating synthetic observed data for demonstration + t_observed = np.array(data_array['time']) # Observed times + + # Residuals function for optimization + def residuals(params): + a1, a2 = params + predicted_velocity = jacchiaVel(t_observed, a1, a2, v0) + return np.sum((data_array['velocities'] - predicted_velocity)**2) + + # Initial guess for a1 and a2 + initial_guess = [0.005, 10] + + # Apply minimize to the residuals + result = minimize(residuals, initial_guess) + + # Results + jac_a1, jac_a2 = abs(result.x) + + acc_jacchia = abs(jac_a1)*abs(jac_a2)**2 + + try: + # fit a line to the throught the obs_vel and ht_sim + index_ht_peak = next(x for x, val in enumerate(data_array['height']) if val <= peak_mag_height) + except StopIteration: + # Handle the case where no height is less than or equal to peak_mag_height + index_ht_peak = len(data_array['height']) // 2 + + # Check if the arrays are non-empty before fitting the polynomial + if len(data_array['height'][:index_ht_peak]) > 0 and len(data_array['absolute_magnitudes'][:index_ht_peak]) > 0: + a3_Inabs, b3_Inabs, c3_Inabs = np.polyfit(data_array['height'][:index_ht_peak], data_array['absolute_magnitudes'][:index_ht_peak], 2) + else: + # Handle the case of empty input arrays + a3_Inabs, b3_Inabs, c3_Inabs = 0, 0, 0 + + # Check if the arrays are non-empty before fitting the polynomial + if len(data_array['height'][index_ht_peak:]) > 0 and len(data_array['absolute_magnitudes'][index_ht_peak:]) > 0: + a3_Outabs, b3_Outabs, c3_Outabs = np.polyfit(data_array['height'][index_ht_peak:], data_array['absolute_magnitudes'][index_ht_peak:], 2) + else: + # Handle the case of empty input arrays + a3_Outabs, b3_Outabs, c3_Outabs = 0, 0, 0 + + # # check if the ht_obs[:index_ht_peak] and abs_mag_obs[:index_ht_peak] are empty + # a3_Inabs, b3_Inabs, c3_Inabs = np.polyfit(data_array['height'][:index_ht_peak], data_array['absolute_magnitudes'][:index_ht_peak], 2) + + # # check if the ht_obs[index_ht_peak:] and abs_mag_obs[index_ht_peak:] are empty + # a3_Outabs, b3_Outabs, c3_Outabs = np.polyfit(data_array['height'][index_ht_peak:], data_array['absolute_magnitudes'][index_ht_peak:], 2) + + + ######## SKEW KURT ################ + # create a new array with the same values as time_pickl + index=[] + # if the distance between two index is smalle than 0.05 delete the second one + for i in range(len(data_array['time'])-1): + if data_array['time'][i+1]-data_array['time'][i] < 0.01: + # save the index as an array + index.append(i+1) + # delete the index from the list + time_pickl = np.delete(data_array['time'], index) + abs_mag_pickl = np.delete(data_array['time'], index) + + abs_mag_pickl = [0 if math.isnan(x) else x for x in abs_mag_pickl] + + # subrtract the max value of the mag to center it at the origin + mag_sampled_norm = (-1)*(abs_mag_pickl - np.max(abs_mag_pickl)) + # check if there is any negative value and add the absolute value of the min value to all the values + mag_sampled_norm = mag_sampled_norm + np.abs(np.min(mag_sampled_norm)) + # normalize the mag so that the sum is 1 + time_sampled_norm= time_pickl - np.mean(time_pickl) + # mag_sampled_norm = mag_sampled_norm/np.sum(mag_sampled_norm) + mag_sampled_norm = mag_sampled_norm/np.max(mag_sampled_norm) + # substitute the nan values with zeros + mag_sampled_norm = np.nan_to_num(mag_sampled_norm) + + # create an array with the number the ammount of same number equal to the value of the mag + mag_sampled_distr = [] + mag_sampled_array=np.asarray(mag_sampled_norm*1000, dtype = 'int') + for i in range(len(abs_mag_pickl)): + # create an integer form the array mag_sampled_array[i] and round of the given value + numbs=mag_sampled_array[i] + # invcrease the array number by the mag_sampled_distr numbs + # array_nu=(np.ones(numbs+1)*i_pos).astype(int) + array_nu=(np.ones(numbs+1)*time_sampled_norm[i]) + mag_sampled_distr=np.concatenate((mag_sampled_distr, array_nu)) + + # # # plot the mag_sampled_distr as an histogram + # plt.hist(mag_sampled_distr) + # plt.show() + + # kurtosyness.append(kurtosis(mag_sampled_distr)) + # skewness.append(skew(mag_sampled_distr)) + kurtosyness=kurtosis(mag_sampled_distr) + skewness=skew(mag_sampled_distr) + + ################################# + + + + # Data to populate the dataframe + data_picklefile_pd = { + 'solution_id': [data_array['name']], + 'type': [data_array['type']], + 'vel_init_norot': [data_array['v_init']], + 'vel_avg_norot': [data_array['v_avg']], + 'v_init_180km': [data_array['v_init_180km']], + 'duration': [duration], + 'peak_mag_height': [peak_mag_height], + 'begin_height': [begin_height], + 'end_height': [end_height], + 'peak_abs_mag': [peak_abs_mag], + 'beg_abs_mag': [beg_abs_mag], + 'end_abs_mag': [end_abs_mag], + 'F': [F_param], + 'trail_len': [trail_len], + 't0': [t0], + 'deceleration_lin': [acceleration_lin], + 'deceleration_parab': [acceleration_parab], + 'decel_parab_t0': [acceleration_parab_t0], + 'decel_t0': [decel_t0], + 'decel_jacchia': [acc_jacchia], + 'zenith_angle': [data_array['zenith_angle']], + 'kurtosis': [kurtosyness], + 'skew': [skewness], + 'avg_lag': [avg_lag], + 'kc': [kc_par], + 'Dynamic_pressure_peak_abs_mag': [data_array['Dynamic_pressure_peak_abs_mag']], + 'a_acc': [a3], + 'b_acc': [b3], + 'c_acc': [c3], + 'a_t0': [a_t0], + 'b_t0': [b_t0], + 'c_t0': [c_t0], + 'a1_acc_jac': [jac_a1], + 'a2_acc_jac': [jac_a2], + 'a_mag_init': [a3_Inabs], + 'b_mag_init': [b3_Inabs], + 'c_mag_init': [c3_Inabs], + 'a_mag_end': [a3_Outabs], + 'b_mag_end': [b3_Outabs], + 'c_mag_end': [c3_Outabs], + 'mass': [data_array['mass']], + 'rho': [data_array['rho']], + 'sigma': [data_array['sigma']], + 'erosion_height_start': [data_array['erosion_height_start']], + 'erosion_coeff': [data_array['erosion_coeff']], + 'erosion_mass_index': [data_array['erosion_mass_index']], + 'erosion_mass_min': [data_array['erosion_mass_min']], + 'erosion_mass_max': [data_array['erosion_mass_max']], + 'erosion_range': [data_array['erosion_range']], + 'erosion_energy_per_unit_cross_section': [data_array['erosion_energy_per_unit_cross_section']], + 'erosion_energy_per_unit_mass': [data_array['erosion_energy_per_unit_mass']] + } + + # Create the dataframe + panda_dataframe_PCA = pd.DataFrame(data_picklefile_pd) + + if data_array['mass']==0: + # delete the mass + panda_dataframe_PCA = panda_dataframe_PCA.drop(columns=['mass', 'rho', 'sigma', 'erosion_height_start', 'erosion_coeff', 'erosion_mass_index', 'erosion_mass_min', 'erosion_mass_max', 'erosion_range', 'erosion_energy_per_unit_cross_section', 'erosion_energy_per_unit_mass']) + + return panda_dataframe_PCA + + + +########## Utils ########################## + +# Function to get trajectory data folder +def find_and_extract_trajectory_files(directory, MetSim_extention): + trajectory_files = [] + file_names = [] + output_folders = [] + input_folders = [] + trajectory_Metsim_file = [] + + for root, dirs, files in os.walk(directory): + + # go in each folder and find the file with the end _trajectory.pickle but skip the folder with the name GenSim + if 'GenSim' in root: + continue + + csv_file_found=False + + for file in files: + if file.endswith(NAME_SUFX_CSV_OBS): + # open + csv_file_found=True + real_data = pd.read_csv(os.path.join(root, file)) + if root not in real_data['solution_id'][0]: + print('The solution_id in the csv file is not the same as the folder name or does not exist in the folder name:', root) + continue + # split real_data['solution_id'][0] in the directory and the name of the file + _ , file_from_csv = os.path.split(real_data['solution_id'][0]) + + base_name = os.path.splitext(file_from_csv)[0] # Remove the file extension + #check if the file_from_csv endswith "_trajectory" if yes then extract the number 20230405_010203 + if base_name.endswith("_trajectory"): + variable_name = base_name.replace("_trajectory", "") # Extract the number 20230405_010203 + output_folder_name = base_name.replace("_trajectory", NAME_SUFX_GENSIM) # _GenSim folder whre all generated simulations are stored + else: + variable_name = base_name + output_folder_name = base_name + NAME_SUFX_GENSIM + + + if file_from_csv.endswith("json"): + # MetSim_phys_file_path = os.path.join(root, file_from_csv) + + # from namefile_sel json file open the json file and save the namefile_sel.const part as file_name_obs+'_sim_fit.json' + with open(os.path.join(root, file_from_csv)) as json_file: + data = json.load(json_file) + const_part = data['const'] + MetSim_phys_file_path = os.path.join(root, output_folder_name)+os.sep+variable_name+'_sim_fit.json' + with open(os.path.join(root, output_folder_name)+os.sep+variable_name+'_sim_fit.json', 'w') as outfile: + json.dump(const_part, outfile, indent=4) + + else: + # check if MetSim_phys_file_path exist + if os.path.isfile(os.path.join(root, variable_name + MetSim_extention)): + # print did not find with th given extention revert to default + MetSim_phys_file_path = os.path.join(root, variable_name + MetSim_extention) + elif os.path.isfile(os.path.join(root, variable_name + '_sim_fit_latest.json')): + print(base_name,': No MetSim file with the given extention', MetSim_extention,'reverting to default extention _sim_fit_latest.json') + MetSim_phys_file_path = os.path.join(root, variable_name + '_sim_fit_latest.json') + else: + # do not save the rest of the files + print(base_name,': No MetSim file with the given extention', MetSim_extention,'do not consider the folder') + continue + + + input_folders.append(root) + trajectory_files.append(os.path.join(root, file)) + file_names.append(variable_name) + output_folders.append(os.path.join(root, output_folder_name)) + trajectory_Metsim_file.append(MetSim_phys_file_path) + + + + if csv_file_found==False: + for file in files: + if file.endswith("_trajectory.pickle"): + base_name = os.path.splitext(file)[0] # Remove the file extension + variable_name = base_name.replace("_trajectory", "") # Extract the number 20230405_010203 + output_folder_name = base_name.replace("_trajectory", NAME_SUFX_GENSIM) # _GenSim folder whre all generated simulations are stored + + # check if MetSim_phys_file_path exist + if os.path.isfile(os.path.join(root, variable_name + MetSim_extention)): + # print did not find with th given extention revert to default + MetSim_phys_file_path = os.path.join(root, variable_name + MetSim_extention) + elif os.path.isfile(os.path.join(root, variable_name + '_sim_fit_latest.json')): + print(base_name,': No MetSim file with the given extention', MetSim_extention,'reverting to default extention _sim_fit_latest.json') + MetSim_phys_file_path = os.path.join(root, variable_name + '_sim_fit_latest.json') + else: + # do not save the rest of the files + print(base_name,': No MetSim file with the given extention', MetSim_extention,'do not consider the folder') + continue + + input_folders.append(root) + trajectory_files.append(os.path.join(root, file)) + file_names.append(variable_name) + output_folders.append(os.path.join(root, output_folder_name)) + trajectory_Metsim_file.append(MetSim_phys_file_path) + + input_list = [[trajectory_files[ii], file_names[ii], input_folders[ii], output_folders[ii], trajectory_Metsim_file[ii]] for ii in range(len(trajectory_files))] + + return input_list + + + +def update_sigma_values(file_path, mag_sigma, len_sigma, More_complex_fit=False, Custom_refinement=False): + with open(file_path, 'r') as file: + content = file.read() + + # Modify mag_sigma and len_sigma + content = re.sub(r'"mag_sigma":\s*[\d.]+', f'"mag_sigma": {mag_sigma}', content) + content = re.sub(r'"len_sigma":\s*[\d.]+', f'"len_sigma": {len_sigma}', content) + + if More_complex_fit: + # Enable "More complex fit - overall fit" + content = re.sub( + r'(# More complex fit - overall fit\s*\{[^{}]*"enabled":\s*)false', + r'\1true', + content + ) + else: + # Enable "More complex fit - overall fit" + content = re.sub( + r'(# More complex fit - overall fit\s*\{[^{}]*"enabled":\s*)true', + r'\1false', + content + ) + + if Custom_refinement: + # Enable "Custom refinement of erosion parameters - improves wake" + content = re.sub( + r'(# Custom refinement of erosion parameters - improves wake\s*\{[^{}]*"enabled":\s*)false', + r'\1true', + content + ) + else: + # Enable "Custom refinement of erosion parameters - improves wake" + content = re.sub( + r'(# Custom refinement of erosion parameters - improves wake\s*\{[^{}]*"enabled":\s*)true', + r'\1false', + content + ) + + # Save the modified content back to the file + with open(file_path, 'w') as file: + file.write(content) + + print('modified options file:', file_path) + + + +########## Distance ########################## + + +# Function to find the knee of the distance plot +def find_knee_dist_index(data_meteor_pd, window_of_smothing_avg=3, std_multip_threshold=1, output_path='', around_meteor='', N_sim_sel_force=0): + dist_for_meteor=np.array(data_meteor_pd['distance_meteor']) + #make subtraction of the next element and the previous element of data_for_meteor["distance_meteor"] + # diff_distance_meteor = np.diff(dist_for_meteor[:int(len(dist_for_meteor)/10)]) + diff_distance_meteor = np.diff(dist_for_meteor) + # histogram plot of the difference with the count on the x axis and diff_distance_meteor on the y axis + indices = np.arange(len(diff_distance_meteor)) + # create the cumulative sum of the diff_distance_meteor + cumsum_diff_distance_meteor = np.cumsum(diff_distance_meteor) + # normalize the diff_distance_meteor xnormalized = (x - xminimum) / range of x + diff_distance_meteor_normalized = (diff_distance_meteor - np.min(diff_distance_meteor)) / (np.max(diff_distance_meteor) - np.min(diff_distance_meteor)) + + def moving_average_smoothing(data, window_size): + smoothed_data = np.convolve(data, np.ones(window_size)/window_size, mode='same') + return smoothed_data + + # apply the smoothing finction + smoothed_diff_distance_meteor = moving_average_smoothing(diff_distance_meteor_normalized, window_of_smothing_avg) + + # fid the first value of the smoothed_diff_distance_meteor that is smaller than the std of the smoothed_diff_distance_meteor + index10percent = np.where(smoothed_diff_distance_meteor < np.std(smoothed_diff_distance_meteor)*std_multip_threshold)[0][0]-2 + + if N_sim_sel_force!=0: + index10percent = N_sim_sel_force + + if index10percent<0: # below does not work problem with finding the mode on KDE later on + index10percent=0 + + if output_path!='': + + # Define a custom palette + custom_palette_orange = { + 'Real': "darkorange", + 'Simulation': "darkorange", + 'Simulation_sel': "darkorange", + 'MetSim': "darkorange", + 'Realization': "darkorange", + 'Observation': "darkorange" + } + + # dimension of the plot 15,5 + plt.figure(figsize=(15,5)) + + plt.subplot(1,2,2) + sns.histplot(data_meteor_pd, x="distance_meteor", hue="type", kde=True, cumulative=True, bins=len(dist_for_meteor), palette=custom_palette_orange) # , stat='density' to have probability + plt.xlabel('Distance in PCA space') + plt.ylabel('Number of events') + plt.title('Cumulative distance in PCA space') + plt.axvline(x=(dist_for_meteor[index10percent]), color="darkorange", linestyle='--', label='Knee distance') + + if len(dist_for_meteor)>100: + plt.ylim(0,100) + elif len(dist_for_meteor)>50: + plt.ylim(0,50) + + plt.legend() + # delete the legend + plt.legend().remove() + + + plt.subplot(1,2,1) + # sns.histplot(diff_distance_meteor_normalized, kde=True, bins=len(distance_meteor_sel_save)) + #make the bar plot 0.5 transparency + + plt.bar(indices, diff_distance_meteor_normalized,color="darkorange", alpha=0.5, edgecolor='black') + plt.xlabel('Number of events') + plt.ylabel('Normalized difference') + plt.title('Distance difference Normalized') + # put a horizontal line at len(curr_sel['distance_meteor']) + plt.axvline(x=index10percent, color="darkorange", linestyle='--') + if len(dist_for_meteor)>100: + plt.xlim(-1,100) + elif len(dist_for_meteor)>50: + plt.xlim(-1,50) + + # find the mean of the first 100 elements of diff_distance_meteor_normalized and put a horizontal line + # plt.axhline(y=np.std(smoothed_diff_distance_meteor), color="darkorange", linestyle='--') + + # set a sup title + plt.suptitle(around_meteor) + + # give more space + plt.tight_layout() + # plt.show() + + # save the figure maximized and with the right name + plt.savefig(output_path+os.sep+around_meteor+os.sep+around_meteor+'_knee'+str(index10percent+1)+'ev_MAXdist'+str(np.round(dist_for_meteor[index10percent],2))+'.png', dpi=300) + + # close the figure + plt.close() + + return index10percent + +# function to use the mahaloby distance and from the mean of the selected shower +def dist_PCA_space_select_sim(df_sim_PCA, shower_current_PCA_single, cov_inv, meanPCA_current, df_sim_shower, shower_current_single, N_sim_sel_force=0, output_dir=''): + N_sim_sel_all=100 + print('calculate distance for',shower_current_single['solution_id']) + + df_sim_PCA_for_now = df_sim_PCA.drop(['type'], axis=1).values + + distance_current = [] + for i_sim in range(len(df_sim_PCA_for_now)): + distance_current.append(mahalanobis_distance(df_sim_PCA_for_now[i_sim], shower_current_PCA_single, cov_inv)) + + # create an array with lenght equal to the number of simulations and set it to shower_current_PCA['solution_id'][i_shower] + solution_id_dist = [shower_current_single['solution_id']] * len(df_sim_PCA_for_now) + df_sim_shower['solution_id_dist'] = solution_id_dist + df_sim_shower['distance_meteor'] = distance_current + # sort the distance and select the n_selected closest to the meteor + df_sim_shower_dis = df_sim_shower.sort_values(by=['distance_meteor']).reset_index(drop=True) + df_sim_selected = df_sim_shower_dis[:N_sim_sel_all].drop(['type'], axis=1) + df_sim_selected['type'] = 'Simulation_sel' + + # create a dataframe with the selected simulated shower characteristics + df_sim_PCA_dist = df_sim_PCA + df_sim_PCA_dist['distance_meteor'] = distance_current + df_sim_PCA_dist = df_sim_PCA_dist.sort_values(by=['distance_meteor']).reset_index(drop=True) + # delete the shower code + df_sim_selected_PCA = df_sim_PCA_dist[:N_sim_sel_all].drop(['type','distance_meteor'], axis=1) + + # make df_sim_selected_PCA an array + df_sim_selected_PCA = df_sim_selected_PCA.values + distance_current_mean = [] + for i_shower in range(len(df_sim_selected)): + distance_current_mean.append(scipy.spatial.distance.euclidean(meanPCA_current, df_sim_selected_PCA[i_shower])) + df_sim_selected['distance_mean']=distance_current_mean # from the mean of the selected shower + + df_curr_sel_curv = df_sim_selected.copy() + + around_meteor=shower_current_single['solution_id'] + # check if around_meteor is a file in a folder + if os.path.exists(around_meteor): + # split in file and directory + _, around_meteor = os.path.split(around_meteor) + around_meteor = around_meteor[:15] + + mkdirP(output_dir+os.sep+around_meteor) + window_of_smothing_avg=3 + std_multip_threshold=1 + if N_sim_sel_force!=0: + print(around_meteor,'select the best',N_sim_sel_force,'simulations') + dist_to_cut=find_knee_dist_index(df_curr_sel_curv,window_of_smothing_avg,std_multip_threshold, output_dir, around_meteor, N_sim_sel_force) + # change of curvature print + df_curr_sel_curv=df_curr_sel_curv.iloc[:dist_to_cut] + else: + dist_to_cut=find_knee_dist_index(df_curr_sel_curv,window_of_smothing_avg,std_multip_threshold, output_dir, around_meteor) + print(around_meteor,'index of the knee distance',dist_to_cut+1) + # change of curvature print + df_curr_sel_curv=df_curr_sel_curv.iloc[:dist_to_cut+1] + + return df_sim_selected, df_curr_sel_curv + + +#### Matrix function ############################################################################ + + + +# Function to perform Varimax rotation +def varimax(Phi, gamma=1.0, q=20, tol=1e-6): + p, k = Phi.shape + R = np.eye(k) + d = 0 + for i in range(q): + d_old = d + Lambda = np.dot(Phi, R) + u, s, vh = svd(np.dot(Phi.T, np.asarray(Lambda) ** 3 - (gamma / p) * np.dot(Lambda, np.diag(np.diag(np.dot(Lambda.T, Lambda)))))) + R = np.dot(u, vh) + d = np.sum(s) + if d_old != 0 and d / d_old < 1 + tol: + break + return np.dot(Phi, R) + +# Function to perform mahalanobis distance +def mahalanobis_distance(x, mean, cov_inv): + diff = x - mean + return np.sqrt(np.dot(np.dot(diff, cov_inv), diff.T)) + + + +# PCA #################################################################################### + +def PCASim(df_sim_shower, df_obs_shower, OUT_PUT_PATH, PCA_percent=99, N_sim_sel=0, variable_PCA=[], No_var_PCA=['kurtosis','skew','a1_acc_jac','a2_acc_jac','a_acc','b_acc','c_acc','c_mag_init','c_mag_end','a_t0', 'b_t0', 'c_t0'], file_name_obs='', cores_parallel=None, PCA_pairplot=False, esclude_real_solution_from_selection=False): + ''' + This function generate the simulated shower from the erosion model and apply PCA. + The function read the json file in the folder and create a csv file with the simulated shower and take the data from GenerateSimulation.py folder. + The function return the dataframe of the selected simulated shower. + + 'solution_id','type','vel_init_norot','vel_avg_norot','duration', + 'mass','peak_mag_height','begin_height','end_height','t0','peak_abs_mag','beg_abs_mag','end_abs_mag', + 'F','trail_len','deceleration_lin','deceleration_parab','decel_jacchia','decel_t0','zenith_angle', 'kurtosis','skew', + 'kc','Dynamic_pressure_peak_abs_mag', + 'a_acc','b_acc','c_acc','a1_acc_jac','a2_acc_jac','a_mag_init','b_mag_init','c_mag_init','a_mag_end','b_mag_end','c_mag_end', + 'rho','sigma','erosion_height_start','erosion_coeff', 'erosion_mass_index', + 'erosion_mass_min','erosion_mass_max','erosion_range', + 'erosion_energy_per_unit_cross_section', 'erosion_energy_per_unit_mass' + + ''' + + # if variable_PCA is not empty + if variable_PCA != []: + # add to variable_PCA array 'type','solution_id' + variable_PCA = ['solution_id','type'] + variable_PCA + if No_var_PCA != []: + # remove from variable_PCA the variables in No_var_PCA + for var in No_var_PCA: + variable_PCA.remove(var) + + else: + # put in variable_PCA all the variables except mass + variable_PCA = list(df_obs_shower.columns) + # check if mass is in the variable_PCA + if 'mass' in variable_PCA: + # remove mass from variable_PCA + variable_PCA.remove('mass') + # if No_var_PCA is not empty + if No_var_PCA != []: + # remove from variable_PCA the variables in No_var_PCA + for var in No_var_PCA: + # check if the variable is in the variable_PCA + if var in variable_PCA: + variable_PCA.remove(var) + + scaled_sim=df_sim_shower[variable_PCA].copy() + scaled_sim=scaled_sim.drop(['type','solution_id'], axis=1) + + print(len(scaled_sim.columns),'Variables for PCA:\n',scaled_sim.columns) + + # Standardize each column separately + scaler = StandardScaler() + df_sim_var_sel_standardized = scaler.fit_transform(scaled_sim) + df_sim_var_sel_standardized = pd.DataFrame(df_sim_var_sel_standardized, columns=scaled_sim.columns) + + # Identify outliers using Z-score method on standardized data + z_scores = np.abs(zscore(df_sim_var_sel_standardized)) + threshold = 3 + outliers = (z_scores > threshold).any(axis=1) + + # outlier number 0 has alway to be the False + if outliers[0]==True: + print('The MetSim reduction is an outlier, still keep it for the PCA analysis') + outliers[0]=False + + # Assign df_sim_shower to the version without outliers + df_sim_shower = df_sim_shower[~outliers].copy() + + + # if PCA_pairplot: + + # scale the data so to be easily plot against each other with the same scale + df_sim_var_sel = df_sim_shower[variable_PCA].copy() + df_sim_var_sel = df_sim_var_sel.drop(['type','solution_id'], axis=1) + + if len(df_sim_var_sel)>10000: + # pick randomly 10000 events + print('Number of events in the simulated :',len(df_sim_var_sel)) + df_sim_var_sel=df_sim_var_sel.sample(n=10000) + + # make a subplot of the distribution of the variables + fig, axs = plt.subplots(int(np.ceil(len(variable_PCA[2:])/5)), 5, figsize=(20, 15)) + # flat it + axs = axs.flatten() + for i, var in enumerate(variable_PCA[2:]): + # plot the distribution of the variable + sns.histplot(df_sim_var_sel[var], kde=True, ax=axs[i], color='b', alpha=0.5, bins=20) + # axs[i//4, i%4].set_title('Distribution of '+var) + # put a vertical line for the df_obs_shower[var] value + axs[i].axvline(df_obs_shower[var].values[0], color='limegreen', linestyle='--', linewidth=5) + # x axis + axs[i].set_xlabel(var) + # # grid + # axs[i//5, i%5].grid() + if i != 0 and i != 5 and i != 10 and i != 15 and i != 20: + # delete the y axis + axs[i].set_ylabel('') + + # delete the plot that are not used + for i in range(len(variable_PCA[2:]), len(axs)): + fig.delaxes(axs[i]) + + # space between the subplots + plt.tight_layout() + + # save the figure + plt.savefig(OUT_PUT_PATH+os.sep+file_name_obs+'_var_hist_real.png') + # close the figure + plt.close() + + + + ##################################### delete var that are not in the 5 and 95 percentile of the simulated shower ##################################### + + # check if a file with the name "log"+n_PC_in_PCA+"_"+str(len(df_sel))+"ev.txt" already exist + if os.path.exists(OUT_PUT_PATH+os.sep+"log_"+file_name_obs[:15]+"_"+str(len(variable_PCA)-2)+"var_"+str(PCA_percent)+"%.txt"): + # remove the file + os.remove(OUT_PUT_PATH+os.sep+"log_"+file_name_obs[:15]+"_"+str(len(variable_PCA)-2)+"var_"+str(PCA_percent)+"%.txt") + sys.stdout = Logger(OUT_PUT_PATH,"log_"+file_name_obs[:15]+"_"+str(len(variable_PCA)-2)+"var_"+str(PCA_percent)+"%.txt") # _30var_99%_13PC + + df_all = pd.concat([df_sim_shower[variable_PCA],df_obs_shower[variable_PCA]], axis=0, ignore_index=True) + # delete nan + df_all = df_all.dropna() + + # create a copy of df_sim_shower for the resampling + df_sim_shower_resample=df_sim_shower.copy() + # df_obs_shower_resample=df_obs_shower.copy() + No_var_PCA_perc=[] + # check that all the df_obs_shower for variable_PCA is within th 5 and 95 percentie of df_sim_shower of variable_PCA + for var in variable_PCA: + if var != 'type' and var != 'solution_id': + # check if the variable is in the df_obs_shower + if var in df_obs_shower.columns: + # check if the variable is in the df_sim_shower + if var in df_sim_shower.columns: + + ii_all=0 + for i_var in range(len(df_obs_shower[var])): + # check if all the values are outside the 5 and 95 percentile of the df_sim_shower if so delete the variable from the variable_PCA + if df_obs_shower[var][i_var] < np.percentile(df_sim_shower[var], 1) or df_obs_shower[var][i_var] > np.percentile(df_sim_shower[var], 99): + ii_all=+ii_all + + print(var) + + if ii_all==len(df_obs_shower[var]): + print('The observed and all realization',var,'are not within the 1 and 99 percentile of the simulated meteors') + # delete the variable from the variable_PCA + variable_PCA.remove(var) + # save the var deleted in a variable + No_var_PCA_perc.append(var) + + df_all = df_all.drop(var, axis=1) + else: + shapiro_test = stats.shapiro(df_all[var]) + print("Initial Shapiro-Wilk Test:", shapiro_test.statistic,"p-val", shapiro_test.pvalue) + + if var=='zenith_angle': + # # do the cosine of the zenith angle + # df_all[var]=np.cos(np.radians(df_all[var])) + # # df_all[var]=transform_to_gaussian(df_all[var]) + # df_sim_shower_resample[var]=np.cos(np.radians(df_sim_shower_resample[var])) + print('Variable ',var,' is not transformed') + + elif var=='vel_init_norot': + # do the cosine of the zenith angle + # df_all[var]=transform_to_gaussian(df_all[var]) + print('Variable ',var,' is not transformed') + + else: + + pt = PowerTransformer(method='yeo-johnson') + df_all[var]=pt.fit_transform(df_all[[var]]) + df_sim_shower_resample[var]=pt.fit_transform(df_sim_shower_resample[[var]]) + + shapiro_test = stats.shapiro(df_all[var]) + print("NEW Shapiro-Wilk Test:", shapiro_test.statistic,"p-val", shapiro_test.pvalue) + + else: + print('Variable ',var,' is not in the simulated shower') + else: + print('Variable ',var,' is not in the observed shower') + + + + # if PCA_pairplot: + df_all_nameless_plot=df_all.copy() + + if len(df_all_nameless_plot)>10000: + # pick randomly 10000 events + print('Number of events in the simulated:',len(df_all_nameless_plot)) + df_all_nameless_plot=df_all_nameless_plot.sample(n=10000) + + # make a subplot of the rho againist each variable_PCA as a scatter plot + fig, axs = plt.subplots(int(np.ceil(len(variable_PCA[2:])/5)), 5, figsize=(20, 15)) + # flat it + axs = axs.flatten() + for i, var in enumerate(variable_PCA[2:]): + # plot the distribution of the variable + sns.histplot(df_all_nameless_plot[var].values[:len(df_sim_shower[variable_PCA])], kde=True, ax=axs[i], color='b', alpha=0.5, bins=20) + # axs[i//4, i%4].set_title('Distribution of '+var) + # put a vertical line for the df_obs_shower[var] value + # print(df_all_nameless_plot['solution_id'].values[len(df_sim_shower[variable_PCA])]) + axs[i].axvline(df_all_nameless_plot[var].values[len(df_sim_shower[variable_PCA])], color='limegreen', linestyle='--', linewidth=5) + # x axis + axs[i].set_xlabel(var) + # # grid + # axs[i//5, i%5].grid() + if i != 0 and i != 5 and i != 10 and i != 15 and i != 20: + # delete the y axis + axs[i].set_ylabel('') + + # space between the subplots + plt.tight_layout() + + # save the figure + plt.savefig(OUT_PUT_PATH+os.sep+file_name_obs+'_var_hist_yeo-johnson.png') + # close the figure + plt.close() + + #################################################################################################################### + + # Now we have all the data and we apply PCA to the dataframe + df_all_nameless=df_all.drop(['type','solution_id'], axis=1) + + # print the data columns names + df_all_columns_names=(df_all_nameless.columns) + + # Separating out the features + scaled_df_all = df_all_nameless[df_all_columns_names].values + + # performing preprocessing part so to make it readeble for PCA + scaled_df_all = StandardScaler().fit_transform(scaled_df_all) + + + ################################# + # Applying PCA function on the data for the number of components + pca = PCA(PCA_percent/100) #PCA_percent + # pca = PCA() #PCA_percent + all_PCA = pca.fit_transform(scaled_df_all) # fit the data and transform it + + #count the number of PC + print('Number of PC:',pca.n_components_) + + ################################# Apply Varimax rotation #################################### + loadings = pca.components_.T + + rotated_loadings = varimax(loadings) + + # # chage the loadings to the rotated loadings in the pca components + pca.components_ = rotated_loadings.T + + # Transform the original PCA scores with the rotated loadings ugly PC space but same results + # all_PCA = np.dot(all_PCA, rotated_loadings.T[:pca.n_components_, :pca.n_components_]) + + ############### PCR ######################################################################################## + + # Example limits for the physical variables (adjust these based on your domain knowledge) + limits = { + 'mass': (np.min(df_sim_shower['mass']), np.max(df_sim_shower['mass'])), # Example limits + 'rho': (np.min(df_sim_shower['rho']), np.max(df_sim_shower['rho'])), + 'sigma': (np.min(df_sim_shower['sigma']), np.max(df_sim_shower['sigma'])), + 'erosion_height_start': (np.min(df_sim_shower['erosion_height_start']), np.max(df_sim_shower['erosion_height_start'])), + 'erosion_coeff': (np.min(df_sim_shower['erosion_coeff']), np.max(df_sim_shower['erosion_coeff'])), + 'erosion_mass_index': (np.min(df_sim_shower['erosion_mass_index']), np.max(df_sim_shower['erosion_mass_index'])), + 'erosion_mass_min': (np.min(df_sim_shower['erosion_mass_min']), np.max(df_sim_shower['erosion_mass_min'])), + 'erosion_mass_max': (np.min(df_sim_shower['erosion_mass_max']), np.max(df_sim_shower['erosion_mass_max'])) + } + + exclude_columns = ['type', 'solution_id'] + physical_vars = ['mass','rho','sigma','erosion_height_start','erosion_coeff','erosion_mass_index','erosion_mass_min','erosion_mass_max'] #, 'erosion_range', 'erosion_energy_per_unit_cross_section', 'erosion_energy_per_unit_mass' + + # Delete specific columns from variable_PCA + variable_PCA_no_info = [col for col in variable_PCA if col not in exclude_columns] + + # # Scale the data + # scaled_sim = pd.DataFrame(scaler.fit_transform(df_sim_shower[variable_PCA_no_info + physical_vars]), columns=variable_PCA_no_info + physical_vars) + + # Define X and y (now y contains only the PCA observable parameters) + X = df_sim_shower_resample[variable_PCA_no_info] + y = df_sim_shower_resample[physical_vars] + + # Split the data into training and testing sets + X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42) + + # Loop over the number of principal components + print("PCR Predictions with "+str(pca.n_components_)+"PC :") + + pca_copy=copy.deepcopy(pca) + # PCR: Principal Component Regression inform that the predicted variable is always positive + pcr = make_pipeline(StandardScaler(), pca_copy, LinearRegression()) + + pcr.fit(X_train, y_train) + # Predict using the models + y_pred_pcr = pcr.predict(df_sim_shower_resample[variable_PCA_no_info]) + # to_plot_unit=['mass [kg]','rho [kg/m^3]','sigma [s^2/km^2]','erosion height start [km]','erosion coeff [s^2/km^2]','erosion mass index [-]','eros. mass min [kg]','eros. mass max [kg]'] + to_plot_unit = [r'$m_0$ [kg]', r'$\rho$ [kg/m$^3$]', r'$\sigma$ [s$^2$/km$^2$]', r'$h_{e}$ [km]', r'$\eta$ [s$^2$/km$^2$]', r'$s$ [-]', r'$m_{l}$ [kg]', r'$m_{u}$ [kg]'] #,r'log($m_{u}$)-log($m_{l}$) [-]'] + # multiply y_pred_pcr that has the 'erosion_coeff'*1000000 and 'sigma'*1000000 + y_pred_pcr[:,4]=y_pred_pcr[:,4]*1000000 + y_pred_pcr[:,2]=y_pred_pcr[:,2]*1000000 + # Get the real values + real_values = df_sim_shower_resample[physical_vars].iloc[0].values + # multiply the real_values + real_values[4]=real_values[4]*1000000 + real_values[2]=real_values[2]*1000000 + + # # Apply limits to the predictions + # for i, var in enumerate(physical_vars): + # y_pred_pcr[:, i] = np.clip(y_pred_pcr[:, i], limits[var][0], limits[var][1]) + + # Print the predictions alongside the real values + print("Predicted vs Real Values:") + # print(output_dir+os.sep+'PhysicProp'+n_PC_in_PCA+'_'+str(len(curr_sel))+'ev_dist'+str(np.round(np.min(curr_sel['distance_meteor']),2))+'-'+str(np.round(np.max(curr_sel['distance_meteor']),2))+'.png') + for i, unit in enumerate(to_plot_unit): + y_pred_pcr[0, i]= abs(y_pred_pcr[0, i]) + print(f'{unit}: Predicted: {y_pred_pcr[0, i]:.4g}, Real: {real_values[i]:.4g}') + + pcr_results_physical_param = y_pred_pcr.copy() + print('--------------------------') + + ############### PCR ######################################################################################## + + + # # select only the column with in columns_PC with the same number of n_components + columns_PC = ['PC' + str(x) for x in range(1, pca.n_components_+1)] + + # create a dataframe with the PCA space + df_all_PCA = pd.DataFrame(data = all_PCA, columns = columns_PC) + + ### plot var explained by each PC bar + + percent_variance = np.round(pca.explained_variance_ratio_* 100, decimals =2) + + # plot the explained variance ratio of each principal componenets base on the number of column of the original dimension + plt.bar(x= range(1,len(percent_variance)+1), height=percent_variance, tick_label=columns_PC, color='black') + # ad text at the top of the bar with the percentage of variance explained + for i in range(1,len(percent_variance)+1): + # reduce text size + plt.text(i, percent_variance[i-1], str(percent_variance[i-1])+'%', ha='center', va='bottom', fontsize=5) + + plt.ylabel('Percentance of Variance Explained') + plt.xlabel('Principal Component') + # save the figure + plt.savefig(OUT_PUT_PATH+os.sep+file_name_obs+'PCAexplained_variance_ratio_'+str(len(variable_PCA)-2)+'var_'+str(PCA_percent)+'%_'+str(pca.n_components_)+'PC.png') + # close the figure + plt.close() + # plt.show() + + ### plot covariance matrix + + # make the image big as the screen + # plt.figure(figsize=(20, 20)) + + # Compute the correlation coefficients + # cov_data = pca.components_.T + # varimax rotation + cov_data = rotated_loadings + + # Plot the correlation matrix + img = plt.matshow(cov_data.T, cmap=plt.cm.coolwarm, vmin=-1, vmax=1) + plt.colorbar(img) + + # Mapping of original variable names to LaTeX-style labels + variable_map = { + 'vel_init_norot': r"$v_i$", + 'vel_avg_norot': r"$v_{avg}$", + 'duration': r"$t$", + 'peak_mag_height': r"$h_{p}$", + 'begin_height': r"$h_{beg}$", + 'end_height': r"$h_{end}$", + 'peak_abs_mag': r"$M_{p}$", + 'beg_abs_mag': r"$M_{beg}$", + 'end_abs_mag': r"$M_{end}$", + 'F': r"$F$", + 'trail_len': r"$L$", + 't0': r"$t_0$", + 'deceleration_lin': r"$dAcc_{lin}$", + 'deceleration_parab': r"$dAcc_{par}$", + 'decel_parab_t0': r"$dAcc_{p_{t_0}}$", + 'decel_t0': r"$dAcc_{p1_{t_0}}$", + 'decel_jacchia': r"$dAcc_{jac}$", + 'zenith_angle': r"$\zeta$", + 'avg_lag': r"$lag_{avg}$", + 'kc': r"$k_c$", + 'Dynamic_pressure_peak_abs_mag': r"$P_p$", + 'a_mag_init': r"$Mfit_{a_{int}}$", + 'b_mag_init': r"$Mfit_{b_{int}}$", + 'a_mag_end': r"$Mfit_{a_{fin}}$", + 'b_mag_end': r"$Mfit_{b_{fin}}$" + } + + # Convert the given array to LaTeX-style labels + latex_labels = [variable_map.get(var, var) for var in variable_PCA] + + rows_8 = [x for x in latex_labels] + + # add to the columns the PC number the percent_variance + columns_PC_with_var = ['PC' + str(x) + ' (' + str(percent_variance[x-1]) + '%)' for x in range(1, pca.n_components_+1)] + + # Add the variable names as labels on the x-axis and y-axis + plt.xticks(range(len(rows_8)-2), rows_8[2:], rotation=90) + # yticks with variance explained + plt.yticks(range(len(columns_PC_with_var)), columns_PC_with_var) + + # plot the influence of each component on the original dimension + for i in range(cov_data.shape[0]): + for j in range(cov_data.shape[1]): + plt.text(i, j, "{:.1f}".format(cov_data[i, j]), size=5, color='black', ha="center", va="center") + # save the figure + plt.savefig(OUT_PUT_PATH+os.sep+file_name_obs+'PCAcovariance_matrix_'+str(len(variable_PCA)-2)+'var_'+str(PCA_percent)+'%_'+str(pca.n_components_)+'PC.png') + # close the figure + plt.close() + # plt.show() + ### + + # print the number of simulation selected + print('PCA run for', len(df_sim_shower),'simulations, delete ',len(outliers)-len(df_sim_shower),' outliers') + + # if len(No_var_PCA_perc) > 0: + # for No_var_PCA_perc in No_var_PCA_perc: + # print('Observable data variable [',No_var_PCA_perc,'] is not within the 5 and 95 percentile of the simulated shower') + + # print the name of the variables used in PCA + print('Variables used in PCA: ',df_all_nameless.columns) + + print("explained variance ratio: \n",percent_variance) + + print(str(len(variable_PCA)-2)+' var = '+str(PCA_percent)+'% of the variance explained by ',pca.n_components_,' PC') + + + # add the shower code to the dataframe + df_all_PCA['type'] = df_all['type'].values + + # delete the lines after len(df_sim_shower) to have only the simulated shower + df_sim_PCA = df_all_PCA.drop(df_all_PCA.index[len(df_sim_shower):]) + df_obs_PCA = df_all_PCA.drop(df_all_PCA.index[:len(df_sim_shower)]) + + + ########### Distance metric takes in to account varinace explained #################################################################### + + if esclude_real_solution_from_selection: + df_all_PCA_cov = df_all_PCA[df_all_PCA['type'] != 'Real'].copy() + else: + # delete the type Real from + df_all_PCA_cov = df_all_PCA.copy() + + # Get explained variances of principal components + explained_variance = pca.explained_variance_ratio_ + + # Calculate mean and inverse covariance matrix for Mahalanobis distance + cov_matrix = df_all_PCA_cov.drop(['type'], axis=1).cov() + + # Modify covariance matrix based on explained variances + for i in range(len(explained_variance)): + cov_matrix.iloc[i, :] /= explained_variance[i] + + # # Modify covariance matrix to positively reflect variance explained + # for i in range(len(explained_variance)): + # cov_matrix.iloc[i, :] *= explained_variance[i] + + cov_inv = inv(cov_matrix) + + ############## SELECTION ############################################### + + # group them by Observation, Realization type and the other group by MetSim, Simulation + # meanPCA = df_all_PCA.groupby('type').mean() # does not work + + df_all_PCA['solution_id'] = df_all['solution_id'] + # Create a new column to group by broader categories + group_mapping = { + 'Observation': 'obs', + 'Realization': 'obs', + 'Real': 'sim', + 'MetSim': 'sim', + 'Simulation': 'sim' + } + df_all_PCA['group'] = df_all_PCA['type'].map(group_mapping) + df_obs_shower['group'] = df_obs_shower['type'].map(group_mapping) + df_obs_PCA['group'] = df_obs_PCA['type'].map(group_mapping) + + # # Group by the new column and calculate the mean + # meanPCA = df_all_PCA.groupby('group').mean() + + # # drop the sim column + # meanPCA = meanPCA.drop(['sim'], axis=0) + + # Ensure that only numeric columns are used in the mean calculation + df_numeric = df_all_PCA.select_dtypes(include=[np.number]) + + # Group by the new column and calculate the mean only for numeric columns + meanPCA = df_numeric.groupby(df_all_PCA['group']).mean() + + # Drop the 'sim' row if it exists + meanPCA = meanPCA.drop(['sim'], axis=0, errors='ignore') + + # print(meanPCA) + + meanPCA_current = meanPCA.loc[(meanPCA.index == 'obs')].values.flatten() + # take only the value of the mean of the first row + shower_current = df_obs_shower[df_obs_shower['group'] == 'obs'] + shower_current_PCA = df_obs_PCA[df_obs_PCA['group'] == 'obs'] + + # trasform the dataframe in an array + shower_current_PCA = shower_current_PCA.drop(['type','group'], axis=1).values + + # define the distance + mkdirP(OUT_PUT_PATH+os.sep+SAVE_SELECTION_FOLDER) + if esclude_real_solution_from_selection: + # delete the type Real from + input_list_obs_dist = [[df_sim_PCA[df_sim_PCA['type'] != 'Real'], shower_current_PCA[ii], cov_inv, meanPCA_current, df_sim_shower[df_sim_shower['type'] != 'Real'], shower_current.iloc[ii], N_sim_sel, OUT_PUT_PATH+os.sep+SAVE_SELECTION_FOLDER] for ii in range(len(shower_current))] + df_sim_selected_both_df = domainParallelizer(input_list_obs_dist, dist_PCA_space_select_sim, cores=cores_parallel) + + else: + input_list_obs_dist = [[df_sim_PCA, shower_current_PCA[ii], cov_inv, meanPCA_current, df_sim_shower, shower_current.iloc[ii], N_sim_sel, OUT_PUT_PATH+os.sep+SAVE_SELECTION_FOLDER] for ii in range(len(shower_current))] + df_sim_selected_both_df = domainParallelizer(input_list_obs_dist, dist_PCA_space_select_sim, cores=cores_parallel) + + + # separet df_sim_selected the '' to a list of dataframe called df_sim_selected_all and df_sim_selected_knee + df_sim_selected_all = [] + df_sim_selected_knee = [] + for item in df_sim_selected_both_df: + if isinstance(item, tuple): + df_sim_selected_all.append(item[0]) + df_sim_selected_knee.append(item[1]) + + df_sim_selected_all = pd.concat(df_sim_selected_all) + df_sel_shower = pd.concat(df_sim_selected_knee) + + # DELETE ALL INDEX + + # Insert the column at the first position + df_sim_selected_all.insert(1, 'distance_mean', df_sim_selected_all.pop('distance_mean')) + df_sim_selected_all.insert(1, 'distance_meteor', df_sim_selected_all.pop('distance_meteor')) + df_sim_selected_all.insert(1, 'solution_id_dist', df_sim_selected_all.pop('solution_id_dist')) + df_sim_selected_all.insert(1, 'type', df_sim_selected_all.pop('type')) + + df_sim_selected_all.reset_index(drop=True, inplace=True) + + df_sim_selected_all.to_csv(OUT_PUT_PATH+os.sep+file_name_obs+'_sim_sel.csv', index=False) + + # Insert the column at the first position + df_sel_shower.insert(1, 'distance_mean', df_sel_shower.pop('distance_mean')) + df_sel_shower.insert(1, 'distance_meteor', df_sel_shower.pop('distance_meteor')) + df_sel_shower.insert(1, 'solution_id_dist', df_sel_shower.pop('solution_id_dist')) + df_sel_shower.insert(1, 'type', df_sel_shower.pop('type')) + + df_sel_shower.reset_index(drop=True, inplace=True) + + df_sel_shower.to_csv(OUT_PUT_PATH+os.sep+file_name_obs+'_sim_sel_bf_knee.csv', index=False) + + if isinstance(df_sel_shower, tuple): + df_sel_shower = df_sel_shower[0] + if isinstance(df_sim_selected_all, tuple): + df_sim_selected_all = df_sim_selected_all[0] + + # DELETE ALL old INDEX + + # Create the new DataFrame by filtering df_sim_PCA + df_sel_PCA = df_all_PCA[df_all_PCA['solution_id'].isin(df_sel_shower['solution_id'])] + # change all df_sel_PCA 'type' to Simulation_sel + df_sel_PCA['type'] = 'Simulation_sel' + # reset the index + df_sel_PCA.reset_index(drop=True, inplace=True) + + # df_sel_shower_no_repetitions = df_sim_shower[df_sim_shower['solution_id'].isin(df_sel_shower['solution_id'])] + # # change all df_sel_PCA 'type' to Simulation_sel + # df_sel_shower_no_repetitions['type'] = 'Simulation_sel' + # # reset the index + # df_sel_shower_no_repetitions.reset_index(drop=True, inplace=True) + + df_sel_shower_no_repetitions = df_sel_shower.copy() + + # group by solution_id_dist and keep only n_confront_sel from each group + df_sel_shower_no_repetitions = df_sel_shower_no_repetitions.groupby('solution_id_dist').head(len(df_sel_shower_no_repetitions)) + + # order by distance_meteor + df_sel_shower_no_repetitions = df_sel_shower_no_repetitions.sort_values('distance_meteor') + + # count duplicates and add a column for the number of duplicates + df_sel_shower_no_repetitions['num_duplicates'] = df_sel_shower_no_repetitions.groupby('solution_id')['solution_id'].transform('size') + + df_sel_shower_no_repetitions['solution_id_dist'] = df_obs_shower['solution_id'].values[0] + + df_sel_shower_no_repetitions.drop_duplicates(subset='solution_id', keep='first', inplace=True) + + # save df_sel_shower_real to disk + df_sel_shower_no_repetitions.to_csv(OUT_PUT_PATH+os.sep+SAVE_SELECTION_FOLDER+os.sep+file_name_obs+'_sim_sel_to_optimize.csv', index=False) + + + + print('\nSUCCESS: the simulated meteor have been selected\n') + + # Close the Logger to ensure everything is written to the file STOP COPY in TXT file + sys.stdout.close() + + # Reset sys.stdout to its original value if needed + sys.stdout = sys.__stdout__ + + ########### save dist to observed shower ######################################## + + # # save dist also on selected shower + # distance_current = [] + # for i_shower in range(len(shower_current)): + # distance_current.append(scipy.spatial.distance.euclidean(meanPCA_current, shower_current_PCA[i_shower])) + # shower_current['distance_mean']=distance_current # from the mean of the selected shower + # shower_current.to_csv(OUT_PUT_PATH+os.sep+file_name_obs+'_obs_and_dist.csv', index=False) + + # PLOT the selected simulated shower ######################################## + + # dataframe with the simulated and the selected meteors in the PCA space + # df_sim_sel_PCA = pd.concat([df_sim_PCA,df_sel_PCA], axis=0) + + if PCA_pairplot: + + df_sim_shower_small=df_sim_shower.copy() + + if len(df_sim_shower_small)>10000: # w/o takes forever to plot + # pick randomly 10000 events + df_sim_shower_small=df_sim_shower_small.sample(n=10000) + + print('generating sel sim histogram plot...') + + # Define a custom palette + custom_palette = { + 'Real': "r", + 'Simulation': "b", + 'Simulation_sel': "darkorange", + 'MetSim': "k", + 'Realization': "mediumaquamarine", + 'Observation': "limegreen" + } + + + curr_df = pd.concat([df_sim_shower_small,df_sel_shower,df_obs_shower], axis=0) + + curr_df['num_type'] = curr_df.groupby('type')['type'].transform('size') + curr_df['weight'] = 1 / curr_df['num_type'] + + + fig, axs = plt.subplots(int(np.ceil(len(variable_PCA[2:])/5)), 5, figsize=(20, 15)) + # flatten the axs + axs = axs.flatten() + + # to_plot_unit=['init vel [km/s]','avg vel [km/s]','duration [s]','begin height [km]','peak height [km]','end height [km]','begin abs mag [-]','peak abs mag [-]','end abs mag [-]','F parameter [-]','zenith angle [deg]','deceleration [km/s^2]','trail lenght [km]','kurtosis','skew'] + + # to_plot=['vel_init_norot','vel_avg_norot','duration','begin_height','peak_mag_height','end_height','beg_abs_mag','peak_abs_mag','end_abs_mag','F','zenith_angle','decel_parab_t0','trail_len','kurtosis','skew'] + + # deleter form curr_df the mass + #curr_df=curr_df.drop(['mass'], axis=1) + for ii, var in enumerate(variable_PCA[2:]): + + # if var in ['decel_parab_t0','decel_t0']: + # sns.histplot(curr_df, x=x_plot[x_plot>-500], weights=curr_df['weight'][x_plot>-500],hue='type', ax=axs[ii], kde=True, palette=custom_palette, bins=20) + # axs[ii].set_xticks([np.round(np.min(x_plot[x_plot>-500]),2),np.round(np.max(x_plot[x_plot>-500]),2)]) + + # else: + + sns.histplot(curr_df, x=var, weights=curr_df['weight'], hue='type', ax=axs[ii], kde=True, palette=custom_palette, bins=20) + axs[ii].set_xticks([np.round(np.min(curr_df[var]),2),np.round(np.max(curr_df[var]),2)]) + + # if beg_abs_mag','peak_abs_mag','end_abs_mag inver the x axis + if var in ['beg_abs_mag','peak_abs_mag','end_abs_mag']: + axs[ii].invert_xaxis() + + # Set the x-axis formatter to ScalarFormatter + axs[ii].xaxis.set_major_formatter(ScalarFormatter()) + axs[ii].ticklabel_format(useOffset=False, style='plain', axis='x') + # Set the number of x-axis ticks to 3 + # axs[ii].xaxis.set_major_locator(MaxNLocator(nbins=3)) + + axs[ii].set_ylabel('probability') + axs[ii].set_xlabel(var) + axs[ii].get_legend().remove() + # check if there are more than 3 ticks and if yes only use the first and the last + + # put y axis in log scale + axs[ii].set_yscale('log') + axs[ii].set_ylim(0.01,1) + + + # more space between the subplots + plt.tight_layout() + # # full screen + # figManager = plt.get_current_fig_manager() + # figManager.window.showMaximized() + + # save the figure + fig.savefig(OUT_PUT_PATH+os.sep+file_name_obs+'_Histograms_'+str(len(variable_PCA)-2)+'var_'+str(PCA_percent)+'%_'+str(pca.n_components_)+'PC.png', dpi=300) + plt.close() + + if len(df_sim_PCA)>10000: # w/o takes forever to plot + # df_sim_PCA=df_sim_PCA.sample(n=10000) + # pick only the one with the same index in df_sim_shower_small + df_sim_PCA = df_sim_PCA[df_sim_PCA.index.isin(df_sim_shower_small.index)] + + print('generating PCA space plot...') + + df_sim_sel_PCA = pd.concat([df_sim_PCA,df_sel_PCA,df_obs_PCA], axis=0) + + # Select only the numeric columns for percentile calculations + numeric_columns = df_sim_sel_PCA.select_dtypes(include=[np.number]).columns + + # Create a new column for point sizes + df_sim_sel_PCA['point_size'] = df_sim_sel_PCA['type'].map({ + 'Simulation_sel': 5, + 'Simulation': 5, + 'MetSim': 20, + 'Realization': 20, + 'Observation': 40 + }) + + + # open a new figure to plot the pairplot + fig = plt.figure(figsize=(10, 10), dpi=300) + + # # fig = sns.pairplot(df_sim_sel_PCA, hue='type', plot_kws={'alpha': 0.6, 's': 5, 'edgecolor': 'k'},corner=True) + # fig = sns.pairplot(df_sim_sel_PCA, hue='type',corner=True, palette='bright', diag_kind='kde', plot_kws={'s': 5, 'edgecolor': 'k'}) + # # plt.show() + + # Create the pair plot without points initially + fig = sns.pairplot(df_sim_sel_PCA[numeric_columns.append(pd.Index(['type']))], hue='type', corner=True, palette=custom_palette, diag_kind='kde', plot_kws={'s': 5, 'edgecolor': 'k'}) + + # Overlay scatter plots with custom point sizes + for i in range(len(fig.axes)): + for j in range(len(fig.axes)): + if i > j: + # check if the variable is in the list of the numeric_columns and set the axis limit + if df_sim_sel_PCA.columns[j] in numeric_columns and df_sim_sel_PCA.columns[i] in numeric_columns: + + ax = fig.axes[i, j] + sns.scatterplot(data=df_sim_sel_PCA, x=df_sim_sel_PCA.columns[j], y=df_sim_sel_PCA.columns[i], hue='type', size='point_size', sizes=(5, 40), ax=ax, legend=False, edgecolor='k', palette=custom_palette) + + # ax.set_xlim(percentiles_1[df_sim_sel_PCA.columns[j]], percentiles_99[df_sim_sel_PCA.columns[j]]) + # ax.set_ylim(percentiles_1[df_sim_sel_PCA.columns[i]], percentiles_99[df_sim_sel_PCA.columns[i]]) + + # delete the last row of the plot + # fig.axes[-1, -1].remove() + # Hide the last row of plots + # for ax in fig.axes[-1]: + # ax.remove() + + # Adjust the subplots layout parameters to give some padding + plt.subplots_adjust(hspace=0.3, wspace=0.3) + # plt.show() + + # save the figure + fig.savefig(OUT_PUT_PATH+os.sep+file_name_obs+'PCAspace_sim_sel_real_'+str(len(variable_PCA)-2)+'var_'+str(PCA_percent)+'%_'+str(pca.n_components_)+'PC.png') + # close the figure + plt.close() + + print('generating result variable plot...') + + output_folder=OUT_PUT_PATH+os.sep+file_name_obs+VAR_SEL_DIR_SUFX + # check if the output_folder exists + if not os.path.isdir(output_folder): + mkdirP(output_folder) + + # df_sim_PCA,df_sel_PCA,df_obs_PCA + # print(df_sim_shower) + # loop all physical variables + physical_vars = ['mass','rho','sigma','erosion_height_start','erosion_coeff','erosion_mass_index','erosion_mass_min','erosion_mass_max'] + for var_phys in physical_vars: + # make a subplot of the rho againist each variable_PCA as a scatter plot + fig, axs = plt.subplots(int(np.ceil(len(variable_PCA[2:])/5)), 5, figsize=(20, 15)) + # flat it + axs = axs.flatten() + + for i, var in enumerate(variable_PCA[2:]): + # plot the rho againist the variable with black borders + axs[i].scatter(df_sim_shower_small[var], df_sim_shower_small[var_phys], c='b') #, edgecolors='k', alpha=0.5 + + axs[i].scatter(df_sel_shower[var], df_sel_shower[var_phys], c='orange') #, edgecolors='k', alpha=0.5 + # put a green vertical line for the df_obs_shower[var] value + axs[i].axvline(shower_current[var].values[0], color='limegreen', linestyle='--', linewidth=5) + # put a horizontal line for the rho of the first df_sim_shower_small + axs[i].axhline(df_sim_shower[var_phys].values[0], color='k', linestyle='-', linewidth=2) + # axs[i].set_title(var) + # as a suptitle put the variable_PCA + # fig.suptitle(var_phys) + if i == 0 or i == 5 or i == 10 or i == 15 or i == 20: + # as a suptitle put the variable_PCA + axs[i].set_ylabel(var_phys) + + # x axis + axs[i].set_xlabel(var) + + # grid + axs[i].grid() + # make y axis log if the variable is 'erosion_mass_min' 'erosion_mass_max' + if var_phys == 'erosion_mass_min' or var_phys == 'erosion_mass_max': + axs[i].set_yscale('log') + + plt.tight_layout() + # save the figure + plt.savefig(output_folder+os.sep+file_name_obs+var_phys+'_vs_var_select_PCA.png') + # close the figure + plt.close() + + print('generating PCA position plot...') + + output_folder=OUT_PUT_PATH+os.sep+file_name_obs+PCA_SEL_DIR_SUFX + # check if the output_folder exists + if not os.path.isdir(output_folder): + mkdirP(output_folder) + + # loop all pphysical variables + physical_vars = ['mass','rho','sigma','erosion_height_start','erosion_coeff','erosion_mass_index','erosion_mass_min','erosion_mass_max'] + for var_phys in physical_vars: + + # make a subplot of the rho againist each variable_PCA as a scatter plot + fig, axs = plt.subplots(int(np.ceil(len(columns_PC)/5)), 5, figsize=(20, 15)) + + # flatten the axs array + axs = axs.flatten() + for i, var in enumerate(columns_PC): + # plot the rho againist the variable with black borders + axs[i].scatter(df_sim_PCA[var], df_sim_shower_small[var_phys], c='b') #, edgecolors='k', alpha=0.5 + + axs[i].scatter(df_sel_PCA[var], df_sel_shower_no_repetitions[var_phys], c='orange') #, edgecolors='k', alpha=0.5 + # put a green vertical line for the df_obs_shower[var] value + axs[i].axvline(df_obs_PCA[var].values[0], color='limegreen', linestyle='--', linewidth=5) + # put a horizontal line for the rho of the first df_sim_shower_small + axs[i].axhline(df_sim_shower[var_phys].values[0], color='k', linestyle='-', linewidth=2) + # axs[i].set_title(var) + # # as a suptitle put the variable_PCA + # fig.suptitle(var_phys) + if i == 0 or i == 5 or i == 10 or i == 15 or i == 20: + # as a suptitle put the variable_PCA + axs[i].set_ylabel(var_phys) + # axis x + axs[i].set_xlabel(var) + # grid + axs[i].grid() + # make y axis log if the variable is 'erosion_mass_min' 'erosion_mass_max' + if var_phys == 'erosion_mass_min' or var_phys == 'erosion_mass_max': + axs[i].set_yscale('log') + + # delete the subplot that are not used + for i in range(len(columns_PC), len(axs)): + fig.delaxes(axs[i]) + + plt.tight_layout() + # save the figure + plt.savefig(output_folder+os.sep+file_name_obs+var_phys+'_vs_var_select_PC_space.png') + # close the figure + plt.close() + + + return df_sel_shower, df_sel_shower_no_repetitions, df_sim_selected_all, pcr_results_physical_param, pca.n_components_ + + + + + + +def PCAcorrelation_selPLOT(curr_sim_init, curr_sel, n_PC_in_PCA='',output_dir=''): + + curr_sim=curr_sim_init.copy() + if len(curr_sim)>10000: + # pick randomly 10000 events + print('Number of events in the simulated :',len(curr_sim)) + curr_sim=curr_sim.sample(n=10000).copy() + + curr_sel=curr_sel.copy() + curr_sel = curr_sel.drop_duplicates(subset='solution_id') + curr_df_sim_sel=pd.concat([curr_sim,curr_sel], axis=0, ignore_index=True) + + # Define your label mappings + label_mappings = { + 'mass': 'mass [kg]', + 'rho': 'rho [kg/m^3]', + 'sigma': 'sigma [s^2/km^2]', + 'erosion_height_start': 'erosion height start [km]', + 'erosion_coeff': 'erosion coeff [s^2/km^2]', + 'erosion_mass_index': 'erosion mass index [-]', + 'erosion_mass_min': 'log eros. mass min [kg]', + 'erosion_mass_max': 'log eros. mass max [kg]' + } + + # Define a custom palette + custom_palette = { + 'Real': "r", + 'Simulation': "b", + 'Simulation_sel': "darkorange", + 'MetSim': "k", + 'Realization': "mediumaquamarine", + 'Observation': "limegreen" + } + + to_plot8 = ['type', 'mass', 'rho', 'sigma', 'erosion_height_start', 'erosion_coeff', 'erosion_mass_index', 'erosion_mass_min', 'erosion_mass_max'] + hue_column = 'type' + + + # Create a PairGrid + pairgrid = sns.PairGrid(curr_df_sim_sel[to_plot8], hue=hue_column, palette=custom_palette) + + # Map the plots + pairgrid.map_lower(sns.scatterplot, edgecolor='k', palette=custom_palette) + # for the upper triangle delete x and y axis + # pairgrid.map_diag(sns.kdeplot) + # pairgrid.map_diag(sns.histplot, kde=True, color='k', edgecolor='k') + # pairgrid.add_legend() + + # Update the labels + for ax in pairgrid.axes.flatten(): + if ax is not None: # Check if the axis exists + xlabel = ax.get_xlabel() + ylabel = ax.get_ylabel() + if ylabel in label_mappings: + ax.set_ylabel(label_mappings[ylabel]) + if xlabel in label_mappings: + ax.set_xlabel(label_mappings[xlabel]) + if ylabel in ['erosion_mass_min', 'erosion_mass_max']:#'sigma', + ax.set_yscale('log') + if xlabel in ['erosion_mass_min', 'erosion_mass_max']: #'sigma', + ax.set_xscale('log') + + # # Calculate the correlation matrix + # corr = curr_df_sim_sel[to_plot8[1:]].corr() + + corr = curr_sel[to_plot8[1:]].corr() + + # Find the min and max correlation values + vmin = corr.values.min() + vmax = corr.values.max() + norm = Normalize(vmin=vmin, vmax=vmax) + cmap = sns.color_palette('coolwarm', as_cmap=True) + + # Fill the upper triangle plots with the correlation matrix values and color it with the coolwarm cmap + for i, row in enumerate(to_plot8[1:]): + for j, col in enumerate(to_plot8[1:]): + if i < j: + ax = pairgrid.axes[i, j] # Adjust index to fit the upper triangle + corr_value = corr.loc[row, col] + ax.text(0.5, 0.5, f'{corr_value:.2f}', horizontalalignment='center', verticalalignment='center', fontsize=12, color='black', transform=ax.transAxes) + ax.set_facecolor(cmap(norm(corr_value))) + # cmap = sns.color_palette('coolwarm', as_cmap=True) + # ax.set_facecolor(cmap(corr_value)) + + # Remove the axis labels + ax.xaxis.set_visible(False) + ax.yaxis.set_visible(False) + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.spines['left'].set_visible(False) + ax.spines['bottom'].set_visible(False) + if i == j: + ax = pairgrid.axes[i, j] + ax.set_axis_off() + + # Adjust layout + plt.tight_layout() + + fig_name = (output_dir+os.sep+'MixPhysicPropPairPlot_'+str(n_PC_in_PCA)+'PC_'+str(len(curr_sel))+'ev.png') + plt.savefig(fig_name, dpi=300) + + # Close the figure + plt.close() + + ########################################################################## + ########################################################################## + + + + + +def PCA_physicalProp_KDE_MODE_PLOT(df_sim, df_obs, df_sel, n_PC_in_PCA, fit_funct, mag_noise_real, len_noise_real, Metsim_folderfile_json='', file_name_obs='', folder_file_name_real='', output_dir='', total_distribution=False, save_log=False): + print('PCA_physicalProp_KDE_MODE_PLOT') + output_dir_OG=output_dir + + pd_datafram_PCA_selected_mode_min_KDE=pd.DataFrame() + + # sigma5=5 + + # 5 sigma confidence interval + # five_sigma=False + # mag_noise = MAG_RMSD*SIGMA_ERR + # len_noise = LEN_RMSD*SIGMA_ERR + mag_noise = mag_noise_real.copy() + len_noise = len_noise_real.copy() + + # # Standard deviation of the magnitude Gaussian noise 1 sigma + # # SD of noise in length (m) 1 sigma in km + len_noise= len_noise/1000 + # velocity noise 1 sigma km/s + # vel_noise = (len_noise*np.sqrt(2)/(1/FPS)) + vel_noise = (len_noise/(1/FPS)) + + # check if end with pickle + if folder_file_name_real.endswith('.pickle'): + data_file_real = read_pickle_reduction_file(folder_file_name_real) + elif folder_file_name_real.endswith('.json'): + data_file_real = read_with_noise_GenerateSimulations_output(folder_file_name_real) + + _, _, _, residuals_mag_real, residuals_vel_real, _, residual_time_pos_real, residual_height_pos_real = RMSD_calc_diff(data_file_real, fit_funct) + + if total_distribution: + df_sel['solution_id_dist'] = df_obs['solution_id'].iloc[0] + df_obs=df_obs.iloc[[0]] + + # Get the default color cycle + color_cycle = plt.rcParams['axes.prop_cycle'].by_key()['color'] + # Create an infinite cycle of colors + infinite_color_cycle = itertools.cycle(color_cycle) + + for jj in range(len(df_obs)): + + fig, ax = plt.subplots(2, 3, figsize=(14, 6),gridspec_kw={'height_ratios': [ 3, 0.5],'width_ratios': [ 3, 0.5, 3]}) + # fig, ax = plt.subplots(2, 4) + # flat the ax + ax = ax.flatten() + + around_meteor=df_obs.iloc[jj]['solution_id'] + curr_sel = df_sel[df_sel['solution_id_dist'] == around_meteor] + curr_sel['erosion_coeff']=curr_sel['erosion_coeff']*1000000 + curr_sel['sigma']=curr_sel['sigma']*1000000 + + # check if around_meteor is a file in a folder + is_real=False + if os.path.exists(around_meteor): + is_real=True + # split in file and directory + _, around_meteor = os.path.split(around_meteor) + around_meteor = around_meteor[:15] + + if total_distribution==False: + output_dir=output_dir_OG+os.sep+SAVE_SELECTION_FOLDER+os.sep+around_meteor + + plot_side_by_side(data_file_real, fig, ax, 'go', file_name_obs[:15]+'\nRMSDmag '+str(round(mag_noise_real,3))+' RMSDlen '+str(round(len_noise_real/1000,3)), residuals_mag_real, residuals_vel_real, residual_time_pos_real, residual_height_pos_real, fit_funct, mag_noise, vel_noise,'Std.dev. realizations') + + densest_point = '' + + print('Number of selected events:',len(curr_sel)) + + if len(curr_sel)<2: + print('Check if the event is below RMSD') + ii=0 + Metsim_flag=False + try: + namefile_sel = curr_sel['solution_id'].iloc[ii] + except IndexError: + # Handle the error + print(f"Index {ii} is out of bounds for 'solution_id' in curr_sel.") + namefile_sel = None + continue + # namefile_sel = curr_sel['solution_id'].iloc[ii] + + # chec if the file exist + if not os.path.isfile(namefile_sel): + print('file '+namefile_sel+' not found') + continue + + else: + if namefile_sel.endswith('.pickle'): + data_file = read_pickle_reduction_file(namefile_sel) + pd_datafram_PCA_sim = array_to_pd_dataframe_PCA(data_file) + + elif namefile_sel.endswith('.json'): + # open the json file with the name namefile_sel + f = open(namefile_sel,"r") + data = json.loads(f.read()) + if 'ht_sampled' in data: + data_file = read_GenerateSimulations_output(namefile_sel, data_file_real) + pd_datafram_PCA_sim = array_to_pd_dataframe_PCA(data_file) + + else: + Metsim_flag=True + _, data_file, pd_datafram_PCA_sim = run_simulation(namefile_sel, data_file_real) + + rmsd_mag, rmsd_vel, rmsd_lag, residuals_mag, residuals_vel, residuals_len, residual_time_pos, residual_height_pos = RMSD_calc_diff(data_file, fit_funct) + + color_line=next(infinite_color_cycle) + + if Metsim_flag: + + # plot_side_by_side(data_file, fig, ax, '-k', ii, residuals_mag, residuals_vel, residual_time_pos, residual_height_pos) + + plot_side_by_side(data_file, fig, ax, '-k', 'Metsim data event\n\ +RMSDmag '+str(round(rmsd_mag,3))+' RMSDlen '+str(round(rmsd_lag,3))+'\n\ + m:'+str('{:.2e}'.format(curr_sel.iloc[ii]['mass'],1))+' F:'+str(round(curr_sel.iloc[ii]['F'],2))+'\n\ + rho:'+str(round(curr_sel.iloc[ii]['rho']))+' sigma:'+str(round(curr_sel.iloc[ii]['sigma'],4))+'\n\ + er.height:'+str(round(curr_sel.iloc[ii]['erosion_height_start'],2))+' er.log:'+str(round(curr_sel.iloc[ii]['erosion_range'],1))+'\n\ + er.coeff:'+str(round(curr_sel.iloc[ii]['erosion_coeff'],3))+' er.index:'+str(round(curr_sel.iloc[ii]['erosion_mass_index'],2)), residuals_mag, residuals_vel, residual_time_pos, residual_height_pos) + + + + else: + + plot_side_by_side(data_file, fig, ax, '-','RMSDmag '+str(round(rmsd_mag,3))+' RMSDlen '+str(round(rmsd_lag,3))+' \n\ + m:'+str('{:.2e}'.format(curr_sel.iloc[ii]['mass'],1))+' F:'+str(round(curr_sel.iloc[ii]['F'],2))+'\n\ + rho:'+str(round(curr_sel.iloc[ii]['rho']))+' sigma:'+str(round(curr_sel.iloc[ii]['sigma'],4))+'\n\ + er.height:'+str(round(curr_sel.iloc[ii]['erosion_height_start'],2))+' er.log:'+str(round(curr_sel.iloc[ii]['erosion_range'],1))+'\n\ + er.coeff:'+str(round(curr_sel.iloc[ii]['erosion_coeff'],3))+' er.index:'+str(round(curr_sel.iloc[ii]['erosion_mass_index'],2)), residuals_mag, residuals_vel, residual_time_pos, residual_height_pos) + + # change first line color + ax[0].lines[1].set_color(color_line) + ax[1].lines[1].set_color(color_line) + ax[2].lines[1].set_color(color_line) + ax[5].lines[1].set_color(color_line) + + # pu the leggend putside the plot and adjust the plot base on the screen size + ax[2].legend(bbox_to_anchor=(1.05, 1.0), loc='upper left', borderaxespad=0.) + # the legend do not fit in the plot, so adjust the plot + plt.subplots_adjust(right=.7) + plt.subplots_adjust(wspace=0.2) + + # make more space + plt.tight_layout() + + # split in file and directory + _, name_file = os.path.split(curr_sel['solution_id'].iloc[ii]) + if rmsd_mag 8: + try: + + # def density_function(x): + # # Insert the logic of your objective function here + # # This example uses a simple sum of squares of x + # # Replace it with the actual function you want to minimize + # return np.sum(np.square(x)) + + # # Objective function for maximization (negative density for minimization) + # def objective_function(x): + # return -density_function(x) + + # # Bounds for optimization within all the sim space + # bounds = [(np.min(curr_sel_data[:, i]), np.max(curr_sel_data[:, i])) for i in range(curr_sel_data.shape[1])] + + # # Perform global optimization using differential evolution + # print('Starting global optimization using differential evolution.') + # result = differential_evolution(objective_function, bounds) + + # if result.success: + # densest_point = result.x + # print(f"Densest point found using differential evolution:\n {densest_point}") + # else: + # print('Optimization was unsuccessful.') + # densest_point = '' + + kde = gaussian_kde(dataset=curr_sel_data.T) # Note the transpose to match the expected input shape + + # Negative of the KDE function for optimization + def neg_density(x): + return -kde(x) + + # Bounds for optimization within all the sim space + # data_sim = df_sim[var_kde].values + bounds = [(np.min(curr_sel_data[:, i]), np.max(curr_sel_data[:, i])) for i in range(curr_sel_data.shape[1])] + + # Initial guesses: curr_sel_data mean, curr_sel_data median, and KMeans centroids + mean_guess = np.mean(curr_sel_data, axis=0) + median_guess = np.median(curr_sel_data, axis=0) + + # KMeans centroids as additional guesses + kmeans = KMeans(n_clusters=5, n_init='auto').fit(curr_sel_data) # Adjust n_clusters based on your understanding of the curr_sel_data + centroids = kmeans.cluster_centers_ + + # Combine all initial guesses + initial_guesses = [mean_guess, median_guess] + centroids.tolist() + + # Perform optimization from each initial guess + results = [minimize(neg_density, x0, method='L-BFGS-B', bounds=bounds) for x0 in initial_guesses] + + # Filter out unsuccessful optimizations and find the best result + successful_results = [res for res in results if res.success] + + if successful_results: + best_result = min(successful_results, key=lambda x: x.fun) + densest_point = best_result.x + print("Densest point using KMeans centroid:\n", densest_point) + else: + # raise ValueError('Optimization was unsuccessful. Consider revising the strategy.') + print('Optimization was unsuccessful. Consider revising the strategy.') + # revise the optimization strategy + print('Primary optimization strategies were unsuccessful. Trying fallback strategy (Grid Search).') + # Fallback strategy: Grid Search + grid_size = 5 # Define the grid size for the search + grid_points = [np.linspace(bound[0], bound[1], grid_size) for bound in bounds] + grid_combinations = list(itertools.product(*grid_points)) + + best_grid_point = None + best_grid_density = -np.inf + + for point in grid_combinations: + density = kde(point) + if density > best_grid_density: + best_grid_density = density + best_grid_point = point + + if best_grid_point is not None: + densest_point = np.array(best_grid_point) + print("Densest point found using Grid Search:\n", densest_point) + else: + print("None of the strategy worked no KDE result, change the selected simulations") + except np.linalg.LinAlgError as e: + print(f"LinAlgError: {str(e)}") + else: + print('Not enough data to perform the KDE need more than 8 meteors') + + # if pickle change the extension and the code ################################################################################################## + if Metsim_folderfile_json != '': + # Load the nominal simulation parameters + const_nominal, _ = loadConstants(Metsim_folderfile_json) + else: + const_nominal, _ = loadConstants() + + const_nominal.dens_co = np.array(const_nominal.dens_co) + + dens_co=np.array(const_nominal.dens_co) + + # print(const_nominal.__dict__) + + ### Calculate atmosphere density coeffs (down to the bottom observed height, limit to 15 km) ### + + # Determine the height range for fitting the density + dens_fit_ht_beg = const_nominal.h_init + # dens_fit_ht_end = const_nominal.h_final + + # Assign the density coefficients + const_nominal.dens_co = dens_co + + # Turn on plotting of LCs of individual fragments + const_nominal.fragmentation_show_individual_lcs = True + + # # change the sigma of the fragmentation + # const_nominal.sigma = 1.0 + + # 'rho': 209.27575861617834, 'm_init': 1.3339843905562902e-05, 'v_init': 59836.848805126894, 'shape_factor': 1.21, 'sigma': 1.387556841276162e-08, 'zenith_angle': 0.6944268835985749, 'gamma': 1.0, 'rho_grain': 3000, 'lum_eff_type': 5, 'lum_eff': 0.7, 'mu': 3.8180000000000003e-26, 'erosion_on': True, 'erosion_bins_per_10mass': 10, 'erosion_height_start': 117311.48011974395, 'erosion_coeff': 6.356639734390828e-07, 'erosion_height_change': 0, 'erosion_coeff_change': 3.3e-07, 'erosion_rho_change': 3700, 'erosion_sigma_change': 2.3e-08, 'erosion_mass_index': 1.614450928834309, 'erosion_mass_min': 4.773894502090459e-11, 'erosion_mass_max': 7.485333377052805e-10, 'disruption_on': False, 'compressive_strength': 2000, + + # create a copy of the const_nominal + const_nominal_1D_KDE = copy.deepcopy(const_nominal) + const_nominal_allD_KDE = copy.deepcopy(const_nominal) + + var_cost=['m_init','rho','sigma','erosion_height_start','erosion_coeff','erosion_mass_index','erosion_mass_min','erosion_mass_max'] + # print for each variable the kde + percent_diff_1D=[] + percent_diff_allD=[] + for i in range(len(var_kde)): + + x=curr_sel[var_kde[i]] + + # Check if dataset has multiple elements + if len(x) < 2: + # If dataset has fewer than 2 elements, duplicate the single element or skip + print(f"Dataset for {var_kde[i]} has less than 2 elements. Duplicating elements to compute KDE.") + x = np.concatenate([x, x]) # Duplicate elements to have at least two + + # Compute KDE + kde = gaussian_kde(x) + + # Define the range for which you want to compute KDE values, with more points for higher accuracy + kde_x = np.linspace(x.min(), x.max(), 1000) + kde_values = kde(kde_x) + + # Find the mode (x-value where the KDE curve is at its maximum) + mode_index = np.argmax(kde_values) + mode = kde_x[mode_index] + + real_val=df_sim[var_kde[i]].iloc[0] + + print() + if df_sim['type'].iloc[0]=='MetSim' or df_sim['type'].iloc[0]=='Real': + print(f"MetSim value {var_kde[i]}: {'{:.4g}'.format(real_val)}") + print(f"1D Mode of KDE for {var_kde[i]}: {'{:.4g}'.format(mode)} percent diff: {'{:.4g}'.format(abs((real_val-mode)/(real_val+mode))/2*100)}%") + percent_diff_1D.append(abs((real_val-mode)/(real_val+mode))/2*100) + if densest_point!='': + print(f"Mult.dim. KDE densest {var_kde[i]}: {'{:.4g}'.format(densest_point[i])} percent diff: {'{:.4g}'.format(abs((real_val-densest_point[i])/(real_val+densest_point[i]))/2*100)}%") + percent_diff_allD.append(abs((real_val-densest_point[i])/(real_val+densest_point[i]))/2*100) + # print the value of const_nominal + # print(f"const_nominal {var_cost[i]}: {'{:.4g}'.format(const_nominal.__dict__[var_cost[i]])}") + + if var_cost[i] == 'sigma' or var_cost[i] == 'erosion_coeff': + # put it back as it was + const_nominal_1D_KDE.__dict__[var_cost[i]]=mode/1000000 + if densest_point!='': + const_nominal_allD_KDE.__dict__[var_cost[i]]=densest_point[i]/1000000 + elif var_cost[i] == 'erosion_height_start': + # put it back as it was + const_nominal_1D_KDE.__dict__[var_cost[i]]=mode*1000 + if densest_point!='': + const_nominal_allD_KDE.__dict__[var_cost[i]]=densest_point[i]*1000 + else: + # add each to const_nominal_1D_KDE and const_nominal_allD_KDE + const_nominal_1D_KDE.__dict__[var_cost[i]]=mode + if densest_point!='': + const_nominal_allD_KDE.__dict__[var_cost[i]]=densest_point[i] + + # check if the file output_folder+os.sep+file_name+'_sim_sel_optimized.csv' exists then read + if os.path.exists(output_dir+os.sep+file_name_obs+'_sim_sel_optimized.csv'): + df_sel_optimized_check = pd.read_csv(output_dir+os.sep+file_name_obs+'_sim_sel_optimized.csv') + else: + df_sel_optimized_check = pd.DataFrame() + df_sel_optimized_check['solution_id']='' + + # save the const_nominal as a json file saveConstants(const, dir_path, file_name): + if total_distribution: + if output_dir+os.sep+around_meteor+'_mode_TOT.json' not in df_sel_optimized_check['solution_id'].values: + saveConstants(const_nominal_1D_KDE,output_dir,around_meteor+'_mode_TOT.json') + _, gensim_data_sim, pd_datafram_PCA_sim = run_simulation(output_dir+os.sep+around_meteor+'_mode_TOT.json', data_file_real) + else: + print('already optimized') + _, gensim_data_sim, pd_datafram_PCA_sim = run_simulation(output_dir+os.sep+around_meteor+'_mode_TOT.json', data_file_real) + + else: + if output_dir+os.sep+around_meteor+'_mode.json' not in df_sel_optimized_check['solution_id'].values: + saveConstants(const_nominal_1D_KDE,output_dir,around_meteor+'_mode.json') + _, gensim_data_sim, pd_datafram_PCA_sim = run_simulation(output_dir+os.sep+around_meteor+'_mode.json', data_file_real) + else: + print('already optimized') + _, gensim_data_sim, pd_datafram_PCA_sim = run_simulation(output_dir+os.sep+around_meteor+'_mode.json', data_file_real) + + if pd_datafram_PCA_sim is None: + return pd_datafram_PCA_selected_mode_min_KDE + if gensim_data_sim is None: + return pd_datafram_PCA_selected_mode_min_KDE + + rmsd_mag, rmsd_vel, rmsd_lag, residuals_mag, residuals_vel, residuals_len, residual_time_pos, residual_height_pos = RMSD_calc_diff(gensim_data_sim, fit_funct) + + plot_side_by_side(gensim_data_sim, fig, ax, 'r-', 'MODE : RMSDmag '+str(round(rmsd_mag,3))+' RMSDlen '+str(round(rmsd_lag,3))+'\n\ + m:'+str('{:.2e}'.format(pd_datafram_PCA_sim.iloc[0]['mass'],1))+' F:'+str(round(pd_datafram_PCA_sim.iloc[0]['F'],2))+'\n\ + rho:'+str(round(pd_datafram_PCA_sim.iloc[0]['rho']))+' sigma:'+str(round(pd_datafram_PCA_sim.iloc[0]['sigma']*1000000,4))+'\n\ + er.height:'+str(round(pd_datafram_PCA_sim.iloc[0]['erosion_height_start'],2))+' er.log:'+str(round(pd_datafram_PCA_sim.iloc[0]['erosion_range'],1))+'\n\ + er.coeff:'+str(round(pd_datafram_PCA_sim.iloc[0]['erosion_coeff']*1000000,3))+' er.index:'+str(round(pd_datafram_PCA_sim.iloc[0]['erosion_mass_index'],2)), residuals_mag, residuals_vel, residual_time_pos, residual_height_pos) + + # pd_datafram_PCA_sim['erosion_coeff']=pd_datafram_PCA_sim['erosion_coeff']/1000000 + # pd_datafram_PCA_sim['sigma']=pd_datafram_PCA_sim['sigma']/1000000 + + print('real noise mag', round(mag_noise_real,3),''+str(SIGMA_ERR)+'sig',round(MAG_RMSD*SIGMA_ERR,3),''+str(SIGMA_ERR*2)+'sig',round(MAG_RMSD*SIGMA_ERR*2,3),'|| MODE noise mag', round(rmsd_mag,3), '\nreal noise len', round(len_noise_real/1000,3),''+str(SIGMA_ERR)+'sig',round(LEN_RMSD*SIGMA_ERR,3),''+str(SIGMA_ERR*2)+'sig',round(LEN_RMSD*SIGMA_ERR*2,3),'|| MODE noise len', round(rmsd_lag,3)) + select_mode_print='No' + if rmsd_mag common_height_max: # handle the case where there is no overlap in height + print('No overlap in height') + return 9999,9999,9999,9999,9999,9999,obs_time_err[0], height_km_err[0] + + common_heights = np.linspace(common_height_min, common_height_max, num=len(height_km_err)) # Adjust the number of points as needed + + # Interpolate the magnitudes + interp_magnitudes1 = interp1d(height_km, abs_mag_sim, kind='linear', fill_value="extrapolate") + interp_magnitudes2 = interp1d(height_km_err, abs_mag_sim_err, kind='linear', fill_value="extrapolate") + + # Get magnitudes at the common heights + magnitudes1_common = interp_magnitudes1(common_heights) + magnitudes2_common = interp_magnitudes2(common_heights) + + # Calculate the magnitude differences + magnitude_differences = magnitudes1_common - magnitudes2_common + + # Calculate the RMSD for magnitudes + rmsd_mag = np.sqrt(np.mean(magnitude_differences**2)) + + # # Determine the fraction of matching points for magnitudes + # total_possible_points_mag = len(common_heights) + # matching_points_mag = np.sum((common_heights >= common_height_min) & (common_heights <= common_height_max)) + # fraction_matching_mag = matching_points_mag / total_possible_points_mag + + # # Apply a penalty to the RMSD for magnitudes based on the fraction of matching points + # penalty_factor_mag = 1 / fraction_matching_mag if fraction_matching_mag > 0 else 9999 + # adjusted_rmsd_mag = rmsd_mag * penalty_factor_mag + + # Interpolate the velocities + interp_velocities1 = interp1d(obs_time, vel_kms, kind='linear', fill_value="extrapolate") + interp_velocities2 = interp1d(obs_time_err, vel_kms_err, kind='linear', fill_value="extrapolate") + + # Get velocities at the common times + common_times_min = max(min(obs_time), min(obs_time_err)) + common_times_max = min(max(obs_time), max(obs_time_err)) + common_times = np.linspace(common_times_min, common_times_max, num=len(obs_time_err)) + velocities1_common = interp_velocities1(common_times) + velocities2_common = interp_velocities2(common_times) + + # Calculate the velocity differences + velocity_differences = velocities1_common - velocities2_common + + # Calculate the RMSD for velocities + rmsd_vel = np.sqrt(np.mean(velocity_differences**2)) + + # # Determine the fraction of matching points for velocities + # total_possible_points_vel = len(common_times) + # matching_points_vel = np.sum((common_times >= common_times_min) & (common_times <= common_times_max)) + # fraction_matching_vel = matching_points_vel / total_possible_points_vel + + # # Apply a penalty to the RMSD for velocities based on the fraction of matching points + # penalty_factor_vel = 1 / fraction_matching_vel if fraction_matching_vel > 0 else 9999 + # adjusted_rmsd_vel = rmsd_vel * penalty_factor_vel + + # Interpolate the lag residuals + interp_lag1 = interp1d(obs_time, lag_residual, kind='linear', fill_value="extrapolate") + interp_lag2 = interp1d(obs_time_err, lag_kms_err, kind='linear', fill_value="extrapolate") + + # Get lags at the common times + lags1_common = interp_lag1(common_times) + lags2_common = interp_lag2(common_times) + + # Calculate the lag differences + lag_differences = lags1_common - lags2_common + + # Calculate the RMSD for lags + rmsd_lag = np.sqrt(np.mean(lag_differences**2)) + + # # Determine the fraction of matching points for lags + # total_possible_points_lag = len(common_times) + # matching_points_lag = np.sum((common_times >= min(obs_time)) & (common_times <= max(obs_time))) + # fraction_matching_lag = matching_points_lag / total_possible_points_lag + + # # Apply a penalty to the RMSD for lags based on the fraction of matching points + # penalty_factor_lag = 1 / fraction_matching_lag if fraction_matching_lag > 0 else 9999 + # adjusted_rmsd_lag = rmsd_lag * penalty_factor_lag + + residual_time_pos = common_times + residual_height_pos = common_heights + + # if rmsd_mag is nan give 9999 + if np.isnan(rmsd_mag): + rmsd_mag = 9999 + if np.isnan(rmsd_vel): + rmsd_vel = 9999 + if np.isnan(rmsd_lag): + rmsd_lag = 9999 + + return rmsd_mag, rmsd_vel, rmsd_lag, magnitude_differences, velocity_differences, lag_differences, residual_time_pos, residual_height_pos + + + +def PCA_LightCurveRMSDPLOT_optimize(df_sel_shower, df_obs_shower, output_dir, fit_funct='', gen_Metsim='', mag_noise_real = 0.1, len_noise_real = 20.0, file_name_obs='', number_event_to_optimize=0, run_optimization=True): + + # merge curr_sel and curr_obs + curr_sel = df_sel_shower.copy() + + pd_datafram_PCA_selected_optimized=pd.DataFrame() + + # sigma5=5 + + # 5 sigma confidence interval + # five_sigma=False + # mag_noise = MAG_RMSD*SIGMA_ERR + # len_noise = LEN_RMSD*SIGMA_ERR + mag_noise = mag_noise_real.copy() + len_noise = len_noise_real.copy() + + # # Standard deviation of the magnitude Gaussian noise 1 sigma + # # SD of noise in length (m) 1 sigma in km + len_noise= len_noise/1000 + # velocity noise 1 sigma km/s + # vel_noise = (len_noise*np.sqrt(2)/(1/FPS)) + vel_noise = (len_noise/(1/FPS)) + + # # put the first plot in 2 sublots + # fig, ax = plt.subplots(1, 2, figsize=(17, 5)) + + # # group by solution_id_dist and keep only n_confront_sel from each group + # curr_sel = curr_sel.groupby('solution_id_dist').head(len(number_event_to_optimize)) + # check if distance_meteor is in the columns + no_distance_flag = False + if 'distance_meteor' in curr_sel.columns: + # order by distance_meteor + curr_sel = curr_sel.sort_values('distance_meteor') + else: + no_distance_flag = True + + if number_event_to_optimize == 0: + number_event_to_optimize = len(df_sel_shower) + + # pick from the first n_confront_sel + curr_sel = curr_sel.head(number_event_to_optimize) + + # # count duplicates and add a column for the number of duplicates + # curr_sel['num_duplicates'] = curr_sel.groupby('solution_id')['solution_id'].transform('size') + + # curr_sel.drop_duplicates(subset='solution_id', keep='first', inplace=True) + + curr_sel['erosion_coeff']=curr_sel['erosion_coeff']*1000000 + curr_sel['sigma']=curr_sel['sigma']*1000000 + + # check if end with pickle + if df_obs_shower.iloc[0]['solution_id'].endswith('.pickle'): + data_file_real = read_pickle_reduction_file(df_obs_shower.iloc[0]['solution_id']) + elif df_obs_shower.iloc[0]['solution_id'].endswith('.json'): + data_file_real = read_with_noise_GenerateSimulations_output(df_obs_shower.iloc[0]['solution_id']) + + _, _, _, residuals_mag_real, residuals_vel_real, _, residual_time_pos_real, residual_height_pos_real = RMSD_calc_diff(data_file_real, fit_funct) + + # Get the default color cycle + color_cycle = plt.rcParams['axes.prop_cycle'].by_key()['color'] + + # Create an infinite cycle of colors + infinite_color_cycle = itertools.cycle(color_cycle) + + for ii in range(len(curr_sel)): + + fig, ax = plt.subplots(2, 3, figsize=(14, 6),gridspec_kw={'height_ratios': [ 3, 0.5],'width_ratios': [ 3, 0.5, 3]}) + # fig, ax = plt.subplots(2, 4) + # flat the ax + ax = ax.flatten() + + # pick the ii element of the solution_id column + namefile_sel=curr_sel.iloc[ii]['solution_id'] + Metsim_flag=False + + # chec if the file exist + if not os.path.isfile(namefile_sel): + print('file '+namefile_sel+' not found') + continue + else: + if namefile_sel.endswith('.pickle'): + data_file = read_pickle_reduction_file(namefile_sel) + + elif namefile_sel.endswith('.json'): + # open the json file with the name namefile_sel + f = open(namefile_sel,"r") + data = json.loads(f.read()) + if 'ht_sampled' in data: + data_file = read_GenerateSimulations_output(namefile_sel, data_file_real) + + else: + if gen_Metsim == '': + print('no data for the Metsim file') + continue + + else: + # make a copy of gen_Metsim + data_file = gen_Metsim.copy() + # file metsim + Metsim_flag=True + + rmsd_mag, rmsd_vel, rmsd_lag, residuals_mag, residuals_vel, residuals_len, residual_time_pos, residual_height_pos = RMSD_calc_diff(data_file, fit_funct) + + print('real noise mag', round(mag_noise_real,3),''+str(SIGMA_ERR)+'sig',round(MAG_RMSD*SIGMA_ERR,3),''+str(SIGMA_ERR*2)+'sig',round(MAG_RMSD*SIGMA_ERR*2,3),'|| Event noise mag', round(rmsd_mag,3), '\nreal noise len', round(len_noise_real/1000,3),''+str(SIGMA_ERR)+'sig',round(LEN_RMSD*SIGMA_ERR,3),''+str(SIGMA_ERR*2)+'sig',round(LEN_RMSD*MAG_RMSD*2,3),'|| Event noise len', round(rmsd_lag,3)) + plot_side_by_side(data_file_real, fig, ax, 'go', file_name_obs[:15]+'\nRMSDmag '+str(round(mag_noise_real,3))+' RMSDlen '+str(round(len_noise_real/1000,3)), residuals_mag_real, residuals_vel_real, residual_time_pos_real, residual_height_pos_real, fit_funct, mag_noise, vel_noise, 'Std.dev. realizations') + + color_line=next(infinite_color_cycle) + + if Metsim_flag: + + # plot_side_by_side(data_file, fig, ax, '-k', ii, residuals_mag, residuals_vel, residual_time_pos, residual_height_pos) + if no_distance_flag: + plot_side_by_side(data_file, fig, ax, '-k', 'Metsim data event\n\ +RMSDmag '+str(round(rmsd_mag,3))+' RMSDlen '+str(round(rmsd_lag,3))+'\n\ + m:'+str('{:.2e}'.format(curr_sel.iloc[ii]['mass'],1))+' F:'+str(round(curr_sel.iloc[ii]['F'],2))+'\n\ + rho:'+str(round(curr_sel.iloc[ii]['rho']))+' sigma:'+str(round(curr_sel.iloc[ii]['sigma'],4))+'\n\ + er.height:'+str(round(curr_sel.iloc[ii]['erosion_height_start'],2))+' er.log:'+str(round(curr_sel.iloc[ii]['erosion_range'],1))+'\n\ + er.coeff:'+str(round(curr_sel.iloc[ii]['erosion_coeff'],3))+' er.index:'+str(round(curr_sel.iloc[ii]['erosion_mass_index'],2)), residuals_mag, residuals_vel, residual_time_pos, residual_height_pos) + + else: + plot_side_by_side(data_file, fig, ax, '-k', 'Metsim data event\n\ +RMSDmag '+str(round(rmsd_mag,3))+' RMSDlen '+str(round(rmsd_lag,3))+'\n\ + N°duplic. '+str(round(curr_sel.iloc[ii]['num_duplicates']))+' min dist:'+str(round(curr_sel.iloc[ii]['distance_meteor'],2))+'\n\ + m:'+str('{:.2e}'.format(curr_sel.iloc[ii]['mass'],1))+' F:'+str(round(curr_sel.iloc[ii]['F'],2))+'\n\ + rho:'+str(round(curr_sel.iloc[ii]['rho']))+' sigma:'+str(round(curr_sel.iloc[ii]['sigma'],4))+'\n\ + er.height:'+str(round(curr_sel.iloc[ii]['erosion_height_start'],2))+' er.log:'+str(round(curr_sel.iloc[ii]['erosion_range'],1))+'\n\ + er.coeff:'+str(round(curr_sel.iloc[ii]['erosion_coeff'],3))+' er.index:'+str(round(curr_sel.iloc[ii]['erosion_mass_index'],2)), residuals_mag, residuals_vel, residual_time_pos, residual_height_pos) + + + + else: + + # if color_line == '#2ca02c': + # color_line='m' + + # plot_side_by_side(data_file, fig, ax, '-', ii, residuals_mag, residuals_vel, residual_time_pos, residual_height_pos) + + if no_distance_flag: + plot_side_by_side(data_file, fig, ax, '-','RMSDmag '+str(round(rmsd_mag,3))+' RMSDlen '+str(round(rmsd_lag,3))+'\n\ + m:'+str('{:.2e}'.format(curr_sel.iloc[ii]['mass'],1))+' F:'+str(round(curr_sel.iloc[ii]['F'],2))+'\n\ + rho:'+str(round(curr_sel.iloc[ii]['rho']))+' sigma:'+str(round(curr_sel.iloc[ii]['sigma'],4))+'\n\ + er.height:'+str(round(curr_sel.iloc[ii]['erosion_height_start'],2))+' er.log:'+str(round(curr_sel.iloc[ii]['erosion_range'],1))+'\n\ + er.coeff:'+str(round(curr_sel.iloc[ii]['erosion_coeff'],3))+' er.index:'+str(round(curr_sel.iloc[ii]['erosion_mass_index'],2)), residuals_mag, residuals_vel, residual_time_pos, residual_height_pos) + + else: + plot_side_by_side(data_file, fig, ax, '-','RMSDmag '+str(round(rmsd_mag,3))+' RMSDlen '+str(round(rmsd_lag,3))+'\n\ + N°duplic. '+str(round(curr_sel.iloc[ii]['num_duplicates']))+' min dist:'+str(round(curr_sel.iloc[ii]['distance_meteor'],2))+'\n\ + m:'+str('{:.2e}'.format(curr_sel.iloc[ii]['mass'],1))+' F:'+str(round(curr_sel.iloc[ii]['F'],2))+'\n\ + rho:'+str(round(curr_sel.iloc[ii]['rho']))+' sigma:'+str(round(curr_sel.iloc[ii]['sigma'],4))+'\n\ + er.height:'+str(round(curr_sel.iloc[ii]['erosion_height_start'],2))+' er.log:'+str(round(curr_sel.iloc[ii]['erosion_range'],1))+'\n\ + er.coeff:'+str(round(curr_sel.iloc[ii]['erosion_coeff'],3))+' er.index:'+str(round(curr_sel.iloc[ii]['erosion_mass_index'],2)), residuals_mag, residuals_vel, residual_time_pos, residual_height_pos) + + # change first line color + ax[0].lines[1].set_color(color_line) + ax[1].lines[1].set_color(color_line) + ax[2].lines[1].set_color(color_line) + ax[5].lines[1].set_color(color_line) + + # split the name from the path + _, file_name_title = os.path.split(curr_sel.iloc[ii]['solution_id']) + # suptitle of the plot + fig.suptitle(file_name_title) + + # pu the leggend putside the plot and adjust the plot base on the screen size + ax[2].legend(bbox_to_anchor=(1.05, 1.0), loc='upper left', borderaxespad=0.) + # the legend do not fit in the plot, so adjust the plot + plt.subplots_adjust(right=.7) + plt.subplots_adjust(wspace=0.2) + + # make more space + plt.tight_layout() + + file_json_save_phys_NOoptimized=output_dir+os.sep+SAVE_SELECTION_FOLDER+os.sep+file_name_title + if Metsim_flag: + file_json_save_phys=output_dir+os.sep+SAVE_SELECTION_FOLDER+os.sep+file_name_title[:23]+'_fitted.json' + file_json_save_results=output_dir+os.sep+SAVE_RESULTS_FOLDER_EVENTS_PLOTS+os.sep+file_name_title[:23]+'_fitted.json' + const_nominal, _ = loadConstants(namefile_sel) + saveConstants(const_nominal,output_dir,file_name_obs+'_sim_fit.json') + else: + file_json_save_phys=output_dir+os.sep+SAVE_SELECTION_FOLDER+os.sep+file_name_obs[:15]+'_'+file_name_title[:23]+'_fitted.json' + file_json_save_results=output_dir+os.sep+SAVE_RESULTS_FOLDER_EVENTS_PLOTS+os.sep+file_name_obs[:15]+'_'+file_name_title[:23]+'_fitted.json' + # from namefile_sel json file open the json file and save the namefile_sel.const part as file_name_obs+'_sim_fit.json' + with open(namefile_sel) as json_file: + data = json.load(json_file) + const_part = data['const'] + with open(output_dir+os.sep+file_name_obs+'_sim_fit.json', 'w') as outfile: + json.dump(const_part, outfile, indent=4) + + shutil.copy(namefile_sel, file_json_save_phys_NOoptimized) + + if run_optimization: + + # check if file_json_save_phys is present + if not os.path.isfile(file_json_save_phys): + + if rmsd_mag<=mag_noise_real and rmsd_lag<=len_noise_real/1000: + print('below sigma noise, SAVED') + + shutil.copy(output_dir+os.sep+file_name_obs+'_sim_fit.json', file_json_save_phys) + + pd_datafram_PCA_selected_optimized = pd.concat([pd_datafram_PCA_selected_optimized, curr_sel.iloc[ii]], axis=0) + + # suptitle of the plot + fig.suptitle(file_name_title+' PERFECT below sigma noise') + + # pu the leggend putside the plot and adjust the plot base on the screen size + ax[2].legend(bbox_to_anchor=(1.05, 1.0), loc='upper left', borderaxespad=0.) + # the legend do not fit in the plot, so adjust the plot + plt.subplots_adjust(right=.7) + plt.subplots_adjust(wspace=0.2) + # make more space + plt.tight_layout() + plt.savefig(output_dir+os.sep+SAVE_RESULTS_FOLDER_EVENTS_PLOTS+os.sep+file_name_title[:23]+'_RMSDmag'+str(round(rmsd_mag,2))+'_RMSDlen'+str(round(rmsd_lag,2))+'_Heigh_MagVelCoef.png') + shutil.copy(output_dir+os.sep+file_name_obs+'_sim_fit_fitted.json', file_json_save_results) + + plt.savefig(output_dir+os.sep+SAVE_SELECTION_FOLDER+os.sep+file_name_title[:23]+'_RMSDmag'+str(round(rmsd_mag,2))+'_RMSDlen'+str(round(rmsd_lag,2))+'_Heigh_MagVelCoef.png') + + # close the plot + plt.close() + continue + + elif rmsd_mag10000: # w/o takes forever to plot + # pick randomly 10000 events + df_sim_shower_small=df_sim_shower_small.sample(n=10000) + if 'MetSim' not in df_sim_shower_small['type'].values and 'Real' not in df_sim_shower_small['type'].values: + df_sim_shower_small = pd.concat([df_sim_shower_small.iloc[[0]], df_sim_shower_small]) + + if save_log: + # check if a file with the name "log"+n_PC_in_PCA+"_"+str(len(df_sel))+"ev.txt" already exist + if os.path.exists(output_dir+os.sep+"log_"+file_name[:15]+"_CI"+str(n_PC_in_PCA)+"PC.txt"): + # remove the file + os.remove(output_dir+os.sep+"log_"+file_name[:15]+"_CI"+str(n_PC_in_PCA)+"PC.txt") + sys.stdout = Logger(output_dir,"log_"+file_name[:15]+"_CI"+str(n_PC_in_PCA)+"PC.txt") # _30var_99%_13PC + + + + curr_df_sim_sel = pd.concat([df_sim_shower_small,df_sel_shower], axis=0) + + # multiply the erosion coeff by 1000000 to have it in km/s + curr_df_sim_sel['erosion_coeff']=curr_df_sim_sel['erosion_coeff']*1000000 + curr_df_sim_sel['sigma']=curr_df_sim_sel['sigma']*1000000 + curr_df_sim_sel['erosion_energy_per_unit_cross_section']=curr_df_sim_sel['erosion_energy_per_unit_cross_section']/1000000 + curr_df_sim_sel['erosion_energy_per_unit_mass']=curr_df_sim_sel['erosion_energy_per_unit_mass']/1000000 + + group_mapping = { + 'Simulation_sel': 'selected', + 'MetSim': 'simulated', + 'Real': 'simulated', + 'Simulation': 'simulated' + } + curr_df_sim_sel['group'] = curr_df_sim_sel['type'].map(group_mapping) + + curr_df_sim_sel['num_group'] = curr_df_sim_sel.groupby('group')['group'].transform('size') + curr_df_sim_sel['weight'] = 1 / curr_df_sim_sel['num_group'] + + curr_df_sim_sel['num_type'] = curr_df_sim_sel.groupby('type')['type'].transform('size') + curr_df_sim_sel['weight_type'] = 1 / curr_df_sim_sel['num_type'] + + curr_sel = curr_df_sim_sel[curr_df_sim_sel['group'] == 'selected'].copy() + # curr_sim = curr_df_sim_sel[curr_df_sim_sel['group'] == 'simulated'].copy() + + # with color based on the shower but skip the first 2 columns (shower_code, shower_id) + to_plot=['mass','rho','sigma','erosion_height_start','erosion_coeff','erosion_mass_index','erosion_mass_min','erosion_mass_max','erosion_range','erosion_energy_per_unit_mass','erosion_energy_per_unit_cross_section','erosion_energy_per_unit_cross_section'] + # to_plot_unit=['mass [kg]','rho [kg/m^3]','sigma [s^2/km^2]','erosion height start [km]','erosion coeff [s^2/km^2]','erosion mass index [-]','log eros. mass min [kg]','log eros. mass max [kg]','log eros. mass range [-]','erosion energy per unit mass [MJ/kg]','erosion energy per unit cross section [MJ/m^2]','erosion energy per unit cross section [MJ/m^2]'] + to_plot_unit = [r'$m_0$ [kg]', r'$\rho$ [kg/m$^3$]', r'$\sigma$ [s$^2$/km$^2$]', r'$h_{e}$ [km]', r'$\eta$ [s$^2$/km$^2$]', r'$s$ [-]', r'log($m_{l}$) [-]', r'log($m_{u}$) [-]',r'log($m_{u}$)-log($m_{l}$) [-]'] + + + fig, axs = plt.subplots(3, 3) + # from 2 numbers to one numbr for the subplot axs + axs = axs.flatten() + + + + print('\\hline') + if len(Min_KDE_point) > 0: + # print('var & $real$ & $1D_{KDE}$ & $1D_{KDE}\\%_{dif}$ & $allD_{KDE}$ & $allD_{KDE}\\%_{dif}$\\\\') + # print('var & real & mode & min$_{KDE}$ & -1\\sigma/+1\\sigma & -2\\sigma/+2\\sigma \\\\') + print('Variables & '+str(df_sim_shower['type'].iloc[0])+' & Mode & Dens.Point $ & 95\\%CIlow & 95\\%CIup \\\\') + else: + print('Variables & '+str(df_sim_shower['type'].iloc[0])+' & Mode & 95\\%CIlow & 95\\%CIup \\\\') + + ii_densest=0 + for i in range(9): + # put legendoutside north + plotvar=to_plot[i] + + if plotvar == 'erosion_mass_min' or plotvar == 'erosion_mass_max': + # take the log of the erosion_mass_min and erosion_mass_max + curr_df_sim_sel[plotvar]=np.log10(curr_df_sim_sel[plotvar]) + curr_sel[plotvar]=np.log10(curr_sel[plotvar]) + if len(Min_KDE_point) > 0: + Min_KDE_point[ii_densest]=np.log10(Min_KDE_point[ii_densest]) + # Min_KDE_point[ii_densest-1]=np.log10(Min_KDE_point[ii_densest-1]) + # sns.histplot(curr_df_sim_sel, x=curr_df_sim_sel[plotvar], weights=curr_df_sim_sel['weight'],hue='shower_code', ax=axs[i], kde=True, palette='bright', bins=20) + sns.histplot(curr_df_sim_sel, x=curr_df_sim_sel[plotvar], weights=curr_df_sim_sel['weight'], hue='group', ax=axs[i], palette='bright', bins=20) + unique_values_count = curr_sel[plotvar].nunique() + if unique_values_count > 1: + # # add the kde to the plot probability density function + sns.histplot(curr_sel, x=curr_sel[plotvar], weights=curr_sel['weight'], bins=20, ax=axs[i], fill=False, edgecolor=False, color='r', kde=True, binrange=[np.min(curr_df_sim_sel[plotvar]),np.max(curr_df_sim_sel[plotvar])]) + kde_line = axs[i].lines[-1] + axs[i].lines[-1].remove() + else: + kde_line = None + + # if the only_select_meteors_from is equal to any curr_df_sim_sel plot the observed event value as a vertical red line + # check if curr_df_sim_sel['type']=='MetSim' is in the curr_df_sim_sel['type'].values + if 'MetSim' in curr_df_sim_sel['type'].values: + # get the value of the observed event + axs[i].axvline(x=curr_df_sim_sel[curr_df_sim_sel['type']=='MetSim'][plotvar].values[0], color='k', linewidth=2) + elif 'Real' in curr_df_sim_sel['type'].values: + axs[i].axvline(x=curr_df_sim_sel[curr_df_sim_sel['type']=='Real'][plotvar].values[0], color='g', linewidth=2, linestyle='--') + + if plotvar == 'erosion_mass_min' or plotvar == 'erosion_mass_max': + # put it back as it was + curr_df_sim_sel[plotvar]=10**curr_df_sim_sel[plotvar] + curr_sel[plotvar]=10**curr_sel[plotvar] + + # get te 97.72nd percentile and the 2.28th percentile of curr_sel[plotvar] and call them sigma_97 and sigma_2 + sigma_95=np.percentile(curr_sel[plotvar], 95) + sigma_84=np.percentile(curr_sel[plotvar], 84.13) + sigma_15=np.percentile(curr_sel[plotvar], 15.87) + sigma_5=np.percentile(curr_sel[plotvar], 5) + + if kde_line is not None: + # Get the x and y data from the KDE line + kde_line_Xval = kde_line.get_xdata() + kde_line_Yval = kde_line.get_ydata() + + # Find the index of the maximum y value + max_index = np.argmax(kde_line_Yval) + if i!=8: + # Plot a dot at the maximum point + # axs[i].plot(kde_line_Xval[max_index], kde_line_Yval[max_index], 'ro') # 'ro' for red dot + axs[i].axvline(x=kde_line_Xval[max_index], color='red', linestyle='-.') + + x_10mode=kde_line_Xval[max_index] + if plotvar == 'erosion_mass_min' or plotvar == 'erosion_mass_max': + x_10mode=10**kde_line_Xval[max_index] + + if len(Min_KDE_point) > 0: + if len(Min_KDE_point)>ii_densest: + + # Find the index with the closest value to densest_point[ii_dense] to all y values + densest_index = find_closest_index(kde_line_Xval, [Min_KDE_point[ii_densest]]) + + # add also the densest_point[i] as a blue dot + # axs[i].plot(Min_KDE_point[ii_densest], kde_line_Yval[densest_index[0]], 'bo') + axs[i].axvline(x=Min_KDE_point[ii_densest], color='blue', linestyle='-.') + + if plotvar == 'erosion_mass_min' or plotvar == 'erosion_mass_max': + Min_KDE_point[ii_densest]=10**(Min_KDE_point[ii_densest]) + + if i<9: + print('\\hline') #df_sel_save[df_sel_save['solution_id']==only_select_meteors_from][plotvar].values[0] + # print(f"{to_plot_unit[i]} & ${'{:.4g}'.format(df_sel_save[df_sel_save['solution_id']==only_select_meteors_from][plotvar].values[0])}$ & ${'{:.4g}'.format(x_10mode)}$ & $ {'{:.2g}'.format(percent_diff_1D[i])}$\\% & $ {'{:.4g}'.format(densest_point[i])}$ & $ {'{:.2g}'.format(percent_diff_allD[i])}$\\% \\\\") + # print(to_plot_unit[i]+'& $'+str(x[max_index])+'$ & $'+str(percent_diff_1D[i])+'$\\% & $'+str(densest_point[ii_densest])+'$ & $'+str(percent_diff_allD[i])+'\\% \\\\') + # print(f"{to_plot_unit[i]} & ${'{:.4g}'.format(df_sel_save[df_sel_save['solution_id']==only_select_meteors_from][plotvar].values[0])}$ & ${'{:.4g}'.format(x_10mode)}$ & $ {'{:.2g}'.format(percent_diff_1D[i])}$\\% & $ {'{:.4g}'.format(densest_point[i])}$ & $ {'{:.2g}'.format(percent_diff_allD[i])}$\\% \\\\") + # print(f"{to_plot_unit[i]} & {'{:.4g}'.format(df_sel_save[df_sel_save['solution_id']==only_select_meteors_from][plotvar].values[0])} & {'{:.4g}'.format(x_10mode)} & {'{:.4g}'.format(densest_point[i])} & {'{:.4g}'.format(sigma_15)} / {'{:.4g}'.format(sigma_84)} & {'{:.4g}'.format(sigma_2)} / {'{:.4g}'.format(sigma_97)} \\\\") + print(f"{to_plot_unit[i]} & {'{:.4g}'.format(curr_df_sim_sel[plotvar].iloc[0])} & {'{:.4g}'.format(x_10mode)} & {'{:.4g}'.format(Min_KDE_point[i])} & {'{:.4g}'.format(sigma_5)} & {'{:.4g}'.format(sigma_95)} \\\\") + ii_densest=ii_densest+1 + else: + if i<9: + print('\\hline') + print(f"{to_plot_unit[i]} & {'{:.4g}'.format(curr_df_sim_sel[plotvar].iloc[0])} & {'{:.4g}'.format(x_10mode)} & {'{:.4g}'.format(sigma_5)} & {'{:.4g}'.format(sigma_95)} \\\\") + else: + if i<9: + print('\\hline') + print(f"{to_plot_unit[i]} & {'{:.4g}'.format(curr_df_sim_sel[plotvar].iloc[0])} & {'{:.4g}'.format(sigma_5)} & {'{:.4g}'.format(sigma_95)} \\\\") + + axs[i].set_ylabel('probability') + axs[i].set_xlabel(to_plot_unit[i]) + + # check if y axis is above 1 if so set_ylim(0,1) + if axs[i].get_ylim()[1]>1: + axs[i].set_ylim(0,1) + + # # plot the legend outside the plot + # axs[i].legend() + axs[i].get_legend().remove() + + + if i==0: + # place the xaxis exponent in the bottom right corner + axs[i].xaxis.get_offset_text().set_x(1.10) + + # # more space between the subplots erosion_coeff sigma + plt.tight_layout() + + print('\\hline') + + + # save the figure maximized and with the right name + fig.savefig(output_dir+os.sep+file_name+'_PhysicProp'+str(n_PC_in_PCA)+'PC_'+str(len(curr_sel))+'ev.png', dpi=300) # _dist'+str(np.round(np.min(curr_sel['distance_meteor']),2))+'-'+str(np.round(np.max(curr_sel['distance_meteor']),2))+' + + # close the figure + plt.close() + + if save_log: + # Close the Logger to ensure everything is written to the file STOP COPY in TXT file + sys.stdout.close() + + # Reset sys.stdout to its original value if needed + sys.stdout = sys.__stdout__ + + ii_densest=0 + if 'solution_id_dist' in df_sel_shower_real.columns: + # the plot can get suck if too many reliazations + if len(df_sel_shower_real['solution_id_dist'].unique())<60: + if len(df_sel_shower_real['solution_id_dist'].unique())>1: + print('plot the distribution of the Realization',len(df_sel_shower_real['solution_id_dist'].unique())) + fig, axs = plt.subplots(3, 3) + # from 2 numbers to one numbr for the subplot axs + axs = axs.flatten() + + # ii_densest=0 + for i in range(9): + # put legendoutside north + plotvar=to_plot[i] + + if plotvar == 'erosion_mass_min' or plotvar == 'erosion_mass_max': + + sns.histplot(curr_df_sim_sel, x=np.log10(curr_df_sim_sel[plotvar]), weights=curr_df_sim_sel['weight'], hue='group', ax=axs[i], palette='bright', bins=20, binrange=[np.log10(np.min(curr_df_sim_sel[plotvar])),np.log10(np.max(curr_df_sim_sel[plotvar]))]) + # sns.histplot(curr_df_sim_sel, x=curr_df_sim_sel[plotvar], weights=curr_df_sim_sel['weight'],hue='solution_id_dist', ax=axs[i], multiple="stack", kde=True, bins=20, binrange=[np.min(df_sel_save[plotvar]),np.max(df_sel_save[plotvar])]) + sns.histplot(curr_df_sim_sel, x=np.log10(curr_df_sim_sel[plotvar]), weights=curr_df_sim_sel['weight'],hue='solution_id_dist', ax=axs[i], multiple="stack", bins=20, binrange=[np.log10(np.min(curr_df_sim_sel[plotvar])),np.log10(np.max(curr_df_sim_sel[plotvar]))]) + # # add the kde to the plot as a probability density function + sns.histplot(curr_sel, x=np.log10(curr_sel[plotvar]), weights=curr_sel['weight'], bins=20, ax=axs[i], multiple="stack", fill=False, edgecolor=False, color='r', kde=True, binrange=[np.log10(np.min(curr_df_sim_sel[plotvar])),np.log10(np.max(curr_df_sim_sel[plotvar]))]) + + kde_line = axs[i].lines[-1] + # delete from the plot the axs[i].lines[-1] + axs[i].lines[-1].remove() + + # if the only_select_meteors_from is equal to any curr_df_sim_sel plot the observed event value as a vertical red line + if 'MetSim' in curr_df_sim_sel['type'].values: + # get the value of the observed event + axs[i].axvline(x=np.log10(curr_df_sim_sel[curr_df_sim_sel['type']=='MetSim'][plotvar].values[0]), color='k', linewidth=2) + elif 'Real' in curr_df_sim_sel['type'].values: + axs[i].axvline(x=np.log10(curr_df_sim_sel[curr_df_sim_sel['type']=='Real'][plotvar].values[0]), color='g', linewidth=2, linestyle='--') + + # if len(Min_KDE_point) > 0: + # Min_KDE_point[ii_densest]=np.log10(Min_KDE_point[ii_densest]) + # # Min_KDE_point[ii_densest-1]=np.log10(Min_KDE_point[ii_densest-1]) + + else: + + sns.histplot(curr_df_sim_sel, x=curr_df_sim_sel[plotvar], weights=curr_df_sim_sel['weight'], hue='group', ax=axs[i], palette='bright', bins=20, binrange=[np.min(curr_df_sim_sel[plotvar]),np.max(curr_df_sim_sel[plotvar])]) + # sns.histplot(curr_df_sim_sel, x=curr_df_sim_sel[plotvar], weights=curr_df_sim_sel['weight'],hue='solution_id_dist', ax=axs[i], multiple="stack", kde=True, bins=20, binrange=[np.min(df_sel_save[plotvar]),np.max(df_sel_save[plotvar])]) + sns.histplot(curr_df_sim_sel, x=curr_df_sim_sel[plotvar], weights=curr_df_sim_sel['weight'], hue='solution_id_dist', ax=axs[i], multiple="stack", bins=20, binrange=[np.min(curr_df_sim_sel[plotvar]),np.max(curr_df_sim_sel[plotvar])]) + # # add the kde to the plot as a probability density function + sns.histplot(curr_sel, x=curr_sel[plotvar], weights=curr_sel['weight'], bins=20, ax=axs[i], multiple="stack", fill=False, edgecolor=False, color='r', kde=True, binrange=[np.min(curr_df_sim_sel[plotvar]),np.max(curr_df_sim_sel[plotvar])]) + + kde_line = axs[i].lines[-1] + + # delete from the plot the axs[i].lines[-1] + axs[i].lines[-1].remove() + + # if the only_select_meteors_from is equal to any curr_df_sim_sel plot the observed event value as a vertical red line + if 'MetSim' in curr_df_sim_sel['type'].values: + # get the value of the observed event + axs[i].axvline(x=curr_df_sim_sel[curr_df_sim_sel['type']=='MetSim'][plotvar].values[0], color='k', linewidth=2) + elif 'Real' in curr_df_sim_sel['type'].values: + axs[i].axvline(x=curr_df_sim_sel[curr_df_sim_sel['type']=='Real'][plotvar].values[0], color='g', linewidth=2, linestyle='--') + # put the value of diff_percent_1d at th upper left of the line + + axs[i].set_ylabel('probability') + axs[i].set_xlabel(to_plot_unit[i]) + # check if y axis is above 1 if so set_ylim(0,1) + if axs[i].get_ylim()[1]>1: + axs[i].set_ylim(0,1) + + # # plot the legend outside the plot + # axs[i].legend() + axs[i].get_legend().remove() + + # # Get the x and y data from the KDE line + # kde_line_Xval = kde_line.get_xdata() + # kde_line_Yval = kde_line.get_ydata() + + # if i != 8: + # axs[i].plot(kde_line_Xval[max_index], kde_line_Yval[max_index], 'ro') + + # if i==0: + # # place the xaxis exponent in the bottom right corner + # axs[i].xaxis.get_offset_text().set_x(1.10) + # if len(Min_KDE_point) > 0: + # if len(Min_KDE_point)>ii_densest: + + # # Find the index with the closest value to densest_point[ii_dense] to all y values + # densest_index = find_closest_index(kde_line_Xval, [Min_KDE_point[ii_densest]]) + + # # add also the densest_point[i] as a blue dot + # axs[i].plot(Min_KDE_point[ii_densest], kde_line_Yval[densest_index[0]], 'bo') + # ii_densest=ii_densest+1 + # # more space between the subplots erosion_coeff sigma + plt.tight_layout() + + # save the figure maximized and with the right name + fig.savefig(output_dir+os.sep+file_name+'_PhysicProp_Reliazations_'+str(n_PC_in_PCA)+'PC_'+str(len(curr_sel))+'ev.png', dpi=300) + + + +def PCA_LightCurveCoefPLOT(df_sel_shower_real, df_obs_shower, output_dir, fit_funct='', gensim_data_obs='', mag_noise_real= 0.1, len_noise_real = 20.0, file_name_obs='', trajectory_Metsim_file='', output_folder_of_csv=''): + + # number to confront + n_confront_obs=1 + if output_folder_of_csv=='': + n_confront_sel=7 + else: + n_confront_sel=9 + + # number of PC in PCA + with_noise=True + + # is the input data noisy + noise_data_input=False + + # activate jachia + jacchia_fit=False + + # activate parabolic fit + parabolic_fit=False + + t0_fit=False + + mag_fit=False + + # 5 sigma confidence interval + # five_sigma=False + # mag_noise = MAG_RMSD*SIGMA_ERR + # len_noise = LEN_RMSD*SIGMA_ERR + mag_noise = mag_noise_real.copy() + len_noise = len_noise_real.copy() + + # # Standard deviation of the magnitude Gaussian noise 1 sigma + # # SD of noise in length (m) 1 sigma in km + len_noise= len_noise/1000 + # velocity noise 1 sigma km/s + # vel_noise = (len_noise*np.sqrt(2)/(1/FPS)) + vel_noise = (len_noise/(1/FPS)) + + # put the first plot in 2 sublots + fig, ax = plt.subplots(1, 2, figsize=(17, 5)) + + df_sel_shower = df_sel_shower_real.copy() + + # # group by solution_id_dist and keep only n_confront_sel from each group + # df_sel_shower = df_sel_shower.groupby('solution_id_dist').head(len(df_sel_shower)) + + # check if distance_meteor is in the columns + if 'distance_meteor' in df_sel_shower.columns: + # order by distance_meteor + df_sel_shower = df_sel_shower.sort_values('distance_meteor') + + # # count duplicates and add a column for the number of duplicates + # df_sel_shower['num_duplicates'] = df_sel_shower.groupby('solution_id')['solution_id'].transform('size') + + # df_sel_shower.drop_duplicates(subset='solution_id', keep='first', inplace=True) + + df_sel_shower['erosion_coeff']=df_sel_shower['erosion_coeff']*1000000 + df_sel_shower['sigma']=df_sel_shower['sigma']*1000000 + + if n_confront_obs result_number: + print(cml_args.min_nres,'simulatd to found') + while cml_args.min_nres > result_number: + + # reset index + pd_datafram_PCA_selected_lowRMSD.reset_index(drop=True, inplace=True) + + pd_datafram_PCA_selected_lowRMSD['type'] = 'Simulation_sel' + + # delete any row from the csv file that has the same value of mass, rho, sigma, erosion_height_start, erosion_coeff, erosion_mass_index, erosion_mass_min, erosion_mass_max, erosion_range, erosion_energy_per_unit_cross_section, erosion_energy_per_unit_mass + if 'mass' in pd_datafram_PCA_selected_lowRMSD.columns: + # Drop duplicate rows based on the specified columns + pd_datafram_PCA_selected_lowRMSD = pd_datafram_PCA_selected_lowRMSD.drop_duplicates(subset=[ + 'mass', 'rho', 'sigma', 'erosion_height_start', 'erosion_coeff', + 'erosion_mass_index', 'erosion_mass_min', 'erosion_mass_max', + 'erosion_range', 'erosion_energy_per_unit_cross_section', + 'erosion_energy_per_unit_mass' + ]) + pd_datafram_PCA_selected_lowRMSD.reset_index(drop=True, inplace=True) + + pd_results = pd.concat([pd_results, pd_datafram_PCA_selected_lowRMSD]) + + # save and update the disk + pd_results.to_csv(output_folder+os.sep+SAVE_RESULTS_FOLDER+os.sep+file_name+'_sim_sel_results.csv', index=False) + + + if 'solution_id' in pd_results.columns: + print('PLOT: the physical characteristics results') + PCA_PhysicalPropPLOT(pd_results, pd_datafram_PCA_sim, pca_N_comp, output_folder+os.sep+SAVE_RESULTS_FOLDER, file_name) + print('PLOT: correlation matrix of the results') + PCAcorrelation_selPLOT(pd_datafram_PCA_sim, pd_results, pca_N_comp, output_folder+os.sep+SAVE_RESULTS_FOLDER) + print('PLOT: best 9 results and add the RMSD value to csv selected') + PCA_LightCurveCoefPLOT(pd_results, pd_dataframe_PCA_obs_real, output_folder+os.sep+SAVE_RESULTS_FOLDER, fit_funct, gensim_data_obs, rmsd_pol_mag, rmsd_t0_lag, file_name, trajectory_Metsim_file,output_folder+os.sep+SAVE_RESULTS_FOLDER+os.sep+file_name+'_sim_sel_results.csv') + print() + print('SUCCES: the physical characteristics range is in the results folder') + else: + # print('FAIL: Not found any result below magRMSD',rmsd_pol_mag*SIGMA_ERR,'and lenRMSD',rmsd_t0_lag*SIGMA_ERR/1000) + print('FAIL: Not found any result below magRMSD',MAG_RMSD*SIGMA_ERR,'and lenRMSD',LEN_RMSD*SIGMA_ERR) + break + + + # check if only 1 in len break + if len(pd_results) == 1: + print('Only one result found') + # create a dictionary with the physical parameters + CI_physical_param = { + 'v_init_180km': [pd_results['v_init_180km'].values[0], pd_results['v_init_180km'].values[0]], + 'zenith_angle': [pd_results['zenith_angle'].values[0], pd_results['zenith_angle'].values[0]], + 'mass': [pd_results['mass'].values[0], pd_results['mass'].values[0]], + 'rho': [pd_results['rho'].values[0], pd_results['rho'].values[0]], + 'sigma': [pd_results['sigma'].values[0], pd_results['sigma'].values[0]], + 'erosion_height_start': [pd_results['erosion_height_start'].values[0], pd_results['erosion_height_start'].values[0]], + 'erosion_coeff': [pd_results['erosion_coeff'].values[0], pd_results['erosion_coeff'].values[0]], + 'erosion_mass_index': [pd_results['erosion_mass_index'].values[0], pd_results['erosion_mass_index'].values[0]], + 'erosion_mass_min': [pd_results['erosion_mass_min'].values[0], pd_results['erosion_mass_min'].values[0]], + 'erosion_mass_max': [pd_results['erosion_mass_max'].values[0], pd_results['erosion_mass_max'].values[0]] + } + + else: + print('Number of results found:',len(pd_results)) + columns_physpar = ['v_init_180km','zenith_angle','mass', 'rho', 'sigma', 'erosion_height_start', 'erosion_coeff', + 'erosion_mass_index', 'erosion_mass_min', 'erosion_mass_max'] + + ############################################################################### + + # # Calculate the quantiles + # quantiles = pd_results[columns_physpar].quantile([0.05, 0.95]) + + # # Convert the quantiles to a dictionary + # CI_physical_param = {col: quantiles[col].tolist() for col in columns_physpar} + + ############################################################################### + + # Calculate the quantiles + quantiles = pd_results[columns_physpar].quantile([0.1, 0.9]) + + # Get the minimum and maximum values + min_val = pd_results[columns_physpar].min() + max_val = pd_results[columns_physpar].max() + + # Calculate the extended range using the logic provided + extended_min = min_val - (quantiles.loc[0.1] - min_val) + # consider the value extended_min<0 Check each column in extended_min and set to min_val if negative + for col in columns_physpar: + if extended_min[col] < 0: + extended_min[col] = min_val[col] + extended_max = max_val + (max_val - quantiles.loc[0.9]) + + # Convert the extended range to a dictionary + CI_physical_param = {col: [extended_min[col], extended_max[col]] for col in columns_physpar} + + ############################################################################### + + + # check if v_init_180km are the same value + if CI_physical_param['v_init_180km'][0] == CI_physical_param['v_init_180km'][1]: + CI_physical_param['v_init_180km'] = [CI_physical_param['v_init_180km'][0] - CI_physical_param['v_init_180km'][0]/1000, CI_physical_param['v_init_180km'][1] + CI_physical_param['v_init_180km'][1]/1000] + if CI_physical_param['zenith_angle'][0] == CI_physical_param['zenith_angle'][1]: + CI_physical_param['zenith_angle'] = [CI_physical_param['zenith_angle'][0] - CI_physical_param['zenith_angle'][0]/10000, CI_physical_param['zenith_angle'][1] + CI_physical_param['zenith_angle'][1]/10000] + if CI_physical_param['mass'][0] == CI_physical_param['mass'][1]: + CI_physical_param['mass'] = [CI_physical_param['mass'][0] - CI_physical_param['mass'][0]/10, CI_physical_param['mass'][1] + CI_physical_param['mass'][1]/10] + if np.round(CI_physical_param['rho'][0]/100) == np.round(CI_physical_param['rho'][1]/100): + CI_physical_param['rho'] = [CI_physical_param['rho'][0] - CI_physical_param['rho'][0]/5, CI_physical_param['rho'][1] + CI_physical_param['rho'][1]/5] + if CI_physical_param['sigma'][0] == CI_physical_param['sigma'][1]: + CI_physical_param['sigma'] = [CI_physical_param['sigma'][0] - CI_physical_param['sigma'][0]/10, CI_physical_param['sigma'][1] + CI_physical_param['sigma'][1]/10] + if CI_physical_param['erosion_height_start'][0] == CI_physical_param['erosion_height_start'][1]: + CI_physical_param['erosion_height_start'] = [CI_physical_param['erosion_height_start'][0] - CI_physical_param['erosion_height_start'][0]/100, CI_physical_param['erosion_height_start'][1] + CI_physical_param['erosion_height_start'][1]/100] + if CI_physical_param['erosion_coeff'][0] == CI_physical_param['erosion_coeff'][1]: + CI_physical_param['erosion_coeff'] = [CI_physical_param['erosion_coeff'][0] - CI_physical_param['erosion_coeff'][0]/10, CI_physical_param['erosion_coeff'][1] + CI_physical_param['erosion_coeff'][1]/10] + if CI_physical_param['erosion_mass_index'][0] == CI_physical_param['erosion_mass_index'][1]: + CI_physical_param['erosion_mass_index'] = [CI_physical_param['erosion_mass_index'][0] - CI_physical_param['erosion_mass_index'][0]/10, CI_physical_param['erosion_mass_index'][1] + CI_physical_param['erosion_mass_index'][1]/10] + if CI_physical_param['erosion_mass_min'][0] == CI_physical_param['erosion_mass_min'][1]: + CI_physical_param['erosion_mass_min'] = [CI_physical_param['erosion_mass_min'][0] - CI_physical_param['erosion_mass_min'][0]/10, CI_physical_param['erosion_mass_min'][1] + CI_physical_param['erosion_mass_min'][1]/10] + if CI_physical_param['erosion_mass_max'][0] == CI_physical_param['erosion_mass_max'][1]: + CI_physical_param['erosion_mass_max'] = [CI_physical_param['erosion_mass_max'][0] - CI_physical_param['erosion_mass_max'][0]/10, CI_physical_param['erosion_mass_max'][1] + CI_physical_param['erosion_mass_max'][1]/10] + + + # Multiply the 'erosion_height_start' values by 1000 + CI_physical_param['erosion_height_start'] = [x * 1000 for x in CI_physical_param['erosion_height_start']] + + print('CI_physical_param:',CI_physical_param) + + result_number = len(pd_results) + + if cml_args.min_nres <= result_number: + # print the number of results found + print('SUCCES: Number of results found:',result_number) + break + else: + if old_results_number == result_number: + print('Same number of results found:',result_number) + ii_repeat+=1 + if ii_repeat==3: + print('STOP: After 3 times the same number of results found') + print('STOP: After new simulation within 95%CI no new simulation below magRMSD',MAG_RMSD*SIGMA_ERR,'and lenRMSD',LEN_RMSD*SIGMA_ERR) + print('STOP: Number of results found:',result_number) + break + old_results_number = result_number + print('regenerate new simulation in the CI range') + generate_simulations(pd_dataframe_PCA_obs_real, simulation_MetSim_object, gensim_data_obs, cml_args.min_nres, output_folder, file_name, False, CI_physical_param) + + # look for the good_files = glob.glob(os.path.join(output_folder, '*_good_files.txt')) + good_files = [f for f in os.listdir(output_folder) if f.endswith('_good_files.txt')] + + # Construct the full path to the good file + good_file_path = os.path.join(output_folder, good_files[0]) + + # Read the file, skipping the first line + df_good_files = pd.read_csv(good_file_path, skiprows=1) + + # Rename the columns + df_good_files.columns = ["File name", "lim mag", "lim mag length", "length delay (s)"] + + # Extract the first column into an array + file_names = df_good_files["File name"].to_numpy() + + # Change the file extension to .json + all_jsonfiles = [file_name.replace('.pickle', '.json') for file_name in file_names] + + # open the folder and extract all the json files + os.chdir(input_folder) + + print('Number of simulated files in 95CI : ',len(all_jsonfiles)) + + input_list = [[all_jsonfiles[ii], 'simulation_'+str(ii+1)] for ii in range(len(all_jsonfiles))] + results_list = domainParallelizer(input_list, read_GenerateSimulations_output_to_PCA, cores=cml_args.cores) + + # if no read the json files in the folder and create a new csv file + pd_datafram_NEWsim_good = pd.concat(results_list) + + pd_datafram_NEWsim_good.to_csv(output_folder+os.sep+file_name+NAME_SUFX_CSV_SIM_NEW, index=False) + # print saved csv file + print('saved sim csv file:',output_folder+os.sep+file_name+NAME_SUFX_CSV_SIM_NEW) + + input_list_obs = [[pd_datafram_NEWsim_good.iloc[[ii]].reset_index(drop=True), pd_dataframe_PCA_obs_real, output_folder, fit_funct, gensim_data_Metsim, rmsd_pol_mag, rmsd_t0_lag, file_name, 0, False] for ii in range(len(pd_datafram_NEWsim_good))] + results_list = domainParallelizer(input_list_obs, PCA_LightCurveRMSDPLOT_optimize, cores=cml_args.cores) + + # base on the one selected + pd_datafram_PCA_selected_lowRMSD = pd.concat(results_list) + + # Timing end + end_time = time.time() + + # Compute elapsed time + elapsed_time = end_time - start_time + hours, rem = divmod(elapsed_time, 3600) + minutes, seconds = divmod(rem, 60) + # print('Elapsed time in seconds:',elapsed_time) + print(f"Elapsed time: {int(hours):02}:{int(minutes):02}:{int(seconds):02}") + + print() + + diff --git a/wmpl/MetSim/ML/GenerateSimulations.py b/wmpl/MetSim/ML/GenerateSimulations.py index 8138ee3f..0a4664fe 100644 --- a/wmpl/MetSim/ML/GenerateSimulations.py +++ b/wmpl/MetSim/ML/GenerateSimulations.py @@ -86,7 +86,7 @@ def __init__(self): # Power of a zero-magnitude meteor (Watts) - self.P_0M = 840 + self.P_0m = 840 # System FPS self.fps = 80 @@ -235,7 +235,7 @@ def __init__(self): # Power of a zero-magnitude meteor (Watts) - self.P_0M = 840 + self.P_0m = 840 # System FPS self.fps = 80 @@ -390,7 +390,7 @@ def __init__(self, output_dir, erosion_sim_params, random_seed=None): # Init simulation constants self.const = Constants() self.const.dens_co = self.params.dens_co - self.const.P_0M = self.params.P_0M + self.const.P_0m = self.params.P_0m # Set tau to CAMO faint meteor model self.const.lum_eff_type = 5 @@ -409,7 +409,7 @@ def __init__(self, output_dir, erosion_sim_params, random_seed=None): else: local_state = np.random.RandomState() - # Randomly sample physical parameters + # Randomly sample physical parameters for param_name in self.params.param_list: # Get the parameter container @@ -426,11 +426,22 @@ def __init__(self, output_dir, erosion_sim_params, random_seed=None): p.val = 10**(local_state.uniform(np.log10(p.min), np.log10(p.max))) - # Generate meteoroid masses distributed according to a power-law - elif param_name == "m_init": + # # Generate meteoroid masses distributed according to a power-law + # elif param_name == "m_init": - # Use a sampling mass index of 2 - p.val = samplePowerLaw(-2.0, p.min, p.max) + # # # Use a sampling mass index of 2 + # p.val = samplePowerLaw(-2.0, p.min, p.max) + # # # p.val = samplePowerLaw(-1.1, p.min, p.max) + # # # p.val = np.random.uniform(p.min, p.max) + # # # p.val = local_state.uniform(p.min, p.max) + + # Generate meteoroid v_init distributed according to a gaussian-law with their p.min, p.max at 5 sigman + elif param_name == 'v_init' or param_name == 'zenith_angle': + mean = (p.min + p.max) / 2 + std_dev = (p.max - p.min) / 10 # Ensure the range covers ±5 sigma + + # Sample a value from the Gaussian distribution + p.val = local_state.normal(mean, std_dev) # b) Distribute all other values uniformely else: @@ -564,8 +575,10 @@ def runSimulation(self, min_frames_visible=MIN_FRAMES_VISIBLE): self.ht_sampled = None self.len_sampled = None self.mag_sampled = None + self.vel_sampled = None + self.lag_sampled = None if res is not None: - _, self.time_sampled, self.ht_sampled, self.len_sampled, self.mag_sampled, _, \ + _, self.time_sampled, self.ht_sampled, self.len_sampled, self.mag_sampled, self.vel_sampled, self.lag_sampled, _, \ _ = extractSimData(self, min_frames_visible=min_frames_visible, check_only=False, param_class=self.params.__class__) @@ -574,45 +587,50 @@ def runSimulation(self, min_frames_visible=MIN_FRAMES_VISIBLE): self.ht_sampled = self.ht_sampled.tolist() self.len_sampled = self.len_sampled.tolist() self.mag_sampled = self.mag_sampled.tolist() + self.vel_sampled = self.vel_sampled.tolist() + self.lag_sampled = self.lag_sampled.tolist() - ### Sort saved files into a directory structure split by velocity and density ### + ### Sort saved files into a directory structure split by velocity and density ### - # Extract the velocity part - split_file = self.file_name.split("_") - vel = float(split_file[2].strip("v")) + # Extract the velocity part + split_file = self.file_name.split("_") + vel = float(split_file[2].strip("v")) - # Make velocity folder name - vel_folder = "v{:02d}".format(int(vel)) - vel_folder_path = os.path.join(self.output_dir, vel_folder) + # Make velocity folder name + vel_folder = "v{:02d}".format(int(vel)) + vel_folder_path = os.path.join(self.output_dir, vel_folder) - # Create the velocity folder if it doesn't already exist - if not os.path.isdir(vel_folder_path): - os.makedirs(vel_folder_path) + # Create the velocity folder if it doesn't already exist + if not os.path.isdir(vel_folder_path): + os.makedirs(vel_folder_path) - # Extract the density part - dens = 100*int(float(split_file[4].strip("rho"))/100) + # Extract the density part + dens = 100*int(float(split_file[4].strip("rho"))/100) - # Make density folder name - dens_folder = "rho{:04d}".format(dens) - dens_folder_path = os.path.join(vel_folder_path, dens_folder) + # Make density folder name + dens_folder = "rho{:04d}".format(dens) + dens_folder_path = os.path.join(vel_folder_path, dens_folder) - # Make the density folder - if not os.path.isdir(dens_folder_path): - os.makedirs(dens_folder_path) + # Make the density folder + if not os.path.isdir(dens_folder_path): + os.makedirs(dens_folder_path) - ### + ### + + # Save results as a JSON file + self.saveJSON(dens_folder_path) - # Save results as a JSON file - self.saveJSON(dens_folder_path) + # Save results as a pickle file + savePickle(self, dens_folder_path, self.file_name + ".pickle") - # Save results as a pickle file - savePickle(self, dens_folder_path, self.file_name + ".pickle") + return os.path.join(dens_folder_path, self.file_name + ".pickle") + else: + return None - return os.path.join(dens_folder_path, self.file_name + ".pickle") @@ -667,6 +685,7 @@ def extractSimData(sim, min_frames_visible=MIN_FRAMES_VISIBLE, check_only=False, # Draw limiting magnitude and length end magnitude lim_mag = np.random.uniform(params.lim_mag_brightest, params.lim_mag_faintest) + lim_mag_len = np.random.uniform(params.lim_mag_len_end_brightest, params.lim_mag_len_end_faintest) # Draw the length delay @@ -675,8 +694,8 @@ def extractSimData(sim, min_frames_visible=MIN_FRAMES_VISIBLE, check_only=False, postprocess_params = [lim_mag, lim_mag_len, len_delay] - lim_mag_faintest = np.max([lim_mag, lim_mag_len]) - lim_mag_brightest = np.min([lim_mag, lim_mag_len]) + # lim_mag_faintest = np.max([lim_mag, lim_mag_len]) + # lim_mag_brightest = np.min([lim_mag, lim_mag_len]) ### ### @@ -684,9 +703,11 @@ def extractSimData(sim, min_frames_visible=MIN_FRAMES_VISIBLE, check_only=False, sim.simulation_results.abs_magnitude[np.isnan(sim.simulation_results.abs_magnitude)] \ = np.nanmax(sim.simulation_results.abs_magnitude) - # Get indices that are above the faintest limiting magnitude - indices_visible = sim.simulation_results.abs_magnitude <= lim_mag_faintest + # indices_visible = sim.simulation_results.abs_magnitude <= lim_mag_faintest + + # define the indices that are smller than the lim_mag + indices_visible = sim.simulation_results.abs_magnitude <= params.lim_mag_faintest # If no points were visible, skip this solution if not np.any(indices_visible): @@ -695,17 +716,33 @@ def extractSimData(sim, min_frames_visible=MIN_FRAMES_VISIBLE, check_only=False, ### CHECK METEOR VISIBILITY WITH THE BRIGTHER (DETECTION) LIMITING MAGNITUDE ### ### (in the CAMO widefield camera) ### - # Get indices of magnitudes above the brighter limiting magnitude - indices_visible_brighter = sim.simulation_results.abs_magnitude >= lim_mag_brightest + # # Get indices of magnitudes above the brighter limiting magnitude + # indices_visible_brighter = sim.simulation_results.abs_magnitude <= lim_mag_brightest + + # define the indices that are smaller than the lim_mag_len + indices_visible_brighter = sim.simulation_results.abs_magnitude <= lim_mag_len # If no points were visible, skip this solution if not np.any(indices_visible_brighter): return None + + # Find the first index of indices_visible + first_visible_index = np.where(indices_visible)[0][0] + + # Find the last index of indices_visible_brighter + last_visible_brighter_index = np.where(indices_visible_brighter)[0][-1] + + # randomly chose between first_visible_index and first_visible_index-1 + first_visible_index = np.random.choice([first_visible_index, first_visible_index-1]) + # Create a mask that includes all points between the first and last indices + indices_range = np.arange(first_visible_index, last_visible_brighter_index + 1) + indices_visible = np.zeros_like(indices_visible, dtype=bool) + indices_visible[indices_range] = True # Compute the minimum time the meteor needs to be visible min_time_visible = min_frames_visible/params.fps + len_delay - time_lim_mag_bright = sim.simulation_results.time_arr[indices_visible_brighter] + time_lim_mag_bright = sim.simulation_results.time_arr[indices_visible] time_lim_mag_bright -= time_lim_mag_bright[0] # Check if the minimum time is satisfied @@ -714,24 +751,28 @@ def extractSimData(sim, min_frames_visible=MIN_FRAMES_VISIBLE, check_only=False, ### ### - # Get the first index after the magnitude reaches visibility in the wide field - index_first_visibility = np.argwhere(indices_visible_brighter)[0][0] + # # Get the first index after the magnitude reaches visibility + # index_first_visibility = np.argwhere(indices_visible)[0][0] - # Set all visibility indices before the first one visible in the wide field to False - indices_visible[:index_first_visibility] = False + # # Set all visibility indices before the first one visible to False + # indices_visible[:index_first_visibility] = False # Select time, magnitude, height, and length above the visibility limit time_visible = sim.simulation_results.time_arr[indices_visible] mag_visible = sim.simulation_results.abs_magnitude[indices_visible] - ht_visible = sim.simulation_results.leading_frag_height_arr[indices_visible] - len_visible = sim.simulation_results.leading_frag_length_arr[indices_visible] + ht_visible = sim.simulation_results.brightest_height_arr[indices_visible] + len_visible = sim.simulation_results.brightest_length_arr[indices_visible] + vel_visible = sim.simulation_results.leading_frag_vel_arr[indices_visible] + # print('-------------------') + # print("mag_visible", mag_visible[-1]) # Resample the time to the system FPS mag_interpol = scipy.interpolate.CubicSpline(time_visible, mag_visible) ht_interpol = scipy.interpolate.CubicSpline(time_visible, ht_visible) len_interpol = scipy.interpolate.CubicSpline(time_visible, len_visible) + vel_interpol = scipy.interpolate.CubicSpline(time_visible, vel_visible) # Sample the time according to the FPS from one camera time_sampled_cam1 = np.arange(np.min(time_visible), np.max(time_visible), 1.0/params.fps) @@ -751,16 +792,43 @@ def extractSimData(sim, min_frames_visible=MIN_FRAMES_VISIBLE, check_only=False, # Cut the time array to the length of the visible data time_sampled_cam2 = time_sampled_cam2[(time_sampled_cam2 >= np.min(time_visible)) & (time_sampled_cam2 <= np.max(time_visible))] - + # Combine the two camera time arrays time_sampled = np.sort(np.concatenate([time_sampled_cam1, time_sampled_cam2])) - # Create new mag, height and length arrays at FPS frequency mag_sampled = mag_interpol(time_sampled) ht_sampled = ht_interpol(time_sampled) len_sampled = len_interpol(time_sampled) - + vel_sampled = vel_interpol(time_sampled) + + # print("mag_sampled", mag_sampled[-1]) + + # check if the last value in mag_sampled is smaller than params.lim_mag_len_end_brightest then add an other time_sampled[-1]+1.0/params.fps + mag_diff=(mag_sampled[-1]-params.lim_mag_len_end_brightest) + # check if the difference is negative or positive + if mag_diff < 0: + time_sampled_temp = np.append(time_sampled, time_sampled[-1]+1.0/params.fps) + mag_sampled_temp = mag_interpol(time_sampled_temp) + if abs(mag_diff)>abs(mag_sampled_temp[-1]-params.lim_mag_len_end_brightest): + time_sampled = time_sampled_temp + mag_sampled = mag_sampled_temp + ht_sampled = ht_interpol(time_sampled) + len_sampled = len_interpol(time_sampled) + vel_sampled = vel_interpol(time_sampled) + + elif mag_diff > 0: + time_sampled_temp = time_sampled[:-1] + mag_sampled_temp = mag_interpol(time_sampled_temp) + if abs(mag_diff)>abs(mag_sampled_temp[-1]-params.lim_mag_len_end_brightest): #lim_mag_len_end_faintest + time_sampled = time_sampled_temp + mag_sampled = mag_sampled_temp + ht_sampled = ht_interpol(time_sampled) + len_sampled = len_interpol(time_sampled) + vel_sampled = vel_interpol(time_sampled) + + # print("NEW") + # print("mag_sampled NEW: ", mag_sampled[-1]) # Normalize time to zero time_sampled -= time_sampled[0] @@ -774,8 +842,9 @@ def extractSimData(sim, min_frames_visible=MIN_FRAMES_VISIBLE, check_only=False, ### - # Set all magnitudes below the brightest limiting magnitude to the faintest magnitude - mag_sampled[mag_sampled > lim_mag] = params.lim_mag_len_end_faintest + #############WRONG################ + # # Set all magnitudes below the brightest limiting magnitude to the faintest magnitude + # mag_sampled[mag_sampled > lim_mag] = params.lim_mag_len_end_faintest # Normalize the first length to zero @@ -841,9 +910,21 @@ def extractSimData(sim, min_frames_visible=MIN_FRAMES_VISIBLE, check_only=False, padOrTruncate(len_normed, params.data_length), \ padOrTruncate(mag_normed, params.data_length)]) + lag_sampled=len_sampled-(vel_sampled[0]*time_sampled+len_sampled[0]) + + # get the new velocity with noise + for vel_ii in range(1,len(time_sampled)): + if time_sampled[vel_ii]-time_sampled[vel_ii-1]<1.0/params.fps: + # if time_sampled[vel_ii] % 0.03125 < 0.000000001: + if vel_ii+1 Date: Mon, 17 Feb 2025 11:45:07 -0500 Subject: [PATCH 2/2] fix a bug in AutoRefineFit.py plot and add comands, while avoid generating folders if not present in AutoRefineFit.py and added last version of PhysProp_GenSym --- wmpl/MetSim/AutoRefineFit.py | 82 +- wmpl/MetSim/ML/EMCCD_PhysProp_GenSym.py | 7394 ++++++++++++----------- wmpl/MetSim/ML/GenerateSimulations.py | 30 +- 3 files changed, 4081 insertions(+), 3425 deletions(-) diff --git a/wmpl/MetSim/AutoRefineFit.py b/wmpl/MetSim/AutoRefineFit.py index 65763274..d843fa78 100644 --- a/wmpl/MetSim/AutoRefineFit.py +++ b/wmpl/MetSim/AutoRefineFit.py @@ -532,36 +532,27 @@ def loadFitOptions(dir_path, file_name): -if __name__ == "__main__": - - import argparse - - - ######################### - - # Init the command line arguments parser - arg_parser = argparse.ArgumentParser(description="Refine meteoroid ablation model parameters using automated optimization.") - - arg_parser.add_argument('dir_path', metavar='DIR_PATH', type=str, \ - help="Path to the directory containing the meteor data. The direction has to contain the trajectory pickle file, the simulated parameters .json file, and optionally a METAL .met file with the wide-field lightcurve.") - - arg_parser.add_argument("fit_options_file", metavar="FIT_OPTIONS_FILE", type=str, \ - help="Name of the file containing the fit options. It is assumed the file is located in the same directory as the meteor data.") - - arg_parser.add_argument('--updated', action='store_true', \ - help="Load the updated simulation JSON file insted of the original one.") - - arg_parser.add_argument('-x', '--hideplots', \ - help="Don't show generated plots on the screen, just save them to disk.", action="store_true") - - # Parse the command line arguments - cml_args = arg_parser.parse_args() +def runAutoRefine(dir_path_main, fit_options_file_main, updated_main=False, hideplots_main=True): + """ + Run the automated fitting of the meteor parameters uing the Refine meteoroid ablation model parameters + using automated optimization. As folows are shown the 4 arguments that are passed to the function: + - dir_path_main = Path to the directory containing the meteor data. The direction has to contain the trajectory pickle file, the simulated parameters .json file, and optionally a METAL .met file with the wide-field lightcurve. + - fit_options_file_main = Name of the file containing the fit options. It is assumed the file is located in the same directory as the meteor data. + - updated_main = Load the updated simulation JSON file that has been refined insted of the original one. Default is False. + - hideplots_main = Don't show generated plots on the screen, just save them to disk. Default is True. + """ - ######################### + # initlize cml_args + class cml_args: + pass + cml_args.dir_path = dir_path_main + cml_args.fit_options_file = fit_options_file_main + cml_args.updated = updated_main + cml_args.hideplots = hideplots_main # Check that scipy version is at least 1.7.0 - older versions don't support Nelder-Mead with bounds - if scipy.__version__ < "1.7.0": + if tuple(map(int, scipy.__version__.split('.'))) < (1, 7, 0): print("ERROR: scipy version must be at least 1.7.0! Please upgrade scipy.") print("If you're using conda, then run:") print("conda install -c conda-forge scipy=1.7.3") @@ -869,7 +860,7 @@ def loadFitOptions(dir_path, file_name): mag_filter = obs.absolute_magnitudes < 9 - ax_mag.plot(obs.absolute_magnitudes[mag_filter], obs.model_ht[mag_filter]/1000, marker='x', ms=8, alpha=0.5, label=obs.station_id) + ax_mag.plot(obs.absolute_magnitudes[mag_filter], obs.model_ht[mag_filter]/1000, marker='x', markersize=8, alpha=0.5, label=obs.station_id) # Compute the observed lag @@ -882,12 +873,12 @@ def loadFitOptions(dir_path, file_name): # obs_ht = obs.model_ht # Plot the observed lag - lag_handle = ax_lag.plot(obs_lag, obs_ht/1000, 'x', ms=8, alpha=0.5, linestyle='dashed', + lag_handle = ax_lag.plot(obs_lag, obs_ht/1000, 'x', alpha=0.5, linestyle='dashed', label=obs.station_id, markersize=10, linewidth=2) # Plot the velocity - ax_vel.plot(obs.velocities[1:]/1000, obs_ht[1:]/1000, 'x', ms=8, alpha=0.5, linestyle='dashed', + ax_vel.plot(obs.velocities[1:]/1000, obs_ht[1:]/1000, 'x', alpha=0.5, linestyle='dashed', label=obs.station_id, markersize=10, linewidth=2) # Update the min/max height @@ -986,4 +977,35 @@ def loadFitOptions(dir_path, file_name): plt.clf() plt.close() - ### ### \ No newline at end of file + ### ### + + + +if __name__ == "__main__": + + import argparse + + + ######################### + + # Init the command line arguments parser + arg_parser = argparse.ArgumentParser(description="Refine meteoroid ablation model parameters using automated optimization.") + + arg_parser.add_argument('dir_path', metavar='DIR_PATH', type=str, \ + help="Path to the directory containing the meteor data. The direction has to contain the trajectory pickle file, the simulated parameters .json file, and optionally a METAL .met file with the wide-field lightcurve.") + + arg_parser.add_argument("fit_options_file", metavar="FIT_OPTIONS_FILE", type=str, \ + help="Name of the file containing the fit options. It is assumed the file is located in the same directory as the meteor data.") + + arg_parser.add_argument('--updated', action='store_true', \ + help="Load the updated simulation JSON file that has been refined insted of the original one.") + + arg_parser.add_argument('-x', '--hideplots', \ + help="Do not show generated plots on the screen, just save them to disk.", action="store_true") + + # Parse the command line arguments + cml_args = arg_parser.parse_args() + + ######################### + + runAutoRefine(cml_args.dir_path, cml_args.fit_options_file, cml_args.updated, cml_args.hideplots) diff --git a/wmpl/MetSim/ML/EMCCD_PhysProp_GenSym.py b/wmpl/MetSim/ML/EMCCD_PhysProp_GenSym.py index 0a713fca..6daaba2e 100644 --- a/wmpl/MetSim/ML/EMCCD_PhysProp_GenSym.py +++ b/wmpl/MetSim/ML/EMCCD_PhysProp_GenSym.py @@ -1,9 +1,9 @@ """ -The code is used to extract the physical properties of the simulated showers from EMCCD observations -by selecting the most similar simulated events in the PC space using: -- Mode of the siumulated events -- The min of the KDE esults -- Principal Component Regression (PCR) +The code is used to extract the physical properties of the simulated showers from observations +by selecting the most similar simulated events using a montecarlo method. +The code is used to : +- Generate the simulated meteors for given observations +- Extract the physical properties of the most similar simulated showers from observations """ import json @@ -11,6 +11,11 @@ import re import pandas as pd import matplotlib.pyplot as plt +from matplotlib.gridspec import GridSpec +import matplotlib.gridspec as gridspec +# import matplotlib +# matplotlib.use('Agg') +import matplotlib.ticker as ticker from numpy.linalg import inv import numpy as np import subprocess @@ -23,15 +28,16 @@ from sklearn.preprocessing import StandardScaler import wmpl import shutil -from scipy.stats import kurtosis, skew from wmpl.Utils.OSTools import mkdirP from matplotlib.ticker import ScalarFormatter -import math from scipy.stats import gaussian_kde +from scipy.stats import norm +from scipy.stats import chi2 from wmpl.Utils.PyDomainParallelizer import domainParallelizer from scipy.linalg import svd from wmpl.MetSim.GUI import loadConstants, saveConstants,SimulationResults from wmpl.MetSim.MetSimErosion import runSimulation, Constants, zenithAngleAtSimulationBegin +from wmpl.MetSim.AutoRefineFit import runAutoRefine from scipy.interpolate import interp1d from matplotlib.colors import Normalize from scipy.optimize import minimize @@ -48,79 +54,45 @@ from sklearn.preprocessing import PowerTransformer from wmpl.MetSim.ML.GenerateSimulations import generateErosionSim,saveProcessedList,MetParam from wmpl.Utils.TrajConversions import J2000_JD, date2JD +from wmpl.Utils.Math import meanAngle import warnings import itertools import time from multiprocessing import Pool - +from multiprocessing import cpu_count +import multiprocessing # CONSTANTS ########################################################################################### -FPS = 32 -NAME_SUFX_GENSIM = "_GenSim" +NAME_SUFX_GENSIM = "_PhysUncert" # _GenSim NAME_SUFX_CSV_OBS = "_obs.csv" -NAME_SUFX_CSV_SIM = "_sim.csv" -NAME_SUFX_CSV_SIM_NEW = "_sim_new.csv" -NAME_SUFX_CSV_CURRENT_FIT = "_fit_sim.csv" -NAME_SUFX_CSV_PHYSICAL_FIT_RESULTS = "_physical_prop.csv" - -SAVE_SELECTION_FOLDER='Selection' -VAR_SEL_DIR_SUFX = '_sel_var_vs_physProp' -PCA_SEL_DIR_SUFX = '_sel_PCA_vs_physProp' -SAVE_RESULTS_FOLDER='Results' -SAVE_RESULTS_FOLDER_EVENTS_PLOTS='Results'+os.sep+'events_plots' - -# sigma value of the RMSD that is considered to select a good fit -SIGMA_ERR = 1 # 1.96 # 95CI -MAG_RMSD = 0.25 -# MAG_RMSD = 0.25 # for heavy -# MAG_RMSD = 0.20 # for steep fast -# MAG_RMSD = 0.15 # for shallow slow -# MAG_RMSD = 0.05 # for small - -LEN_RMSD = 0.04 # 0.02 -# LEN_RMSD = 0.04 -# MAG_RMSD = 0.08 -# LEN_RMSD = 0.04 # 0.025 - -# Use the IF function, one of the logical functions, to return one value if a condition is true and another value if it's false. For example: =IF(A2>B2,"Over Budget","OK") =IF(A2=B2,B4-A4,"") - -# # Calculate the cumulative probability for the z-value, the confidence level is the percentage of the area within ±z_value -# CONFIDENCE_LEVEL = (2 * stats.norm.cdf(SIGMA_ERR) - 1)*100 +NAME_SUFX_CSV_RESULTS = "_results.csv" +METEOR_PLOTS_JSON_FILE_FOLDER = 'meteor_files' + +OPTIMIZATION_FOLDER='Optimization' +# these may change though the script +SAVE_RESULTS_FINAL_FOLDER='Results' + +# sensistivity lvl mag of camera +CAMERA_SENSITIVITY_LVL_MAG = np.float64(0.1) +# sensistivity lvl mag of camera +CAMERA_SENSITIVITY_LVL_LEN = np.float64(0.005)*1000 # Length of data that will be used as an input during training DATA_LENGTH = 256 # Default number of minimum frames for simulation MIN_FRAMES_VISIBLE = 4 +# Define the maximum difference in magnitude allowed +MAX_MAG_DIFF = 1 +# Penalty thresholds +TIME_THRESHOLD = 1 # frames +HEIGHT_THRESHOLD = 1 # km + # python -m EMCCD_PCA_Shower_PhysProp "C:\Users\maxiv\Documents\UWO\Papers\1)PCA\PCA_Error_propagation\TEST" "PER" "C:\Users\maxiv\Documents\UWO\Papers\1)PCA\PCA_Error_propagation" 1000 # python -m EMCCD_PCA_Shower_PhysProp "C:\Users\maxiv\Documents\UWO\Papers\1)PCA\PCA_Error_propagation\TEST" "PER" "C:\Users\maxiv\Documents\UWO\Papers\1)PCA\PCA_Error_propagation" 1000 > output.txt -# FUNCTIONS ########################################################################################### - -# create a txt file where you save averithing that has been printed -class Logger(object): - def __init__(self, directory=".", filename="log.txt"): - self.terminal = sys.stdout - # Ensure the directory exists - if not os.path.exists(directory): - os.makedirs(directory) - # Combine the directory and filename to create the full path - filepath = os.path.join(directory, filename) - self.log = open(filepath, "a") - - def write(self, message): - self.terminal.write(message) - self.log.write(message) - - def flush(self): - # This might be necessary as stdout could call flush - self.terminal.flush() - - def close(self): - # Close the log file when done - self.log.close() - +# MATH FUNCTIONS ########################################################################################### def find_closest_index(time_arr, time_sampled): closest_indices = [] @@ -142,12 +114,18 @@ def cubic_lag(t, a, b, c, t0): t_after = t[t > t0] # Compute the lag linearly before t0 - l_before = np.zeros_like(t_before)+c + l_before = np.zeros_like(t_before)#+c # Compute the lag quadratically after t0 - l_after = -abs(a)*(t_after - t0)**3 - abs(b)*(t_after - t0)**2 + c + l_after = -abs(a)*(t_after - t0)**3 - abs(b)*(t_after - t0)**2 #+ c - return np.concatenate((l_before, l_after)) + c=0 + + total_lag = np.concatenate((l_before, l_after)) + + total_lag = total_lag - total_lag[0] + + return total_lag def cubic_velocity(t, a, b, v0, t0): @@ -164,7 +142,7 @@ def cubic_velocity(t, a, b, v0, t0): # Compute the velocity linearly before t0 v_before = np.ones_like(t_before)*v0 - # Compute the velocity quadratically after t0 + # Compute the velocity quadratically after t0 lag_sampled=len_sampled-(vel_sampled[0]*time_sampled+len_sampled[0]) v_after = -3*abs(a)*(t_after - t0)**2 - 2*abs(b)*(t_after - t0) + v0 return np.concatenate((v_before, v_after)) @@ -234,42 +212,27 @@ def fit_mag_polin2_RMSD(data_mag, time_data): return fit1, residuals_pol, rmsd_pol,'Polinomial Fit' -def fit_lag_t0_RMSD_old(lag_data,time_data,velocity_data): - v_init=velocity_data[0] - # initial guess of deceleration decel equal to linear fit of velocity - p0 = [np.mean(lag_data), 0, 0, np.mean(time_data)] - opt_res = opt.minimize(lag_residual, p0, args=(np.array(time_data), np.array(lag_data)), method='Nelder-Mead') - a_t0, b_t0, c_t0, t0 = opt_res.x - fitted_lag_t0 = cubic_lag(np.array(time_data), a_t0, b_t0, c_t0, t0) - - opt_res_vel = opt.minimize(vel_residual, [a_t0, b_t0, v_init, t0], args=(np.array(time_data), np.array(velocity_data)), method='Nelder-Mead') - a_t0, b_t0, v_init_new, t0 = opt_res_vel.x # problem with the small time - fitted_vel_t0 = cubic_velocity(np.array(time_data), a_t0, b_t0, v_init, t0) - - fitted_acc_t0 = cubic_acceleration(np.array(time_data), a_t0, b_t0, t0) - residuals_t0 = lag_data - fitted_lag_t0 - rmsd_t0 = np.sqrt(np.mean(residuals_t0 ** 2)) - - return fitted_lag_t0, residuals_t0, rmsd_t0, 'Cubic Fit', fitted_vel_t0, fitted_acc_t0 - -def fit_lag_t0_RMSD(lag_data, time_data, velocity_data): - v_init = velocity_data[0] +def fit_lag_t0_RMSD(lag_data, time_data, velocity_data, v_init): + # v_init = velocity_data[0] # initial guess of deceleration decel equal to linear fit of velocity p0 = [np.mean(lag_data), 0, 0, np.mean(time_data)] opt_res = opt.minimize(lag_residual, p0, args=(np.array(time_data), np.array(lag_data)), method='Nelder-Mead') a_t0, b_t0, c_t0, t0 = opt_res.x fitted_lag_t0 = cubic_lag(np.array(time_data), a_t0, b_t0, c_t0, t0) + # fitted_lag_t0 = fitted_lag_t0 - fitted_lag_t0[0] # Optimize velocity residual based on initial guess from lag residual opt_res_vel = opt.minimize(vel_residual, [a_t0, b_t0, v_init, t0], args=(np.array(time_data), np.array(velocity_data)), method='Nelder-Mead') a_t0_vel, b_t0_vel, v_init_vel, t0_vel = opt_res_vel.x fitted_vel_t0_vel = cubic_velocity(np.array(time_data), a_t0_vel, b_t0_vel, v_init_vel, t0_vel) + + fitted_vlag_t0_vel = cubic_lag(np.array(time_data), a_t0_vel, b_t0_vel, c_t0, t0_vel) # # Compute fitted velocity from original lag optimization # fitted_vel_t0_lag = cubic_velocity(np.array(time_data), a_t0, b_t0, v_init, t0) # Compute fitted velocity from original lag optimization - fitted_vel_t0_lag = cubic_velocity(np.array(time_data), a_t0, b_t0, v_init_vel, t0) + fitted_vel_t0_lag = cubic_velocity(np.array(time_data), a_t0, b_t0, v_init, t0) # # Compute fitted velocity from original lag optimization # fitted_vel_t0_lag_vel = cubic_velocity(np.array(time_data), a_t0, b_t0, v_init_vel, t0) @@ -280,55 +243,91 @@ def fit_lag_t0_RMSD(lag_data, time_data, velocity_data): rmsd_vel_vel = np.sqrt(np.mean(residuals_vel_vel ** 2)) rmsd_vel_lag = np.sqrt(np.mean(residuals_vel_lag ** 2)) + + best_fitted_vel_t0 = fitted_vel_t0_lag + best_a_t0, best_b_t0, best_t0 = a_t0, b_t0, t0 - # Choose the best fitted velocity based on RMSD - if rmsd_vel_vel < rmsd_vel_lag: - best_fitted_vel_t0 = fitted_vel_t0_vel - best_a_t0, best_b_t0, best_t0 = a_t0_vel, b_t0_vel, t0_vel - else: - best_fitted_vel_t0 = fitted_vel_t0_lag - best_a_t0, best_b_t0, best_t0 = a_t0, b_t0, t0 - + # # Choose the best fitted velocity based on RMSD + # if rmsd_vel_vel < rmsd_vel_lag: + # best_fitted_vel_t0 = fitted_vel_t0_vel + # best_a_t0, best_b_t0, best_t0 = a_t0_vel, b_t0_vel, t0_vel + # else: + # best_fitted_vel_t0 = fitted_vel_t0_lag + # best_a_t0, best_b_t0, best_t0 = a_t0, b_t0, t0 + + # # plot the two curves of lag and velocity + # fig, ax = plt.subplots(1, 2, figsize=(14, 6), dpi=300) + # # flat the ax + # ax = ax.flatten() + # ax[0].plot(time_data, lag_data, 'go', label='Observation') + # ax[0].plot(time_data, fitted_lag_t0, 'k--', label='Cubic Fit lag') + # ax[0].plot(time_data, fitted_vlag_t0_vel, 'r--', label='Cubic Fit vel') + # ax[0].set_xlabel('Time (s)') + # ax[0].set_ylabel('Lag [m]') + # ax[0].legend() + # ax[1].plot(time_data, velocity_data, 'go', label='Observation') + # ax[1].plot(time_data, fitted_vel_t0_lag, 'k--', label='Cubic Fit lag') + # ax[1].plot(time_data, fitted_vel_t0_vel, 'r--', label='Cubic Fit vel') + # ax[1].set_ylabel('Velocity (m/s)') + # ax[1].set_xlabel('Time (s)') + # ax[1].legend() + # plt.show() + fitted_acc_t0 = cubic_acceleration(np.array(time_data), best_a_t0, best_b_t0, best_t0) + # lag can be wrong for short meteors but stil the RMSD will be the same as the scatter WILL NOT CHANGE residuals_t0 = lag_data - fitted_lag_t0 rmsd_t0 = np.sqrt(np.mean(residuals_t0 ** 2)) - return fitted_lag_t0, residuals_t0, rmsd_t0, 'Cubic Fit', best_fitted_vel_t0, fitted_acc_t0 + # # lag can be wrong for short meteors where velocity drops suddenly + # fitted_lag_t0 = cubic_lag(np.array(time_data), best_a_t0, best_b_t0, c_t0, best_t0) + return fitted_lag_t0, residuals_t0, rmsd_t0, 'Cubic Fit', best_fitted_vel_t0, residuals_vel_vel, fitted_acc_t0 -def find_noise_of_data(data, plot_case=False): + +def find_noise_of_data(data, fps=32, output_folder='', file_name=''): + ''' + Find the noise of the data + ''' # make a copy of data_obs data_obs = copy.deepcopy(data) - fitted_lag_t0_lag, residuals_t0_lag, rmsd_t0_lag, fit_type_lag, fitted_vel_t0, fitted_acc_t0 = fit_lag_t0_RMSD(data_obs['lag'],data_obs['time'], data_obs['velocities']) + fitted_lag_t0_lag, residuals_t0_lag, rmsd_t0_lag, fit_type_lag, fitted_vel_t0, residuals_t0_vel, fitted_acc_t0 = fit_lag_t0_RMSD(data_obs['lag'],data_obs['time'], data_obs['velocities'], data_obs['v_init']) # now do it for fit_mag_polin2_RMSD fit_pol_mag, residuals_pol_mag, rmsd_pol_mag, fit_type_mag = fit_mag_polin2_RMSD(data_obs['absolute_magnitudes'],data_obs['time']) + # lag_sampled=len_sampled-(vel_sampled[0]*time_sampled+len_sampled[0]) + + len_t0_extr= fitted_lag_t0_lag + (fitted_vel_t0[0]*data_obs['time']) + # create a pd dataframe with fit_pol_mag and fitted_vel_t0 and time and height fit_funct = { 'velocities': fitted_vel_t0, 'height': data_obs['height'], 'absolute_magnitudes': fit_pol_mag, 'time': data_obs['time'], - 'lag': fitted_lag_t0_lag + 'lag': fitted_lag_t0_lag, + 'length': len_t0_extr, + 'rmsd_len' : rmsd_t0_lag/1000, + 'rmsd_mag' : rmsd_pol_mag, + 'rmsd_vel' : rmsd_t0_lag/1000*np.sqrt(2)/(1.0/fps), + 'fps': fps } + + data_obs['res_absolute_magnitudes'] = residuals_pol_mag + data_obs['res_lag'] = residuals_t0_lag + data_obs['res_velocities'] = residuals_t0_vel/1000 + data_obs['rmsd_len'] = rmsd_t0_lag/1000 + data_obs['rmsd_mag'] = rmsd_pol_mag + data_obs['rmsd_vel'] = rmsd_t0_lag/1000*np.sqrt(2)/(1.0/fps) - if plot_case: - fig, ax = plt.subplots(1, 2, figsize=(14, 6), dpi=300) - # flat the ax - ax = ax.flatten() - plot_side_by_side(data,fig, ax,'go','Obsevation') - - plot_side_by_side(fit_funct,fig, ax,'k--','fit') - - return rmsd_t0_lag, rmsd_pol_mag, fit_pol_mag, fitted_lag_t0_lag, fit_funct, fig, ax - else: - return rmsd_t0_lag, rmsd_pol_mag, fit_pol_mag, fitted_lag_t0_lag, fit_funct + # data['name'] is a path and I need aonly the name of the file + plot_data_with_residuals_and_real(rmsd_pol_mag*1.96, rmsd_t0_lag/1000*np.sqrt(2)/(1.0/fps)*1.96, rmsd_t0_lag/1000*1.96, fit_funct, data_obs, label_real=data['name'].split(os.sep)[-1], file_name=data['name'].split(os.sep)[-1]+'_fit_t0_polin_curve.png', output_dir = output_folder) + return rmsd_t0_lag, rmsd_pol_mag, fit_pol_mag, fitted_lag_t0_lag, fit_funct #### Generate Observation ######################################################################### -def generate_observation_realization(data, rmsd_lag, rmsd_mag, fit_funct, name='', fig='', ax='', plot_case=False): +def generate_observation_realization(data, rmsd_lag, rmsd_mag, fit_funct, name='', fps=32, fig='', ax='', plot_case=False): # print a . so that the next will be on the same line print('.', end='') @@ -352,9 +351,9 @@ def generate_observation_realization(data, rmsd_lag, rmsd_mag, fit_funct, name=' # Add noise to length data (Gaussian noise) for each realization fitted_lag_t0_lag += np.random.normal(loc=0.0, scale=rmsd_lag, size=len(data_obs['length'])) data_obs['lag']=fitted_lag_t0_lag - # add noise to velocity data considering the noise as rmsd_lag/(1.0/FPS) - fitted_lag_t0_vel += np.random.normal(loc=0.0, scale=rmsd_lag/(1.0/FPS), size=len(data_obs['velocities'])) - # fitted_lag_t0_vel += np.random.normal(loc=0.0, scale=rmsd_lag*np.sqrt(2)/(1.0/FPS), size=len(data_obs['velocities'])) + # add noise to velocity data considering the noise as rmsd_lag/(1.0/fps) + # fitted_lag_t0_vel += np.random.normal(loc=0.0, scale=rmsd_lag/(1.0/fps), size=len(data_obs['velocities'])) + fitted_lag_t0_vel += np.random.normal(loc=0.0, scale=rmsd_lag*np.sqrt(2)/(1.0/fps), size=len(data_obs['velocities'])) data_obs['velocities']=fitted_lag_t0_vel ### ### @@ -364,8 +363,8 @@ def generate_observation_realization(data, rmsd_lag, rmsd_mag, fit_funct, name=' # # get the new velocity with noise # for vel_ii in range(1,len(data_obs['time'])-1): - # diff_1=abs((data_obs['time'][vel_ii]-data_obs['time'][vel_ii-1])-1.0/FPS) - # diff_2=abs((data_obs['time'][vel_ii+1]-data_obs['time'][vel_ii-1])-1.0/FPS) + # diff_1=abs((data_obs['time'][vel_ii]-data_obs['time'][vel_ii-1])-1.0/fps) + # diff_2=abs((data_obs['time'][vel_ii+1]-data_obs['time'][vel_ii-1])-1.0/fps) # if diff_160000: + # erosion_sim_params.dt = 0.005 + # elif v_init_180km<20000: + # erosion_sim_params.dt = 0.01 + # else: + # erosion_sim_params.dt = (-1)*0.000000125*v_init_180km+0.0125 - # Initial velocity range (m/s) - erosion_sim_params.v_init = MetParam(v_init_180km-1000, v_init_180km+1000) # 60091.41691 # Zenith angle range - erosion_sim_params.zenith_angle = MetParam(np.radians(real_data['zenith_angle'].iloc[0]-0.01), np.radians(real_data['zenith_angle'].iloc[0]+0.01)) # 43.466538 + erosion_sim_params.zenith_angle = MetParam(np.radians(real_data['zenith_angle'].iloc[0]-0.1), np.radians(real_data['zenith_angle'].iloc[0]+0.1)) # 43.466538 + + ###### PANDA DATAFRAME RANGES ###### - # erosion_sim_params.erosion_height_start = MetParam(real_data['peak_mag_height'].iloc[0]*1000+(real_data['begin_height'].iloc[0]-real_data['peak_mag_height'].iloc[0])*1000/2, real_data['begin_height'].iloc[0]*1000+(real_data['begin_height'].iloc[0]-real_data['peak_mag_height'].iloc[0])*1000/2) # 43.466538 - erosion_sim_params.erosion_height_start = MetParam(real_data['begin_height'].iloc[0]*1000-1000, real_data['begin_height'].iloc[0]*1000+4000) # 43.466538 + erosion_range_min=(np.log10(erosion_sim_params.erosion_mass_max.max) - np.log10(erosion_sim_params.erosion_mass_min.min)) + erosion_range_max=(np.log10(erosion_sim_params.erosion_mass_max.min) - np.log10(erosion_sim_params.erosion_mass_min.max)) + + const = simulation_MetSim_object.const + + # Load the constants + # const, _ = loadConstants(cost_path) + const.dens_co = np.array(const.dens_co) + + # copy const in const_min and const_max + const_min = copy.deepcopy(const) + const_max = copy.deepcopy(const) + + const_min.erosion_height_start = erosion_sim_params.erosion_height_start.min + const_min.v_init = erosion_sim_params.v_init.min + const_min.zenith_angle = erosion_sim_params.zenith_angle.min + const_min.m_init = erosion_sim_params.m_init.min + const_min.rho = erosion_sim_params.rho.min + + const_max.erosion_height_start = erosion_sim_params.erosion_height_start.max + const_max.v_init = erosion_sim_params.v_init.max + const_max.zenith_angle = erosion_sim_params.zenith_angle.max + const_max.m_init = erosion_sim_params.m_init.max + const_max.rho = erosion_sim_params.rho.max + + # Compute the erosion energies + erosion_energy_per_unit_cross_section_min, erosion_energy_per_unit_mass_min = wmpl.MetSim.MetSimErosion.energyReceivedBeforeErosion(const_min) + erosion_energy_per_unit_cross_section_max, erosion_energy_per_unit_mass_max = wmpl.MetSim.MetSimErosion.energyReceivedBeforeErosion(const_max) + + pd_dataframe_col = ['mass', 'rho', 'sigma', 'erosion_height_start', 'erosion_coeff', 'erosion_mass_index', 'erosion_mass_min', 'erosion_mass_max', 'erosion_range', 'erosion_energy_per_unit_cross_section', 'erosion_energy_per_unit_mass'] + pd_dataframe_ranges = pd.DataFrame(columns=pd_dataframe_col) + pd_dataframe_ranges.loc[0] = [erosion_sim_params.m_init.min, erosion_sim_params.rho.min, erosion_sim_params.sigma.min, erosion_sim_params.erosion_height_start.min/1000, erosion_sim_params.erosion_coeff.min, erosion_sim_params.erosion_mass_index.min, erosion_sim_params.erosion_mass_min.min, erosion_sim_params.erosion_mass_max.min, erosion_range_min, erosion_energy_per_unit_cross_section_min, erosion_energy_per_unit_mass_min] + pd_dataframe_ranges.loc[1] = [erosion_sim_params.m_init.max, erosion_sim_params.rho.max, erosion_sim_params.sigma.max, erosion_sim_params.erosion_height_start.max/1000, erosion_sim_params.erosion_coeff.max, erosion_sim_params.erosion_mass_index.max, erosion_sim_params.erosion_mass_min.max, erosion_sim_params.erosion_mass_max.max, erosion_range_max, erosion_energy_per_unit_cross_section_max, erosion_energy_per_unit_mass_max] + + # # erosion_sim_params.erosion_height_start = MetParam(real_data['peak_mag_height'].iloc[0]*1000+(real_data['begin_height'].iloc[0]-real_data['peak_mag_height'].iloc[0])*1000/2, real_data['begin_height'].iloc[0]*1000+(real_data['begin_height'].iloc[0]-real_data['peak_mag_height'].iloc[0])*1000/2) # 43.466538 + # erosion_sim_params.erosion_height_start = MetParam(real_data['begin_height'].iloc[0]*1000-1000, real_data['begin_height'].iloc[0]*1000+4000) # 43.466538 + return erosion_sim_params, pd_dataframe_ranges + + + + +def generate_simulations(real_data,simulation_MetSim_object,gensim_data_obs,fit_funct,n_res_to_find,cores_to_run,result_folder,output_folder,file_name, fps, dens_co, flag_manual_metsim=True, CI_physical_param=''): + ''' + Generate simulations for the given real data + ''' + + # Init simulation parameters with the given class name + erosion_sim_params, _ = range_gen_simulations(real_data,simulation_MetSim_object, fps, dens_co, flag_manual_metsim) if CI_physical_param!='': - erosion_sim_params.v_init = MetParam(CI_physical_param['v_init_180km'][0], CI_physical_param['v_init_180km'][1]) # 60091.41691 + erosion_sim_params.v_init = MetParam(CI_physical_param['vel_180km'][0], CI_physical_param['vel_180km'][1]) # 60091.41691 erosion_sim_params.zenith_angle = MetParam(np.radians(CI_physical_param['zenith_angle'][0]), np.radians(CI_physical_param['zenith_angle'][1])) # 43.466538 erosion_sim_params.m_init = MetParam(CI_physical_param['mass'][0], CI_physical_param['mass'][1]) erosion_sim_params.rho = MetParam(CI_physical_param['rho'][0], CI_physical_param['rho'][1]) @@ -650,17 +805,17 @@ def generate_simulations(real_data,simulation_MetSim_object,gensim_data,numb_sim if os.path.exists(output_folder+os.sep+"log_"+file_name[:15]+"_GenereateSimulations_range_NEW.txt"): # remove the file os.remove(output_folder+os.sep+"log_"+file_name[:15]+"_GenereateSimulations_range_NEW.txt") - sys.stdout = Logger(output_folder,"log_"+file_name[:15]+"_GenereateSimulations_range_NEW.txt") # _30var_99%_13PC + sys.stdout = Logger(output_folder,"log_"+file_name[:15]+"_GenereateSimulations_range_NEW.txt") # _30var_99perc_13PC else: + # check if a file with the name "log"+n_PC_in_PCA+"_"+str(len(df_sel))+"ev.txt" already exist if os.path.exists(output_folder+os.sep+"log_"+file_name[:15]+"_GenereateSimulations_range.txt"): # remove the file - os.remove(output_folder+os.sep+"log_"+file_name[:15]+"GenereateSimulations_range.txt") - sys.stdout = Logger(output_folder,"log_"+file_name[:15]+"GenereateSimulations_range.txt") # _30var_99%_13PC + os.remove(output_folder+os.sep+"log_"+file_name[:15]+"_GenereateSimulations_range.txt") + sys.stdout = Logger(output_folder,"log_"+file_name[:15]+"_GenereateSimulations_range.txt") # _30var_99perc_13PC - print('Run',numb_sim,'simulations with :') - # to_plot_unit=['mass [kg]','rho [kg/m^3]','sigma [s^2/km^2]','erosion height start [km]','erosion coeff [s^2/km^2]','erosion mass index [-]','eros. mass min [kg]','eros. mass max [kg]'] + # to_plot_unit=['mass [kg]','rho [kg/m^3]','sigma [kg/MJ]','erosion height start [km]','erosion coeff [kg/MJ]','erosion mass index','eros. mass min [kg]','eros. mass max [kg]'] print('\\hline') #df_sel_save[df_sel_save['solution_id']==only_select_meteors_from][plotvar].values[0] print('Variables & min.val. & MAX.val. \\\\') @@ -672,17 +827,17 @@ def generate_simulations(real_data,simulation_MetSim_object,gensim_data,numb_sim print('\\hline') # - zenith angle: min 28.736969960110045 - MAX 28.75696996011005 # print('- zenith angle: min',np.degrees(erosion_sim_params.zenith_angle.min),'- MAX',np.degrees(erosion_sim_params.zenith_angle.max)) - print(f"Zenith ang. [deg] & {'{:.4g}'.format(np.degrees(erosion_sim_params.zenith_angle.min))} & {'{:.4g}'.format(np.degrees(erosion_sim_params.zenith_angle.max))} \\\\") + print(f"Zenith ang. [deg] & {'{:.6g}'.format(np.degrees(erosion_sim_params.zenith_angle.min))} & {'{:.6g}'.format(np.degrees(erosion_sim_params.zenith_angle.max))} \\\\") print('\\hline') # - Initial mag: min 5.45949291900601 - MAX 5.43949291900601 # print('- Initial mag: min',erosion_sim_params.lim_mag_faintest,'- MAX',erosion_sim_params.lim_mag_brightest) - print(f"Init. mag [-] & {'{:.4g}'.format(erosion_sim_params.lim_mag_faintest)} & {'{:.4g}'.format(erosion_sim_params.lim_mag_brightest)} \\\\") + print(f"Init. mag & {'{:.4g}'.format(erosion_sim_params.lim_mag_faintest)} & {'{:.4g}'.format(erosion_sim_params.lim_mag_brightest)} \\\\") print('\\hline') # - Final mag: min 6.0268141526507435 - MAX 6.006814152650744 # print('- Final mag: min',erosion_sim_params.lim_mag_len_end_faintest,'- MAX',erosion_sim_params.lim_mag_len_end_brightest) - print(f"Fin. mag [-] & {'{:.4g}'.format(erosion_sim_params.lim_mag_len_end_faintest)} & {'{:.4g}'.format(erosion_sim_params.lim_mag_len_end_brightest)} \\\\") + print(f"Fin. mag & {'{:.4g}'.format(erosion_sim_params.lim_mag_len_end_faintest)} & {'{:.4g}'.format(erosion_sim_params.lim_mag_len_end_brightest)} \\\\") print('\\hline') # - Mass: min 5.509633400654068e-07 - MAX 1.5509633400654067e-06 @@ -697,7 +852,7 @@ def generate_simulations(real_data,simulation_MetSim_object,gensim_data,numb_sim print('\\hline') # - sigma : min 8e-09 - MAX 3e-08 # print('- sigma : min',erosion_sim_params.sigma.min,'- MAX',erosion_sim_params.sigma.max) - print(f"sigma [s^2/km^2] & {'{:.4g}'.format(erosion_sim_params.sigma.min*1000000)} & {'{:.4g}'.format(erosion_sim_params.sigma.max*1000000)} \\\\") + print(f"sigma [kg/MJ] & {'{:.4g}'.format(erosion_sim_params.sigma.min*1000000)} & {'{:.4g}'.format(erosion_sim_params.sigma.max*1000000)} \\\\") print('\\hline') # - erosion_height_start : min 107622.04437691614 - MAX 117622.04437691614 @@ -707,12 +862,12 @@ def generate_simulations(real_data,simulation_MetSim_object,gensim_data,numb_sim print('\\hline') # - erosion_coeff : min 0.0 - MAX 1e-06 # print('- erosion_coeff : min',erosion_sim_params.erosion_coeff.min,'- MAX',erosion_sim_params.erosion_coeff.max) - print(f"Eros.coeff. [s^2/km^2] & {'{:.4g}'.format(erosion_sim_params.erosion_coeff.min*1000000)} & {'{:.4g}'.format(erosion_sim_params.erosion_coeff.max*1000000)} \\\\") + print(f"Eros.coeff. [kg/MJ] & {'{:.4g}'.format(erosion_sim_params.erosion_coeff.min*1000000)} & {'{:.4g}'.format(erosion_sim_params.erosion_coeff.max*1000000)} \\\\") print('\\hline') # - erosion_mass_index : min 1.5 - MAX 2.5 # print('- erosion_mass_index : min',erosion_sim_params.erosion_mass_index.min,'- MAX',erosion_sim_params.erosion_mass_index.max) - print(f"Eros.mass index [-] & {'{:.4g}'.format(erosion_sim_params.erosion_mass_index.min)} & {'{:.4g}'.format(erosion_sim_params.erosion_mass_index.max)} \\\\") + print(f"Eros.mass index & {'{:.4g}'.format(erosion_sim_params.erosion_mass_index.min)} & {'{:.4g}'.format(erosion_sim_params.erosion_mass_index.max)} \\\\") print('\\hline') # - erosion_mass_min : min 5e-12 - MAX 1e-10 @@ -733,70 +888,234 @@ def generate_simulations(real_data,simulation_MetSim_object,gensim_data,numb_sim # Reset sys.stdout to its original value if needed sys.stdout = sys.__stdout__ - input_list = [(output_folder, copy.deepcopy(erosion_sim_params), np.random.randint(0, 2**31 - 1), MIN_FRAMES_VISIBLE) for _ in range(numb_sim)] - with Pool(cml_args.cores) as pool: - results_list = pool.map(safe_generate_erosion_sim, input_list) + mkdirP(output_folder+os.sep+OPTIMIZATION_FOLDER) + + # check if gensim_data_obs['name'] ends with pikle + if gensim_data_obs['name'].endswith('.pickle'): + print('The detect meteor is in a pickle format') + # turn the true False statement to True + optimiz_case = True + else: + print('The detect meteor is not saved in a pickle format') + # it cannot optimize the json files + optimiz_case = False + + # Create a Pool with 'cores_to_run' processes + pool = multiprocessing.Pool(processes=cores_to_run) + + try: + # Keep submitting tasks as long as we don't have enough JSON files + while True: + # Check how many JSON files we have so far + all_jsonfiles = get_json_files(result_folder) + current_count = len(all_jsonfiles) + print(f"{n_res_to_find} needed, current count: {current_count} still need {n_res_to_find - current_count}") + # If we have enough, break + if current_count >= n_res_to_find: + print(f"SUCCESS: Found {current_count} JSON files, stopping.") + # Terminate *all* running/pending tasks immediately + pool.terminate() + # Join so we wait for worker processes to actually exit + pool.join() + # (Optional) cleanup + for folder in os.listdir(output_folder): + # Example: remove folders that match vNN, e.g. v60 + if re.match(r'v\d{2}', folder): + shutil.rmtree(os.path.join(output_folder, folder)) + + # delete the (output_folder+os.sep+OPTIMIZATION_FOLDER) + shutil.rmtree(output_folder+os.sep+OPTIMIZATION_FOLDER) + + break + + # If not enough yet, spawn *one* new task + pool.apply_async( + search_for_good_results, + args=( + n_res_to_find, + gensim_data_obs, + fit_funct, + result_folder, + output_folder, + copy.deepcopy(erosion_sim_params), + optimiz_case, + MIN_FRAMES_VISIBLE + ) + ) + + # Sleep a little so we don't spawn tasks too rapidly in a tight loop + # This also gives the pool time to schedule tasks and start them + time.sleep(0.1) + + finally: + # Once we have enough files, or if there's an error, close the pool + pool.close() + pool.join() + - count_none = sum(res is None for res in results_list) - saveProcessedList(output_folder, results_list, erosion_sim_params.__class__.__name__, MIN_FRAMES_VISIBLE) - print('Resulted simulations:', numb_sim - count_none) - print('Failed simulations:', count_none) - print('Saved', numb_sim - count_none, 'simulations in', output_folder) - ######################### - # # Generate simulations using multiprocessing - # input_list = [[output_folder, copy.deepcopy(erosion_sim_params), \ - # np.random.randint(0, 2**31 - 1),MIN_FRAMES_VISIBLE] for _ in range(numb_sim)] - # results_list = domainParallelizer(input_list, generateErosionSim, cores=cml_args.cores) - - # # print(results_list) - - # # count how many None are in the results_list - # count_none=0 - # for res in results_list: - # if res is None: - # count_none+=1 - # continue - - # saveProcessedList(output_folder, results_list, erosion_sim_params.__class__.__name__, \ - # MIN_FRAMES_VISIBLE) - # print('Resulted simulations:', numb_sim-count_none) - # print('Failed siulations', len(results_list)/100*count_none,'%') - # print('Saved',numb_sim-count_none,'simulations in',output_folder) - ######################### +def search_for_good_results(n_res_to_find, gensim_data_obs, fit_funct, result_folder, output_folder, erosion_sim_params, optimiz_case = True, minframvis = MIN_FRAMES_VISIBLE): + """ + Perform exactly ONE iteration of the generation + optimization process. + Return True/False (or any relevant info) to indicate success/failure. + """ - # plot the pickle files data that are not none in the results_list - # do not plot more than 10 curves - if plot_case: + # walk thorought the directories and find all the json files inside each folder inside the directory + file_name_obs = os.path.basename(gensim_data_obs['name'])[:15] + real_event_copy = copy.deepcopy(gensim_data_obs) - fig, ax = plt.subplots(1, 2, figsize=(14, 6), dpi=300) - # flat the ax - ax = ax.flatten() + print('CPU:',multiprocessing.current_process().name) + # very random seed np.random.seed(None) + results_list = safe_generate_erosion_sim([output_folder, erosion_sim_params, np.random.seed(None), minframvis]) # np.random.randint(0, 2**31 - 1) - jj_plots_curve=0 - for res in results_list: - if res is not None: - if jj_plots_curve>100: - # stop if too many curves are plotted - break - - if res[0] is not None: - # change res[0] extension to .json - res[0] = res[0].replace('.pickle', '.json') - print(res[0]) - # get the first value of res - gensim_data_sim = read_GenerateSimulations_output(res[0]) - - plot_side_by_side(gensim_data_sim, fig, ax, 'b-') - jj_plots_curve += 1 - - plot_side_by_side(gensim_data,fig, ax,'go','Obsevation') + print('results_list',results_list) + + # chnage the extension of results_list[0] to json + results_json = results_list[0].replace('.pickle','.json') + gensim_data = read_GenerateSimulations_output(results_json, real_event_copy) + flag_results_found = create_json_file_and_optimiz(gensim_data, file_name_obs, real_event_copy, fit_funct, result_folder, output_folder, '',1.1, optimiz_case) + if flag_results_found: + print('Results found for',results_json) + else: + print('Results not found for',results_json) + + # remove the results + os.remove(results_json) + os.remove(results_list[0]) + + + + +def create_json_file_and_optimiz(gensim_data, file_name_obs, gensim_data_obs, fit_funct, result_folder, output_folder, filetype='', opt_multip=2, optim=False): + + results_json = gensim_data['name'] + # take the folder and the name of the file + _, results_json_name = os.path.split(results_json) + + # results_pickle_name = results_list[0].split(os.sep)[-1] + + image_name=results_json_name[:-5]+'.png' - return fig, ax + chi2_red_mag, chi2_red_vel, chi2_red_len, rmsd_mag, rmsd_vel, rmsd_lag, residuals_mag, residuals_vel, residuals_len, residual_time_pos, residual_height_pos , lag_km_sim = RMSD_calc_diff(gensim_data, gensim_data_obs) #, fit_funct + # Interpolation on the fit data's height grid + interp_ht_time = interp1d(gensim_data_obs['height'], gensim_data_obs['time'], kind='linear', bounds_error=False, fill_value='extrapolate') + # Interpolated fit on data grid + sim_time_pos = interp_ht_time(gensim_data['height']) + + # copy the data to the mode + data_file = gensim_data.copy() + data_file['time'] = sim_time_pos + data_file['res_absolute_magnitudes'] = residuals_mag + data_file['res_velocities'] = residuals_vel + data_file['res_lag'] = residuals_len * 1000 + data_file['lag'] = lag_km_sim * 1000 + data_file['rmsd_mag'] = rmsd_mag + data_file['rmsd_vel'] = rmsd_vel + data_file['rmsd_len'] = rmsd_lag + data_file['chi2_red_mag'] = chi2_red_mag + data_file['chi2_red_len'] = chi2_red_len + + print('REAL rmsd_mag',gensim_data_obs['rmsd_mag']*gensim_data_obs['z_score'],'SIM rmsd_mag',rmsd_mag,'check') + print('REAL rmsd_lag',gensim_data_obs['rmsd_len']*gensim_data_obs['z_score'],'SIM rmsd_lag',rmsd_lag,'check') + if gensim_data_obs['rmsd_mag']*gensim_data_obs['z_score']>rmsd_mag and gensim_data_obs['rmsd_len']*gensim_data_obs['z_score']>rmsd_lag: + print('SUCCESS: Below RMSD threshold RMSD: MAG sim',np.round(rmsd_mag,3),'REAL',np.round(gensim_data_obs['rmsd_mag']*gensim_data_obs['z_score'],3),'|| LAG',np.round(rmsd_lag,3),'REAL',np.round(gensim_data_obs['rmsd_len']*gensim_data_obs['z_score'],3)) + + plot_data_with_residuals_and_real(gensim_data_obs['rmsd_mag']*gensim_data_obs['z_score'], gensim_data_obs['rmsd_vel']*gensim_data_obs['z_score'], gensim_data_obs['rmsd_len']*gensim_data_obs['z_score'], fit_funct, gensim_data_obs, gensim_data_obs['name'].split(os.sep)[-1], image_name, result_folder, data_file, filetype) + + # save the results + shutil.copy(results_json, result_folder+os.sep+results_json_name) + return True + + # try to optimize if close to the multiplier value + elif gensim_data_obs['rmsd_mag']*gensim_data_obs['z_score']*opt_multip>rmsd_mag and gensim_data_obs['rmsd_len']*gensim_data_obs['z_score']*opt_multip>rmsd_lag and optim==True: + print('... : Try Optimization as it below RMSD threshold RMSD: MAG sim',np.round(rmsd_mag,3),'REAL',np.round(gensim_data_obs['rmsd_mag']*gensim_data_obs['z_score']*opt_multip,3),'|| LAG',np.round(rmsd_lag,3),'REAL',np.round(gensim_data_obs['rmsd_len']*gensim_data_obs['z_score']*opt_multip,3)) + + # file_json_save_phys=output_folder+os.sep+OPTIMIZATION_FOLDER+os.sep+results_json_name[:-5]+'_fitted.json' + output_dir_optimized = output_folder+os.sep+OPTIMIZATION_FOLDER+os.sep+'Optimization_'+results_json_name[:-5] + # file_json_save_phys=output_dir_optimized+os.sep+results_json_name[:-5]+'_fitted.json' + file_optim_results = output_dir_optimized+os.sep+file_name_obs+'_sim_fit_fitted.json' + # image_name=results_json_name[:-5]+'_fitted.png' + # results_json_name_fit = results_json_name[:-5]+'_fitted.json' + mkdirP(output_dir_optimized) + + plot_data_with_residuals_and_real(gensim_data_obs['rmsd_mag']*gensim_data_obs['z_score'], gensim_data_obs['rmsd_vel']*gensim_data_obs['z_score'], gensim_data_obs['rmsd_len']*gensim_data_obs['z_score'], fit_funct, gensim_data_obs, gensim_data_obs['name'].split(os.sep)[-1], image_name, output_dir_optimized, data_file, filetype) + + # from namefile_sel json file open the json file and save the namefile_sel.const part as file_name_obs+'_sim_fit.json' + with open(results_json) as json_file: + data = json.load(json_file) + const_part = data['const'] + with open(output_dir_optimized+os.sep+file_name_obs+'_sim_fit.json', 'w') as outfile: + json.dump(const_part, outfile, indent=4) + + shutil.copy(output_folder+os.sep+'AutoRefineFit_options.txt', output_dir_optimized+os.sep+'AutoRefineFit_options.txt') + update_sigma_values(output_dir_optimized+os.sep+'AutoRefineFit_options.txt', gensim_data_obs['rmsd_mag'], gensim_data_obs['rmsd_len'], False, False) # More_complex_fit=False, Custom_refinement=False + # run the optimization + shutil.copy(gensim_data_obs['name'], output_dir_optimized+os.sep+os.path.basename(gensim_data_obs['name'])) + + print('runing the optimization... CPU:',multiprocessing.current_process().name) + # this creates a ew file called output_dir+os.sep+file_name_obs+'_sim_fit_fitted.json' + runAutoRefine(output_dir_optimized, 'AutoRefineFit_options.txt', updated_main=False, hideplots_main=True) + + _, gensim_data_optimized, pd_datafram_PCA_sim_optimized = run_simulation(file_optim_results, gensim_data_obs) + + chi2_red_mag, chi2_red_vel, chi2_red_len, rmsd_mag, rmsd_vel, rmsd_lag, residuals_mag, residuals_vel, residuals_len, residual_time_pos, residual_height_pos , lag_km_sim = RMSD_calc_diff(gensim_data_optimized, gensim_data_obs) + + + # Interpolated fit on data grid + sim_time_pos = interp_ht_time(gensim_data_optimized['height']) + + # copy the data to the mode + data_file_sim_opt = gensim_data_optimized.copy() + data_file_sim_opt['time'] = sim_time_pos + data_file_sim_opt['res_absolute_magnitudes'] = residuals_mag + data_file_sim_opt['res_velocities'] = residuals_vel + data_file_sim_opt['res_lag'] = residuals_len * 1000 + data_file_sim_opt['lag'] = lag_km_sim * 1000 + data_file_sim_opt['rmsd_mag'] = rmsd_mag + data_file_sim_opt['rmsd_vel'] = rmsd_vel + data_file_sim_opt['rmsd_len'] = rmsd_lag + data_file_sim_opt['chi2_red_mag'] = chi2_red_mag + data_file_sim_opt['chi2_red_len'] = chi2_red_len + plot_data_with_residuals_and_real(gensim_data_obs['rmsd_mag']*gensim_data_obs['z_score'], gensim_data_obs['rmsd_vel']*gensim_data_obs['z_score'], gensim_data_obs['rmsd_len']*gensim_data_obs['z_score'], fit_funct, gensim_data_obs, gensim_data_obs['name'].split(os.sep)[-1], image_name, output_dir_optimized, data_file, filetype, data_file_sim_opt, 'Optimized') + + + if gensim_data_obs['rmsd_mag']*gensim_data_obs['z_score']>rmsd_mag and gensim_data_obs['rmsd_len']*gensim_data_obs['z_score']>rmsd_lag: + print('SUCCESS: OPTIMIZATION Below RMSD threshold RMSD: MAG sim',np.round(rmsd_mag,3),'REAL',np.round(gensim_data_obs['rmsd_mag']*gensim_data_obs['z_score'],3),'|| LAG',np.round(rmsd_lag,3),'REAL',np.round(gensim_data_obs['rmsd_len']*gensim_data_obs['z_score'],3)) + + # copy the data to the mode + plot_data_with_residuals_and_real(gensim_data_obs['rmsd_mag']*gensim_data_obs['z_score'], gensim_data_obs['rmsd_vel']*gensim_data_obs['z_score'], gensim_data_obs['rmsd_len']*gensim_data_obs['z_score'], fit_funct, gensim_data_obs, gensim_data_obs['name'].split(os.sep)[-1], image_name, result_folder, data_file, filetype, data_file_sim_opt, 'Optimized') + + # save the results + shutil.copy(file_optim_results, result_folder+os.sep+results_json_name) + # shutil.move(results_list[0], result_folder+os.sep+results_pickle_name) + + # # remove the folder of the optinmization + # shutil.rmtree(output_dir_optimized) + return True + + else: + print('FAIL: OPTIMIZATION Above RMSD threshold RMSD: MAG sim',np.round(rmsd_mag,3),'REAL',np.round(gensim_data_obs['rmsd_mag']*gensim_data_obs['z_score'],3),'|| LAG',np.round(rmsd_lag,3),'REAL',np.round(gensim_data_obs['rmsd_len']*gensim_data_obs['z_score'],3)) + # # remove the folder of the optinmization + # shutil.rmtree(output_dir_optimized) + return False + + # case in which the filetype is Metsim but above the RMSD + elif filetype == 'Metsim': + + print('FAIL: Bad Metsim, redo manually or try optimization!\nAbove RMSD threshold RMSD: MAG sim',np.round(rmsd_mag,3),'REAL',np.round(gensim_data_obs['rmsd_mag']*gensim_data_obs['z_score'],3),'|| LAG',np.round(rmsd_lag,3),'REAL',np.round(gensim_data_obs['rmsd_len']*gensim_data_obs['z_score'],3)) + + plot_data_with_residuals_and_real(gensim_data_obs['rmsd_mag']*gensim_data_obs['z_score'], gensim_data_obs['rmsd_vel']*gensim_data_obs['z_score'], gensim_data_obs['rmsd_len']*gensim_data_obs['z_score'], fit_funct, gensim_data_obs, gensim_data_obs['name'].split(os.sep)[-1], image_name, result_folder, data_file, filetype) + return False + + else: + print('FAIL: Above RMSD threshold RMSD: MAG sim',np.round(rmsd_mag,3),'REAL',np.round(gensim_data_obs['rmsd_mag']*gensim_data_obs['z_score'],3),'|| LAG',np.round(rmsd_lag,3),'REAL',np.round(gensim_data_obs['rmsd_len']*gensim_data_obs['z_score'],3)) + + return False + #### Plot ############################################################################# @@ -808,8 +1127,522 @@ def check_axis_inversion(ax): is_y_inverted = y_max < y_min return is_x_inverted, is_y_inverted +def plot_data_with_residuals_and_real(rmsd_mag, rmsd_vel, rmsd_len, fit_funct_original, real_original, label_real='', file_name='', output_dir = '', data_original='', label_data='', data_opt_or_desns_original='', label_opt_or_desns=''): + + # copy the data + fit_funct = copy.deepcopy(fit_funct_original) + real = copy.deepcopy(real_original) + data = copy.deepcopy(data_original) + data_opt_or_desns = copy.deepcopy(data_opt_or_desns_original) + + if fit_funct['height'][1] > 1000: + fit_funct['velocities'] = fit_funct['velocities']/1000 + fit_funct['height'] = fit_funct['height']/1000 + + if real['height'][1] > 1000: + real['velocities'] = real['velocities']/1000 + real['height'] = real['height']/1000 + + if data != '': + if data['height'][1] > 1000: + data['velocities'] = data['velocities']/1000 + data['height'] = data['height']/1000 + if data_opt_or_desns != '': + if data_opt_or_desns['height'][1] > 1000: + data_opt_or_desns['velocities'] = data_opt_or_desns['velocities']/1000 + data_opt_or_desns['height'] = data_opt_or_desns['height']/1000 + + def line_and_color_plot(label,color_line1=None): + if label=='Mode': + return '','-','r' + elif label=='Metsim': + return '','-','k' + elif label=='Dens.point': + return '','-','b' + elif label=='Optimized': + return 'x',':', color_line1 + else: + return '','-',None + + # Create the figure and main GridSpec with specified height ratios + fig = plt.figure(figsize=(14, 6)) + gs_main = gridspec.GridSpec(2, 4, figure=fig, height_ratios=[3, 0.5], width_ratios=[1, 1, 1, 1]) + + # Create a sub GridSpec for Plot 0 and Plot 1 with width ratios + gs01 = gridspec.GridSpecFromSubplotSpec(1, 2, subplot_spec=gs_main[0, 0:2], wspace=0, width_ratios=[3, 1]) + + # Plot 0 and 1: Side by side, sharing the y-axis + ax0 = fig.add_subplot(gs01[0]) + ax1 = fig.add_subplot(gs01[1], sharey=ax0) + + # Insert fill_between for magnitude + height_km_err = real['height'] + abs_mag_sim_err = fit_funct['absolute_magnitudes'] + mag_noise = real['rmsd_mag'] + ax0.fill_betweenx(height_km_err, abs_mag_sim_err - mag_noise, abs_mag_sim_err + mag_noise, color='darkgray', alpha=0.2) + ax0.fill_betweenx(height_km_err, abs_mag_sim_err - mag_noise * real_original['z_score'], abs_mag_sim_err + mag_noise * real_original['z_score'], color='lightgray', alpha=0.2) + ax0.plot(real['absolute_magnitudes'], real['height'], 'go') + if data != '': + line1, = ax0.plot(data['absolute_magnitudes'], data['height']) + _, _, color_line1= line_and_color_plot(label_data) + if color_line1!=None: + # set the color of line1 to color_line1 + line1.set_color(color_line1) + # get line1 color + color_line1 = line1.get_color() + if data_opt_or_desns!='': + line2, = ax0.plot(data_opt_or_desns['absolute_magnitudes'], data_opt_or_desns['height']) + line_marker2, line_sty2, color_line2 = line_and_color_plot(label_opt_or_desns,color_line1) + if color_line2!=None: + # set the color of line2 to color_line2 + line2.set_color(color_line2) + # set the linestyle of line2 to line_sty2 + line2.set_linestyle(line_sty2) + # set the marker of line2 to line_marker2 + line2.set_marker(line_marker2) + else: + ax0.plot(fit_funct['absolute_magnitudes'], fit_funct['height'], 'k--') + ax0.set_xlabel('Absolute Magnitudes') + # flip the x-axis + ax0.invert_xaxis() + # ax0.tick_params(axis='x', rotation=45) + ax0.set_ylabel('Height (km)') + ax0.grid(True, linestyle='--', color='lightgray') + + ax1.fill_betweenx(height_km_err, -mag_noise, mag_noise, color='darkgray', alpha=0.2) + ax1.fill_betweenx(height_km_err, -mag_noise * real_original['z_score'], mag_noise * real_original['z_score'], color='lightgray', alpha=0.2) + ax1.plot([0, 0], [fit_funct['height'][0], fit_funct['height'][-1]],color='lightgray') + # Plot 1: Height vs. Res.Mag, without y-axis tick labels + if data != '': + # Plot 0: Height vs. Absolute Magnitudes with two lines + ax1.plot(data['res_absolute_magnitudes'], real['height'],'.',color=color_line1) + if data_opt_or_desns!='': + if line_marker2!='': + ax1.plot(data_opt_or_desns['res_absolute_magnitudes'], real['height'],line_marker2,color=color_line2) + else: + ax1.plot(data_opt_or_desns['res_absolute_magnitudes'], real['height'],'.',color=color_line2) + else: + ax1.plot(real['res_absolute_magnitudes'], real['height'], 'g.') + ax1.set_xlabel('Res.Mag') + # flip the x-axis + ax1.invert_xaxis() + # ax1.tick_params(axis='x', rotation=45) + ax1.tick_params(labelleft=False) # Hide y-axis tick labels + ax1.grid(True, linestyle='--', color='lightgray') + + + # Plot 4: Custom legend for Plot 0 with two columns + ax4 = fig.add_subplot(gs_main[1, 0]) + ax4.axis('off') + # mag$\chi^2_{red}$'+str(round(data['chi2_red_mag'],2))+' lag$\chi^2_{red}$'+str(round(data['chi2_red_len'],2))+'\n\ + # mag$\chi^2_{red}$'+str(round(data_opt_or_desns['chi2_red_mag'],2))+' lag$\chi^2_{red}$'+str(round(data_opt_or_desns['chi2_red_len'],2))+'\n\ + # mag$\chi^2_{red}$'+str(round(data['chi2_red_mag'],2))+' lag$\chi^2_{red}$'+str(round(data['chi2_red_len'],2))+'\n\ + if data_opt_or_desns!='': + label_line1= label_data+' mag$_{RMSD}$ '+str(round(data['rmsd_mag'],3))+' lag$_{RMSD}$ '+str(round(data['rmsd_len']*1000,1))+'m\n\ +$m_0$:'+str('{:.2e}'.format(data['mass'],1))+'kg $\\rho$:'+str(round(data['rho']))+'kg/m$^3$\n\ +$\sigma$:'+str(round(data['sigma']*1000000,4))+'kg/MJ $\eta$:'+str(round(data['erosion_coeff']*1000000,3))+'kg/MJ\n\ +$h_e$:'+str(round(data['erosion_height_start'],1))+'km $s$:'+str(round(data['erosion_mass_index'],2))+'\n\ +$m_l$:'+str('{:.2e}'.format(data['erosion_mass_min'],1))+'kg $m_u$:'+str('{:.2e}'.format(data['erosion_mass_max'],1))+'kg' + label_line2 = label_opt_or_desns+' mag$_{RMSD}$ '+str(round(data_opt_or_desns['rmsd_mag'],3))+' lag$_{RMSD}$ '+str(round(data_opt_or_desns['rmsd_len']*1000,1))+'m\n\ +$m_0$:'+str('{:.2e}'.format(data_opt_or_desns['mass'],1))+'kg $\\rho$:'+str(round(data_opt_or_desns['rho']))+'kg/m$^3$\n\ +$\sigma$:'+str(round(data_opt_or_desns['sigma']*1000000,1))+'kg/MJ $\eta$:'+str(round(data_opt_or_desns['erosion_coeff']*1000000,3))+'kg/MJ\n\ +$h_e$:'+str(round(data_opt_or_desns['erosion_height_start'],1))+'km $s$:'+str(round(data_opt_or_desns['erosion_mass_index'],2))+'\n\ +$m_l$:'+str('{:.2e}'.format(data_opt_or_desns['erosion_mass_min'],1))+'kg $m_u$:'+str('{:.2e}'.format(data_opt_or_desns['erosion_mass_max'],1))+'kg' + ax4.legend([line1, line2], [label_line1, label_line2], loc='center', ncol=2, fontsize=7) + elif data!='': + label_line1=label_data+' mag$_{RMSD}$ '+str(round(data['rmsd_mag'],3))+' lag$_{RMSD}$ '+str(round(data['rmsd_len']*1000,1))+'m\n\ +$m_0$:'+str('{:.2e}'.format(data['mass'],1))+'kg $\\rho$:'+str(round(data['rho']))+'kg/m$^3$\n\ +$\sigma$:'+str(round(data['sigma']*1000000,4))+'kg/MJ $\eta$:'+str(round(data['erosion_coeff']*1000000,3))+'s$^2$/km$^2$\n\ +$h_e$:'+str(round(data['erosion_height_start'],1))+'km $s$:'+str(round(data['erosion_mass_index'],2))+'\n\ +$m_l$:'+str('{:.2e}'.format(data['erosion_mass_min'],1))+'kg $m_u$:'+str('{:.2e}'.format(data['erosion_mass_max'],1))+'kg' + ax4.legend([line1], [label_line1], loc='center left', ncol=1) + + # Plot 5: Custom legend with green dot, dashed line, and shaded areas + ax5 = fig.add_subplot(gs_main[1, 1]) + ax5.axis('off') + ax5.plot([], [], 'go', label=label_real[:15]+'\nmag$_{RMSD}$ '+str(round(real['rmsd_mag'],3))+'\nvel$_{RMSD}$ '+str(round(real['rmsd_vel'],3))+'km/s\nlag$_{RMSD}$ '+str(round(real['rmsd_len']*1000,1))+'m') # Green dot + if data == '': + ax5.plot([], [], 'k--', label='Fit') # Black dashed line + ax5.fill_between([], [], [], color='darkgray', alpha=0.2, label='1$\sigma$') + ax5.fill_between([], [], [], color='lightgray', alpha=0.2, label='2$\sigma$') + ax5.legend(loc='right', fontsize=8) # upper right + + + # Plot 2 and 6: Vertically stacked, sharing the x-axis (Time) with height ratios + gs_col2 = gridspec.GridSpecFromSubplotSpec(2, 1, subplot_spec=gs_main[:, 2], hspace=0, height_ratios=[3, 1]) + ax2 = fig.add_subplot(gs_col2[0, 0]) + ax6 = fig.add_subplot(gs_col2[1, 0], sharex=ax2) + + + # Remaining subplots with fill_between + residual_time_pos = real['time'] + vel_kms_err = fit_funct['velocities'] + vel_noise = real['rmsd_vel'] + ax2.fill_between(residual_time_pos, vel_kms_err - vel_noise, vel_kms_err + vel_noise, color='darkgray', alpha=0.2) + ax2.fill_between(residual_time_pos, vel_kms_err - vel_noise * real_original['z_score'], vel_kms_err + vel_noise * real_original['z_score'], color='lightgray', alpha=0.2) + # Plot 2: Velocity vs. Time, without x-axis tick labels + ax2.plot(real['time'], real['velocities'], 'go') + if data != '': + ax2.plot(data['time'], data['velocities'], color=color_line1) + if data_opt_or_desns!='': + ax2.plot(data_opt_or_desns['time'], data_opt_or_desns['velocities'], line_marker2+line_sty2, color=color_line2) + else: + ax2.plot(fit_funct['time'], fit_funct['velocities'], 'k--') + ax2.set_ylabel('Velocity [km/s]') + ax2.tick_params(labelbottom=False) # Hide x-axis tick labels + ax2.grid(True, linestyle='--', color='lightgray') + + # Plot 6: Res.Vel vs. Time + ax6.fill_between(residual_time_pos, -vel_noise, vel_noise, color='darkgray', alpha=0.2) + ax6.fill_between(residual_time_pos, -vel_noise * real_original['z_score'], vel_noise * real_original['z_score'], color='lightgray', alpha=0.2) + ax6.plot([fit_funct['time'][0], fit_funct['time'][-1]], [0, 0], color='lightgray') + if data != '': + ax6.plot(real['time'], data['res_velocities'], '.', color=color_line1) + if data_opt_or_desns!='': + if line_marker2!='': + ax6.plot(real['time'], data_opt_or_desns['res_velocities'], line_marker2, color=color_line2) + else: + ax6.plot(real['time'], data_opt_or_desns['res_velocities'], '.', color=color_line2) + else: + ax6.plot(real['time'], real['res_velocities'], 'g.') + ax6.set_xlabel('Time [s]') + ax6.set_ylabel('Res.Vel [km/s]') + ax6.grid(True, linestyle='--', color='lightgray') + + # Plot 3 and 7: Vertically stacked, sharing the x-axis (Time) with height ratios + gs_col3 = gridspec.GridSpecFromSubplotSpec(2, 1, subplot_spec=gs_main[:, 3], hspace=0, height_ratios=[3, 1]) + ax3 = fig.add_subplot(gs_col3[0, 0]) + ax7 = fig.add_subplot(gs_col3[1, 0], sharex=ax3) + + lag_km_err = fit_funct['lag'] + lag_noise = real['rmsd_len'] * 1000 + ax3.fill_between(residual_time_pos, lag_km_err - lag_noise, lag_km_err + lag_noise, color='darkgray', alpha=0.2) + ax3.fill_between(residual_time_pos, lag_km_err - lag_noise * real_original['z_score'], lag_km_err + lag_noise * real_original['z_score'], color='lightgray', alpha=0.2) + # Plot 2: Velocity vs. Time, without x-axis tick labels + ax3.plot(real['time'], real['lag'], 'go') + if data != '': + ax3.plot(data['time'], data['lag'], color=color_line1) + if data_opt_or_desns!='': + ax3.plot(data_opt_or_desns['time'], data_opt_or_desns['lag'], line_marker2+line_sty2, color=color_line2) + else: + ax3.plot(fit_funct['time'], fit_funct['lag'], 'k--') + ax3.set_ylabel('Lag [m]') + ax3.tick_params(labelbottom=False) # Hide x-axis tick labels + ax3.grid(True, linestyle='--', color='lightgray') + + # Plot 7: Res.Vel vs. Time + ax7.fill_between(residual_time_pos, -lag_noise, lag_noise, color='darkgray', alpha=0.2) + ax7.fill_between(residual_time_pos, -lag_noise * real_original['z_score'], lag_noise * real_original['z_score'], color='lightgray', alpha=0.2) + ax7.plot([fit_funct['time'][0], fit_funct['time'][-1]], [0, 0], color='lightgray') + if data != '': + ax7.plot(real['time'], data['res_lag'], '.', color=color_line1) + if data_opt_or_desns!='': + if line_marker2!='': + ax7.plot(real['time'], data_opt_or_desns['res_lag'], line_marker2, color=color_line2) + else: + ax7.plot(real['time'], data_opt_or_desns['res_lag'], '.', color=color_line2) + else: + ax7.plot(real['time'], real['res_lag'], 'g.') + ax7.set_xlabel('Time [s]') + ax7.set_ylabel('Res.Lag [m]') + ax7.grid(True, linestyle='--', color='lightgray') + + # Adjust the overall layout to prevent overlap + plt.subplots_adjust(wspace=0.4, hspace=0.4) + + select_data='' + if data!='': + if data['rmsd_mag']= 0.5 and data['chi2_red_mag'] <= 1.5 and data['chi2_red_len'] >= 0.5 and data['chi2_red_len'] <= 1.5 + select_data=label_data+' SELECTED' + else: + select_data=label_data+' NOT SELECTED' + if data_opt_or_desns !='': + if data_opt_or_desns['rmsd_mag']= 0.5 and data_opt_or_desns['chi2_red_mag'] <= 1.5 and data_opt_or_desns['chi2_red_len'] >= 0.5 and data_opt_or_desns['chi2_red_len'] <= 1.5 + select_data=select_data+' '+label_opt_or_desns+' SELECTED' + else: + select_data=select_data+' '+label_opt_or_desns+' NOT SELECTED' + + file_name_title=file_name + #check if the file_name has a '.' in it if so rake file_name[:-5] + if '.pickle' in file_name: + file_name_title=file_name[:15] + elif '.json' in file_name: + # find in which position is '.json' + pos=file_name.find('.json') + # delete the '.json' from the file_name and all the characters after it + file_name_title=file_name[:pos] + elif '.png' in file_name: + file_name_title=file_name[:-4] + fig.suptitle(file_name_title+' '+select_data) + + # Save the plot + print('file saved: '+output_dir +os.sep+ file_name) + fig.savefig(output_dir +os.sep+ file_name, dpi=300) + + # Display the plot + plt.close(fig) + + + + + + +def sigma_waterfallPLOT(df_result, df_sim_range, realRMSD_mag, realRMSD_lag, output_directory, name_file, + sigma_values=[2, 1.9, 1.8, 1.7, 1.6, 1.5, 1.4, 1.3, 1.2, 1.1, 1.0]): + df = df_result.copy() + sim_range_plot = df_sim_range.copy() + # take the first row df_obs_real + # df_obs_real = df_result.iloc[0] + # look find the index of the where df_sel_shower_real['type'] == 'Metsim' or 'Real' + if 'MetSim' in df_result['type'].values: + # find the index of the where df_sel_shower_real['type'] == 'Metsim' + idx = df_result.index[df_result['type'] == 'MetSim'] + df_obs_real = df_result.iloc[idx] + elif 'Real' in df_result['type'].values: + # find the index of the where df_sel_shower_real['type'] == 'Real' + idx = df_result.index[df_result['type'] == 'Real'] + df_obs_real = df_result.iloc[idx] + else: + # empty dataframe + df_obs_real = df_result.iloc[0] + + + # Columns to plot + to_plot = [ + 'mass', + 'rho', + 'sigma', + 'erosion_height_start', + 'erosion_coeff', + 'erosion_mass_index', + 'erosion_mass_min', + 'erosion_mass_max', + 'erosion_range', + 'erosion_energy_per_unit_cross_section', + 'erosion_energy_per_unit_mass' + ] + + # Corresponding units/labels + to_plot_unit = [ + r'$m_0$ [kg]', + r'$\rho$ [kg/m$^3$]', + r'$\sigma$ [kg/MJ]', + r'$h_{e}$ [km]', + r'$\eta$ [kg/MJ]', + r'$s$', + r'log($m_{l}$)', + r'log($m_{u}$)', + r'log($m_{u}$)-log($m_{l}$)', + r'$E_{S}$ [MJ/m$^2$]', + r'$E_{V}$ [MJ/kg]' + ] + + # multiply the erosion coeff by 1000000 to have it in km/s + df['erosion_coeff'] = df['erosion_coeff'] * 1000000 + df['sigma'] = df['sigma'] * 1000000 + df['erosion_energy_per_unit_cross_section'] = df['erosion_energy_per_unit_cross_section'] / 1000000 + df['erosion_energy_per_unit_mass'] = df['erosion_energy_per_unit_mass'] / 1000000 + df['erosion_mass_min'] = np.log10(df['erosion_mass_min']) + df['erosion_mass_max'] = np.log10(df['erosion_mass_max']) + + sim_range_plot['erosion_coeff'] = sim_range_plot['erosion_coeff'] * 1000000 + sim_range_plot['sigma'] = sim_range_plot['sigma'] * 1000000 + sim_range_plot['erosion_energy_per_unit_cross_section'] = sim_range_plot['erosion_energy_per_unit_cross_section'] / 1000000 + sim_range_plot['erosion_energy_per_unit_mass'] = sim_range_plot['erosion_energy_per_unit_mass'] / 1000000 + sim_range_plot['erosion_mass_min'] = np.log10(sim_range_plot['erosion_mass_min']) + sim_range_plot['erosion_mass_max'] = np.log10(sim_range_plot['erosion_mass_max']) + + # multiply the erosion coeff by 1000000 to have it in km/s + df_obs_real['erosion_coeff'] = df_obs_real['erosion_coeff'] * 1000000 + df_obs_real['sigma'] = df_obs_real['sigma'] * 1000000 + df_obs_real['erosion_energy_per_unit_cross_section'] = df_obs_real['erosion_energy_per_unit_cross_section'] / 1000000 + df_obs_real['erosion_energy_per_unit_mass'] = df_obs_real['erosion_energy_per_unit_mass'] / 1000000 + df_obs_real['erosion_mass_min'] = np.log10(df_obs_real['erosion_mass_min']) + df_obs_real['erosion_mass_max'] = np.log10(df_obs_real['erosion_mass_max']) + + df_limits = sim_range_plot.copy() + + used_sigmas = sigma_values + + fig, axs = plt.subplots(3, 4, figsize=(15, 10)) + axes = axs.flatten() # Flatten axes for easier iteration + + sc = None # For scatter plot reference (for the colorbar) + + data_for_table = [] + lendata_sigma = [] + # Plot data for each sigma on the same set of subplots + for i, s in enumerate(used_sigmas): + # Filter the dataframe based on sigma threshold + filtered_df = df[ + (df['rmsd_mag'] < s * realRMSD_mag) & + (df['rmsd_len'] < s * realRMSD_lag) + ] + + # lendata_sigma.append(f'$({len(filtered_df)})~{s}\\sigma$') + lendata_sigma.append(f'${s}~$RMSD$~-~{len(filtered_df)}$') + + # Format RMSD with one decimal place, even for whole numbers + data_for_table.append([f"{s:.1f}", f"{len(filtered_df)}"]) + + # Choose a distinct alpha or marker for each sigma to differentiate them + # (Optional: You could also use different markers or colors per sigma.) + alpha_val = max(0.2, 1 - (i*0.07)) # Decrease alpha with each sigma + # Plot each variable in its corresponding subplot + for ax_index, var in enumerate(to_plot): + ax = axes[ax_index] + + # if 'MetSim'==df_obs_real['type'].iloc[0]: + # # make a black line vertical line at the real value + # ax.axvline(df_obs_real[var].iloc[0], color='black', linewidth=2) + # el + if 'Real'==df_obs_real['type'].iloc[0]: + # make a black line vertical line at the real value + ax.axvline(df_obs_real[var].iloc[0], color='black', linewidth=2) + ax.axvline(df_obs_real[var].iloc[0], color='black', linewidth=2) + elif 'Real'==df_obs_real['type'].iloc[0]: + # make a black line vertical line at the real value + ax.axvline(df_obs_real[var].iloc[0], color='g', linewidth=2, linestyle='--') + ax.axvline(df_obs_real[var].iloc[0], color='black', linewidth=2) + elif 'Real'==df_obs_real['type'].iloc[0]: + # make a black line vertical line at the real value + ax.axvline(df_obs_real[var].iloc[0], color='g', linewidth=2, linestyle='--') + + data = filtered_df[var].dropna() + if data.empty: + # No data after filtering, just continue + continue + else: + # make sigma multipy to ones + y = np.ones(len(data)) * s + # Compute density along the variable's values + x = data.values + + if len(x) > 3: + density = gaussian_kde(x)(x) + # Normalize density to [0, 1] + density = (density - density.min()) / (density.max() - density.min()) + + sc = ax.scatter(x, y, c=density, cmap='viridis', vmin=0, vmax=1, s=20, edgecolor='none') # , alpha=alpha_val + + # Find the densest point (highest density) + densest_index = np.argmax(density) + densest_point = x[densest_index] + + # put a blue dot to the mean value + ax.plot(np.mean(x), s, 'bs', markersize=5) + # You can now use densest_point as your "mode" or representative value + ax.plot(densest_point, s, 'ro', markersize=5) + + else: + # If there's only one point, set density to mid-range + density = np.ones(len(data)) * 0.5 + + sc = ax.scatter(x, y, c=density, cmap='viridis', vmin=0, vmax=1, s=20, edgecolor='none') # , alpha=alpha_val + + densest_point = np.mean(x) + + + + + + + # Set titles and labels + for ax_index, var in enumerate(to_plot): + ax = axes[ax_index] + # ax.set_title(var, fontsize=10) + ax.set_xlabel(to_plot_unit[ax_index], fontsize=9) + # now put the x axis range from the highest to the smallest value in df_sel_sim but + ax.set_xlim([df_limits[var].min(), df_limits[var].max()]) + # tilt thicks 45 degrees + # ax.tick_params(axis='x', rotation=45) + ax.xaxis.set_major_locator(ticker.MaxNLocator(4)) + # ax.set_ylabel('$\sigma$', fontsize=9) + ax.set_ylabel('RMSD', fontsize=9) + # # set thicks along y axis as lendata + # ax.set_yticks(sigma_values) + # ax.set_yticklabels(lendata_sigma) + # put the -- in the grids + ax.grid(True, linestyle='--', color='lightgray') + # set the y axis + ax.set_ylim([np.min(sigma_values)-np.min(sigma_values)/10, np.max(sigma_values)+np.min(sigma_values)/10]) + + # The last subplot (axes[11]) is used for the legend only + axes[11].axis('off') + + # Create the table + table = axes[11].table( + cellText=data_for_table, + colLabels=["RMSD", "Count"], + loc='center left', + bbox=[-0.1, 0.0, 0.35, 1.0] # Adjust these values as needed + ) + + # Adjust table formatting + table.auto_set_font_size(False) + table.set_fontsize(8) + table.auto_set_column_width(col=list(range(2))) # Auto-adjust column widths + + # Align text in cells (optional) + for (row, col), cell in table.get_celld().items(): + # Make header bold and aligned center + if row == 0: + cell.set_text_props(ha='center', va='center', fontweight='bold') + else: + # Align numeric columns to the right and RMSD column center if desired + if col == 0: + cell.set_text_props(ha='center', va='center') + else: + cell.set_text_props(ha='center', va='center') + + # Create custom legend entries + import matplotlib.patches as mpatches + from matplotlib.lines import Line2D + + mode_line = Line2D([0], [0], color='red', label='Mode', marker='o', linestyle='None') + mean_line = Line2D([0], [0], color='blue', label='Mean', marker='s', linestyle='None') + # if 'MetSim' in df_obs_real['type'].values: + # if 'MetSim' == df_obs_real['type'].iloc[0]: + # metsim_line = Line2D([0], [0], color='black', linewidth=2, label='Metsim') + # legend_elements = [metsim_line, mean_line, mode_line] + # el + if 'Real' == df_obs_real['type'].iloc[0]: + metsim_line = Line2D([0], [0], color='black', linewidth=2, label='Real') + legend_elements = [metsim_line, mean_line, mode_line] + else: + # # put the len of x in the legend followed by the sigma value + # sigma_values = Line2D([], [], color='none', marker='', linestyle='None', label=lendata_sigma) + legend_elements = [mean_line, mode_line] + + axes[11].legend(handles=legend_elements, loc='upper center') # , fontsize=8 + + # Adjust layout and add a single colorbar to the figure + fig.subplots_adjust(right=0.85) + cbar_ax = fig.add_axes([0.9, 0.15, 0.02, 0.7]) + cbar = plt.colorbar(sc, cax=cbar_ax, label='Density (normalized)') + + plt.tight_layout(rect=[0, 0, 0.9, 1]) + + # Save the figure instead of showing it + if not os.path.exists(output_directory): + os.makedirs(output_directory) + plt.savefig(os.path.join(output_directory, name_file + '_waterfall_sigma'+str(np.max(sigma_values))+'max'+str(np.min(sigma_values))+'min.png'), dpi=300) + plt.close(fig) + + + # if len(df_sim_shower_NEW_inter) > 0: + # iter_patch = mpatches.Patch(color='limegreen', label='Iterative', alpha=0.5, edgecolor='black') + # if 'MetSim' in curr_df_sim_sel['type'].values: + # metsim_line = Line2D([0], [0], color='black', linewidth=2, label='Metsim Solution') + # else: + # metsim_line = Line2D([0], [0], color='green', linestyle='--', linewidth=2, label='Real Solution') + + + + -def plot_side_by_side(data1, fig='', ax='', colorline1='.', label1='', residuals_mag='', residuals_vel='', residual_time_pos='', residual_height_pos='', fit_funct='', mag_noise='', vel_noise='', label_fit=''): + +def plot_side_by_side(data1, fig='', ax='', colorline1='.', label1='', residuals_mag='', residuals_vel='', residual_time_pos='', residual_height_pos='', residuals_lag='', fit_funct='', mag_noise='', vel_noise='',lag_noise='', sim_lag='', sim_time=''): # check if data1 is None if data1 is None: @@ -833,6 +1666,12 @@ def plot_side_by_side(data1, fig='', ax='', colorline1='.', label1='', residuals # Plot the simulation results if residuals_mag != '' and residuals_vel != '' and residual_time_pos!='' and residual_height_pos!='': + residual_time_pos_err=residual_time_pos + if len(residual_time_pos) != len(obs1['velocities']): + # interpolate from residual_time_pos[0] to residual_time_pos[-1] with len(obs1['velocities']) + residual_time_pos = obs1['time'] # np.linspace(residual_time_pos[0], residual_time_pos[-1], len(obs1['velocities'])) + + if fig=='' and ax=='': fig, ax = plt.subplots(2, 3, figsize=(14, 6),gridspec_kw={'height_ratios': [ 3, 1],'width_ratios': [ 3, 0.5, 3]}) # figsize=(10, 5), dpi=300 0.5, 3, 3, 0.5 # flat the ax @@ -844,6 +1683,10 @@ def plot_side_by_side(data1, fig='', ax='', colorline1='.', label1='', residuals abs_mag_sim_err=np.array(fit_funct['absolute_magnitudes']) height_km_err=np.array(fit_funct['height']) vel_kms_err=np.array(fit_funct['velocities']) + len_km_err=np.array(fit_funct['length']) + lag_km_err=np.array(fit_funct['lag']) + #lag_kms_err=len_km_err - (obs1['velocities'][0]/1000*obs_time_err) + #_err=lag_kms_err - lag_kms_err[0] # from list to array if np.mean(fit_funct['height'])>1000: # convert to km/s @@ -851,19 +1694,46 @@ def plot_side_by_side(data1, fig='', ax='', colorline1='.', label1='', residuals vel_kms_err=np.array(fit_funct['velocities'])/1000 # plot noisy area around vel_kms for vel_noise for the fix height_km - ax[0].fill_betweenx(height_km_err, abs_mag_sim_err-mag_noise, abs_mag_sim_err+mag_noise, color='lightgray', alpha=0.5) + ax[0].fill_betweenx(height_km_err, abs_mag_sim_err-mag_noise, abs_mag_sim_err+mag_noise, color='darkgray', alpha=0.2) + ax[0].fill_betweenx(height_km_err, abs_mag_sim_err-mag_noise*1.96, abs_mag_sim_err+mag_noise*1.96, color='lightgray', alpha=0.2) + ax[0].plot(abs_mag_sim_err,height_km_err, 'k--') # plot noisy area around vel_kms for vel_noise for the fix height_km - ax[2].fill_between(obs_time_err, vel_kms_err-vel_noise, vel_kms_err+vel_noise, color='lightgray', alpha=0.5, label=label_fit) + ax[1].fill_betweenx(height_km_err, -mag_noise, mag_noise, color='darkgray', alpha=0.2) + ax[1].fill_betweenx(height_km_err, -mag_noise*1.96, mag_noise*1.96, color='lightgray', alpha=0.2) - # plot noisy area around vel_kms for vel_noise for the fix height_km - ax[1].fill_betweenx(height_km_err, -mag_noise, mag_noise, color='lightgray', alpha=0.5) + if lag_noise != '': + lag_noise = lag_noise * 1000 - # plot noisy area around vel_kms for vel_noise for the fix height_km - ax[5].fill_between(obs_time_err, -vel_noise, vel_noise, color='lightgray', alpha=0.5) + # plot noisy area around vel_kms for vel_noise for the fix height_km + ax[2].fill_between(residual_time_pos, vel_kms_err-vel_noise, vel_kms_err+vel_noise, color='darkgray', alpha=0.2) + ax[2].fill_between(residual_time_pos, vel_kms_err-vel_noise*1.96, vel_kms_err+vel_noise*1.96, color='lightgray', alpha=0.2) + ax[2].plot(residual_time_pos, vel_kms_err, 'k--') + + # plot noisy area around vel_kms for vel_noise for the fix height_km + ax[3].fill_between(residual_time_pos, lag_km_err-lag_noise, lag_km_err+lag_noise, color='darkgray', alpha=0.2, label='1$\sigma$') + ax[3].fill_between(residual_time_pos, lag_km_err-lag_noise*1.96, lag_km_err+lag_noise*1.96, color='lightgray', alpha=0.2, label='2$\sigma$') + ax[3].plot(residual_time_pos, lag_km_err, 'k--', label='Fit') + + # plot noisy area around vel_kms for vel_noise for the fix height_km + ax[6].fill_between(residual_time_pos, -vel_noise, vel_noise, color='darkgray', alpha=0.2) + ax[6].fill_between(residual_time_pos, -vel_noise*1.96, vel_noise*1.96, color='lightgray', alpha=0.2) + + # plot noisy area around vel_kms for vel_noise for the fix height_km + ax[7].fill_between(residual_time_pos, -lag_noise, lag_noise, color='darkgray', alpha=0.2) + ax[7].fill_between(residual_time_pos, -lag_noise*1.96, lag_noise*1.96, color='lightgray', alpha=0.2) + + else: + # plot noisy area around vel_kms for vel_noise for the fix height_km + ax[2].fill_between(residual_time_pos, vel_kms_err-vel_noise, vel_kms_err+vel_noise, color='darkgray', alpha=0.2, label='1$\sigma$') + ax[2].fill_between(residual_time_pos, vel_kms_err-vel_noise*1.96, vel_kms_err+vel_noise*1.96, color='lightgray', alpha=0.2, label='2$\sigma$') + ax[2].plot(residual_time_pos, vel_kms_err, 'k--', label='Fit') + + # plot noisy area around vel_kms for vel_noise for the fix height_km + ax[5].fill_between(residual_time_pos, -vel_noise, vel_noise, color='lightgray', alpha=0.5) ax[0].plot(obs1['absolute_magnitudes'],obs1['height'], colorline1) - ax[0].set_xlabel('Absolute Magnitude [-]') + ax[0].set_xlabel('Absolute Magnitude') ax[0].set_ylabel('Height [km]') # grid on on both subplot with -- as linestyle and light gray color ax[0].grid(True) @@ -882,9 +1752,10 @@ def plot_side_by_side(data1, fig='', ax='', colorline1='.', label1='', residuals # ax[0].plot(obs1['absolute_magnitudes'],obs1['height'], colorline1, color='m') # plot the residuals against time - ax[1].plot(residuals_mag, residual_height_pos, '.', color=line_color) + if fit_funct=='' and mag_noise=='' and vel_noise=='': + ax[1].plot(residuals_mag, residual_height_pos, '.', color=line_color) # ax[1].set_ylabel('Height [km]') - ax[1].set_xlabel('Res.mag [-]') + ax[1].set_xlabel('Res.mag') ax[1].tick_params(axis='x', rotation=45) # flip the y-axis @@ -901,74 +1772,111 @@ def plot_side_by_side(data1, fig='', ax='', colorline1='.', label1='', residuals ax[1].grid(linestyle='--',color='lightgray') ax[1].set_ylim(ax[0].get_ylim()) - if label1!='': - ax[2].plot(obs1['time'], obs1['velocities'], colorline1, color=line_color, label=label1) - else: - ax[2].plot(obs1['time'], obs1['velocities'], colorline1, color=line_color) - # show the legend - if label1 != '': - ax[2].legend() - ax[2].set_xlabel('Time [s]') - ax[2].set_ylabel('Velocity [km/s]') - ax[2].grid(True) - ax[2].grid(linestyle='--',color='lightgray') + if residuals_lag!='': + if sim_time!='': + ax[2].plot(sim_time, obs1['velocities'], colorline1, color=line_color) + else: + ax[2].plot(residual_time_pos, obs1['velocities'], colorline1, color=line_color) - # delete the plot in the middle - ax[3].axis('off') - - # # put as the super title the name - # plt.suptitle(name) - ax[4].axis('off') + ax[2].set_xlabel('Time [s]') + ax[2].set_ylabel('Velocity [km/s]') + ax[2].grid(True) + ax[2].grid(linestyle='--',color='lightgray') - # plot the residuals against time - ax[5].plot(residual_time_pos, residuals_vel, '.', color=line_color) - ax[5].set_ylabel('Res.vel [km/s]') - ax[5].grid(True) - ax[5].grid(linestyle='--',color='lightgray') - # use the same limits of ax[3] - ax[5].set_xlim(ax[2].get_xlim()) - - - # # plot the distribution of the residuals along the y axis - # ax[5].hist(residuals_mag, bins=20, color=line_color, alpha=0.5) - # ax[5].set_ylabel('N.data') - # ax[5].set_xlabel('Res.mag [-]') - # is_x_inverted, _ =check_axis_inversion(ax[5]) - # if is_x_inverted==False: - # ax[5].invert_xaxis() - # ax[5].grid(True) - # ax[5].grid(linestyle='--',color='lightgray') - - # # plot the residuals against time - # ax[6].plot(residual_time_pos, residuals_vel, '.', color=line_color) - # # ax[6].set_xlabel('Time [s]') - # ax[6].set_xticks([]) - # ax[6].set_ylabel('Res.vel [km/s]') - # ax[6].invert_yaxis() - # # ax[3].title(f'Absolute Magnitude Residuals') - # # ax[3].legend() - # ax[6].grid(True) - # ax[6].grid(linestyle='--',color='lightgray') - - # # plot the distribution of the residuals along the y axis - # ax[7].hist(residuals_vel, bins=20, color=line_color, alpha=0.5, orientation='horizontal') - # ax[7].set_xlabel('N.data') - # # invert the y axis - # ax[7].invert_yaxis() - # ax[7].set_ylabel('Res.vel [km/s]') - # # delete the the the line at the top ad the right - # ax[7].spines['top'].set_visible(False) - # ax[7].spines['right'].set_visible(False) - # # do not show the y ticks - # # ax[7].set_yticks([]) - # # # show the zero line - # # ax[7].axhline(0, color='k', linewidth=0.5) - # # grid on - # ax[7].grid(True) - # # grid on - # ax[7].grid(linestyle='--',color='lightgray') + if label1!='': + if sim_lag!='': + if sim_time!='': + ax[3].plot(sim_time, sim_lag*1000, colorline1, color=line_color, label=label1) + else: + ax[3].plot(residual_time_pos, sim_lag*1000, colorline1, color=line_color, label=label1) + else: + if sim_time!='': + ax[3].plot(sim_time, obs1['lag'], colorline1, color=line_color, label=label1) + else: + ax[3].plot(residual_time_pos, obs1['lag'], colorline1, color=line_color, label=label1) + else: + if sim_lag!='': + if sim_time!='': + ax[3].plot(sim_time, sim_lag*1000, colorline1, color=line_color) + else: + ax[3].plot(residual_time_pos, sim_lag*1000, colorline1, color=line_color) + else: + if sim_time!='': + ax[3].plot(sim_time, obs1['lag'], colorline1, color=line_color) + else: + ax[3].plot(residual_time_pos, obs1['lag'], colorline1, color=line_color) + + # show the legend + if label1 != '': + ax[3].legend() + + ax[3].set_xlabel('Time [s]') + ax[3].set_ylabel('Lag [m]') + ax[3].grid(True) + ax[3].grid(linestyle='--',color='lightgray') + + # delete the plot in the middle + ax[4].axis('off') + + # # put as the super title the name + # plt.suptitle(name) + ax[5].axis('off') + + # plot the residuals against time + if fit_funct=='' and mag_noise=='' and vel_noise=='': + ax[6].plot(residual_time_pos_err, residuals_vel, '.', color=line_color) + ax[6].set_ylabel('Res.vel [km/s]') + ax[6].grid(True) + ax[6].grid(linestyle='--',color='lightgray') + # use the same limits of ax[3] + ax[6].set_xlim(ax[2].get_xlim()) + + # plot the residuals against time + if fit_funct=='' and mag_noise=='' and vel_noise=='': + ax[7].plot(residual_time_pos_err, residuals_lag*1000, '.', color=line_color) + ax[7].set_ylabel('Res.lag [m]') + ax[7].grid(True) + ax[7].grid(linestyle='--',color='lightgray') + # use the same limits of ax[3] + ax[7].set_xlim(ax[3].get_xlim()) + + else: + if label1!='': + if sim_time!='': + ax[2].plot(sim_time, obs1['velocities'], colorline1, color=line_color, label=label1) + else: + ax[2].plot(residual_time_pos, obs1['velocities'], colorline1, color=line_color, label=label1) + else: + if sim_time!='': + ax[2].plot(sim_time, obs1['velocities'], colorline1, color=line_color) + else: + ax[2].plot(residual_time_pos, obs1['velocities'], colorline1, color=line_color) + # show the legend + if label1 != '': + ax[2].legend() + + ax[2].set_xlabel('Time [s]') + ax[2].set_ylabel('Velocity [km/s]') + ax[2].grid(True) + ax[2].grid(linestyle='--',color='lightgray') + + # delete the plot in the middle + ax[3].axis('off') + + # # put as the super title the name + # plt.suptitle(name) + ax[4].axis('off') + + # plot the residuals against time + if fit_funct=='' and mag_noise=='' and vel_noise=='': + ax[5].plot(residual_time_pos_err, residuals_vel, '.', color=line_color) + ax[5].set_ylabel('Res.vel [km/s]') + ax[5].grid(True) + ax[5].grid(linestyle='--',color='lightgray') + # use the same limits of ax[3] + ax[5].set_xlim(ax[2].get_xlim()) else : @@ -981,7 +1889,7 @@ def plot_side_by_side(data1, fig='', ax='', colorline1='.', label1='', residuals # plot the magnitude curve with height ax[0].plot(obs1['absolute_magnitudes'],obs1['height'], colorline1) - ax[0].set_xlabel('Absolute Magnitude [-]') + ax[0].set_xlabel('Absolute Magnitude') ax[0].set_ylabel('Height [km]') # check if the axis is inverted is_x_inverted, _ =check_axis_inversion(ax[0]) @@ -1012,150 +1920,510 @@ def plot_side_by_side(data1, fig='', ax='', colorline1='.', label1='', residuals plt.tight_layout() +def plot_histogram_PCA_dist(df_sim_selected_all, save_folder, dist, maxcutdist=0): + -#### Reader ############################################################################# + plt.figure(figsize=(10, 5)) + if maxcutdist != 0: + plt.hist(df_sim_selected_all[df_sim_selected_all['distance_meteor'] < maxcutdist]['distance_meteor'], bins=100, alpha=0.5, color='b', cumulative=True) # , color='b', edgecolor='black', linewidth=1.2 + + # put a vertical line at the dist + plt.axvline(x=dist, color='blue', linestyle='--', label='Real event distance') + plt.xlabel('PC distance') + plt.ylabel('Cumulative Count') + plt.savefig(save_folder + os.sep + 'HistogramsCUMUL_'+ str(np.round(dist,3)) + 'PCdist.png', dpi=300) + plt.close() -def read_GenerateSimulations_output_to_PCA(file_path, name=''): - if name!='': - print(name) - gensim_data = read_GenerateSimulations_output(file_path) - if gensim_data is None: - return None - else: - pd_datfram_PCA = array_to_pd_dataframe_PCA(gensim_data) - return pd_datfram_PCA +def plot_gray_dist(pd_datafram_PCA_selected, mindist, maxdist, distance_metric, df_obs_shower, output_dir, fit_funct, gensim_data_obs='', mag_noise_real=0.1, len_noise_real=20.0, fps=32, file_name_obs='', trajectory_Metsim_file=''): + # Number of observations and selections to plot + n_confront_obs = 1 -def read_GenerateSimulations_output(file_path, real_event=''): + # Flags for additional fits (set to False as default) + with_noise = True - f = open(file_path,"r") - data = json.loads(f.read()) + # Convert length noise to km and calculate velocity noise + lag_noise = len_noise_real + len_noise = len_noise_real / 1000 + vel_noise = (len_noise * np.sqrt(2) / (1 / fps)) - # show processed event - print(file_path) + # Increase figure size to provide more space for the table + fig = plt.figure(figsize=(10, 10)) + # Adjust width_ratios to allocate more space to the table + gs = GridSpec(2, 2) # Allocated equal space to the table , width_ratios=[1, 1, 1] - if data['ht_sampled']!= None: + # Create axes for the two plots + ax0 = fig.add_subplot(gs[0, 0]) + ax1 = fig.add_subplot(gs[0, 1]) + ax2 = fig.add_subplot(gs[1, 0]) + ax3 = fig.add_subplot(gs[1, 1]) - vel_sim=data['simulation_results']['leading_frag_vel_arr'][:-1]#['brightest_vel_arr']#['leading_frag_vel_arr']#['main_vel_arr'] - ht_sim=data['simulation_results']['leading_frag_height_arr'][:-1]#['brightest_height_arr']['leading_frag_height_arr']['main_height_arr'] - time_sim=data['simulation_results']['time_arr'][:-1]#['main_time_arr'] - abs_mag_sim=data['simulation_results']['abs_magnitude'][:-1] - len_sim=data['simulation_results']['brightest_length_arr'][:-1]#['brightest_length_arr'] - Dynamic_pressure= data['simulation_results']['leading_frag_dyn_press_arr'][:-1] - - # ht_obs=data['ht_sampled'] - # try: - # index_ht_sim=next(x for x, val in enumerate(ht_sim) if val <= ht_obs[0]) - # except StopIteration: - # # index_ht_sim = None - # print('The first element of the observation is not in the simulation') - # return None - - # try: - # index_ht_sim_end=next(x for x, val in enumerate(ht_sim) if val <= ht_obs[-1]) - # except StopIteration: - # # index_ht_sim_end = None - # print('The last element of the observation is not in the simulation') - # return None - - if real_event!= '': - mag_obs=real_event['absolute_magnitudes'] + df_sel_shower = pd_datafram_PCA_selected.copy() + + df_sel_shower = df_sel_shower[df_sel_shower[distance_metric] < maxdist] + + # oreder the df_sel_shower from the highest distance_metric to the lowest + df_sel_shower = df_sel_shower.sort_values(by=distance_metric, ascending=False) + + # Adjust units for erosion coefficients + df_sel_shower['erosion_coeff'] = df_sel_shower['erosion_coeff'] * 1e6 + df_sel_shower['sigma'] = df_sel_shower['sigma'] * 1e6 + + # Limit observations and selections if necessary + if n_confront_obs < len(df_obs_shower): + df_obs_shower = df_obs_shower.head(n_confront_obs) + + # Concatenate observation and selection DataFrames + curr_sel = pd.concat([df_obs_shower, df_sel_shower], axis=0).reset_index(drop=True) + + # Loop over the observations and selected simulations + for ii in range(len(curr_sel)): + namefile_sel = curr_sel.iloc[ii]['solution_id'] + Metsim_flag = False + print('real', trajectory_Metsim_file, '- sel', namefile_sel) + + # Check if the file exists + if not os.path.isfile(namefile_sel): + print('file ' + namefile_sel + ' not found') + continue else: - mag_obs=data['mag_sampled'] + # Read the appropriate data file + if namefile_sel.endswith('.pickle'): + data_file = read_pickle_reduction_file(namefile_sel) + data_file_real = data_file.copy() + + elif namefile_sel.endswith('.json'): + with open(namefile_sel, "r") as f: + data = json.loads(f.read()) + if 'ht_sampled' in data: + if ii == 0: + data_file = read_with_noise_GenerateSimulations_output(namefile_sel, fps) + data_file_real = data_file.copy() + else: + data_file = read_GenerateSimulations_output(namefile_sel, gensim_data_obs) + data_file_real = data_file.copy() + else: + if trajectory_Metsim_file == '': + print('no data for the Metsim file') + continue - print('read_GenerateSimulations_output mag',mag_obs[0],'-',mag_obs[-1]) + trajectory_Metsim_file_name = trajectory_Metsim_file.split(os.sep)[-1] + namefile_sel_name = namefile_sel.split(os.sep)[-1] - try: - # find the index of the first element of abs_mag_sim that is smaller than the first element of mag_obs - index_abs_mag_sim_start = next(i for i, val in enumerate(abs_mag_sim) if val <= mag_obs[0]) - index_abs_mag_sim_start = index_abs_mag_sim_start + np.random.randint(2) - except StopIteration: - print("The first observation height is not within the simulation data range.") - return None - try: - index_abs_mag_sim_end = next(i for i, val in enumerate(abs_mag_sim[::-1]) if val <= mag_obs[-1]) - index_abs_mag_sim_end = len(abs_mag_sim) - index_abs_mag_sim_end - 1 - except StopIteration: - print("The first observation height is not within the simulation data range.") - return None + if trajectory_Metsim_file_name == namefile_sel_name: + _, data_file, _ = run_simulation(trajectory_Metsim_file, gensim_data_obs, fit_funct) + Metsim_flag = True + else: + _, data_file, _ = run_simulation(namefile_sel, gensim_data_obs, fit_funct) + + if ii == 0: + # give the name of the file + file_name_only = os.path.basename(namefile_sel) + + # Extract necessary data from the data file + height_km = np.array(data_file['height']) / 1000 + abs_mag_sim = np.array(data_file['absolute_magnitudes']) + obs_time = np.array(data_file['time']) + vel_kms = np.array(data_file['velocities']) / 1000 + if ii == 0: + lag_m = np.array(data_file['lag']) + else: + _, _, _, _, _, _, _, _, _, _, _, lag_m_sim = RMSD_calc_diff(data_file, gensim_data_obs) + lag_m = np.array(lag_m_sim) * 1000 # np.array(data_file['lag']) / 1000 + + if ii == 0: + # Plotting the observed data (green line) + if with_noise and fit_funct != '': + height_km_err = np.array(fit_funct['height']) / 1000 + abs_mag_sim_err = np.array(fit_funct['absolute_magnitudes']) + + # Plot confidence intervals (filled areas) + ax0.fill_betweenx( + height_km_err, + abs_mag_sim_err - mag_noise_real, + abs_mag_sim_err + mag_noise_real, + color='darkgray', + label='1$\sigma$ '+str(np.round(mag_noise_real,3)), + alpha=0.2 + ) + ax0.fill_betweenx( + height_km_err, + abs_mag_sim_err - mag_noise_real * 1.96, + abs_mag_sim_err + mag_noise_real * 1.96, + color='lightgray', + alpha=0.2 + ) + + obs_time_err = np.array(fit_funct['time']) + vel_kms_err = np.array(fit_funct['velocities']) / 1000 + lag_m_err = np.array(fit_funct['lag']) + + # Plot velocity confidence intervals + ax1.fill_between( + obs_time_err, + vel_kms_err - vel_noise, + vel_kms_err + vel_noise, + color='darkgray', + label='1$\sigma$ '+str(np.round(len_noise*1000,1))+' m', + alpha=0.2 + ) + ax1.fill_between( + obs_time_err, + vel_kms_err - vel_noise * 1.96, + vel_kms_err + vel_noise * 1.96, + color='lightgray', + alpha=0.2 + ) + + # Plot velocity confidence intervals + ax3.fill_between( + obs_time_err, + lag_m_err - lag_noise, + lag_m_err + lag_noise, + color='darkgray', + label='1$\sigma$ '+str(np.round(len_noise*1000,1))+' m', + alpha=0.2 + ) + ax3.fill_between( + obs_time_err, + lag_m_err - lag_noise * 1.96, + lag_m_err + lag_noise * 1.96, + color='lightgray', + alpha=0.2 + ) + + + # Store real observation data + real_time = obs_time + real_abs_mag = abs_mag_sim + real_height_km = height_km + + # Plot the observed data (green markers) + ax0.plot(abs_mag_sim, height_km, 'o', color='g') + ax1.plot(obs_time, vel_kms, 'o', color='g') + ax3.plot(obs_time, lag_m, 'o', color='g') + + # Optionally, include observed data in the table + # Uncomment the following lines if you want to include observed data + # curve_data = [ + # '', # Placeholder for color + # 'N/A', # mag$_{RMSD}$ + # 'N/A', # len$_{RMSD}$ + # 'N/A', # m0 + # 'N/A', # rho + # 'N/A', # sigma + # 'N/A', # eta + # 'N/A', # he + # 'N/A', # s + # 'N/A', # ml + # 'N/A' # mu + # ] + # row_colors.append('g') # Color of the observed data + # table_data.append(curve_data) + + else: + + # Interpolate time positions based on height + interp_ht_time = interp1d( + real_height_km, + real_time, + kind='linear', + bounds_error=False, + fill_value='extrapolate' + ) + residual_time_pos = interp_ht_time(height_km) + + if mindist > curr_sel.iloc[ii][distance_metric]: + + # Plot the selected simulation data + if Metsim_flag: + # For Metsim data, plot in black + line_sel0, = ax0.plot(abs_mag_sim, height_km, color='k') + line, = ax1.plot(residual_time_pos, vel_kms, color='k') + line, = ax3.plot(residual_time_pos, lag_m, color='k') + line_color = 'k' + else: + line_sel0, = ax0.plot(abs_mag_sim, height_km) + line_color = line_sel0.get_color() + if line_color == '#2ca02c': + line_color='m' + # change the color of line_sel0 + line_sel0.set_color('m') + line, = ax1.plot(residual_time_pos, vel_kms, color=line_color) + line, = ax3.plot(residual_time_pos, lag_m, color=line_color) + else: + # Plot the selected simulation data in gray + line_sel0, = ax0.plot(abs_mag_sim, height_km, color='dimgray', linewidth=0.1) # alpha=0.2, + line_color = line_sel0.get_color() + line, = ax1.plot(residual_time_pos, vel_kms, color=line_color, linewidth=0.1) + line, = ax3.plot(residual_time_pos, lag_m, color=line_color, linewidth=0.1) + + # ax2.hist(pd_datafram_PCA_selected[distance_metric], bins=100, alpha=0.5, color='b') # color='b', edgecolor='black', linewidth=1.2 + # make the cumulative distribution of pd_datafram_PCA_selected[distance_metric] + # ax2.hist(pd_datafram_PCA_selected[distance_metric], bins=100, alpha=0.5, color='b', cumulative=True, edgecolor='black', linewidth=1.2) + ax2.hist(df_sel_shower[distance_metric], edgecolor='black', alpha=0.5, color='b') #, cumulative=True , color='b', edgecolor='black', linewidth=1.2 + # put a vertical line at the dist + ax2.axvline(x=mindist, color='blue', linestyle='--', label='Real event distance') + + plot_xaxis_dist = distance_metric + if distance_metric == 'distance_meteor': + plot_xaxis_dist = 'PC distance' + elif distance_metric == 'distance_mean': + plot_xaxis_dist = 'PC Mean reliz. distance' + elif distance_metric == 'multiple_rmsd': + plot_xaxis_dist = 'RMSD' + + ax2.set_xlabel(plot_xaxis_dist) + ax2.set_ylabel('Count') # Cumulative + # # make th y axis logarithmic + # ax2.set_yscale('log') + # remove the right and upper border + ax2.spines['right'].set_visible(False) + ax2.spines['top'].set_visible(False) + + + # Adjust the plot styles and axes + ax0.invert_xaxis() + ax1.grid(linestyle='--', color='lightgray') + ax0.grid(linestyle='--', color='lightgray') + ax3.grid(linestyle='--', color='lightgray') + + ax1.set_xlabel('Time [s]') + ax1.set_ylabel('Velocity [km/s]') + ax3.set_xlabel('Time [s]') + ax3.set_ylabel('Lag [m]') + ax0.set_xlabel('Absolute Magnitude') + ax0.set_ylabel('Height [km]') + + # Remove legends from both plots if any + if ax0.get_legend() is not None: + ax0.get_legend().remove() + if ax1.get_legend() is not None: + ax1.get_legend().remove() + if ax3.get_legend() is not None: + ax3.get_legend().remove() + + plt.savefig(output_dir + os.sep +file_name_obs+'_grayPlot_'+str(len(df_sel_shower))+'ev_'+str(mindist)+'distPC.png', bbox_inches='tight') + plt.close() + + + + + + + + + +def plot_PCA_mindist_RMSD(pd_datafram_PCA_selected_before_knee_NO_repetition_all, mindist, maxdist, output_PCA_dist, pd_dataframe_PCA_obs_real, pd_datafram_PCA_sim, fit_funct, gensim_data_obs, rmsd_pol_mag, mag_RMSD_real, rmsd_t0_lag, len_RMSD_real, fps, file_name, trajectory_Metsim_file, PCAn_comp): + pd_datafram_PCA_selected_before_knee_NO_repetition = pd_datafram_PCA_selected_before_knee_NO_repetition_all[pd_datafram_PCA_selected_before_knee_NO_repetition_all['distance_meteor'] < mindist] + pd_datafram_PCA_selected_before_maxdist = pd_datafram_PCA_selected_before_knee_NO_repetition_all[pd_datafram_PCA_selected_before_knee_NO_repetition_all['distance_meteor'] < maxdist] + # check if pd_datafram_PCA_selected_before_knee_NO_repetition is not empty and check if folder do not exist + if len(pd_datafram_PCA_selected_before_knee_NO_repetition) != 0: # and not os.path.isdir(output_PCA_dist) + mkdirP(output_PCA_dist) - # print('mag',index_abs_mag_sim_start,'-',index_abs_mag_sim_end,'\nheight',index_ht_sim,'-',index_ht_sim_end) + print('PLOT: histogram of the PC distance meteor') + # plot the histogram of the distance_meteor + plot_histogram_PCA_dist(pd_datafram_PCA_selected_before_knee_NO_repetition_all, output_PCA_dist, mindist, maxdist) + + print('PLOT: all simulations selected and max dist in gray') + # plot all simulations selected and max in gray + plot_gray_dist(pd_datafram_PCA_selected_before_knee_NO_repetition_all, mindist, maxdist,'distance_meteor', pd_dataframe_PCA_obs_real, output_PCA_dist, fit_funct, gensim_data_obs, rmsd_pol_mag, rmsd_t0_lag, fps, file_name, trajectory_Metsim_file) + + print('PLOT: best 10 simulations selected and add the RMSD value to csv selected') + # order pd_datafram_PCA_selected_before_knee_NO_repetition to distance_mean + pd_datafram_PCA_selected_before_knee_NO_repetition = pd_datafram_PCA_selected_before_knee_NO_repetition.sort_values(by=['distance_meteor'], ascending=True) # distance_mean + # plot of the best 10 selected simulations and add the RMSD value to csv selected + LightCurveCoefPLOT(pd_datafram_PCA_selected_before_knee_NO_repetition, pd_dataframe_PCA_obs_real, output_PCA_dist, fit_funct, gensim_data_obs, rmsd_pol_mag, rmsd_t0_lag, fps, file_name, trajectory_Metsim_file, vel_lagplot='lag', pca_N_comp=PCAn_comp) + LightCurveCoefPLOT(pd_datafram_PCA_selected_before_knee_NO_repetition, pd_dataframe_PCA_obs_real, output_PCA_dist, fit_funct, gensim_data_obs, rmsd_pol_mag, rmsd_t0_lag, fps, file_name, trajectory_Metsim_file, vel_lagplot='vel', pca_N_comp=PCAn_comp) + + print('PLOT: the physical characteristics of the selected simulations Mode and KDE') + PhysicalPropPLOT(pd_datafram_PCA_selected_before_knee_NO_repetition, pd_datafram_PCA_sim, output_PCA_dist, file_name, pca_N_comp=PCAn_comp) + + print('PLOT: correlation of the selected simulations (takes a long time)') + # plot correlation function of the selected simulations + correlation_selPLOT(pd_datafram_PCA_sim, pd_datafram_PCA_selected_before_knee_NO_repetition, output_PCA_dist, pca_N_comp=PCAn_comp) + + # from pd_datafram_PCA_selected_before_knee_NO_repetition delete the one that pd_datafram_PCA_selected_before_knee_NO_repetition['rmsd_mag'].iloc[i] > mag_RMSD_real or pd_datafram_PCA_selected_before_knee_NO_repetition['rmsd_len'].iloc[i] > len_RMSD_real: + pd_datafram_PCA_selected_before_knee_NO_repetition_RMSD = pd_datafram_PCA_selected_before_knee_NO_repetition[(pd_datafram_PCA_selected_before_knee_NO_repetition['rmsd_mag'] < mag_RMSD_real) & (pd_datafram_PCA_selected_before_knee_NO_repetition['rmsd_len'] < len_RMSD_real)] + # check if there are any selected simulations + if len(pd_datafram_PCA_selected_before_knee_NO_repetition_RMSD) != 0: + PCA_RMSD_folder=output_PCA_dist+os.sep+'PCA+RMSD' + mkdirP(PCA_RMSD_folder) + print('PLOT: best 10 simulations selected and add the RMSD value to csv selected') + # plot of the best 10 selected simulations and add the RMSD value to csv selected + LightCurveCoefPLOT(pd_datafram_PCA_selected_before_knee_NO_repetition_RMSD, pd_dataframe_PCA_obs_real, PCA_RMSD_folder, fit_funct, gensim_data_obs, rmsd_pol_mag, rmsd_t0_lag, fps, file_name, trajectory_Metsim_file, vel_lagplot='lag', pca_N_comp=PCAn_comp) + LightCurveCoefPLOT(pd_datafram_PCA_selected_before_knee_NO_repetition_RMSD, pd_dataframe_PCA_obs_real, PCA_RMSD_folder, fit_funct, gensim_data_obs, rmsd_pol_mag, rmsd_t0_lag, fps, file_name, trajectory_Metsim_file, vel_lagplot='vel', pca_N_comp=PCAn_comp) + + print('PLOT: the physical characteristics of the selected simulations Mode and KDE') + PhysicalPropPLOT(pd_datafram_PCA_selected_before_knee_NO_repetition_RMSD, pd_datafram_PCA_sim, PCA_RMSD_folder, file_name, pca_N_comp=PCAn_comp) + + print('PLOT: correlation of the selected simulations (takes a long time)') + # plot correlation function of the selected simulations + correlation_selPLOT(pd_datafram_PCA_sim, pd_datafram_PCA_selected_before_knee_NO_repetition_RMSD, PCA_RMSD_folder, pca_N_comp=PCAn_comp) + else: + print('Results already present or No selected simulations below min PCA distance',mindist) + + +#### Reader ############################################################################# + + +def read_GenerateSimulations_output_to_PCA(file_path, name='', fit_funct='', real_event='', flag_for_PCA=False): + real_event_copy = copy.deepcopy(real_event) + if name!='': + print(name) + gensim_data = read_GenerateSimulations_output(file_path, real_event_copy, flag_for_PCA) + if gensim_data is None: + return None + else: + pd_datfram_PCA = array_to_pd_dataframe_PCA(gensim_data, real_event_copy) + return pd_datfram_PCA + + +def read_GenerateSimulations_output(file_path, real_event, flag_for_PCA=False): + # check if present the file_path + if os.path.isfile(file_path): + f = open(file_path,"r") + data = json.loads(f.read()) + + # show processed event + print(file_path) + + # check if there is 'ht_sampled' in the data + if 'ht_sampled' not in data: + print("Warning: 'ht_sampled' not in data. Skipping.") + return None + if data['ht_sampled']!= None: + + vel_sim=data['simulation_results']['leading_frag_vel_arr'][:-1]#['brightest_vel_arr']#['leading_frag_vel_arr']#['main_vel_arr'] + ht_sim=data['simulation_results']['leading_frag_height_arr'][:-1]#['brightest_height_arr']['leading_frag_height_arr']['main_height_arr'] + time_sim=data['simulation_results']['time_arr'][:-1]#['main_time_arr'] + abs_mag_sim=data['simulation_results']['abs_magnitude'][:-1] + len_sim=data['simulation_results']['leading_frag_length_arr'][:-1]#['brightest_length_arr'] + Dynamic_pressure= data['simulation_results']['leading_frag_dyn_press_arr'][:-1] - abs_mag_sim = abs_mag_sim[index_abs_mag_sim_start:index_abs_mag_sim_end] - vel_sim = vel_sim[index_abs_mag_sim_start:index_abs_mag_sim_end] - time_sim = time_sim[index_abs_mag_sim_start:index_abs_mag_sim_end] - ht_sim = ht_sim[index_abs_mag_sim_start:index_abs_mag_sim_end] - len_sim = len_sim[index_abs_mag_sim_start:index_abs_mag_sim_end] - Dynamic_pressure = Dynamic_pressure[index_abs_mag_sim_start:index_abs_mag_sim_end] + # ht_obs=data['ht_sampled'] + # try: + # index_ht_sim=next(x for x, val in enumerate(ht_sim) if val <= ht_obs[0]) + # except StopIteration: + # # index_ht_sim = None + # print('The first element of the observation is not in the simulation') + # return None + + # try: + # index_ht_sim_end=next(x for x, val in enumerate(ht_sim) if val <= ht_obs[-1]) + # except StopIteration: + # # index_ht_sim_end = None + # print('The last element of the observation is not in the simulation') + # return None + + # if real_event!= '': + # mag_obs=real_event['absolute_magnitudes'] + # else: + # mag_obs=data['mag_sampled'] + mag_obs=real_event['absolute_magnitudes'] + # print('read_GenerateSimulations_output mag',mag_obs[0],'-',mag_obs[-1]) - # abs_mag_sim=abs_mag_sim[index_ht_sim:index_ht_sim_end] - # vel_sim=vel_sim[index_ht_sim:index_ht_sim_end] - # time_sim=time_sim[index_ht_sim:index_ht_sim_end] - # ht_sim=ht_sim[index_ht_sim:index_ht_sim_end] - # len_sim=len_sim[index_ht_sim:index_ht_sim_end] + try: + # find the index of the first element of abs_mag_sim that is smaller than the first element of mag_obs + index_abs_mag_sim_start = next(i for i, val in enumerate(abs_mag_sim) if val <= mag_obs[0]) + if flag_for_PCA: + index_abs_mag_sim_start = index_abs_mag_sim_start - 1 + np.random.randint(2) + else: + index_abs_mag_sim_start = index_abs_mag_sim_start - 1 # + np.random.randint(2) + except StopIteration: + print("The first observation height is not within the simulation data range.") + return None + try: + index_abs_mag_sim_end = next(i for i, val in enumerate(abs_mag_sim[::-1]) if val <= mag_obs[-1]) + if flag_for_PCA: + index_abs_mag_sim_end = len(abs_mag_sim) - index_abs_mag_sim_end + 1 - np.random.randint(2) + else: + index_abs_mag_sim_end = len(abs_mag_sim) - index_abs_mag_sim_end + 1 + except StopIteration: + print("The first observation height is not within the simulation data range.") + return None + + # print('mag',index_abs_mag_sim_start,'-',index_abs_mag_sim_end,'\nheight',index_ht_sim,'-',index_ht_sim_end) + + abs_mag_sim = abs_mag_sim[index_abs_mag_sim_start:index_abs_mag_sim_end] + vel_sim = vel_sim[index_abs_mag_sim_start:index_abs_mag_sim_end] + time_sim = time_sim[index_abs_mag_sim_start:index_abs_mag_sim_end] + ht_sim = ht_sim[index_abs_mag_sim_start:index_abs_mag_sim_end] + len_sim = len_sim[index_abs_mag_sim_start:index_abs_mag_sim_end] + Dynamic_pressure = Dynamic_pressure[index_abs_mag_sim_start:index_abs_mag_sim_end] - # closest_indices = find_closest_index(ht_sim, ht_obs) - # Dynamic_pressure= data['simulation_results']['leading_frag_dyn_press_arr'] - # Dynamic_pressure= Dynamic_pressure[index_ht_sim:index_ht_sim_end] - # Dynamic_pressure=[Dynamic_pressure[jj_index_cut] for jj_index_cut in closest_indices] - # abs_mag_sim=[abs_mag_sim[jj_index_cut] for jj_index_cut in closest_indices] - # vel_sim=[vel_sim[jj_index_cut] for jj_index_cut in closest_indices] - # time_sim=[time_sim[jj_index_cut] for jj_index_cut in closest_indices] - # ht_sim=[ht_sim[jj_index_cut] for jj_index_cut in closest_indices] - # len_sim=[len_sim[jj_index_cut] for jj_index_cut in closest_indices] + # abs_mag_sim=abs_mag_sim[index_ht_sim:index_ht_sim_end] + # vel_sim=vel_sim[index_ht_sim:index_ht_sim_end] + # time_sim=time_sim[index_ht_sim:index_ht_sim_end] + # ht_sim=ht_sim[index_ht_sim:index_ht_sim_end] + # len_sim=len_sim[index_ht_sim:index_ht_sim_end] - # divide the vel_sim by 1000 considering is a list - time_sim = [i-time_sim[0] for i in time_sim] - # vel_sim = [i/1000 for i in vel_sim] - len_sim = [i-len_sim[0] for i in len_sim] - # ht_sim = [i/1000 for i in ht_sim] + # closest_indices = find_closest_index(ht_sim, ht_obs) - # Load the constants - const, _ = loadConstants(file_path) - const.dens_co = np.array(const.dens_co) + # Dynamic_pressure= data['simulation_results']['leading_frag_dyn_press_arr'] + # Dynamic_pressure= Dynamic_pressure[index_ht_sim:index_ht_sim_end] + # Dynamic_pressure=[Dynamic_pressure[jj_index_cut] for jj_index_cut in closest_indices] - # Compute the erosion energies - erosion_energy_per_unit_cross_section, erosion_energy_per_unit_mass = wmpl.MetSim.MetSimErosion.energyReceivedBeforeErosion(const) + # abs_mag_sim=[abs_mag_sim[jj_index_cut] for jj_index_cut in closest_indices] + # vel_sim=[vel_sim[jj_index_cut] for jj_index_cut in closest_indices] + # time_sim=[time_sim[jj_index_cut] for jj_index_cut in closest_indices] + # ht_sim=[ht_sim[jj_index_cut] for jj_index_cut in closest_indices] + # len_sim=[len_sim[jj_index_cut] for jj_index_cut in closest_indices] - gensim_data = { - 'name': file_path, - 'type': 'Simulation', - 'v_init': vel_sim[0], # m/s - 'velocities': vel_sim, # m/s - 'height': ht_sim, # m - 'absolute_magnitudes': abs_mag_sim, - 'lag': len_sim-(vel_sim[0]*np.array(time_sim)+len_sim[0]), # m - 'length': len_sim, # m - 'time': time_sim, # s - 'v_avg': np.mean(vel_sim), # m/s - 'v_init_180km': data['params']['v_init']['val'], # m/s - 'Dynamic_pressure_peak_abs_mag': Dynamic_pressure[np.argmin(abs_mag_sim)], - 'zenith_angle': data['params']['zenith_angle']['val']*180/np.pi, - 'mass': data['params']['m_init']['val'], - 'rho': data['params']['rho']['val'], - 'sigma': data['params']['sigma']['val'], - 'erosion_height_start': data['params']['erosion_height_start']['val']/1000, - 'erosion_coeff': data['params']['erosion_coeff']['val'], - 'erosion_mass_index': data['params']['erosion_mass_index']['val'], - 'erosion_mass_min': data['params']['erosion_mass_min']['val'], - 'erosion_mass_max': data['params']['erosion_mass_max']['val'], - 'erosion_range': np.log10(data['params']['erosion_mass_max']['val']) - np.log10(data['params']['erosion_mass_min']['val']), - 'erosion_energy_per_unit_cross_section': erosion_energy_per_unit_cross_section, - 'erosion_energy_per_unit_mass': erosion_energy_per_unit_mass - } + # divide the vel_sim by 1000 considering is a list + time_sim = [i-time_sim[0] for i in time_sim] + # vel_sim = [i/1000 for i in vel_sim] + len_sim = [i-len_sim[0] for i in len_sim] + # ht_sim = [i/1000 for i in ht_sim] - return gensim_data - + # Load the constants + const, _ = loadConstants(file_path) + const.dens_co = np.array(const.dens_co) + + # Compute the erosion energies + erosion_energy_per_unit_cross_section, erosion_energy_per_unit_mass = wmpl.MetSim.MetSimErosion.energyReceivedBeforeErosion(const) + + gensim_data = { + 'name': file_path, + 'type': 'Simulation', + 'v_init': vel_sim[0], # m/s + 'velocities': np.array(vel_sim), # m/s + 'height': np.array(ht_sim), # m + 'absolute_magnitudes': np.array(abs_mag_sim), + 'lag': np.array(len_sim-(vel_sim[0]*np.array(time_sim))), # m +len_sim[0] + 'length': np.array(len_sim), # m + 'time': np.array(time_sim), # s + 'v_avg': np.mean(vel_sim), # m/s + 'vel_180km': data['params']['v_init']['val'], # m/s + 'Dynamic_pressure_peak_abs_mag': Dynamic_pressure[np.argmin(abs_mag_sim)], + 'zenith_angle': data['params']['zenith_angle']['val']*180/np.pi, + 'mass': data['params']['m_init']['val'], + 'rho': data['params']['rho']['val'], + 'sigma': data['params']['sigma']['val'], + 'erosion_height_start': data['params']['erosion_height_start']['val']/1000, + 'erosion_coeff': data['params']['erosion_coeff']['val'], + 'erosion_mass_index': data['params']['erosion_mass_index']['val'], + 'erosion_mass_min': data['params']['erosion_mass_min']['val'], + 'erosion_mass_max': data['params']['erosion_mass_max']['val'], + 'erosion_range': np.log10(data['params']['erosion_mass_max']['val']) - np.log10(data['params']['erosion_mass_min']['val']), + 'erosion_energy_per_unit_cross_section': erosion_energy_per_unit_cross_section, + 'erosion_energy_per_unit_mass': erosion_energy_per_unit_mass + } + + return gensim_data + + else: + return None else: return None -def Old_GenSym_json_get_vel_lag(data): +def Old_GenSym_json_get_vel_lag(data, fps=32): ht_sim=data['simulation_results']['leading_frag_height_arr'][:-1]#['brightest_height_arr']['leading_frag_height_arr']['main_height_arr'] ht_obs=data['ht_sampled'] @@ -1169,7 +2437,7 @@ def Old_GenSym_json_get_vel_lag(data): # get the new velocity with noise for vel_ii in range(1,len(time_sampled)): - if time_sampled[vel_ii]-time_sampled[vel_ii-1]<1.0/FPS: + if time_sampled[vel_ii]-time_sampled[vel_ii-1]<1.0/fps: # if time_sampled[vel_ii] % 0.03125 < 0.000000001: if vel_ii+1 14): - print('Found values below 14 absolute magnitudes:', combined_obs['absolute_magnitudes'][combined_obs['absolute_magnitudes'] > 14]) + # check if any value is below 10 absolute_magnitudes and print find values below 8 absolute_magnitudes + if np.any(combined_obs['absolute_magnitudes'] > 8): + print('Found values below 8 absolute magnitudes:', combined_obs['absolute_magnitudes'][combined_obs['absolute_magnitudes'] > 8]) # delete any values above 10 absolute_magnitudes and delete the corresponding values in the other arrays - combined_obs = {key: combined_obs[key][combined_obs['absolute_magnitudes'] < 14] for key in combined_obs.keys()} + combined_obs = {key: combined_obs[key][combined_obs['absolute_magnitudes'] < 8] for key in combined_obs.keys()} + + dens_fit_ht_beg = 180000 + dens_fit_ht_end = traj.rend_ele - 5000 + if dens_fit_ht_end < 14000: + dens_fit_ht_end = 14000 + + lat_mean = np.mean([traj.rbeg_lat, traj.rend_lat]) + lon_mean = meanAngle([traj.rbeg_lon, traj.rend_lon]) + jd_dat=traj.jdt_ref + + # Fit the polynomail describing the density + dens_co = fitAtmPoly(lat_mean, lon_mean, dens_fit_ht_end, dens_fit_ht_beg, jd_dat) - Dynamic_pressure_peak_abs_mag=(wmpl.Utils.Physics.dynamicPressure(lat_dat, lon_dat, combined_obs['height'][np.argmin(combined_obs['absolute_magnitudes'])], jd_dat, combined_obs['velocities'][np.argmin(combined_obs['absolute_magnitudes'])])) + Dynamic_pressure_peak_abs_mag=(wmpl.Utils.Physics.dynamicPressure(lat_mean, lon_mean, combined_obs['height'][np.argmin(combined_obs['absolute_magnitudes'])], jd_dat, combined_obs['velocities'][np.argmin(combined_obs['absolute_magnitudes'])])) # , gamma=traj.const.gamma const=Constants() zenith_angle=zenithAngleAtSimulationBegin(const.h_init, traj.rbeg_ele, traj.orbit.zc, const.r_earth) @@ -1443,19 +2806,34 @@ def read_pickle_reduction_file(file_path, MetSim_phys_file_path='', obs_sep=Fals erosion_range=(0) erosion_energy_per_unit_cross_section_arr=(0) erosion_energy_per_unit_mass_arr=(0) + v_180km=v_init type_sim='Observation' # put all the varible in a array mass, rho, sigma, erosion_height_start, erosion_coeff, erosion_mass_index, erosion_mass_min, erosion_mass_max, erosion_range, erosion_energy_per_unit_cross_section_arr, erosion_energy_per_unit_mass_arr - output_phys = [mass, rho, sigma, erosion_height_start, erosion_coeff, erosion_mass_index, erosion_mass_min, erosion_mass_max, erosion_range, erosion_energy_per_unit_cross_section_arr, erosion_energy_per_unit_mass_arr] + output_phys = [mass, rho, sigma, erosion_height_start, erosion_coeff, erosion_mass_index, erosion_mass_min, erosion_mass_max, erosion_range, erosion_energy_per_unit_cross_section_arr, erosion_energy_per_unit_mass_arr, v_180km] - # delete the elev_data from the combined_obs - del combined_obs['elev_data'] + # # delete the elev_data from the combined_obs + # del combined_obs['elev_data'] # add to combined_obs the avg velocity and the peak dynamic pressure and all the physical parameters combined_obs['name'] = file_path - combined_obs['v_init'] = combined_obs['velocities'][0] - combined_obs['v_init_180km'] = combined_obs['velocities'][0]+100 + combined_obs['v_init'] = v_init + combined_obs['vel_180km'] = output_phys[11] + combined_obs['lag'] = combined_obs['lag']-combined_obs['lag'][0] + combined_obs['dens_co'] = dens_co + combined_obs['obs1_time'] = obs1_time + combined_obs['obs2_time'] = obs2_time + combined_obs['obs1_length'] = obs1_length + combined_obs['obs2_length'] = obs2_length + combined_obs['obs1_height'] = obs1_height + combined_obs['obs2_height'] = obs2_height + combined_obs['obs1_velocities'] = obs1_velocities + combined_obs['obs2_velocities'] = obs2_velocities + combined_obs['obs1_absolute_magnitudes'] = obs1_absolute_magnitudes + combined_obs['obs2_absolute_magnitudes'] = obs2_absolute_magnitudes + combined_obs['obs1_lag'] = obs1_lag + combined_obs['obs2_lag'] = obs2_lag combined_obs['type'] = type_sim combined_obs['v_avg'] = v_avg combined_obs['Dynamic_pressure_peak_abs_mag'] = Dynamic_pressure_peak_abs_mag @@ -1487,7 +2865,7 @@ def read_MetSim_phyProp_output(MetSim_phys_file_path): data = json.load(json_file) mass=(data['m_init']) # add also rho sigma erosion_height_start erosion_coeff erosion_mass_index erosion_mass_min erosion_mass_max erosion_range erosion_energy_per_unit_cross_section erosion_energy_per_unit_mass - # mass=(data['m_init']) + v_180km=(data['v_init']) rho=(data['rho']) sigma=(data['sigma']) erosion_height_start=(data['erosion_height_start']/1000) @@ -1525,15 +2903,14 @@ def read_MetSim_phyProp_output(MetSim_phys_file_path): erosion_range=(0) erosion_energy_per_unit_cross_section_arr=(0) erosion_energy_per_unit_mass_arr=(0) + v_180km=(0) # put all the varible in a array mass, rho, sigma, erosion_height_start, erosion_coeff, erosion_mass_index, erosion_mass_min, erosion_mass_max, erosion_range, erosion_energy_per_unit_cross_section_arr, erosion_energy_per_unit_mass_arr - output_phys = [mass, rho, sigma, erosion_height_start, erosion_coeff, erosion_mass_index, erosion_mass_min, erosion_mass_max, erosion_range, erosion_energy_per_unit_cross_section_arr, erosion_energy_per_unit_mass_arr] + output_phys = [mass, rho, sigma, erosion_height_start, erosion_coeff, erosion_mass_index, erosion_mass_min, erosion_mass_max, erosion_range, erosion_energy_per_unit_cross_section_arr, erosion_energy_per_unit_mass_arr, v_180km] return output_phys - - -def array_to_pd_dataframe_PCA(data): +def array_to_pd_dataframe_PCA(data, test_data=[]): if data is None: # Handle the None case, maybe log an error or return an empty DataFrame @@ -1564,56 +2941,68 @@ def array_to_pd_dataframe_PCA(data): kc_par = begin_height + (2.86 - 2*np.log(data_array['v_init']))/0.0612 + + try: + # fit a line to the throught the vel_sim and ht_sim + a, b = np.polyfit(data_array['time'],data_array['velocities'], 1) + acceleration_lin = a - # fit a line to the throught the vel_sim and ht_sim - a, b = np.polyfit(data_array['time'],data_array['velocities'], 1) - acceleration_lin = a - - t0 = np.mean(data_array['time']) + t0 = np.mean(data_array['time']) - # initial guess of deceleration decel equal to linear fit of velocity - p0 = [a, 0, 0, t0] + # initial guess of deceleration decel equal to linear fit of velocity + # p0 = [a, 0, 0, t0] + p0 = [avg_lag, 0, 0, t0] - opt_res = opt.minimize(lag_residual, p0, args=(np.array(data_array['time']), np.array(data_array['lag'])), method='Nelder-Mead') + opt_res = opt.minimize(lag_residual, p0, args=(np.array(data_array['time']), np.array(data_array['lag'])), method='Nelder-Mead') - # sample the fit for the velocity and acceleration - a_t0, b_t0, c_t0, t0 = opt_res.x + # sample the fit for the velocity and acceleration + a_t0, b_t0, c_t0, t0 = opt_res.x - # compute reference decelearation - t_decel_ref = (t0 + np.max(data_array['time']))/2 - decel_t0 = cubic_acceleration(t_decel_ref, a_t0, b_t0, t0)[0] + # compute reference decelearation + t_decel_ref = (t0 + np.max(data_array['time']))/2 + decel_t0 = cubic_acceleration(t_decel_ref, a_t0, b_t0, t0)[0] - a_t0=-abs(a_t0) - b_t0=-abs(b_t0) + a_t0=-abs(a_t0) + b_t0=-abs(b_t0) - acceleration_parab_t0=a_t0*6 + b_t0*2 + acceleration_parab_t0=a_t0*6 + b_t0*2 - a3, b3, c3 = np.polyfit(data_array['time'],data_array['velocities'], 2) - acceleration_parab=a3*2 + b3 + a3, b3, c3 = np.polyfit(data_array['time'],data_array['velocities'], 2) + acceleration_parab=a3*2 + b3 - # Assuming the jacchiaVel function is defined as: - def jacchiaVel(t, a1, a2, v_init): - return v_init - np.abs(a1) * np.abs(a2) * np.exp(np.abs(a2) * t) + # Assuming the jacchiaVel function is defined as: + def jacchiaVel(t, a1, a2, v_init): + return v_init - np.abs(a1) * np.abs(a2) * np.exp(np.abs(a2) * t) - # Generating synthetic observed data for demonstration - t_observed = np.array(data_array['time']) # Observed times + # Generating synthetic observed data for demonstration + t_observed = np.array(data_array['time']) # Observed times - # Residuals function for optimization - def residuals(params): - a1, a2 = params - predicted_velocity = jacchiaVel(t_observed, a1, a2, v0) - return np.sum((data_array['velocities'] - predicted_velocity)**2) + # Residuals function for optimization + def residuals(params): + a1, a2 = params + predicted_velocity = jacchiaVel(t_observed, a1, a2, v0) + return np.sum((data_array['velocities'] - predicted_velocity)**2) - # Initial guess for a1 and a2 - initial_guess = [0.005, 10] + # Initial guess for a1 and a2 + initial_guess = [0.005, 10] - # Apply minimize to the residuals - result = minimize(residuals, initial_guess) + # Apply minimize to the residuals + result = minimize(residuals, initial_guess) - # Results - jac_a1, jac_a2 = abs(result.x) + # Results + jac_a1, jac_a2 = abs(result.x) - acc_jacchia = abs(jac_a1)*abs(jac_a2)**2 + acc_jacchia = abs(jac_a1)*abs(jac_a2)**2 + except Exception as e: + # Handle exceptions and provide default values + print(f"Error in computation: {e}, filling with default zeros.") + acceleration_lin = 0 + a_t0 = b_t0 = c_t0 = t0 = 0 + decel_t0 = 0 + acceleration_parab_t0 = 0 + a3 = b3 = c3 = 0 + acceleration_parab = 0 + jac_a1 = jac_a2 = acc_jacchia = 0 try: # fit a line to the throught the obs_vel and ht_sim @@ -1643,50 +3032,19 @@ def residuals(params): # a3_Outabs, b3_Outabs, c3_Outabs = np.polyfit(data_array['height'][index_ht_peak:], data_array['absolute_magnitudes'][index_ht_peak:], 2) - ######## SKEW KURT ################ - # create a new array with the same values as time_pickl - index=[] - # if the distance between two index is smalle than 0.05 delete the second one - for i in range(len(data_array['time'])-1): - if data_array['time'][i+1]-data_array['time'][i] < 0.01: - # save the index as an array - index.append(i+1) - # delete the index from the list - time_pickl = np.delete(data_array['time'], index) - abs_mag_pickl = np.delete(data_array['time'], index) - - abs_mag_pickl = [0 if math.isnan(x) else x for x in abs_mag_pickl] - - # subrtract the max value of the mag to center it at the origin - mag_sampled_norm = (-1)*(abs_mag_pickl - np.max(abs_mag_pickl)) - # check if there is any negative value and add the absolute value of the min value to all the values - mag_sampled_norm = mag_sampled_norm + np.abs(np.min(mag_sampled_norm)) - # normalize the mag so that the sum is 1 - time_sampled_norm= time_pickl - np.mean(time_pickl) - # mag_sampled_norm = mag_sampled_norm/np.sum(mag_sampled_norm) - mag_sampled_norm = mag_sampled_norm/np.max(mag_sampled_norm) - # substitute the nan values with zeros - mag_sampled_norm = np.nan_to_num(mag_sampled_norm) - - # create an array with the number the ammount of same number equal to the value of the mag - mag_sampled_distr = [] - mag_sampled_array=np.asarray(mag_sampled_norm*1000, dtype = 'int') - for i in range(len(abs_mag_pickl)): - # create an integer form the array mag_sampled_array[i] and round of the given value - numbs=mag_sampled_array[i] - # invcrease the array number by the mag_sampled_distr numbs - # array_nu=(np.ones(numbs+1)*i_pos).astype(int) - array_nu=(np.ones(numbs+1)*time_sampled_norm[i]) - mag_sampled_distr=np.concatenate((mag_sampled_distr, array_nu)) - - # # # plot the mag_sampled_distr as an histogram - # plt.hist(mag_sampled_distr) - # plt.show() + ######## RMSD ############### + # print('fit_funct RMSD mag',fit_funct['rmsd_mag'],' vel',fit_funct['rmsd_vel'], ' lag',fit_funct['rmsd_len']) + if test_data == []: + rmsd_lag = 0 + rmsd_mag = 0 + chi2_red_lag = 0 + chi2_red_mag = 0 - # kurtosyness.append(kurtosis(mag_sampled_distr)) - # skewness.append(skew(mag_sampled_distr)) - kurtosyness=kurtosis(mag_sampled_distr) - skewness=skew(mag_sampled_distr) + else: + # Compute the residuals + chi2_red_mag, chi2_red_vel, chi2_red_lag, rmsd_mag, rmsd_vel, rmsd_lag, magnitude_differences, velocity_differences, lag_differences, residual_time_pos, residual_height_pos, lag_kms_sim = RMSD_calc_diff(data, test_data) #, fit_funct + + # print(data_array['name'],'rmsd_mag',rmsd_mag,'rmsd_vel',rmsd_vel,'rmsd_len',rmsd_lag) ################################# @@ -1696,9 +3054,13 @@ def residuals(params): data_picklefile_pd = { 'solution_id': [data_array['name']], 'type': [data_array['type']], - 'vel_init_norot': [data_array['v_init']], - 'vel_avg_norot': [data_array['v_avg']], - 'v_init_180km': [data_array['v_init_180km']], + 'rmsd_mag': [rmsd_mag], + 'rmsd_len': [rmsd_lag], + 'chi2_red_mag': [chi2_red_mag], + 'chi2_red_len': [chi2_red_lag], + 'vel_1st_frame': [data_array['v_init']], + 'vel_avg': [data_array['v_avg']], + 'vel_180km': [data_array['vel_180km']], 'duration': [duration], 'peak_mag_height': [peak_mag_height], 'begin_height': [begin_height], @@ -1715,8 +3077,6 @@ def residuals(params): 'decel_t0': [decel_t0], 'decel_jacchia': [acc_jacchia], 'zenith_angle': [data_array['zenith_angle']], - 'kurtosis': [kurtosyness], - 'skew': [skewness], 'avg_lag': [avg_lag], 'kc': [kc_par], 'Dynamic_pressure_peak_abs_mag': [data_array['Dynamic_pressure_peak_abs_mag']], @@ -1758,107 +3118,219 @@ def residuals(params): -########## Utils ########################## - -# Function to get trajectory data folder -def find_and_extract_trajectory_files(directory, MetSim_extention): - trajectory_files = [] - file_names = [] - output_folders = [] - input_folders = [] - trajectory_Metsim_file = [] - - for root, dirs, files in os.walk(directory): - # go in each folder and find the file with the end _trajectory.pickle but skip the folder with the name GenSim - if 'GenSim' in root: - continue - - csv_file_found=False - for file in files: - if file.endswith(NAME_SUFX_CSV_OBS): - # open - csv_file_found=True - real_data = pd.read_csv(os.path.join(root, file)) - if root not in real_data['solution_id'][0]: - print('The solution_id in the csv file is not the same as the folder name or does not exist in the folder name:', root) - continue - # split real_data['solution_id'][0] in the directory and the name of the file - _ , file_from_csv = os.path.split(real_data['solution_id'][0]) - - base_name = os.path.splitext(file_from_csv)[0] # Remove the file extension - #check if the file_from_csv endswith "_trajectory" if yes then extract the number 20230405_010203 - if base_name.endswith("_trajectory"): - variable_name = base_name.replace("_trajectory", "") # Extract the number 20230405_010203 - output_folder_name = base_name.replace("_trajectory", NAME_SUFX_GENSIM) # _GenSim folder whre all generated simulations are stored - else: - variable_name = base_name - output_folder_name = base_name + NAME_SUFX_GENSIM - +########## Utils ########################## - if file_from_csv.endswith("json"): - # MetSim_phys_file_path = os.path.join(root, file_from_csv) +# create a txt file where you save averithing that has been printed +class Logger(object): + def __init__(self, directory=".", filename="log.txt"): + self.terminal = sys.stdout + # Ensure the directory exists + if not os.path.exists(directory): + os.makedirs(directory) + # Combine the directory and filename to create the full path + filepath = os.path.join(directory, filename) + self.log = open(filepath, "a") - # from namefile_sel json file open the json file and save the namefile_sel.const part as file_name_obs+'_sim_fit.json' - with open(os.path.join(root, file_from_csv)) as json_file: - data = json.load(json_file) - const_part = data['const'] - MetSim_phys_file_path = os.path.join(root, output_folder_name)+os.sep+variable_name+'_sim_fit.json' - with open(os.path.join(root, output_folder_name)+os.sep+variable_name+'_sim_fit.json', 'w') as outfile: - json.dump(const_part, outfile, indent=4) + def write(self, message): + self.terminal.write(message) + self.log.write(message) - else: - # check if MetSim_phys_file_path exist - if os.path.isfile(os.path.join(root, variable_name + MetSim_extention)): - # print did not find with th given extention revert to default - MetSim_phys_file_path = os.path.join(root, variable_name + MetSim_extention) - elif os.path.isfile(os.path.join(root, variable_name + '_sim_fit_latest.json')): - print(base_name,': No MetSim file with the given extention', MetSim_extention,'reverting to default extention _sim_fit_latest.json') - MetSim_phys_file_path = os.path.join(root, variable_name + '_sim_fit_latest.json') - else: - # do not save the rest of the files - print(base_name,': No MetSim file with the given extention', MetSim_extention,'do not consider the folder') - continue + def flush(self): + # This might be necessary as stdout could call flush + self.terminal.flush() + def close(self): + # Close the log file when done + self.log.close() - input_folders.append(root) - trajectory_files.append(os.path.join(root, file)) - file_names.append(variable_name) - output_folders.append(os.path.join(root, output_folder_name)) - trajectory_Metsim_file.append(MetSim_phys_file_path) +# update solution_id directory saved in CSV files +def update_solution_ids(base_dir, new_base_dir): + # Iterate through all subdirectories + for root, dirs, files in os.walk(new_base_dir): + for file in files: + # Only process CSV files + if file.endswith('.csv'): + file_path = os.path.join(root, file) + print(f"Processing file: {file_path}") + try: + # Load the CSV file as DataFrame + df = pd.read_csv(file_path) + + # Check if 'solution_id' column exists + if 'solution_id' in df.columns: + # Update each value in 'solution_id' + df['solution_id'] = df['solution_id'].apply( + lambda x: x.replace(base_dir, new_base_dir) if isinstance(x, str) else x + ) + + # Write updated DataFrame back to CSV + df.to_csv(file_path, index=False) + print(f"Updated solution_id in: {file_path}") + + except Exception as e: + print(f"Failed to process file {file_path} due to: {e}") + + +class SetUpObservationFolders: + def __init__(self, input_folder, metsim_json): + """ + Loads the observation data from the given folder or file and MetSim json file. + The observation data can be in the form of a CSV file, pickle file, or JSON file. + + Parameters: + input_folder: str - The path to the folder containing the observation data. + metsim_json: str - JSON file extension for MetSim constants, default '_sim_fit_latest.json'. + """ + self.input_folder = input_folder + self.metsim_json = metsim_json + self.input_folder_file = self._get_observation_files() + + def __repr__(self): + return f"SetUpObservationFolders({self.input_folder}, {self.metsim_json})" + + def __str__(self): + return f"SetUpObservationFolders: input folder={self.input_folder}, MetSim json file end={self.metsim_json}" + + def _get_observation_files(self): + """ + Determines if the input is a directory or a single file and processes accordingly. + """ + if os.path.isdir(self.input_folder): + return self._find_trajectory_files(self.input_folder) + elif os.path.isfile(self.input_folder): + return self._process_single_file(self.input_folder) + else: + print('The provided path or file does not exist') + sys.exit() - if csv_file_found==False: - for file in files: - if file.endswith("_trajectory.pickle"): - base_name = os.path.splitext(file)[0] # Remove the file extension - variable_name = base_name.replace("_trajectory", "") # Extract the number 20230405_010203 - output_folder_name = base_name.replace("_trajectory", NAME_SUFX_GENSIM) # _GenSim folder whre all generated simulations are stored - - # check if MetSim_phys_file_path exist - if os.path.isfile(os.path.join(root, variable_name + MetSim_extention)): - # print did not find with th given extention revert to default - MetSim_phys_file_path = os.path.join(root, variable_name + MetSim_extention) - elif os.path.isfile(os.path.join(root, variable_name + '_sim_fit_latest.json')): - print(base_name,': No MetSim file with the given extention', MetSim_extention,'reverting to default extention _sim_fit_latest.json') - MetSim_phys_file_path = os.path.join(root, variable_name + '_sim_fit_latest.json') - else: - # do not save the rest of the files - print(base_name,': No MetSim file with the given extention', MetSim_extention,'do not consider the folder') - continue - - input_folders.append(root) - trajectory_files.append(os.path.join(root, file)) - file_names.append(variable_name) - output_folders.append(os.path.join(root, output_folder_name)) - trajectory_Metsim_file.append(MetSim_phys_file_path) + def _process_single_file(self, filepath): + """ + Processes a single file, extracts relevant information, and determines the output folder and MetSim file path. + """ + trajectory_files = [filepath] + file_name = os.path.splitext(os.path.basename(filepath))[0] + input_folder = os.path.dirname(filepath) + output_folder = os.path.splitext(filepath)[0] + NAME_SUFX_GENSIM + + # Get the MetSim file path, or create it if the input is a JSON file + if filepath.endswith('.json'): + with open(filepath) as json_file: + const_part = json.load(json_file)['const'] + metsim_path = os.path.join(input_folder, f'{file_name}{self.metsim_json}') + with open(metsim_path, 'w') as outfile: + json.dump(const_part, outfile, indent=4) + return [[trajectory_files[0], file_name, input_folder, output_folder, metsim_path]] + else: + metsim_path = self._get_metsim_file(input_folder, file_name) + return [[trajectory_files[0], file_name, input_folder, output_folder, metsim_path]] + + def _find_trajectory_files(self, directory): + """ + Walks through the directory to find and process trajectory files. + """ + trajectory_files, file_names, input_folders, output_folders, metsim_files = [], [], [], [], [] + + for root, _, files in os.walk(directory): + # Skip folders with the name NAME_SUFX_GENSIM + if NAME_SUFX_GENSIM in root: + continue - input_list = [[trajectory_files[ii], file_names[ii], input_folders[ii], output_folders[ii], trajectory_Metsim_file[ii]] for ii in range(len(trajectory_files))] + csv_found = False + # Look for CSV files first + for file in files: + if file.endswith(NAME_SUFX_CSV_OBS): + csv_found = True + self._process_csv_file(root, file, trajectory_files, file_names, input_folders, output_folders, metsim_files) + break - return input_list + # If no CSV file is found, look for pickle files + if not csv_found: + for file in files: + if file.endswith('_trajectory.pickle'): + self._process_pickle_file(root, file, trajectory_files, file_names, input_folders, output_folders, metsim_files) + + return [[trajectory_files[i], file_names[i], input_folders[i], output_folders[i], metsim_files[i]] for i in range(len(trajectory_files))] + + def _process_csv_file(self, root, file, trajectory_files, file_names, input_folders, output_folders, metsim_files): + """ + Processes a CSV file to extract relevant information and determine the output folder and MetSim file path. + """ + real_data = pd.read_csv(os.path.join(root, file)) + solution_id = real_data['solution_id'][0] + if root not in solution_id: + print('The solution_id in the CSV file does not match the folder name:', root) + return + + _, file_from_csv = os.path.split(solution_id) + base_name = os.path.splitext(file_from_csv)[0] + variable_name, output_folder_name = self._get_variable_and_output(base_name) + + # Get the MetSim file path, or create it if the input is a JSON file + metsim_path = self._get_metsim_file(root, variable_name) + if file_from_csv.endswith('.json'): + with open(os.path.join(root, file_from_csv)) as json_file: + const_part = json.load(json_file)['const'] + metsim_path = os.path.join(root, output_folder_name, f'{variable_name}_sim_fit.json') + os.makedirs(os.path.join(root, output_folder_name), exist_ok=True) + with open(metsim_path, 'w') as outfile: + json.dump(const_part, outfile, indent=4) + self._add_file_details(root, file, variable_name, output_folder_name, metsim_path, trajectory_files, file_names, input_folders, output_folders, metsim_files) + + def _process_pickle_file(self, root, file, trajectory_files, file_names, input_folders, output_folders, metsim_files): + """ + Processes a pickle file to extract relevant information and determine the output folder and MetSim file path. + """ + base_name = os.path.splitext(file)[0] + variable_name, output_folder_name = self._get_variable_and_output(base_name) + metsim_path = self._get_metsim_file(root, variable_name) + self._add_file_details(root, file, variable_name, output_folder_name, metsim_path, trajectory_files, file_names, input_folders, output_folders, metsim_files) + + def _get_variable_and_output(self, base_name): + """ + Determines the variable name and output folder name based on the base file name. + """ + if base_name.endswith('_trajectory'): + variable_name = base_name.replace('_trajectory', '') + output_folder_name = f'{variable_name}{NAME_SUFX_GENSIM}' + else: + variable_name = base_name + output_folder_name = f'{base_name}{NAME_SUFX_GENSIM}' + return variable_name, output_folder_name + + def _get_metsim_file(self, folder, variable_name): + """ + Gets the path to the MetSim file, falling back to a default if necessary. + """ + metsim_path = os.path.join(folder, f'{variable_name}{self.metsim_json}') + if os.path.isfile(metsim_path): + return metsim_path + default_path = os.path.join(folder, f'{variable_name}_sim_fit_latest.json') + if os.path.isfile(default_path): + print(f'{variable_name}: No MetSim file with the given extension {self.metsim_json}, reverting to default extension _sim_fit_latest.json') + return default_path + print(f'{variable_name}: No MetSim file found, create a first guess.') + const_nominal = Constants() + const_dict = const_nominal.to_dict() + first_guess = os.path.join(folder, f'{variable_name}_first_guess.json') + with open(first_guess, 'w') as outfile: + json.dump(const_dict, outfile, indent=4) + return first_guess + + def _add_file_details(self, root, file, variable_name, output_folder_name, metsim_path, trajectory_files, file_names, input_folders, output_folders, metsim_files): + """ + Adds the file details to the respective lists if the MetSim file path is valid. + """ + if metsim_path: + trajectory_files.append(os.path.join(root, file)) + file_names.append(variable_name) + input_folders.append(root) + output_folders.append(os.path.join(root, output_folder_name)) + metsim_files.append(metsim_path) def update_sigma_values(file_path, mag_sigma, len_sigma, More_complex_fit=False, Custom_refinement=False): @@ -1906,170 +3378,128 @@ def update_sigma_values(file_path, mag_sigma, len_sigma, More_complex_fit=False, print('modified options file:', file_path) +def CI_range_gen_sim(pd_results, ii_repeat, old_results_number,pd_dataframe_MAX_min_ranges=pd.DataFrame()): + + result_number = len(pd_results) + + # check if only 1 in len + if result_number == 1: + print('Only one result found') + # create a dictionary with the physical parameters + CI_physical_param = { + 'vel_180km': [pd_results['vel_180km'].values[0], pd_results['vel_180km'].values[0]], + 'zenith_angle': [pd_results['zenith_angle'].values[0], pd_results['zenith_angle'].values[0]], + 'mass': [pd_results['mass'].values[0], pd_results['mass'].values[0]], + 'rho': [pd_results['rho'].values[0], pd_results['rho'].values[0]], + 'sigma': [pd_results['sigma'].values[0], pd_results['sigma'].values[0]], + 'erosion_height_start': [pd_results['erosion_height_start'].values[0], pd_results['erosion_height_start'].values[0]], + 'erosion_coeff': [pd_results['erosion_coeff'].values[0], pd_results['erosion_coeff'].values[0]], + 'erosion_mass_index': [pd_results['erosion_mass_index'].values[0], pd_results['erosion_mass_index'].values[0]], + 'erosion_mass_min': [pd_results['erosion_mass_min'].values[0], pd_results['erosion_mass_min'].values[0]], + 'erosion_mass_max': [pd_results['erosion_mass_max'].values[0], pd_results['erosion_mass_max'].values[0]] + } -########## Distance ########################## - + else: + print('Number of results found:',len(pd_results)) + columns_physpar = ['vel_180km','zenith_angle','mass', 'rho', 'sigma', 'erosion_height_start', 'erosion_coeff', + 'erosion_mass_index', 'erosion_mass_min', 'erosion_mass_max'] + + if ii_repeat > 1 and old_results_number == result_number: + ############################################################################### + # try to focus on the one that have good results + + # Calculate the quantiles + quantiles = pd_results[columns_physpar].quantile([0.2, 0.8]) -# Function to find the knee of the distance plot -def find_knee_dist_index(data_meteor_pd, window_of_smothing_avg=3, std_multip_threshold=1, output_path='', around_meteor='', N_sim_sel_force=0): - dist_for_meteor=np.array(data_meteor_pd['distance_meteor']) - #make subtraction of the next element and the previous element of data_for_meteor["distance_meteor"] - # diff_distance_meteor = np.diff(dist_for_meteor[:int(len(dist_for_meteor)/10)]) - diff_distance_meteor = np.diff(dist_for_meteor) - # histogram plot of the difference with the count on the x axis and diff_distance_meteor on the y axis - indices = np.arange(len(diff_distance_meteor)) - # create the cumulative sum of the diff_distance_meteor - cumsum_diff_distance_meteor = np.cumsum(diff_distance_meteor) - # normalize the diff_distance_meteor xnormalized = (x - xminimum) / range of x - diff_distance_meteor_normalized = (diff_distance_meteor - np.min(diff_distance_meteor)) / (np.max(diff_distance_meteor) - np.min(diff_distance_meteor)) + # Convert the quantiles to a dictionary + CI_physical_param = {col: quantiles[col].tolist() for col in columns_physpar} - def moving_average_smoothing(data, window_size): - smoothed_data = np.convolve(data, np.ones(window_size)/window_size, mode='same') - return smoothed_data + ############################################################################### + else: + # try and look for other results that might be around - # apply the smoothing finction - smoothed_diff_distance_meteor = moving_average_smoothing(diff_distance_meteor_normalized, window_of_smothing_avg) - - # fid the first value of the smoothed_diff_distance_meteor that is smaller than the std of the smoothed_diff_distance_meteor - index10percent = np.where(smoothed_diff_distance_meteor < np.std(smoothed_diff_distance_meteor)*std_multip_threshold)[0][0]-2 - - if N_sim_sel_force!=0: - index10percent = N_sim_sel_force - - if index10percent<0: # below does not work problem with finding the mode on KDE later on - index10percent=0 - - if output_path!='': - - # Define a custom palette - custom_palette_orange = { - 'Real': "darkorange", - 'Simulation': "darkorange", - 'Simulation_sel': "darkorange", - 'MetSim': "darkorange", - 'Realization': "darkorange", - 'Observation': "darkorange" - } + # Calculate the quantiles + quantiles = pd_results[columns_physpar].quantile([0.1, 0.9]) - # dimension of the plot 15,5 - plt.figure(figsize=(15,5)) + # Get the minimum and maximum values + min_val = pd_results[columns_physpar].min() + max_val = pd_results[columns_physpar].max() - plt.subplot(1,2,2) - sns.histplot(data_meteor_pd, x="distance_meteor", hue="type", kde=True, cumulative=True, bins=len(dist_for_meteor), palette=custom_palette_orange) # , stat='density' to have probability - plt.xlabel('Distance in PCA space') - plt.ylabel('Number of events') - plt.title('Cumulative distance in PCA space') - plt.axvline(x=(dist_for_meteor[index10percent]), color="darkorange", linestyle='--', label='Knee distance') + # Calculate the extended range using the logic provided + extended_min = min_val - (quantiles.loc[0.1] - min_val) + # consider the value extended_min<0 Check each column in extended_min and set to min_val if negative + for col in columns_physpar: + if extended_min[col] < 0: + extended_min[col] = min_val[col] + extended_max = max_val + (max_val - quantiles.loc[0.9]) - if len(dist_for_meteor)>100: - plt.ylim(0,100) - elif len(dist_for_meteor)>50: - plt.ylim(0,50) + # Convert the extended range to a dictionary + CI_physical_param = {col: [extended_min[col], extended_max[col]] for col in columns_physpar} + + ############################################################################### + + + # check if v_init_180km are the same value + if CI_physical_param['vel_180km'][0] == CI_physical_param['vel_180km'][1]: + CI_physical_param['vel_180km'] = [CI_physical_param['vel_180km'][0] - CI_physical_param['vel_180km'][0]/1000, CI_physical_param['vel_180km'][1] + CI_physical_param['vel_180km'][1]/1000] + if CI_physical_param['zenith_angle'][0] == CI_physical_param['zenith_angle'][1]: + CI_physical_param['zenith_angle'] = [CI_physical_param['zenith_angle'][0] - CI_physical_param['zenith_angle'][0]/10000, CI_physical_param['zenith_angle'][1] + CI_physical_param['zenith_angle'][1]/10000] + if CI_physical_param['mass'][0] == CI_physical_param['mass'][1]: + CI_physical_param['mass'] = [CI_physical_param['mass'][0] - CI_physical_param['mass'][0]/10, CI_physical_param['mass'][1] + CI_physical_param['mass'][1]/10] + if np.round(CI_physical_param['rho'][0]/100) == np.round(CI_physical_param['rho'][1]/100): + if CI_physical_param['rho'][0] - 100<0: + CI_physical_param['rho'] = [CI_physical_param['rho'][0]/10, CI_physical_param['rho'][1] + 100] + else: + CI_physical_param['rho'] = [CI_physical_param['rho'][0] - 100, CI_physical_param['rho'][1] + 100] # - CI_physical_param['rho'][0]/5 + if CI_physical_param['sigma'][0] == CI_physical_param['sigma'][1]: + CI_physical_param['sigma'] = [CI_physical_param['sigma'][0] - CI_physical_param['sigma'][0]/10, CI_physical_param['sigma'][1] + CI_physical_param['sigma'][1]/10] + if CI_physical_param['erosion_height_start'][0] == CI_physical_param['erosion_height_start'][1]: + CI_physical_param['erosion_height_start'] = [CI_physical_param['erosion_height_start'][0] - CI_physical_param['erosion_height_start'][0]/100, CI_physical_param['erosion_height_start'][1] + CI_physical_param['erosion_height_start'][1]/100] + if CI_physical_param['erosion_coeff'][0] == CI_physical_param['erosion_coeff'][1]: + CI_physical_param['erosion_coeff'] = [CI_physical_param['erosion_coeff'][0] - CI_physical_param['erosion_coeff'][0]/10, CI_physical_param['erosion_coeff'][1] + CI_physical_param['erosion_coeff'][1]/10] + if CI_physical_param['erosion_mass_index'][0] == CI_physical_param['erosion_mass_index'][1]: + CI_physical_param['erosion_mass_index'] = [CI_physical_param['erosion_mass_index'][0] - CI_physical_param['erosion_mass_index'][0]/10, CI_physical_param['erosion_mass_index'][1] + CI_physical_param['erosion_mass_index'][1]/10] + if CI_physical_param['erosion_mass_min'][0] == CI_physical_param['erosion_mass_min'][1]: + CI_physical_param['erosion_mass_min'] = [CI_physical_param['erosion_mass_min'][0] - CI_physical_param['erosion_mass_min'][0]/10, CI_physical_param['erosion_mass_min'][1] + CI_physical_param['erosion_mass_min'][1]/10] + if CI_physical_param['erosion_mass_max'][0] == CI_physical_param['erosion_mass_max'][1]: + CI_physical_param['erosion_mass_max'] = [CI_physical_param['erosion_mass_max'][0] - CI_physical_param['erosion_mass_max'][0]/10, CI_physical_param['erosion_mass_max'][1] + CI_physical_param['erosion_mass_max'][1]/10] - plt.legend() - # delete the legend - plt.legend().remove() + # Multiply the 'erosion_height_start' values by 1000 + CI_physical_param['erosion_height_start'] = [x * 1000 for x in CI_physical_param['erosion_height_start']] - plt.subplot(1,2,1) - # sns.histplot(diff_distance_meteor_normalized, kde=True, bins=len(distance_meteor_sel_save)) - #make the bar plot 0.5 transparency - - plt.bar(indices, diff_distance_meteor_normalized,color="darkorange", alpha=0.5, edgecolor='black') - plt.xlabel('Number of events') - plt.ylabel('Normalized difference') - plt.title('Distance difference Normalized') - # put a horizontal line at len(curr_sel['distance_meteor']) - plt.axvline(x=index10percent, color="darkorange", linestyle='--') - if len(dist_for_meteor)>100: - plt.xlim(-1,100) - elif len(dist_for_meteor)>50: - plt.xlim(-1,50) - - # find the mean of the first 100 elements of diff_distance_meteor_normalized and put a horizontal line - # plt.axhline(y=np.std(smoothed_diff_distance_meteor), color="darkorange", linestyle='--') - - # set a sup title - plt.suptitle(around_meteor) - - # give more space - plt.tight_layout() - # plt.show() - - # save the figure maximized and with the right name - plt.savefig(output_path+os.sep+around_meteor+os.sep+around_meteor+'_knee'+str(index10percent+1)+'ev_MAXdist'+str(np.round(dist_for_meteor[index10percent],2))+'.png', dpi=300) - - # close the figure - plt.close() + # # check if pd_dataframe_ranges is not empty + # if pd_dataframe_MAX_min_ranges.empty == False: + # # make sure that all the values are within the pd_dataframe_MAX_min_ranges + # for key in CI_physical_param: + # if CI_physical_param[key][0] < pd_dataframe_MAX_min_ranges[key].min(): + # CI_physical_param[key][0] = pd_dataframe_MAX_min_ranges[key].min() + # print(key,'min is below the allowd value',CI_physical_param[key][0]) + # if CI_physical_param[key][1] > pd_dataframe_MAX_min_ranges[key].max(): + # CI_physical_param[key][1] = pd_dataframe_MAX_min_ranges[key].max() + # print(key,'max is above the allowd value',CI_physical_param[key][1]) - return index10percent - -# function to use the mahaloby distance and from the mean of the selected shower -def dist_PCA_space_select_sim(df_sim_PCA, shower_current_PCA_single, cov_inv, meanPCA_current, df_sim_shower, shower_current_single, N_sim_sel_force=0, output_dir=''): - N_sim_sel_all=100 - print('calculate distance for',shower_current_single['solution_id']) - - df_sim_PCA_for_now = df_sim_PCA.drop(['type'], axis=1).values - - distance_current = [] - for i_sim in range(len(df_sim_PCA_for_now)): - distance_current.append(mahalanobis_distance(df_sim_PCA_for_now[i_sim], shower_current_PCA_single, cov_inv)) - - # create an array with lenght equal to the number of simulations and set it to shower_current_PCA['solution_id'][i_shower] - solution_id_dist = [shower_current_single['solution_id']] * len(df_sim_PCA_for_now) - df_sim_shower['solution_id_dist'] = solution_id_dist - df_sim_shower['distance_meteor'] = distance_current - # sort the distance and select the n_selected closest to the meteor - df_sim_shower_dis = df_sim_shower.sort_values(by=['distance_meteor']).reset_index(drop=True) - df_sim_selected = df_sim_shower_dis[:N_sim_sel_all].drop(['type'], axis=1) - df_sim_selected['type'] = 'Simulation_sel' - - # create a dataframe with the selected simulated shower characteristics - df_sim_PCA_dist = df_sim_PCA - df_sim_PCA_dist['distance_meteor'] = distance_current - df_sim_PCA_dist = df_sim_PCA_dist.sort_values(by=['distance_meteor']).reset_index(drop=True) - # delete the shower code - df_sim_selected_PCA = df_sim_PCA_dist[:N_sim_sel_all].drop(['type','distance_meteor'], axis=1) - - # make df_sim_selected_PCA an array - df_sim_selected_PCA = df_sim_selected_PCA.values - distance_current_mean = [] - for i_shower in range(len(df_sim_selected)): - distance_current_mean.append(scipy.spatial.distance.euclidean(meanPCA_current, df_sim_selected_PCA[i_shower])) - df_sim_selected['distance_mean']=distance_current_mean # from the mean of the selected shower - - df_curr_sel_curv = df_sim_selected.copy() - - around_meteor=shower_current_single['solution_id'] - # check if around_meteor is a file in a folder - if os.path.exists(around_meteor): - # split in file and directory - _, around_meteor = os.path.split(around_meteor) - around_meteor = around_meteor[:15] - - mkdirP(output_dir+os.sep+around_meteor) - window_of_smothing_avg=3 - std_multip_threshold=1 - if N_sim_sel_force!=0: - print(around_meteor,'select the best',N_sim_sel_force,'simulations') - dist_to_cut=find_knee_dist_index(df_curr_sel_curv,window_of_smothing_avg,std_multip_threshold, output_dir, around_meteor, N_sim_sel_force) - # change of curvature print - df_curr_sel_curv=df_curr_sel_curv.iloc[:dist_to_cut] - else: - dist_to_cut=find_knee_dist_index(df_curr_sel_curv,window_of_smothing_avg,std_multip_threshold, output_dir, around_meteor) - print(around_meteor,'index of the knee distance',dist_to_cut+1) - # change of curvature print - df_curr_sel_curv=df_curr_sel_curv.iloc[:dist_to_cut+1] + return CI_physical_param - return df_sim_selected, df_curr_sel_curv +def get_json_files(results_event_dir): + json_files = [] + for file_name in os.listdir(results_event_dir): + if file_name.endswith('.json'): + json_files.append(results_event_dir+os.sep+file_name) + return json_files -#### Matrix function ############################################################################ -# Function to perform Varimax rotation -def varimax(Phi, gamma=1.0, q=20, tol=1e-6): + + + + +#### Matrix function ############################################################################ + +# Function to perform Varimax rotation +def varimax(Phi, gamma=1.0, q=20, tol=1e-6): p, k = Phi.shape R = np.eye(k) d = 0 @@ -2090,17 +3520,18 @@ def mahalanobis_distance(x, mean, cov_inv): + # PCA #################################################################################### -def PCASim(df_sim_shower, df_obs_shower, OUT_PUT_PATH, PCA_percent=99, N_sim_sel=0, variable_PCA=[], No_var_PCA=['kurtosis','skew','a1_acc_jac','a2_acc_jac','a_acc','b_acc','c_acc','c_mag_init','c_mag_end','a_t0', 'b_t0', 'c_t0'], file_name_obs='', cores_parallel=None, PCA_pairplot=False, esclude_real_solution_from_selection=False): +def PCASim(df_sim_shower, df_obs_shower, save_results_folder_PCA, PCA_percent=99, variable_PCA=[], No_var_PCA=['chi2_red_mag', 'chi2_red_len', 'rmsd_mag', 'rmsd_len', 'vel_180km','a1_acc_jac','a2_acc_jac','a_acc','b_acc','c_acc','c_mag_init','c_mag_end','a_t0', 'b_t0', 'c_t0'], file_name_obs=''): ''' This function generate the simulated shower from the erosion model and apply PCA. The function read the json file in the folder and create a csv file with the simulated shower and take the data from GenerateSimulation.py folder. The function return the dataframe of the selected simulated shower. - 'solution_id','type','vel_init_norot','vel_avg_norot','duration', + 'solution_id','type','vel_1st_frame','vel_avg','duration', 'mass','peak_mag_height','begin_height','end_height','t0','peak_abs_mag','beg_abs_mag','end_abs_mag', - 'F','trail_len','deceleration_lin','deceleration_parab','decel_jacchia','decel_t0','zenith_angle', 'kurtosis','skew', + 'F','trail_len','deceleration_lin','deceleration_parab','decel_jacchia','decel_t0','zenith_angle', 'kc','Dynamic_pressure_peak_abs_mag', 'a_acc','b_acc','c_acc','a1_acc_jac','a2_acc_jac','a_mag_init','b_mag_init','c_mag_init','a_mag_end','b_mag_end','c_mag_end', 'rho','sigma','erosion_height_start','erosion_coeff', 'erosion_mass_index', @@ -2109,104 +3540,18 @@ def PCASim(df_sim_shower, df_obs_shower, OUT_PUT_PATH, PCA_percent=99, N_sim_sel ''' - # if variable_PCA is not empty - if variable_PCA != []: - # add to variable_PCA array 'type','solution_id' - variable_PCA = ['solution_id','type'] + variable_PCA - if No_var_PCA != []: - # remove from variable_PCA the variables in No_var_PCA - for var in No_var_PCA: - variable_PCA.remove(var) - - else: - # put in variable_PCA all the variables except mass - variable_PCA = list(df_obs_shower.columns) - # check if mass is in the variable_PCA - if 'mass' in variable_PCA: - # remove mass from variable_PCA - variable_PCA.remove('mass') - # if No_var_PCA is not empty - if No_var_PCA != []: - # remove from variable_PCA the variables in No_var_PCA - for var in No_var_PCA: - # check if the variable is in the variable_PCA - if var in variable_PCA: - variable_PCA.remove(var) - - scaled_sim=df_sim_shower[variable_PCA].copy() - scaled_sim=scaled_sim.drop(['type','solution_id'], axis=1) - - print(len(scaled_sim.columns),'Variables for PCA:\n',scaled_sim.columns) - - # Standardize each column separately - scaler = StandardScaler() - df_sim_var_sel_standardized = scaler.fit_transform(scaled_sim) - df_sim_var_sel_standardized = pd.DataFrame(df_sim_var_sel_standardized, columns=scaled_sim.columns) - - # Identify outliers using Z-score method on standardized data - z_scores = np.abs(zscore(df_sim_var_sel_standardized)) - threshold = 3 - outliers = (z_scores > threshold).any(axis=1) - - # outlier number 0 has alway to be the False - if outliers[0]==True: - print('The MetSim reduction is an outlier, still keep it for the PCA analysis') - outliers[0]=False - - # Assign df_sim_shower to the version without outliers - df_sim_shower = df_sim_shower[~outliers].copy() - - # if PCA_pairplot: - - # scale the data so to be easily plot against each other with the same scale - df_sim_var_sel = df_sim_shower[variable_PCA].copy() - df_sim_var_sel = df_sim_var_sel.drop(['type','solution_id'], axis=1) - - if len(df_sim_var_sel)>10000: - # pick randomly 10000 events - print('Number of events in the simulated :',len(df_sim_var_sel)) - df_sim_var_sel=df_sim_var_sel.sample(n=10000) - - # make a subplot of the distribution of the variables - fig, axs = plt.subplots(int(np.ceil(len(variable_PCA[2:])/5)), 5, figsize=(20, 15)) - # flat it - axs = axs.flatten() - for i, var in enumerate(variable_PCA[2:]): - # plot the distribution of the variable - sns.histplot(df_sim_var_sel[var], kde=True, ax=axs[i], color='b', alpha=0.5, bins=20) - # axs[i//4, i%4].set_title('Distribution of '+var) - # put a vertical line for the df_obs_shower[var] value - axs[i].axvline(df_obs_shower[var].values[0], color='limegreen', linestyle='--', linewidth=5) - # x axis - axs[i].set_xlabel(var) - # # grid - # axs[i//5, i%5].grid() - if i != 0 and i != 5 and i != 10 and i != 15 and i != 20: - # delete the y axis - axs[i].set_ylabel('') - - # delete the plot that are not used - for i in range(len(variable_PCA[2:]), len(axs)): - fig.delaxes(axs[i]) - - # space between the subplots - plt.tight_layout() - - # save the figure - plt.savefig(OUT_PUT_PATH+os.sep+file_name_obs+'_var_hist_real.png') - # close the figure - plt.close() - + df_sim_shower, variable_PCA, outliers = process_PCA_variables(variable_PCA, No_var_PCA, df_obs_shower, df_sim_shower, save_results_folder_PCA, file_name_obs, False) + variable_PCA_initial = variable_PCA.copy() ##################################### delete var that are not in the 5 and 95 percentile of the simulated shower ##################################### # check if a file with the name "log"+n_PC_in_PCA+"_"+str(len(df_sel))+"ev.txt" already exist - if os.path.exists(OUT_PUT_PATH+os.sep+"log_"+file_name_obs[:15]+"_"+str(len(variable_PCA)-2)+"var_"+str(PCA_percent)+"%.txt"): + if os.path.exists(save_results_folder_PCA+os.sep+"log_"+file_name_obs[:15]+"_"+str(len(variable_PCA)-2)+"var_"+str(PCA_percent)+"%.txt"): # remove the file - os.remove(OUT_PUT_PATH+os.sep+"log_"+file_name_obs[:15]+"_"+str(len(variable_PCA)-2)+"var_"+str(PCA_percent)+"%.txt") - sys.stdout = Logger(OUT_PUT_PATH,"log_"+file_name_obs[:15]+"_"+str(len(variable_PCA)-2)+"var_"+str(PCA_percent)+"%.txt") # _30var_99%_13PC + os.remove(save_results_folder_PCA+os.sep+"log_"+file_name_obs[:15]+"_"+str(len(variable_PCA)-2)+"var_"+str(PCA_percent)+"%.txt") + sys.stdout = Logger(save_results_folder_PCA,"log_"+file_name_obs[:15]+"_"+str(len(variable_PCA)-2)+"var_"+str(PCA_percent)+"%.txt") # _30var_99perc_13PC df_all = pd.concat([df_sim_shower[variable_PCA],df_obs_shower[variable_PCA]], axis=0, ignore_index=True) # delete nan @@ -2215,97 +3560,6 @@ def PCASim(df_sim_shower, df_obs_shower, OUT_PUT_PATH, PCA_percent=99, N_sim_sel # create a copy of df_sim_shower for the resampling df_sim_shower_resample=df_sim_shower.copy() # df_obs_shower_resample=df_obs_shower.copy() - No_var_PCA_perc=[] - # check that all the df_obs_shower for variable_PCA is within th 5 and 95 percentie of df_sim_shower of variable_PCA - for var in variable_PCA: - if var != 'type' and var != 'solution_id': - # check if the variable is in the df_obs_shower - if var in df_obs_shower.columns: - # check if the variable is in the df_sim_shower - if var in df_sim_shower.columns: - - ii_all=0 - for i_var in range(len(df_obs_shower[var])): - # check if all the values are outside the 5 and 95 percentile of the df_sim_shower if so delete the variable from the variable_PCA - if df_obs_shower[var][i_var] < np.percentile(df_sim_shower[var], 1) or df_obs_shower[var][i_var] > np.percentile(df_sim_shower[var], 99): - ii_all=+ii_all - - print(var) - - if ii_all==len(df_obs_shower[var]): - print('The observed and all realization',var,'are not within the 1 and 99 percentile of the simulated meteors') - # delete the variable from the variable_PCA - variable_PCA.remove(var) - # save the var deleted in a variable - No_var_PCA_perc.append(var) - - df_all = df_all.drop(var, axis=1) - else: - shapiro_test = stats.shapiro(df_all[var]) - print("Initial Shapiro-Wilk Test:", shapiro_test.statistic,"p-val", shapiro_test.pvalue) - - if var=='zenith_angle': - # # do the cosine of the zenith angle - # df_all[var]=np.cos(np.radians(df_all[var])) - # # df_all[var]=transform_to_gaussian(df_all[var]) - # df_sim_shower_resample[var]=np.cos(np.radians(df_sim_shower_resample[var])) - print('Variable ',var,' is not transformed') - - elif var=='vel_init_norot': - # do the cosine of the zenith angle - # df_all[var]=transform_to_gaussian(df_all[var]) - print('Variable ',var,' is not transformed') - - else: - - pt = PowerTransformer(method='yeo-johnson') - df_all[var]=pt.fit_transform(df_all[[var]]) - df_sim_shower_resample[var]=pt.fit_transform(df_sim_shower_resample[[var]]) - - shapiro_test = stats.shapiro(df_all[var]) - print("NEW Shapiro-Wilk Test:", shapiro_test.statistic,"p-val", shapiro_test.pvalue) - - else: - print('Variable ',var,' is not in the simulated shower') - else: - print('Variable ',var,' is not in the observed shower') - - - - # if PCA_pairplot: - df_all_nameless_plot=df_all.copy() - - if len(df_all_nameless_plot)>10000: - # pick randomly 10000 events - print('Number of events in the simulated:',len(df_all_nameless_plot)) - df_all_nameless_plot=df_all_nameless_plot.sample(n=10000) - - # make a subplot of the rho againist each variable_PCA as a scatter plot - fig, axs = plt.subplots(int(np.ceil(len(variable_PCA[2:])/5)), 5, figsize=(20, 15)) - # flat it - axs = axs.flatten() - for i, var in enumerate(variable_PCA[2:]): - # plot the distribution of the variable - sns.histplot(df_all_nameless_plot[var].values[:len(df_sim_shower[variable_PCA])], kde=True, ax=axs[i], color='b', alpha=0.5, bins=20) - # axs[i//4, i%4].set_title('Distribution of '+var) - # put a vertical line for the df_obs_shower[var] value - # print(df_all_nameless_plot['solution_id'].values[len(df_sim_shower[variable_PCA])]) - axs[i].axvline(df_all_nameless_plot[var].values[len(df_sim_shower[variable_PCA])], color='limegreen', linestyle='--', linewidth=5) - # x axis - axs[i].set_xlabel(var) - # # grid - # axs[i//5, i%5].grid() - if i != 0 and i != 5 and i != 10 and i != 15 and i != 20: - # delete the y axis - axs[i].set_ylabel('') - - # space between the subplots - plt.tight_layout() - - # save the figure - plt.savefig(OUT_PUT_PATH+os.sep+file_name_obs+'_var_hist_yeo-johnson.png') - # close the figure - plt.close() #################################################################################################################### @@ -2321,7 +3575,6 @@ def PCASim(df_sim_shower, df_obs_shower, OUT_PUT_PATH, PCA_percent=99, N_sim_sel # performing preprocessing part so to make it readeble for PCA scaled_df_all = StandardScaler().fit_transform(scaled_df_all) - ################################# # Applying PCA function on the data for the number of components pca = PCA(PCA_percent/100) #PCA_percent @@ -2382,8 +3635,8 @@ def PCASim(df_sim_shower, df_obs_shower, OUT_PUT_PATH, PCA_percent=99, N_sim_sel pcr.fit(X_train, y_train) # Predict using the models y_pred_pcr = pcr.predict(df_sim_shower_resample[variable_PCA_no_info]) - # to_plot_unit=['mass [kg]','rho [kg/m^3]','sigma [s^2/km^2]','erosion height start [km]','erosion coeff [s^2/km^2]','erosion mass index [-]','eros. mass min [kg]','eros. mass max [kg]'] - to_plot_unit = [r'$m_0$ [kg]', r'$\rho$ [kg/m$^3$]', r'$\sigma$ [s$^2$/km$^2$]', r'$h_{e}$ [km]', r'$\eta$ [s$^2$/km$^2$]', r'$s$ [-]', r'$m_{l}$ [kg]', r'$m_{u}$ [kg]'] #,r'log($m_{u}$)-log($m_{l}$) [-]'] + # to_plot_unit=['mass [kg]','rho [kg/m^3]','sigma [kg/MJ]','erosion height start [km]','erosion coeff [kg/MJ]','erosion mass index','eros. mass min [kg]','eros. mass max [kg]'] + to_plot_unit = [r'$m_0$ [kg]', r'$\rho$ [kg/m$^3$]', r'$\sigma$ [kg/MJ]', r'$h_{e}$ [km]', r'$\eta$ [kg/MJ]', r'$s$', r'$m_{l}$ [kg]', r'$m_{u}$ [kg]'] #,r'log($m_{u}$)-log($m_{l}$)'] # multiply y_pred_pcr that has the 'erosion_coeff'*1000000 and 'sigma'*1000000 y_pred_pcr[:,4]=y_pred_pcr[:,4]*1000000 y_pred_pcr[:,2]=y_pred_pcr[:,2]*1000000 @@ -2407,7 +3660,7 @@ def PCASim(df_sim_shower, df_obs_shower, OUT_PUT_PATH, PCA_percent=99, N_sim_sel pcr_results_physical_param = y_pred_pcr.copy() print('--------------------------') - ############### PCR ######################################################################################## + ############### PC plots ######################################################################################## # # select only the column with in columns_PC with the same number of n_components @@ -2430,18 +3683,13 @@ def PCASim(df_sim_shower, df_obs_shower, OUT_PUT_PATH, PCA_percent=99, N_sim_sel plt.ylabel('Percentance of Variance Explained') plt.xlabel('Principal Component') # save the figure - plt.savefig(OUT_PUT_PATH+os.sep+file_name_obs+'PCAexplained_variance_ratio_'+str(len(variable_PCA)-2)+'var_'+str(PCA_percent)+'%_'+str(pca.n_components_)+'PC.png') + plt.savefig(save_results_folder_PCA+os.sep+file_name_obs+'PCAexplained_variance_ratio_'+str(len(variable_PCA)-2)+'var_'+str(PCA_percent)+'perc_'+str(pca.n_components_)+'PC.png') # close the figure plt.close() # plt.show() ### plot covariance matrix - # make the image big as the screen - # plt.figure(figsize=(20, 20)) - - # Compute the correlation coefficients - # cov_data = pca.components_.T # varimax rotation cov_data = rotated_loadings @@ -2451,33 +3699,33 @@ def PCASim(df_sim_shower, df_obs_shower, OUT_PUT_PATH, PCA_percent=99, N_sim_sel # Mapping of original variable names to LaTeX-style labels variable_map = { - 'vel_init_norot': r"$v_i$", - 'vel_avg_norot': r"$v_{avg}$", - 'duration': r"$t$", - 'peak_mag_height': r"$h_{p}$", + 'vel_1st_frame': r"$v_0$", + 'vel_avg': r"$v_{avg}$", + 'vel_180km': r"$v_{180km}$", + 'duration': r"$T$", + 'peak_mag_height': r"$h_{peak}$", 'begin_height': r"$h_{beg}$", 'end_height': r"$h_{end}$", - 'peak_abs_mag': r"$M_{p}$", + 'peak_abs_mag': r"$M_{peak}$", 'beg_abs_mag': r"$M_{beg}$", 'end_abs_mag': r"$M_{end}$", 'F': r"$F$", 'trail_len': r"$L$", 't0': r"$t_0$", - 'deceleration_lin': r"$dAcc_{lin}$", - 'deceleration_parab': r"$dAcc_{par}$", - 'decel_parab_t0': r"$dAcc_{p_{t_0}}$", - 'decel_t0': r"$dAcc_{p1_{t_0}}$", - 'decel_jacchia': r"$dAcc_{jac}$", - 'zenith_angle': r"$\zeta$", - 'avg_lag': r"$lag_{avg}$", + 'deceleration_lin': r"$\bar{a}$", + 'deceleration_parab': r"$a_{quad}(1~s)$", + 'decel_parab_t0': r"$\bar{a}_{poly}(1~s)$", + 'decel_t0': r"$\bar{a}_{poly}$", + 'decel_jacchia': r"$a_0 k$", + 'zenith_angle': r"$z_c$", + 'avg_lag': r"$\bar{\ell}$", 'kc': r"$k_c$", - 'Dynamic_pressure_peak_abs_mag': r"$P_p$", - 'a_mag_init': r"$Mfit_{a_{int}}$", - 'b_mag_init': r"$Mfit_{b_{int}}$", - 'a_mag_end': r"$Mfit_{a_{fin}}$", - 'b_mag_end': r"$Mfit_{b_{fin}}$" + 'Dynamic_pressure_peak_abs_mag': r"$Q_{peak}$", + 'a_mag_init': r"$d_1$", + 'b_mag_init': r"$s_1$", + 'a_mag_end': r"$d_2$", + 'b_mag_end': r"$s_2$" } - # Convert the given array to LaTeX-style labels latex_labels = [variable_map.get(var, var) for var in variable_PCA] @@ -2496,19 +3744,152 @@ def PCASim(df_sim_shower, df_obs_shower, OUT_PUT_PATH, PCA_percent=99, N_sim_sel for j in range(cov_data.shape[1]): plt.text(i, j, "{:.1f}".format(cov_data[i, j]), size=5, color='black', ha="center", va="center") # save the figure - plt.savefig(OUT_PUT_PATH+os.sep+file_name_obs+'PCAcovariance_matrix_'+str(len(variable_PCA)-2)+'var_'+str(PCA_percent)+'%_'+str(pca.n_components_)+'PC.png') + plt.savefig(save_results_folder_PCA+os.sep+file_name_obs+'PCAcovariance_matrix_'+str(len(variable_PCA)-2)+'var_'+str(PCA_percent)+'perc_'+str(pca.n_components_)+'PC.png') # close the figure plt.close() # plt.show() + + ######### importance of each variable in the PCA space #################################################################### + + # Define variable categories by their original names + general_trajectory_vars = { + 'duration', 'trail_len', 'zenith_angle', 'begin_height', + 'peak_mag_height', 'end_height', 'kc', 'Dynamic_pressure_peak_abs_mag' + } + + dynamics_vars = { + 'vel_1st_frame', 'vel_avg', 'avg_lag', 't0', + 'decel_t0', 'decel_parab_t0', 'deceleration_lin', 'deceleration_parab', 'decel_jacchia' + } + + light_curve_vars = { + 'beg_abs_mag', 'peak_abs_mag', 'end_abs_mag', 'F', + 'a_mag_init', 'b_mag_init', 'a_mag_end', 'b_mag_end' + } + + # Calculate variable importance + explained_variance = pca.explained_variance_ratio_ + variable_importance = np.sum(np.abs(rotated_loadings) * explained_variance[:rotated_loadings.shape[1]], axis=1) + variable_importance_percent = variable_importance * 100 + + # Map variable names to LaTeX labels + variable_labels = [variable_map.get(var, var) for var in variable_PCA_no_info] + + # We also want to keep track of original variable names so we can color-code by category + sorted_data = sorted(zip(variable_importance_percent, variable_labels, variable_PCA_no_info), + key=lambda x: x[0], reverse=True) + sorted_importance, sorted_labels, sorted_original_names = zip(*sorted_data) + + # Assign a color based on the category + colors = [] + for var_name in sorted_original_names: + if var_name in general_trajectory_vars: + colors.append('red') + elif var_name in dynamics_vars: + colors.append('green') + elif var_name in light_curve_vars: + colors.append('blue') + else: + # If not categorized, just use a default color + colors.append('gray') + + # Plot the sorted variable importance as a bar plot + plt.figure(figsize=(12, 6)) + bars = plt.bar(sorted_labels, sorted_importance, color=colors, alpha=0.7) + + # save the labels and the importance of the variable and the colors in a csv file + df_variable_importance = pd.DataFrame(list(zip(sorted_labels, sorted_importance, colors)), columns=['Variable', 'Importance', 'Color']) + df_variable_importance.to_csv(save_results_folder_PCA+os.sep+file_name_obs+'_PCA_sorted_variable_importance_percent.csv', index=False) + + # Add percentage value on top of each bar + for bar, importance in zip(bars, sorted_importance): + plt.text( + bar.get_x() + bar.get_width() / 2, + bar.get_height(), + f"{importance:.1f}%", # Display the percentage value + ha='center', + va='bottom', + fontsize=8, + ) + + # Customize plot + plt.xticks(rotation=90) + plt.ylabel("Variable Contribution (%)") + plt.xlabel("Variables") + plt.tight_layout() + + # Save the figure + plt.savefig(save_results_folder_PCA + os.sep + file_name_obs + '_PCA_sorted_variable_importance_percent.png') + plt.close() + + ### Denis Plot #################################################################################################### + + # Assuming cov_data is your loadings matrix with shape (n_variables, n_PCs) + n_variables, n_PCs = cov_data.shape + + # Create LaTeX-style labels for your variables using variable_PCA_no_info + latex_labels = [variable_map.get(var, var) for var in variable_PCA_no_info] + + # Initialize a list to keep track of selected variable indices + selected_vars = [] + + # Step 1: For each PC, create a list of variable indices sorted by absolute loading + sorted_indices_per_pc = [] + for pc_idx in range(n_PCs): + # Get the loadings for PC pc_idx + pc_loadings = cov_data[:, pc_idx] + # Get indices sorted by absolute value of loadings, from highest to lowest + sorted_indices = np.argsort(-np.abs(pc_loadings)) + sorted_indices_per_pc.append(sorted_indices) + + # Step 2: Initialize a list to keep track of positions in each PC's sorted indices + positions_in_pc = [0] * n_PCs # This will keep track of the next variable to consider in each PC + + # Step 3: While not all variables are selected, select variables in round-robin fashion + while len(selected_vars) < n_variables: + for pc_idx in range(n_PCs): + # Get the sorted indices for this PC + sorted_indices = sorted_indices_per_pc[pc_idx] + # Find the next variable not yet selected + while positions_in_pc[pc_idx] < n_variables: + var_idx = sorted_indices[positions_in_pc[pc_idx]] + positions_in_pc[pc_idx] += 1 # Move to next position for this PC + if var_idx not in selected_vars: + selected_vars.append(var_idx) + break # Move to next PC + if len(selected_vars) == n_variables: + break # All variables have been selected + + # Step 4: Rearrange cov_data and labels according to selected_vars + cov_data_selected = cov_data[selected_vars, :] + latex_labels_selected = [latex_labels[i] for i in selected_vars] + + # Step 5: Plot the rearranged covariance matrix + img = plt.matshow(cov_data_selected.T, cmap=plt.cm.coolwarm, vmin=-1, vmax=1) + plt.colorbar(img) + + # Add variable names as labels on the x-axis + plt.xticks(range(len(latex_labels_selected)), latex_labels_selected, rotation=90) + + # Add PCs with variance explained as labels on the y-axis + columns_PC_with_var = ['PC' + str(x) + ' (' + str(percent_variance[x-1]) + '%)' for x in range(1, pca.n_components_+1)] + plt.yticks(range(len(columns_PC_with_var)), columns_PC_with_var) + + # Annotate each cell with the covariance value + for i in range(cov_data_selected.shape[0]): + for j in range(cov_data_selected.shape[1]): + plt.text(i, j, "{:.1f}".format(cov_data_selected[i, j]), size=5, color='black', ha="center", va="center") + + # Save and close the figure + plt.savefig(save_results_folder_PCA + os.sep + file_name_obs + 'PCA_Den_covariance_matrix_' + str(len(variable_PCA_no_info)-2) + 'var_' + str(PCA_percent) + 'perc_' + str(pca.n_components_) + 'PC.png') + plt.close() + + ### # print the number of simulation selected print('PCA run for', len(df_sim_shower),'simulations, delete ',len(outliers)-len(df_sim_shower),' outliers') - # if len(No_var_PCA_perc) > 0: - # for No_var_PCA_perc in No_var_PCA_perc: - # print('Observable data variable [',No_var_PCA_perc,'] is not within the 5 and 95 percentile of the simulated shower') - # print the name of the variables used in PCA print('Variables used in PCA: ',df_all_nameless.columns) @@ -2525,455 +3906,370 @@ def PCASim(df_sim_shower, df_obs_shower, OUT_PUT_PATH, PCA_percent=99, N_sim_sel df_obs_PCA = df_all_PCA.drop(df_all_PCA.index[:len(df_sim_shower)]) - ########### Distance metric takes in to account varinace explained #################################################################### - - if esclude_real_solution_from_selection: - df_all_PCA_cov = df_all_PCA[df_all_PCA['type'] != 'Real'].copy() - else: - # delete the type Real from - df_all_PCA_cov = df_all_PCA.copy() - - # Get explained variances of principal components - explained_variance = pca.explained_variance_ratio_ + # Close the Logger to ensure everything is written to the file STOP COPY in TXT file + sys.stdout.close() - # Calculate mean and inverse covariance matrix for Mahalanobis distance - cov_matrix = df_all_PCA_cov.drop(['type'], axis=1).cov() + # Reset sys.stdout to its original value if needed + sys.stdout = sys.__stdout__ - # Modify covariance matrix based on explained variances - for i in range(len(explained_variance)): - cov_matrix.iloc[i, :] /= explained_variance[i] - # # Modify covariance matrix to positively reflect variance explained - # for i in range(len(explained_variance)): - # cov_matrix.iloc[i, :] *= explained_variance[i] + # PLOT the selected simulated shower ######################################## - cov_inv = inv(cov_matrix) + # Copy the DataFrame + df_sim_shower_small = df_sim_shower.copy() - ############## SELECTION ############################################### + # Store necessary values before sampling + # For example, store the first value of var_phys + physical_vars = ['mass', 'rho', 'sigma', 'erosion_height_start', 'erosion_coeff', 'erosion_mass_index', 'erosion_mass_min', 'erosion_mass_max'] + var_phys_values = {} + for var_phys in physical_vars: + var_phys_values[var_phys] = df_sim_shower[var_phys].values[0] - # group them by Observation, Realization type and the other group by MetSim, Simulation - # meanPCA = df_all_PCA.groupby('type').mean() # does not work + # if len(df_sim_shower_small) >10000: # Avoid long plotting times + # # Randomly sample 10,000 events + # df_sim_shower_small = df_sim_shower_small.sample(n=10000) - df_all_PCA['solution_id'] = df_all['solution_id'] - # Create a new column to group by broader categories - group_mapping = { - 'Observation': 'obs', - 'Realization': 'obs', - 'Real': 'sim', - 'MetSim': 'sim', - 'Simulation': 'sim' - } - df_all_PCA['group'] = df_all_PCA['type'].map(group_mapping) - df_obs_shower['group'] = df_obs_shower['type'].map(group_mapping) - df_obs_PCA['group'] = df_obs_PCA['type'].map(group_mapping) + if len(df_sim_shower_small) > 10000: # Limit to 10,000 rows for performance + # Separate rows with 'MetSim' or 'Real' types + metsim_or_real_rows = df_sim_shower_small[df_sim_shower_small['type'].isin(['MetSim', 'Real'])] - # # Group by the new column and calculate the mean - # meanPCA = df_all_PCA.groupby('group').mean() + # Sample the remaining rows excluding 'MetSim' and 'Real' + other_rows = df_sim_shower_small[~df_sim_shower_small['type'].isin(['MetSim', 'Real'])] + sampled_other_rows = other_rows.sample(n=10000 - len(metsim_or_real_rows), random_state=42) - # # drop the sim column - # meanPCA = meanPCA.drop(['sim'], axis=0) + # Combine the sampled rows with 'MetSim' or 'Real' rows + df_sim_shower_small = pd.concat([metsim_or_real_rows, sampled_other_rows], axis=0) - # Ensure that only numeric columns are used in the mean calculation - df_numeric = df_all_PCA.select_dtypes(include=[np.number]) + print('Generating selected simulation histogram plot...') - # Group by the new column and calculate the mean only for numeric columns - meanPCA = df_numeric.groupby(df_all_PCA['group']).mean() + # Define a custom palette + custom_palette = { + 'Real': "g", + 'Simulation': "b", + 'Simulation_sel': "darkorange", + 'MetSim': "k", + 'Realization': "mediumaquamarine", + 'Observation': "limegreen", + 'Iteration': "gold" + } - # Drop the 'sim' row if it exists - meanPCA = meanPCA.drop(['sim'], axis=0, errors='ignore') + # Concatenate DataFrames + curr_df = pd.concat([df_sim_shower_small, df_obs_shower], axis=0) + + # Compute weights + curr_df['num_type'] = curr_df.groupby('type')['type'].transform('size') + curr_df['weight'] = 1 / curr_df['num_type'] + + # Sampling df_sim_PCA consistently + if len(df_sim_PCA) >10000: + # Use the same indices as in df_sim_shower_small + df_sim_PCA = df_sim_PCA.loc[df_sim_shower_small.index] + + print('Generating PCA space plot... (it takes a while)') + + df_sim_sel_PCA = pd.concat([df_sim_PCA, df_obs_PCA], axis=0) + + # Select only numeric columns + numeric_columns = df_sim_sel_PCA.select_dtypes(include=[np.number]).columns + + # Map point sizes + df_sim_sel_PCA['point_size'] = df_sim_sel_PCA['type'].map({ + 'Simulation_sel': 5, + 'Simulation': 5, + 'MetSim': 20, + 'Real': 20, + 'Realization': 20, + 'Observation': 40 + }) + + # Create the pair plot + fig = sns.pairplot( + df_sim_sel_PCA[numeric_columns.append(pd.Index(['type']))], + hue='type', + corner=True, + palette=custom_palette, + diag_kind='kde', + plot_kws={'s': 5, 'edgecolor': 'k'} + ) + + # Overlay scatter plots with custom point sizes + for i in range(len(fig.axes)): + for j in range(len(fig.axes)): + if i > j: + ax = fig.axes[i, j] + sns.scatterplot( + data=df_sim_sel_PCA, + x=df_sim_sel_PCA.columns[j], + y=df_sim_sel_PCA.columns[i], + hue='type', + size='point_size', + sizes=(5, 40), + ax=ax, + legend=False, + edgecolor='k', + palette=custom_palette + ) - # print(meanPCA) + plt.subplots_adjust(hspace=0.3, wspace=0.3) + fig.savefig(save_results_folder_PCA + os.sep + file_name_obs + 'PCAspace_sim_sel_real_' + str(len(variable_PCA) - 2) + 'var_' + str(PCA_percent) + 'perc_' + str(pca.n_components_) + 'PC.png', dpi=300) + plt.close() - meanPCA_current = meanPCA.loc[(meanPCA.index == 'obs')].values.flatten() - # take only the value of the mean of the first row - shower_current = df_obs_shower[df_obs_shower['group'] == 'obs'] - shower_current_PCA = df_obs_PCA[df_obs_PCA['group'] == 'obs'] + print('Generating result variable plot... (it takes a while)') - # trasform the dataframe in an array - shower_current_PCA = shower_current_PCA.drop(['type','group'], axis=1).values + output_folder = save_results_folder_PCA + os.sep + file_name_obs + '_sel_var_vs_physProp' + if not os.path.isdir(output_folder): + mkdirP(output_folder) - # define the distance - mkdirP(OUT_PUT_PATH+os.sep+SAVE_SELECTION_FOLDER) - if esclude_real_solution_from_selection: - # delete the type Real from - input_list_obs_dist = [[df_sim_PCA[df_sim_PCA['type'] != 'Real'], shower_current_PCA[ii], cov_inv, meanPCA_current, df_sim_shower[df_sim_shower['type'] != 'Real'], shower_current.iloc[ii], N_sim_sel, OUT_PUT_PATH+os.sep+SAVE_SELECTION_FOLDER] for ii in range(len(shower_current))] - df_sim_selected_both_df = domainParallelizer(input_list_obs_dist, dist_PCA_space_select_sim, cores=cores_parallel) + # Loop over physical variables + for var_phys in physical_vars: + # Create subplots + fig, axs = plt.subplots(int(np.ceil(len(variable_PCA[2:]) / 5)), 5, figsize=(20, 15)) + axs = axs.flatten() - else: - input_list_obs_dist = [[df_sim_PCA, shower_current_PCA[ii], cov_inv, meanPCA_current, df_sim_shower, shower_current.iloc[ii], N_sim_sel, OUT_PUT_PATH+os.sep+SAVE_SELECTION_FOLDER] for ii in range(len(shower_current))] - df_sim_selected_both_df = domainParallelizer(input_list_obs_dist, dist_PCA_space_select_sim, cores=cores_parallel) + for i, var in enumerate(variable_PCA[2:]): + # Plot simulation data + axs[i].scatter(df_sim_shower_small[var], df_sim_shower_small[var_phys], c='darkorange') + # Plot vertical line using stored value + axs[i].axvline(df_obs_shower[var].values[0], color='limegreen', linestyle='--', linewidth=5) - # separet df_sim_selected the '' to a list of dataframe called df_sim_selected_all and df_sim_selected_knee - df_sim_selected_all = [] - df_sim_selected_knee = [] - for item in df_sim_selected_both_df: - if isinstance(item, tuple): - df_sim_selected_all.append(item[0]) - df_sim_selected_knee.append(item[1]) + # Plot horizontal line using stored value + axs[i].axhline(var_phys_values[var_phys], color='k', linestyle='-', linewidth=2) - df_sim_selected_all = pd.concat(df_sim_selected_all) - df_sel_shower = pd.concat(df_sim_selected_knee) + if i % 5 == 0: + axs[i].set_ylabel(var_phys) - # DELETE ALL INDEX + axs[i].set_xlabel(var) + axs[i].grid() - # Insert the column at the first position - df_sim_selected_all.insert(1, 'distance_mean', df_sim_selected_all.pop('distance_mean')) - df_sim_selected_all.insert(1, 'distance_meteor', df_sim_selected_all.pop('distance_meteor')) - df_sim_selected_all.insert(1, 'solution_id_dist', df_sim_selected_all.pop('solution_id_dist')) - df_sim_selected_all.insert(1, 'type', df_sim_selected_all.pop('type')) + # Log scale for specific variables + if var_phys in ['erosion_mass_min', 'erosion_mass_max']: + axs[i].set_yscale('log') - df_sim_selected_all.reset_index(drop=True, inplace=True) + # Remove unused subplots + for i in range(len(variable_PCA[2:]), len(axs)): + fig.delaxes(axs[i]) - df_sim_selected_all.to_csv(OUT_PUT_PATH+os.sep+file_name_obs+'_sim_sel.csv', index=False) + plt.tight_layout() + plt.savefig(output_folder + os.sep + file_name_obs + var_phys + '_vs_var_select_PCA.png', dpi=300) + plt.close() - # Insert the column at the first position - df_sel_shower.insert(1, 'distance_mean', df_sel_shower.pop('distance_mean')) - df_sel_shower.insert(1, 'distance_meteor', df_sel_shower.pop('distance_meteor')) - df_sel_shower.insert(1, 'solution_id_dist', df_sel_shower.pop('solution_id_dist')) - df_sel_shower.insert(1, 'type', df_sel_shower.pop('type')) + print('Generating PCA position plot... (it takes a while)') - df_sel_shower.reset_index(drop=True, inplace=True) + output_folder = save_results_folder_PCA + os.sep + file_name_obs + '_sel_PCA_vs_physProp' + if not os.path.isdir(output_folder): + mkdirP(output_folder) - df_sel_shower.to_csv(OUT_PUT_PATH+os.sep+file_name_obs+'_sim_sel_bf_knee.csv', index=False) + # Loop over physical variables + for var_phys in physical_vars: + fig, axs = plt.subplots(int(np.ceil(len(columns_PC) / 5)), 5, figsize=(20, 15)) + axs = axs.flatten() - if isinstance(df_sel_shower, tuple): - df_sel_shower = df_sel_shower[0] - if isinstance(df_sim_selected_all, tuple): - df_sim_selected_all = df_sim_selected_all[0] + for i, var in enumerate(columns_PC): + # Plot simulation data + axs[i].scatter(df_sim_PCA[var], df_sim_shower_small[var_phys], c='darkorange') - # DELETE ALL old INDEX + # Plot vertical line + axs[i].axvline(df_obs_PCA[var].values[0], color='limegreen', linestyle='--', linewidth=5) - # Create the new DataFrame by filtering df_sim_PCA - df_sel_PCA = df_all_PCA[df_all_PCA['solution_id'].isin(df_sel_shower['solution_id'])] - # change all df_sel_PCA 'type' to Simulation_sel - df_sel_PCA['type'] = 'Simulation_sel' - # reset the index - df_sel_PCA.reset_index(drop=True, inplace=True) + # Plot horizontal line using stored value + axs[i].axhline(var_phys_values[var_phys], color='k', linestyle='-', linewidth=2) - # df_sel_shower_no_repetitions = df_sim_shower[df_sim_shower['solution_id'].isin(df_sel_shower['solution_id'])] - # # change all df_sel_PCA 'type' to Simulation_sel - # df_sel_shower_no_repetitions['type'] = 'Simulation_sel' - # # reset the index - # df_sel_shower_no_repetitions.reset_index(drop=True, inplace=True) - - df_sel_shower_no_repetitions = df_sel_shower.copy() + if i % 5 == 0: + axs[i].set_ylabel(var_phys) - # group by solution_id_dist and keep only n_confront_sel from each group - df_sel_shower_no_repetitions = df_sel_shower_no_repetitions.groupby('solution_id_dist').head(len(df_sel_shower_no_repetitions)) + axs[i].set_xlabel(var) + axs[i].grid() - # order by distance_meteor - df_sel_shower_no_repetitions = df_sel_shower_no_repetitions.sort_values('distance_meteor') + # Log scale for specific variables + if var_phys in ['erosion_mass_min', 'erosion_mass_max']: + axs[i].set_yscale('log') - # count duplicates and add a column for the number of duplicates - df_sel_shower_no_repetitions['num_duplicates'] = df_sel_shower_no_repetitions.groupby('solution_id')['solution_id'].transform('size') - - df_sel_shower_no_repetitions['solution_id_dist'] = df_obs_shower['solution_id'].values[0] + # Remove unused subplots + for i in range(len(columns_PC), len(axs)): + fig.delaxes(axs[i]) - df_sel_shower_no_repetitions.drop_duplicates(subset='solution_id', keep='first', inplace=True) + plt.tight_layout() + plt.savefig(output_folder + os.sep + file_name_obs + var_phys + '_vs_var_select_PC_space.png', dpi=300) + plt.close() - # save df_sel_shower_real to disk - df_sel_shower_no_repetitions.to_csv(OUT_PUT_PATH+os.sep+SAVE_SELECTION_FOLDER+os.sep+file_name_obs+'_sim_sel_to_optimize.csv', index=False) + return pcr_results_physical_param, pca.n_components_ - print('\nSUCCESS: the simulated meteor have been selected\n') +def process_PCA_variables(variable_PCA, No_var_PCA, df_obs_shower, df_sim_shower, OUT_PUT_PATH, file_name_obs, PCA_pairplot=False): + # if variable_PCA is not empty + if variable_PCA != []: + # add to variable_PCA array 'type','solution_id' + variable_PCA = ['solution_id','type'] + variable_PCA + if No_var_PCA != []: + # remove from variable_PCA the variables in No_var_PCA + for var in No_var_PCA: + variable_PCA.remove(var) - # Close the Logger to ensure everything is written to the file STOP COPY in TXT file - sys.stdout.close() + else: + # put in variable_PCA all the variables except mass + variable_PCA = list(df_obs_shower.columns) + # check if mass is in the variable_PCA + if 'mass' in variable_PCA: + # remove mass from variable_PCA + variable_PCA.remove('mass') + # if No_var_PCA is not empty + if No_var_PCA != []: + # remove from variable_PCA the variables in No_var_PCA + for var in No_var_PCA: + # check if the variable is in the variable_PCA + if var in variable_PCA: + variable_PCA.remove(var) - # Reset sys.stdout to its original value if needed - sys.stdout = sys.__stdout__ + scaled_sim = df_sim_shower[variable_PCA].copy() + scaled_sim = scaled_sim.drop(['type', 'solution_id'], axis=1) - ########### save dist to observed shower ######################################## + # Standardize each column separately + scaler = StandardScaler() + df_sim_var_sel_standardized = scaler.fit_transform(scaled_sim) + df_sim_var_sel_standardized = pd.DataFrame(df_sim_var_sel_standardized, columns=scaled_sim.columns) - # # save dist also on selected shower - # distance_current = [] - # for i_shower in range(len(shower_current)): - # distance_current.append(scipy.spatial.distance.euclidean(meanPCA_current, shower_current_PCA[i_shower])) - # shower_current['distance_mean']=distance_current # from the mean of the selected shower - # shower_current.to_csv(OUT_PUT_PATH+os.sep+file_name_obs+'_obs_and_dist.csv', index=False) + # Identify outliers using Z-score method on standardized data + z_scores = np.abs(zscore(df_sim_var_sel_standardized)) + threshold = 3 + outliers = (z_scores > threshold).any(axis=1) - # PLOT the selected simulated shower ######################################## + # Ensure the first element is not an outlier + if outliers[0]: + print('The MetSim reduction is an outlier') # Still keep it for the PCA analysis + outliers[0] = False - # dataframe with the simulated and the selected meteors in the PCA space - # df_sim_sel_PCA = pd.concat([df_sim_PCA,df_sel_PCA], axis=0) + # Filter out outliers + df_sim_shower = df_sim_shower[~outliers].copy() if PCA_pairplot: - - df_sim_shower_small=df_sim_shower.copy() - - if len(df_sim_shower_small)>10000: # w/o takes forever to plot - # pick randomly 10000 events - df_sim_shower_small=df_sim_shower_small.sample(n=10000) - - print('generating sel sim histogram plot...') - - # Define a custom palette - custom_palette = { - 'Real': "r", - 'Simulation': "b", - 'Simulation_sel': "darkorange", - 'MetSim': "k", - 'Realization': "mediumaquamarine", - 'Observation': "limegreen" + # Mapping of original variable names to LaTeX-style labels + variable_map = { + 'vel_1st_frame': r"$v_0$ [km/s]", + 'vel_avg': r"$v_{avg}$ [km/s]", + 'vel_180km': r"$v_{180km}$ [m/s]", + 'duration': r"$T$ [s]", + 'peak_mag_height': r"$h_{peak}$ [km]", + 'begin_height': r"$h_{beg}$ [km]", + 'end_height': r"$h_{end}$ [km]", + 'peak_abs_mag': r"$M_{peak}$ [mag]", + 'beg_abs_mag': r"$M_{beg}$ [mag]", + 'end_abs_mag': r"$M_{end}$ [mag]", + 'F': r"$F$", + 'trail_len': r"$L$ [km]", + 't0': r"$t_0$ [s]", + 'deceleration_lin': r"$\bar{a}$ [km/s$^{2}$]", + 'deceleration_parab': r"$a_{quad}(1~s)$ [km/s$^{2}$]", + 'decel_parab_t0': r"$\bar{a}_{poly}(1~s)$ [km/s$^{2}$]", + 'decel_t0': r"$\bar{a}_{poly}$ [km/s$^{2}$]", + 'decel_jacchia': r"$a_0 k$ [km/s$^{2}$]", + 'zenith_angle': r"$z_c$ [deg]", + 'avg_lag': r"$\bar{\ell}$ [m]", + 'kc': r"$k_c$ [km]", + 'Dynamic_pressure_peak_abs_mag': r"$Q_{peak}$ [kPa]", + 'a_mag_init': r"$d_1$ [mag/s$^{2}$]", + 'b_mag_init': r"$s_1$ [mag/s]", + 'a_mag_end': r"$d_2$ [mag/s$^{2}$]", + 'b_mag_end': r"$s_2$ [mag/s]" } + latex_labels = [variable_map.get(var, var) for var in variable_PCA[2:]] + df_sim_var_sel = df_sim_shower[variable_PCA].copy().drop(['type', 'solution_id'], axis=1) - curr_df = pd.concat([df_sim_shower_small,df_sel_shower,df_obs_shower], axis=0) - - curr_df['num_type'] = curr_df.groupby('type')['type'].transform('size') - curr_df['weight'] = 1 / curr_df['num_type'] - + # Sample 10,000 events if the dataset is large + if len(df_sim_var_sel) > 10000: + print('Number of events in the simulated:', len(df_sim_var_sel)) + df_sim_var_sel = df_sim_var_sel.sample(n=10000) - fig, axs = plt.subplots(int(np.ceil(len(variable_PCA[2:])/5)), 5, figsize=(20, 15)) - # flatten the axs + # Setup the plot grid + fig, axs = plt.subplots(int(np.ceil(len(latex_labels) / 5)), 5, figsize=(20, 15)) axs = axs.flatten() - # to_plot_unit=['init vel [km/s]','avg vel [km/s]','duration [s]','begin height [km]','peak height [km]','end height [km]','begin abs mag [-]','peak abs mag [-]','end abs mag [-]','F parameter [-]','zenith angle [deg]','deceleration [km/s^2]','trail lenght [km]','kurtosis','skew'] + for i, (var, label) in enumerate(zip(variable_PCA[2:], latex_labels)): + sim_data = df_sim_var_sel[var].values + obs_data = df_obs_shower[var].values - # to_plot=['vel_init_norot','vel_avg_norot','duration','begin_height','peak_mag_height','end_height','beg_abs_mag','peak_abs_mag','end_abs_mag','F','zenith_angle','decel_parab_t0','trail_len','kurtosis','skew'] + # chek if the var is trail_len or Dynamic_pressure_peak_abs_mag if so divide by 1000 + if var in ['trail_len', 'Dynamic_pressure_peak_abs_mag']: + sim_data = sim_data / 1000.0 + obs_data = obs_data / 1000.0 + elif var == 'avg_lag': + sim_data = sim_data * 1000.0 + obs_data = obs_data * 1000.0 - # deleter form curr_df the mass - #curr_df=curr_df.drop(['mass'], axis=1) - for ii, var in enumerate(variable_PCA[2:]): + # Determine bin range + all_values = np.concatenate([sim_data, obs_data]) + min_value, max_value = np.min(all_values), np.max(all_values) - # if var in ['decel_parab_t0','decel_t0']: - # sns.histplot(curr_df, x=x_plot[x_plot>-500], weights=curr_df['weight'][x_plot>-500],hue='type', ax=axs[ii], kde=True, palette=custom_palette, bins=20) - # axs[ii].set_xticks([np.round(np.min(x_plot[x_plot>-500]),2),np.round(np.max(x_plot[x_plot>-500]),2)]) - - # else: + # Normalize simulation data + sim_counts, sim_bins = np.histogram(sim_data, bins=20, range=(min_value, max_value)) + sim_norm = sim_counts / sim_counts.max() - sns.histplot(curr_df, x=var, weights=curr_df['weight'], hue='type', ax=axs[ii], kde=True, palette=custom_palette, bins=20) - axs[ii].set_xticks([np.round(np.min(curr_df[var]),2),np.round(np.max(curr_df[var]),2)]) + # Normalize observation data + obs_counts, obs_bins = np.histogram(obs_data, bins=20, range=(min_value, max_value)) + obs_norm = obs_counts / obs_counts.max() - # if beg_abs_mag','peak_abs_mag','end_abs_mag inver the x axis - if var in ['beg_abs_mag','peak_abs_mag','end_abs_mag']: - axs[ii].invert_xaxis() + # Plot simulation data + axs[i].bar(sim_bins[:-1], sim_norm, width=np.diff(sim_bins), align='edge', color='darkorange', alpha=0.5, label='Simulated') - # Set the x-axis formatter to ScalarFormatter - axs[ii].xaxis.set_major_formatter(ScalarFormatter()) - axs[ii].ticklabel_format(useOffset=False, style='plain', axis='x') - # Set the number of x-axis ticks to 3 - # axs[ii].xaxis.set_major_locator(MaxNLocator(nbins=3)) + # # Plot observed data + # axs[i].bar(obs_bins[:-1], obs_norm, width=np.diff(obs_bins), align='edge', color='cyan', alpha=0.5, label='Observed') - axs[ii].set_ylabel('probability') - axs[ii].set_xlabel(var) - axs[ii].get_legend().remove() - # check if there are more than 3 ticks and if yes only use the first and the last + axs[i].axvline(obs_data[0], color='black', linewidth=3) + axs[i].set_xlabel(label) + axs[i].xaxis.set_major_locator(ticker.MaxNLocator(5)) + axs[i].set_ylabel('Normalized Density') - # put y axis in log scale - axs[ii].set_yscale('log') - axs[ii].set_ylim(0.01,1) + for i in range(len(latex_labels), len(axs)): + fig.delaxes(axs[i]) - - # more space between the subplots plt.tight_layout() - # # full screen - # figManager = plt.get_current_fig_manager() - # figManager.window.showMaximized() - - # save the figure - fig.savefig(OUT_PUT_PATH+os.sep+file_name_obs+'_Histograms_'+str(len(variable_PCA)-2)+'var_'+str(PCA_percent)+'%_'+str(pca.n_components_)+'PC.png', dpi=300) + plt.savefig(os.path.join(OUT_PUT_PATH, f"{file_name_obs}_var_hist_real.png")) plt.close() - if len(df_sim_PCA)>10000: # w/o takes forever to plot - # df_sim_PCA=df_sim_PCA.sample(n=10000) - # pick only the one with the same index in df_sim_shower_small - df_sim_PCA = df_sim_PCA[df_sim_PCA.index.isin(df_sim_shower_small.index)] - - print('generating PCA space plot...') - - df_sim_sel_PCA = pd.concat([df_sim_PCA,df_sel_PCA,df_obs_PCA], axis=0) - - # Select only the numeric columns for percentile calculations - numeric_columns = df_sim_sel_PCA.select_dtypes(include=[np.number]).columns - - # Create a new column for point sizes - df_sim_sel_PCA['point_size'] = df_sim_sel_PCA['type'].map({ - 'Simulation_sel': 5, - 'Simulation': 5, - 'MetSim': 20, - 'Realization': 20, - 'Observation': 40 - }) - - - # open a new figure to plot the pairplot - fig = plt.figure(figsize=(10, 10), dpi=300) - - # # fig = sns.pairplot(df_sim_sel_PCA, hue='type', plot_kws={'alpha': 0.6, 's': 5, 'edgecolor': 'k'},corner=True) - # fig = sns.pairplot(df_sim_sel_PCA, hue='type',corner=True, palette='bright', diag_kind='kde', plot_kws={'s': 5, 'edgecolor': 'k'}) - # # plt.show() - - # Create the pair plot without points initially - fig = sns.pairplot(df_sim_sel_PCA[numeric_columns.append(pd.Index(['type']))], hue='type', corner=True, palette=custom_palette, diag_kind='kde', plot_kws={'s': 5, 'edgecolor': 'k'}) + return df_sim_shower, variable_PCA, outliers - # Overlay scatter plots with custom point sizes - for i in range(len(fig.axes)): - for j in range(len(fig.axes)): - if i > j: - # check if the variable is in the list of the numeric_columns and set the axis limit - if df_sim_sel_PCA.columns[j] in numeric_columns and df_sim_sel_PCA.columns[i] in numeric_columns: - ax = fig.axes[i, j] - sns.scatterplot(data=df_sim_sel_PCA, x=df_sim_sel_PCA.columns[j], y=df_sim_sel_PCA.columns[i], hue='type', size='point_size', sizes=(5, 40), ax=ax, legend=False, edgecolor='k', palette=custom_palette) - - # ax.set_xlim(percentiles_1[df_sim_sel_PCA.columns[j]], percentiles_99[df_sim_sel_PCA.columns[j]]) - # ax.set_ylim(percentiles_1[df_sim_sel_PCA.columns[i]], percentiles_99[df_sim_sel_PCA.columns[i]]) - - # delete the last row of the plot - # fig.axes[-1, -1].remove() - # Hide the last row of plots - # for ax in fig.axes[-1]: - # ax.remove() - - # Adjust the subplots layout parameters to give some padding - plt.subplots_adjust(hspace=0.3, wspace=0.3) - # plt.show() - - # save the figure - fig.savefig(OUT_PUT_PATH+os.sep+file_name_obs+'PCAspace_sim_sel_real_'+str(len(variable_PCA)-2)+'var_'+str(PCA_percent)+'%_'+str(pca.n_components_)+'PC.png') - # close the figure - plt.close() +def correlation_selPLOT(pd_dataframe_ranges, curr_sel, output_dir='', pca_N_comp=0): - print('generating result variable plot...') - - output_folder=OUT_PUT_PATH+os.sep+file_name_obs+VAR_SEL_DIR_SUFX - # check if the output_folder exists - if not os.path.isdir(output_folder): - mkdirP(output_folder) - - # df_sim_PCA,df_sel_PCA,df_obs_PCA - # print(df_sim_shower) - # loop all physical variables - physical_vars = ['mass','rho','sigma','erosion_height_start','erosion_coeff','erosion_mass_index','erosion_mass_min','erosion_mass_max'] - for var_phys in physical_vars: - # make a subplot of the rho againist each variable_PCA as a scatter plot - fig, axs = plt.subplots(int(np.ceil(len(variable_PCA[2:])/5)), 5, figsize=(20, 15)) - # flat it - axs = axs.flatten() - - for i, var in enumerate(variable_PCA[2:]): - # plot the rho againist the variable with black borders - axs[i].scatter(df_sim_shower_small[var], df_sim_shower_small[var_phys], c='b') #, edgecolors='k', alpha=0.5 - - axs[i].scatter(df_sel_shower[var], df_sel_shower[var_phys], c='orange') #, edgecolors='k', alpha=0.5 - # put a green vertical line for the df_obs_shower[var] value - axs[i].axvline(shower_current[var].values[0], color='limegreen', linestyle='--', linewidth=5) - # put a horizontal line for the rho of the first df_sim_shower_small - axs[i].axhline(df_sim_shower[var_phys].values[0], color='k', linestyle='-', linewidth=2) - # axs[i].set_title(var) - # as a suptitle put the variable_PCA - # fig.suptitle(var_phys) - if i == 0 or i == 5 or i == 10 or i == 15 or i == 20: - # as a suptitle put the variable_PCA - axs[i].set_ylabel(var_phys) - - # x axis - axs[i].set_xlabel(var) - - # grid - axs[i].grid() - # make y axis log if the variable is 'erosion_mass_min' 'erosion_mass_max' - if var_phys == 'erosion_mass_min' or var_phys == 'erosion_mass_max': - axs[i].set_yscale('log') - - plt.tight_layout() - # save the figure - plt.savefig(output_folder+os.sep+file_name_obs+var_phys+'_vs_var_select_PCA.png') - # close the figure - plt.close() - - print('generating PCA position plot...') - - output_folder=OUT_PUT_PATH+os.sep+file_name_obs+PCA_SEL_DIR_SUFX - # check if the output_folder exists - if not os.path.isdir(output_folder): - mkdirP(output_folder) - - # loop all pphysical variables - physical_vars = ['mass','rho','sigma','erosion_height_start','erosion_coeff','erosion_mass_index','erosion_mass_min','erosion_mass_max'] - for var_phys in physical_vars: - - # make a subplot of the rho againist each variable_PCA as a scatter plot - fig, axs = plt.subplots(int(np.ceil(len(columns_PC)/5)), 5, figsize=(20, 15)) - - # flatten the axs array - axs = axs.flatten() - for i, var in enumerate(columns_PC): - # plot the rho againist the variable with black borders - axs[i].scatter(df_sim_PCA[var], df_sim_shower_small[var_phys], c='b') #, edgecolors='k', alpha=0.5 - - axs[i].scatter(df_sel_PCA[var], df_sel_shower_no_repetitions[var_phys], c='orange') #, edgecolors='k', alpha=0.5 - # put a green vertical line for the df_obs_shower[var] value - axs[i].axvline(df_obs_PCA[var].values[0], color='limegreen', linestyle='--', linewidth=5) - # put a horizontal line for the rho of the first df_sim_shower_small - axs[i].axhline(df_sim_shower[var_phys].values[0], color='k', linestyle='-', linewidth=2) - # axs[i].set_title(var) - # # as a suptitle put the variable_PCA - # fig.suptitle(var_phys) - if i == 0 or i == 5 or i == 10 or i == 15 or i == 20: - # as a suptitle put the variable_PCA - axs[i].set_ylabel(var_phys) - # axis x - axs[i].set_xlabel(var) - # grid - axs[i].grid() - # make y axis log if the variable is 'erosion_mass_min' 'erosion_mass_max' - if var_phys == 'erosion_mass_min' or var_phys == 'erosion_mass_max': - axs[i].set_yscale('log') - - # delete the subplot that are not used - for i in range(len(columns_PC), len(axs)): - fig.delaxes(axs[i]) - - plt.tight_layout() - # save the figure - plt.savefig(output_folder+os.sep+file_name_obs+var_phys+'_vs_var_select_PC_space.png') - # close the figure - plt.close() - - - return df_sel_shower, df_sel_shower_no_repetitions, df_sim_selected_all, pcr_results_physical_param, pca.n_components_ - - - - - - -def PCAcorrelation_selPLOT(curr_sim_init, curr_sel, n_PC_in_PCA='',output_dir=''): - - curr_sim=curr_sim_init.copy() - if len(curr_sim)>10000: - # pick randomly 10000 events - print('Number of events in the simulated :',len(curr_sim)) - curr_sim=curr_sim.sample(n=10000).copy() + pd_dataframe_ranges=pd_dataframe_ranges.copy() curr_sel=curr_sel.copy() curr_sel = curr_sel.drop_duplicates(subset='solution_id') - curr_df_sim_sel=pd.concat([curr_sim,curr_sel], axis=0, ignore_index=True) + curr_df_sim_sel=curr_sel.copy() + + curr_sel['erosion_coeff'] = curr_sel['erosion_coeff'] * 1000000 + curr_sel['sigma'] = curr_sel['sigma'] * 1000000 + curr_df_sim_sel['erosion_coeff'] = curr_df_sim_sel['erosion_coeff'] * 1000000 + curr_df_sim_sel['sigma'] = curr_df_sim_sel['sigma'] * 1000000 + pd_dataframe_ranges['erosion_coeff'] = pd_dataframe_ranges['erosion_coeff'] * 1000000 + pd_dataframe_ranges['sigma'] = pd_dataframe_ranges['sigma'] * 1000000 + # Define your label mappings label_mappings = { - 'mass': 'mass [kg]', - 'rho': 'rho [kg/m^3]', - 'sigma': 'sigma [s^2/km^2]', - 'erosion_height_start': 'erosion height start [km]', - 'erosion_coeff': 'erosion coeff [s^2/km^2]', - 'erosion_mass_index': 'erosion mass index [-]', - 'erosion_mass_min': 'log eros. mass min [kg]', - 'erosion_mass_max': 'log eros. mass max [kg]' + 'mass': '$m_0$ [kg]', + 'rho': '$\\rho$ [kg/m$^3$]', + 'sigma': '$\sigma$ [kg/MJ]', + 'erosion_height_start': '$h_e$ [km]', + 'erosion_coeff': '$\eta$ [kg/MJ]', + 'erosion_mass_index': '$s$', + 'erosion_mass_min': '$m_{l}$ [kg]', + 'erosion_mass_max': '$m_{u}$ [kg]' } + # to_plot_unit = [r'$m_0$ [kg]', r'$\rho$ [kg/m$^3$]', r'$\sigma$ [kg/MJ]', r'$h_{e}$ [km]', r'$\eta$ [kg/MJ]', r'$s$', r'log($m_{l}$)', r'log($m_{u}$)',r'log($m_{u}$)-log($m_{l}$)'] + # Define a custom palette custom_palette = { - 'Real': "r", + 'Real': "g", 'Simulation': "b", 'Simulation_sel': "darkorange", 'MetSim': "k", 'Realization': "mediumaquamarine", - 'Observation': "limegreen" + 'Observation': "limegreen", + 'Iteration': "gold" } to_plot8 = ['type', 'mass', 'rho', 'sigma', 'erosion_height_start', 'erosion_coeff', 'erosion_mass_index', 'erosion_mass_min', 'erosion_mass_max'] @@ -2995,6 +4291,11 @@ def PCAcorrelation_selPLOT(curr_sim_init, curr_sel, n_PC_in_PCA='',output_dir='' if ax is not None: # Check if the axis exists xlabel = ax.get_xlabel() ylabel = ax.get_ylabel() + # set the x and y lim base on the ylabel and xlabel and give min and max val of pd_dataframe_ranges + if xlabel in pd_dataframe_ranges.columns: + ax.set_xlim(pd_dataframe_ranges[xlabel].min(), pd_dataframe_ranges[xlabel].max()) + if ylabel in pd_dataframe_ranges.columns: + ax.set_ylim(pd_dataframe_ranges[ylabel].min(), pd_dataframe_ranges[ylabel].max()) if ylabel in label_mappings: ax.set_ylabel(label_mappings[ylabel]) if xlabel in label_mappings: @@ -3039,680 +4340,785 @@ def PCAcorrelation_selPLOT(curr_sim_init, curr_sel, n_PC_in_PCA='',output_dir='' # Adjust layout plt.tight_layout() - - fig_name = (output_dir+os.sep+'MixPhysicPropPairPlot_'+str(n_PC_in_PCA)+'PC_'+str(len(curr_sel))+'ev.png') - plt.savefig(fig_name, dpi=300) + + if pca_N_comp!=0: + # Save the figure + plt.savefig(output_dir+os.sep+'PCA'+str(pca_N_comp)+'PC_MixPhysicPropPairPlot_'+str(len(curr_sel))+'ev.png', dpi=300) + else: + # Save the figure + plt.savefig(output_dir+os.sep+'MixPhysicPropPairPlot_'+str(len(curr_sel))+'ev.png', dpi=300) # Close the figure plt.close() - ########################################################################## - ########################################################################## - - + # Calculate the correlation matrix + corr = curr_sel[to_plot8[1:]].corr() + # Saving correlation matrix to a text file + if pca_N_comp!=0: + corr_filename = os.path.join(output_dir, f'correlation_matrix_PCA.txt') + else: + corr_filename = os.path.join(output_dir, f'correlation_matrix.txt') + corr.to_csv(corr_filename, sep='\t', float_format="%.2f") # Save as a tab-separated file with 2 decimal precision + print(f"Correlation matrix saved to: {corr_filename}") + ########################################################################## + ########################################################################## -def PCA_physicalProp_KDE_MODE_PLOT(df_sim, df_obs, df_sel, n_PC_in_PCA, fit_funct, mag_noise_real, len_noise_real, Metsim_folderfile_json='', file_name_obs='', folder_file_name_real='', output_dir='', total_distribution=False, save_log=False): - print('PCA_physicalProp_KDE_MODE_PLOT') - output_dir_OG=output_dir - pd_datafram_PCA_selected_mode_min_KDE=pd.DataFrame() +# Custom objective function with time-based limit +class TimeLimitedObjective: + def __init__(self, func, time_limit): + self.func = func + self.start_time = None + self.time_limit = time_limit - # sigma5=5 + def __call__(self, x): + if self.start_time is None: + self.start_time = time.time() + elif time.time() - self.start_time > self.time_limit: + raise TimeoutError("Time limit exceeded during optimization.") + return self.func(x) - # 5 sigma confidence interval - # five_sigma=False - # mag_noise = MAG_RMSD*SIGMA_ERR - # len_noise = LEN_RMSD*SIGMA_ERR - mag_noise = mag_noise_real.copy() - len_noise = len_noise_real.copy() - # # Standard deviation of the magnitude Gaussian noise 1 sigma - # # SD of noise in length (m) 1 sigma in km - len_noise= len_noise/1000 - # velocity noise 1 sigma km/s - # vel_noise = (len_noise*np.sqrt(2)/(1/FPS)) - vel_noise = (len_noise/(1/FPS)) - # check if end with pickle - if folder_file_name_real.endswith('.pickle'): - data_file_real = read_pickle_reduction_file(folder_file_name_real) - elif folder_file_name_real.endswith('.json'): - data_file_real = read_with_noise_GenerateSimulations_output(folder_file_name_real) - _, _, _, residuals_mag_real, residuals_vel_real, _, residual_time_pos_real, residual_height_pos_real = RMSD_calc_diff(data_file_real, fit_funct) +def PhysicalPropPLOT(df_sel_shower_real, df_sim_range, output_dir, file_name, save_log=True, pca_N_comp=0): + sim_range_plot = df_sim_range.copy() + df_sel_shower = df_sel_shower_real.copy() - if total_distribution: - df_sel['solution_id_dist'] = df_obs['solution_id'].iloc[0] - df_obs=df_obs.iloc[[0]] + if save_log: + # check if a file with the name "log"+n_PC_in_PCA+"_"+str(len(df_sel))+"ev.txt" already exist + if os.path.exists(output_dir + os.sep + "log_" + file_name[:15] + "_ConfInterval.txt"): + # remove the file + os.remove(output_dir + os.sep + "log_" + file_name[:15] + "_ConfInterval.txt") + sys.stdout = Logger(output_dir, "log_" + file_name[:15] + "_ConfInterval.txt") # _30var_99perc_13PC - # Get the default color cycle - color_cycle = plt.rcParams['axes.prop_cycle'].by_key()['color'] - # Create an infinite cycle of colors - infinite_color_cycle = itertools.cycle(color_cycle) + # concatenate df_sel_shower_real and df_sim_shower_NEW_inter + curr_df_sim_sel = pd.concat([df_sel_shower], ignore_index=True) - for jj in range(len(df_obs)): + # Reset the index to ensure uniqueness + curr_df_sim_sel = curr_df_sim_sel.reset_index(drop=True) - fig, ax = plt.subplots(2, 3, figsize=(14, 6),gridspec_kw={'height_ratios': [ 3, 0.5],'width_ratios': [ 3, 0.5, 3]}) - # fig, ax = plt.subplots(2, 4) - # flat the ax - ax = ax.flatten() - - around_meteor=df_obs.iloc[jj]['solution_id'] - curr_sel = df_sel[df_sel['solution_id_dist'] == around_meteor] - curr_sel['erosion_coeff']=curr_sel['erosion_coeff']*1000000 - curr_sel['sigma']=curr_sel['sigma']*1000000 + # multiply the erosion coeff by 1000000 to have it in km/s + curr_df_sim_sel['erosion_coeff'] = curr_df_sim_sel['erosion_coeff'] * 1000000 + curr_df_sim_sel['sigma'] = curr_df_sim_sel['sigma'] * 1000000 + curr_df_sim_sel['erosion_energy_per_unit_cross_section'] = curr_df_sim_sel['erosion_energy_per_unit_cross_section'] / 1000000 + curr_df_sim_sel['erosion_energy_per_unit_mass'] = curr_df_sim_sel['erosion_energy_per_unit_mass'] / 1000000 + sim_range_plot['erosion_coeff'] = sim_range_plot['erosion_coeff'] * 1000000 + sim_range_plot['sigma'] = sim_range_plot['sigma'] * 1000000 + sim_range_plot['erosion_energy_per_unit_cross_section'] = sim_range_plot['erosion_energy_per_unit_cross_section'] / 1000000 + sim_range_plot['erosion_energy_per_unit_mass'] = sim_range_plot['erosion_energy_per_unit_mass'] / 1000000 - # check if around_meteor is a file in a folder - is_real=False - if os.path.exists(around_meteor): - is_real=True - # split in file and directory - _, around_meteor = os.path.split(around_meteor) - around_meteor = around_meteor[:15] + group_mapping = { + 'Simulation_sel': 'selected', + 'MetSim': 'selected', + 'Real': 'selected', + 'Simulation': 'selected', + 'Iteration': 'iteration' + } + curr_df_sim_sel['group'] = curr_df_sim_sel['type'].map(group_mapping) - if total_distribution==False: - output_dir=output_dir_OG+os.sep+SAVE_SELECTION_FOLDER+os.sep+around_meteor + curr_df_sim_sel['num_group'] = curr_df_sim_sel.groupby('group')['group'].transform('size') + curr_df_sim_sel['weight'] = 1 / curr_df_sim_sel['num_group'] - plot_side_by_side(data_file_real, fig, ax, 'go', file_name_obs[:15]+'\nRMSDmag '+str(round(mag_noise_real,3))+' RMSDlen '+str(round(len_noise_real/1000,3)), residuals_mag_real, residuals_vel_real, residual_time_pos_real, residual_height_pos_real, fit_funct, mag_noise, vel_noise,'Std.dev. realizations') + curr_df_sim_sel['num_type'] = curr_df_sim_sel.groupby('type')['type'].transform('size') + curr_df_sim_sel['weight_type'] = 1 / curr_df_sim_sel['num_type'] - densest_point = '' + curr_sel = curr_df_sim_sel[curr_df_sim_sel['group'] == 'selected'].copy() - print('Number of selected events:',len(curr_sel)) + to_plot = ['mass', 'rho', 'sigma', 'erosion_height_start', 'erosion_coeff', 'erosion_mass_index', 'erosion_mass_min', 'erosion_mass_max', 'erosion_range', 'erosion_energy_per_unit_cross_section', 'erosion_energy_per_unit_mass', ''] + to_plot_unit = [r'$m_0$ [kg]', r'$\rho$ [kg/m$^3$]', r'$\sigma$ [kg/MJ]', r'$h_{e}$ [km]', r'$\eta$ [kg/MJ]', r'$s$', r'log($m_{l}$)', r'log($m_{u}$)', r'log($m_{u}$)-log($m_{l}$)', r'$E_{S}$ [MJ/m$^2$]', r'$E_{V}$ [MJ/kg]', r''] - if len(curr_sel)<2: - print('Check if the event is below RMSD') - ii=0 - Metsim_flag=False - try: - namefile_sel = curr_sel['solution_id'].iloc[ii] - except IndexError: - # Handle the error - print(f"Index {ii} is out of bounds for 'solution_id' in curr_sel.") - namefile_sel = None - continue - # namefile_sel = curr_sel['solution_id'].iloc[ii] - - # chec if the file exist - if not os.path.isfile(namefile_sel): - print('file '+namefile_sel+' not found') - continue + fig, axs = plt.subplots(3, 4, figsize=(15, 10)) + axs = axs.flatten() + print('\\hline') + # look find the index of the where df_sel_shower_real['type'] == 'Metsim' or 'Real' + if 'MetSim' in df_sel_shower_real['type'].values: + # find the index of the where df_sel_shower_real['type'] == 'Metsim' + idx = df_sel_shower_real.index[df_sel_shower_real['type'] == 'MetSim'] + print('Variables & ' + str(df_sel_shower_real['type'].iloc[idx]) + ' & 95\\%CIlow & Mean & Mode & 95\\%CIup \\\\') + elif 'Real' in df_sel_shower_real['type'].values: + # find the index of the where df_sel_shower_real['type'] == 'Real' + idx = df_sel_shower_real.index[df_sel_shower_real['type'] == 'Real'] + print('Variables & ' + str(df_sel_shower_real['type'].iloc[idx]) + ' & 95\\%CIlow & Mean & Mode & 95\\%CIup \\\\') + else: + print('Variables & ' + str(df_sel_shower_real['type'].iloc[0]) + ' & 95\\%CIlow & Mean & Mode & 95\\%CIup \\\\') + + ii_densest = 0 + for i in range(12): + plotvar = to_plot[i] + + if i == 11: + # Plot only the legend + axs[i].axis('off') # Turn off the axis + + # Create custom legend entries + import matplotlib.patches as mpatches + from matplotlib.lines import Line2D + + # Define the legend elements + # Define the legend elements + # prior_patch = mpatches.Patch(color='blue', label='Priors', alpha=0.5, edgecolor='black') + sel_events_patch = mpatches.Patch(color='darkorange', label='Initial results', alpha=0.5, edgecolor='red') + mode_line = Line2D([0], [0], color='red', linestyle='-.', label='Mode') + mean_line = Line2D([0], [0], color='blue', linestyle='--', label='Mean') + if 'Iteration' in curr_df_sim_sel['type'].values: + iter_patch = mpatches.Patch(color='gold', label='Iterative results', alpha=0.5, edgecolor='black') + # if 'MetSim' in curr_df_sim_sel['type'].values: + # metsim_line = Line2D([0], [0], color='black', linewidth=2, label='Metsim Solution') + # legend_elements = [sel_events_patch, iter_patch, metsim_line, mean_line, mode_line] + # el + if 'Real' in curr_df_sim_sel['type'].values: + metsim_line = Line2D([0], [0], color='black', linewidth=2, label='Real Solution') + legend_elements = [sel_events_patch, iter_patch, metsim_line, mean_line, mode_line] + else: + legend_elements = [sel_events_patch, iter_patch, mean_line, mode_line] else: - if namefile_sel.endswith('.pickle'): - data_file = read_pickle_reduction_file(namefile_sel) - pd_datafram_PCA_sim = array_to_pd_dataframe_PCA(data_file) - - elif namefile_sel.endswith('.json'): - # open the json file with the name namefile_sel - f = open(namefile_sel,"r") - data = json.loads(f.read()) - if 'ht_sampled' in data: - data_file = read_GenerateSimulations_output(namefile_sel, data_file_real) - pd_datafram_PCA_sim = array_to_pd_dataframe_PCA(data_file) - - else: - Metsim_flag=True - _, data_file, pd_datafram_PCA_sim = run_simulation(namefile_sel, data_file_real) - - rmsd_mag, rmsd_vel, rmsd_lag, residuals_mag, residuals_vel, residuals_len, residual_time_pos, residual_height_pos = RMSD_calc_diff(data_file, fit_funct) - - color_line=next(infinite_color_cycle) + # if 'MetSim' in curr_df_sim_sel['type'].values: + # metsim_line = Line2D([0], [0], color='black', linewidth=2, label='Metsim Solution') + # legend_elements = [sel_events_patch, metsim_line, mean_line, mode_line] + # el + if 'Real' in curr_df_sim_sel['type'].values: + metsim_line = Line2D([0], [0], color='black', linewidth=2, label='Real Solution') + legend_elements = [sel_events_patch, metsim_line, mean_line, mode_line] + else: + legend_elements = [sel_events_patch, mean_line, mode_line] - if Metsim_flag: - - # plot_side_by_side(data_file, fig, ax, '-k', ii, residuals_mag, residuals_vel, residual_time_pos, residual_height_pos) - - plot_side_by_side(data_file, fig, ax, '-k', 'Metsim data event\n\ -RMSDmag '+str(round(rmsd_mag,3))+' RMSDlen '+str(round(rmsd_lag,3))+'\n\ - m:'+str('{:.2e}'.format(curr_sel.iloc[ii]['mass'],1))+' F:'+str(round(curr_sel.iloc[ii]['F'],2))+'\n\ - rho:'+str(round(curr_sel.iloc[ii]['rho']))+' sigma:'+str(round(curr_sel.iloc[ii]['sigma'],4))+'\n\ - er.height:'+str(round(curr_sel.iloc[ii]['erosion_height_start'],2))+' er.log:'+str(round(curr_sel.iloc[ii]['erosion_range'],1))+'\n\ - er.coeff:'+str(round(curr_sel.iloc[ii]['erosion_coeff'],3))+' er.index:'+str(round(curr_sel.iloc[ii]['erosion_mass_index'],2)), residuals_mag, residuals_vel, residual_time_pos, residual_height_pos) - - - - else: - plot_side_by_side(data_file, fig, ax, '-','RMSDmag '+str(round(rmsd_mag,3))+' RMSDlen '+str(round(rmsd_lag,3))+' \n\ - m:'+str('{:.2e}'.format(curr_sel.iloc[ii]['mass'],1))+' F:'+str(round(curr_sel.iloc[ii]['F'],2))+'\n\ - rho:'+str(round(curr_sel.iloc[ii]['rho']))+' sigma:'+str(round(curr_sel.iloc[ii]['sigma'],4))+'\n\ - er.height:'+str(round(curr_sel.iloc[ii]['erosion_height_start'],2))+' er.log:'+str(round(curr_sel.iloc[ii]['erosion_range'],1))+'\n\ - er.coeff:'+str(round(curr_sel.iloc[ii]['erosion_coeff'],3))+' er.index:'+str(round(curr_sel.iloc[ii]['erosion_mass_index'],2)), residuals_mag, residuals_vel, residual_time_pos, residual_height_pos) - - # change first line color - ax[0].lines[1].set_color(color_line) - ax[1].lines[1].set_color(color_line) - ax[2].lines[1].set_color(color_line) - ax[5].lines[1].set_color(color_line) - - # pu the leggend putside the plot and adjust the plot base on the screen size - ax[2].legend(bbox_to_anchor=(1.05, 1.0), loc='upper left', borderaxespad=0.) - # the legend do not fit in the plot, so adjust the plot - plt.subplots_adjust(right=.7) - plt.subplots_adjust(wspace=0.2) - - # make more space - plt.tight_layout() - - # split in file and directory - _, name_file = os.path.split(curr_sel['solution_id'].iloc[ii]) - if rmsd_mag 8: - try: - # def density_function(x): - # # Insert the logic of your objective function here - # # This example uses a simple sum of squares of x - # # Replace it with the actual function you want to minimize - # return np.sum(np.square(x)) - - # # Objective function for maximization (negative density for minimization) - # def objective_function(x): - # return -density_function(x) - - # # Bounds for optimization within all the sim space - # bounds = [(np.min(curr_sel_data[:, i]), np.max(curr_sel_data[:, i])) for i in range(curr_sel_data.shape[1])] - - # # Perform global optimization using differential evolution - # print('Starting global optimization using differential evolution.') - # result = differential_evolution(objective_function, bounds) - - # if result.success: - # densest_point = result.x - # print(f"Densest point found using differential evolution:\n {densest_point}") - # else: - # print('Optimization was unsuccessful.') - # densest_point = '' - - kde = gaussian_kde(dataset=curr_sel_data.T) # Note the transpose to match the expected input shape - - # Negative of the KDE function for optimization - def neg_density(x): - return -kde(x) - - # Bounds for optimization within all the sim space - # data_sim = df_sim[var_kde].values - bounds = [(np.min(curr_sel_data[:, i]), np.max(curr_sel_data[:, i])) for i in range(curr_sel_data.shape[1])] - - # Initial guesses: curr_sel_data mean, curr_sel_data median, and KMeans centroids - mean_guess = np.mean(curr_sel_data, axis=0) - median_guess = np.median(curr_sel_data, axis=0) - - # KMeans centroids as additional guesses - kmeans = KMeans(n_clusters=5, n_init='auto').fit(curr_sel_data) # Adjust n_clusters based on your understanding of the curr_sel_data - centroids = kmeans.cluster_centers_ - - # Combine all initial guesses - initial_guesses = [mean_guess, median_guess] + centroids.tolist() - - # Perform optimization from each initial guess - results = [minimize(neg_density, x0, method='L-BFGS-B', bounds=bounds) for x0 in initial_guesses] - - # Filter out unsuccessful optimizations and find the best result - successful_results = [res for res in results if res.success] - - if successful_results: - best_result = min(successful_results, key=lambda x: x.fun) - densest_point = best_result.x - print("Densest point using KMeans centroid:\n", densest_point) - else: - # raise ValueError('Optimization was unsuccessful. Consider revising the strategy.') - print('Optimization was unsuccessful. Consider revising the strategy.') - # revise the optimization strategy - print('Primary optimization strategies were unsuccessful. Trying fallback strategy (Grid Search).') - # Fallback strategy: Grid Search - grid_size = 5 # Define the grid size for the search - grid_points = [np.linspace(bound[0], bound[1], grid_size) for bound in bounds] - grid_combinations = list(itertools.product(*grid_points)) - - best_grid_point = None - best_grid_density = -np.inf - - for point in grid_combinations: - density = kde(point) - if density > best_grid_density: - best_grid_density = density - best_grid_point = point - - if best_grid_point is not None: - densest_point = np.array(best_grid_point) - print("Densest point found using Grid Search:\n", densest_point) - else: - print("None of the strategy worked no KDE result, change the selected simulations") - except np.linalg.LinAlgError as e: - print(f"LinAlgError: {str(e)}") - else: - print('Not enough data to perform the KDE need more than 8 meteors') - - # if pickle change the extension and the code ################################################################################################## - if Metsim_folderfile_json != '': - # Load the nominal simulation parameters - const_nominal, _ = loadConstants(Metsim_folderfile_json) - else: - const_nominal, _ = loadConstants() - - const_nominal.dens_co = np.array(const_nominal.dens_co) - - dens_co=np.array(const_nominal.dens_co) + sns.histplot(curr_df_sim_sel, x=curr_df_sim_sel[plotvar], weights=curr_df_sim_sel['weight'], hue='group', ax=axs[i], palette={'selected': 'darkorange', 'iteration': 'gold'}, bins=20, binrange=[np.min(sim_range_plot[plotvar]), np.max(sim_range_plot[plotvar])]) + unique_values_count = curr_sel[plotvar].nunique() + if unique_values_count > 1: + # Add the KDE to the plot + sns.histplot(curr_sel, x=curr_sel[plotvar], weights=curr_sel['weight'], bins=20, ax=axs[i], fill=False, edgecolor=False, color='r', kde=True, binrange=[np.min(sim_range_plot[plotvar]), np.max(sim_range_plot[plotvar])]) + kde_line = axs[i].lines[-1] + axs[i].lines[-1].remove() + else: + kde_line = None + + axs[i].axvline(x=np.mean(curr_df_sim_sel[curr_df_sim_sel['group'] == 'selected'][plotvar]), color='blue', linestyle='--', linewidth=3) - # print(const_nominal.__dict__) + # set lim min and max sim_range_plot + axs[i].set_xlim(sim_range_plot[plotvar].min(), sim_range_plot[plotvar].max()) + find_type='' + if 'MetSim' in curr_df_sim_sel['type'].values: + # axs[i].axvline(x=curr_df_sim_sel[curr_df_sim_sel['type'] == 'MetSim'][plotvar].values[0], color='k', linewidth=3) + find_type = 'MetSim' + elif 'Real' in curr_df_sim_sel['type'].values: + axs[i].axvline(x=curr_df_sim_sel[curr_df_sim_sel['type'] == 'Real'][plotvar].values[0], color='k', linewidth=3) + find_type = 'Real' - ### Calculate atmosphere density coeffs (down to the bottom observed height, limit to 15 km) ### + if plotvar == 'erosion_mass_min' or plotvar == 'erosion_mass_max': + # Convert back from log scale + curr_df_sim_sel[plotvar] = 10 ** curr_df_sim_sel[plotvar] + curr_sel[plotvar] = 10 ** curr_sel[plotvar] + sim_range_plot[plotvar] = 10 ** sim_range_plot[plotvar] - # Determine the height range for fitting the density - dens_fit_ht_beg = const_nominal.h_init - # dens_fit_ht_end = const_nominal.h_final + # Calculate percentiles + sigma_95 = np.percentile(curr_sel[plotvar], 95) + sigma_5 = np.percentile(curr_sel[plotvar], 5) - # Assign the density coefficients - const_nominal.dens_co = dens_co + mean_values_sel = np.mean(curr_sel[plotvar]) - # Turn on plotting of LCs of individual fragments - const_nominal.fragmentation_show_individual_lcs = True + if kde_line is not None: + # Get the x and y data from the KDE line + kde_line_Xval = kde_line.get_xdata() + kde_line_Yval = kde_line.get_ydata() - # # change the sigma of the fragmentation - # const_nominal.sigma = 1.0 + # Find the index of the maximum y value (mode) + max_index = np.argmax(kde_line_Yval) + # Plot a vertical line at the mode + axs[i].axvline(x=kde_line_Xval[max_index], color='red', linestyle='-.', linewidth=3) - # 'rho': 209.27575861617834, 'm_init': 1.3339843905562902e-05, 'v_init': 59836.848805126894, 'shape_factor': 1.21, 'sigma': 1.387556841276162e-08, 'zenith_angle': 0.6944268835985749, 'gamma': 1.0, 'rho_grain': 3000, 'lum_eff_type': 5, 'lum_eff': 0.7, 'mu': 3.8180000000000003e-26, 'erosion_on': True, 'erosion_bins_per_10mass': 10, 'erosion_height_start': 117311.48011974395, 'erosion_coeff': 6.356639734390828e-07, 'erosion_height_change': 0, 'erosion_coeff_change': 3.3e-07, 'erosion_rho_change': 3700, 'erosion_sigma_change': 2.3e-08, 'erosion_mass_index': 1.614450928834309, 'erosion_mass_min': 4.773894502090459e-11, 'erosion_mass_max': 7.485333377052805e-10, 'disruption_on': False, 'compressive_strength': 2000, + x_10mode = kde_line_Xval[max_index] + if plotvar == 'erosion_mass_min' or plotvar == 'erosion_mass_max': + x_10mode = 10 ** kde_line_Xval[max_index] - # create a copy of the const_nominal - const_nominal_1D_KDE = copy.deepcopy(const_nominal) - const_nominal_allD_KDE = copy.deepcopy(const_nominal) + if i < 12: + print('\\hline') + print(f"{to_plot_unit[i]} & {find_type} & {'{:.4g}'.format(sigma_5)} & {'{:.4g}'.format(mean_values_sel)} & {'{:.4g}'.format(x_10mode)} & {'{:.4g}'.format(sigma_95)} \\\\") + else: + if i < 12: + print('\\hline') + print(f"{to_plot_unit[i]} & {find_type} & {'{:.4g}'.format(sigma_5)} & {'{:.4g}'.format(mean_values_sel)} & {'{:.4g}'.format(sigma_95)} \\\\") - var_cost=['m_init','rho','sigma','erosion_height_start','erosion_coeff','erosion_mass_index','erosion_mass_min','erosion_mass_max'] - # print for each variable the kde - percent_diff_1D=[] - percent_diff_allD=[] - for i in range(len(var_kde)): + axs[i].set_ylabel('Probability') + axs[i].set_xlabel(to_plot_unit[i]) - x=curr_sel[var_kde[i]] + # Adjust y-axis limit + if axs[i].get_ylim()[1] > 1: + axs[i].set_ylim(0, 1) - # Check if dataset has multiple elements - if len(x) < 2: - # If dataset has fewer than 2 elements, duplicate the single element or skip - print(f"Dataset for {var_kde[i]} has less than 2 elements. Duplicating elements to compute KDE.") - x = np.concatenate([x, x]) # Duplicate elements to have at least two + # Remove individual legends + axs[i].get_legend().remove() - # Compute KDE - kde = gaussian_kde(x) - - # Define the range for which you want to compute KDE values, with more points for higher accuracy - kde_x = np.linspace(x.min(), x.max(), 1000) - kde_values = kde(kde_x) - - # Find the mode (x-value where the KDE curve is at its maximum) - mode_index = np.argmax(kde_values) - mode = kde_x[mode_index] - - real_val=df_sim[var_kde[i]].iloc[0] + if i == 0: + # Adjust x-axis offset text + axs[i].xaxis.get_offset_text().set_x(1.10) - print() - if df_sim['type'].iloc[0]=='MetSim' or df_sim['type'].iloc[0]=='Real': - print(f"MetSim value {var_kde[i]}: {'{:.4g}'.format(real_val)}") - print(f"1D Mode of KDE for {var_kde[i]}: {'{:.4g}'.format(mode)} percent diff: {'{:.4g}'.format(abs((real_val-mode)/(real_val+mode))/2*100)}%") - percent_diff_1D.append(abs((real_val-mode)/(real_val+mode))/2*100) - if densest_point!='': - print(f"Mult.dim. KDE densest {var_kde[i]}: {'{:.4g}'.format(densest_point[i])} percent diff: {'{:.4g}'.format(abs((real_val-densest_point[i])/(real_val+densest_point[i]))/2*100)}%") - percent_diff_allD.append(abs((real_val-densest_point[i])/(real_val+densest_point[i]))/2*100) - # print the value of const_nominal - # print(f"const_nominal {var_cost[i]}: {'{:.4g}'.format(const_nominal.__dict__[var_cost[i]])}") - - if var_cost[i] == 'sigma' or var_cost[i] == 'erosion_coeff': - # put it back as it was - const_nominal_1D_KDE.__dict__[var_cost[i]]=mode/1000000 - if densest_point!='': - const_nominal_allD_KDE.__dict__[var_cost[i]]=densest_point[i]/1000000 - elif var_cost[i] == 'erosion_height_start': - # put it back as it was - const_nominal_1D_KDE.__dict__[var_cost[i]]=mode*1000 - if densest_point!='': - const_nominal_allD_KDE.__dict__[var_cost[i]]=densest_point[i]*1000 - else: - # add each to const_nominal_1D_KDE and const_nominal_allD_KDE - const_nominal_1D_KDE.__dict__[var_cost[i]]=mode - if densest_point!='': - const_nominal_allD_KDE.__dict__[var_cost[i]]=densest_point[i] - - # check if the file output_folder+os.sep+file_name+'_sim_sel_optimized.csv' exists then read - if os.path.exists(output_dir+os.sep+file_name_obs+'_sim_sel_optimized.csv'): - df_sel_optimized_check = pd.read_csv(output_dir+os.sep+file_name_obs+'_sim_sel_optimized.csv') - else: - df_sel_optimized_check = pd.DataFrame() - df_sel_optimized_check['solution_id']='' - - # save the const_nominal as a json file saveConstants(const, dir_path, file_name): - if total_distribution: - if output_dir+os.sep+around_meteor+'_mode_TOT.json' not in df_sel_optimized_check['solution_id'].values: - saveConstants(const_nominal_1D_KDE,output_dir,around_meteor+'_mode_TOT.json') - _, gensim_data_sim, pd_datafram_PCA_sim = run_simulation(output_dir+os.sep+around_meteor+'_mode_TOT.json', data_file_real) - else: - print('already optimized') - _, gensim_data_sim, pd_datafram_PCA_sim = run_simulation(output_dir+os.sep+around_meteor+'_mode_TOT.json', data_file_real) + plt.tight_layout() + print('\\hline') - else: - if output_dir+os.sep+around_meteor+'_mode.json' not in df_sel_optimized_check['solution_id'].values: - saveConstants(const_nominal_1D_KDE,output_dir,around_meteor+'_mode.json') - _, gensim_data_sim, pd_datafram_PCA_sim = run_simulation(output_dir+os.sep+around_meteor+'_mode.json', data_file_real) - else: - print('already optimized') - _, gensim_data_sim, pd_datafram_PCA_sim = run_simulation(output_dir+os.sep+around_meteor+'_mode.json', data_file_real) - - if pd_datafram_PCA_sim is None: - return pd_datafram_PCA_selected_mode_min_KDE - if gensim_data_sim is None: - return pd_datafram_PCA_selected_mode_min_KDE - - rmsd_mag, rmsd_vel, rmsd_lag, residuals_mag, residuals_vel, residuals_len, residual_time_pos, residual_height_pos = RMSD_calc_diff(gensim_data_sim, fit_funct) - - plot_side_by_side(gensim_data_sim, fig, ax, 'r-', 'MODE : RMSDmag '+str(round(rmsd_mag,3))+' RMSDlen '+str(round(rmsd_lag,3))+'\n\ - m:'+str('{:.2e}'.format(pd_datafram_PCA_sim.iloc[0]['mass'],1))+' F:'+str(round(pd_datafram_PCA_sim.iloc[0]['F'],2))+'\n\ - rho:'+str(round(pd_datafram_PCA_sim.iloc[0]['rho']))+' sigma:'+str(round(pd_datafram_PCA_sim.iloc[0]['sigma']*1000000,4))+'\n\ - er.height:'+str(round(pd_datafram_PCA_sim.iloc[0]['erosion_height_start'],2))+' er.log:'+str(round(pd_datafram_PCA_sim.iloc[0]['erosion_range'],1))+'\n\ - er.coeff:'+str(round(pd_datafram_PCA_sim.iloc[0]['erosion_coeff']*1000000,3))+' er.index:'+str(round(pd_datafram_PCA_sim.iloc[0]['erosion_mass_index'],2)), residuals_mag, residuals_vel, residual_time_pos, residual_height_pos) - - # pd_datafram_PCA_sim['erosion_coeff']=pd_datafram_PCA_sim['erosion_coeff']/1000000 - # pd_datafram_PCA_sim['sigma']=pd_datafram_PCA_sim['sigma']/1000000 - - print('real noise mag', round(mag_noise_real,3),''+str(SIGMA_ERR)+'sig',round(MAG_RMSD*SIGMA_ERR,3),''+str(SIGMA_ERR*2)+'sig',round(MAG_RMSD*SIGMA_ERR*2,3),'|| MODE noise mag', round(rmsd_mag,3), '\nreal noise len', round(len_noise_real/1000,3),''+str(SIGMA_ERR)+'sig',round(LEN_RMSD*SIGMA_ERR,3),''+str(SIGMA_ERR*2)+'sig',round(LEN_RMSD*SIGMA_ERR*2,3),'|| MODE noise len', round(rmsd_lag,3)) - select_mode_print='No' - if rmsd_mag n_confront_sel: + break # Exit the loop if we've reached the desired number of selections + + # Interpolate time positions based on height + interp_ht_time = interp1d( + real_height_km, + real_time, + kind='linear', + bounds_error=False, + fill_value='extrapolate' + ) + residual_time_pos = interp_ht_time(height_km) + + # Plot the selected simulation data + if Metsim_flag: + # For Metsim data, plot in black + line_sel0, = ax0.plot(abs_mag_sim, height_km, color='k') + if vel_lagplot == 'lag': + line, = ax1.plot(residual_time_pos, lag_m, color='k') + else: + line, = ax1.plot(residual_time_pos, vel_kms, color='k') + line_color = 'k' + else: + line_sel0, = ax0.plot(abs_mag_sim, height_km) + line_color = line_sel0.get_color() + if line_color == '#2ca02c': + line_color='m' + # change the color of line_sel0 + line_sel0.set_color('m') + if vel_lagplot == 'lag': + line, = ax1.plot(residual_time_pos, lag_m, color=line_color) + else: + line, = ax1.plot(residual_time_pos, vel_kms, color=line_color) + + # Collect data for the table + curve_data = [ + '', # Placeholder for color, will be replaced later + round(curr_sel.iloc[ii]['rmsd_mag'], 3) if 'rmsd_mag' in curr_sel.columns else 'N/A', + round(curr_sel.iloc[ii]['rmsd_len'] * 1000, 1) if 'rmsd_len' in curr_sel.columns else 'N/A', + '{:.2e}'.format(curr_sel.iloc[ii]['mass']) if 'mass' in curr_sel.columns else 'N/A', + round(curr_sel.iloc[ii]['rho']) if 'rho' in curr_sel.columns else 'N/A', + round(curr_sel.iloc[ii]['sigma'], 4) if 'sigma' in curr_sel.columns else 'N/A', + round(curr_sel.iloc[ii]['erosion_coeff'], 3) if 'erosion_coeff' in curr_sel.columns else 'N/A', + round(curr_sel.iloc[ii]['erosion_height_start'], 1) if 'erosion_height_start' in curr_sel.columns else 'N/A', + round(curr_sel.iloc[ii]['erosion_mass_index'], 2) if 'erosion_mass_index' in curr_sel.columns else 'N/A', + '{:.2e}'.format(curr_sel.iloc[ii]['erosion_mass_min']) if 'erosion_mass_min' in curr_sel.columns else 'N/A', + '{:.2e}'.format(curr_sel.iloc[ii]['erosion_mass_max']) if 'erosion_mass_max' in curr_sel.columns else 'N/A' + ] + + # Append the data and color + row_colors.append(line_color) + table_data.append(curve_data) + + # invert the row_colors and table_data + row_colors = row_colors[::-1] + table_data = table_data[::-1] + + # Check if table_data is empty + if not table_data: + print("No data available to display in the table.") + plt.close() # Close the plot + return # Exit the function or skip table creation + + # Adjust the plot styles and axes + ax0.invert_xaxis() + ax1.grid(linestyle='--', color='lightgray') + ax0.grid(linestyle='--', color='lightgray') + + ax1.set_xlabel('Time [s]') + if vel_lagplot == 'lag': + ax1.set_ylabel('Lag [m]') + else: + ax1.set_ylabel('Velocity [km/s]') + ax0.set_xlabel('Absolute Magnitude') + ax0.set_ylabel('Height [km]') + + # Remove legends from both plots if any + if ax0.get_legend() is not None: + ax0.get_legend().remove() + if ax1.get_legend() is not None: + ax1.get_legend().remove() + + # Adjust layout to make room for the table on the right + # plt.subplots_adjust(right=0.75) # Adjust right as needed + + # # Adjust layout to make room for the table on the far right + plt.subplots_adjust(left=0.05, right=0.7) # Increase the 'right' value to detach the table + + # Adjust the GridSpec to create more space between the second plot and the table + gs = GridSpec(1, 3, width_ratios=[1, 1, 0.97]) # Reduce the width of the table column + + # Create a new axis for the table + ax_table = fig.add_subplot(gs[0, 2]) + ax_table.axis('off') # Hide the axis lines and ticks + + # Create the table in ax_table + # Include color patches in the first column + cell_text = [] + for idx, row in enumerate(table_data): + # Replace the placeholder with the color patch + row[0] = '' + cell_text.append(row) + + # Create the table + table = ax_table.table( + cellText=cell_text, + colLabels=headers, + loc='center', + cellLoc='center' + ) + table.auto_set_font_size(False) + table.set_fontsize(8) # Increased font size for better readability + + # Loop through each header cell to set a different font size + for col_idx in range(len(headers)): + header_cell = table[(0, col_idx)] # Access the header row cells + header_cell.set_fontsize(6) # Set a smaller font size for the header + # header_cell.set_fontweight('bold') # Optional: make the header bold + + # Adjust the table column widths to fit labels + n_cols = len(headers) + col_widths = [0.1] + [0.13] * (n_cols - 1) # Increased column widths + for col_idx, width in enumerate(col_widths): + for row_idx in range(len(table_data) + 1): # +1 for header row + cell = table[(row_idx, col_idx)] + cell.set_width(width) + + # Set the cell colors for the first column + for row_idx, color in enumerate(row_colors): + cell = table[row_idx + 1, 0] # +1 to skip header row + cell.set_facecolor(color) + # Optionally, set text color to improve readability + if color == 'k': + cell.get_text().set_color('white') + else: + cell.get_text().set_color('black') + + # Adjust the cell heights to ensure labels fit + n_rows = len(table_data) + 1 # +1 for header row + for row_idx in range(n_rows): + for col_idx in range(n_cols): + cell = table[(row_idx, col_idx)] + cell.set_height(1 / n_rows) + + fig.suptitle( + file_name_only + r' - mag$_{RMSD}$ ' + str(round(curr_sel.iloc[0]['rmsd_mag'], 3)) + + r' lag$_{RMSD}$ ' + str(round(curr_sel.iloc[0]['rmsd_len']*1000, 1)) + ' m', + fontsize=12, # Adjust font size as needed + ha='left', # Align text to the left + x=0.05, # Adjust x to move it to the left (0 is far left, 1 is far right) + y=0.95 # Adjust y to move it up (0 is bottom, 1 is top) + ) + + if pca_N_comp != 0: + plt.savefig(output_dir + os.sep + 'PCA'+str(pca_N_comp)+'PC_'+file_name_obs + '_Heigh_MagVelCoef_'+vel_lagplot+'.png', bbox_inches='tight') + else: + # Save and close the plot + plt.savefig(output_dir + os.sep + file_name_obs + '_Heigh_MagVelCoef_'+vel_lagplot+'.png', bbox_inches='tight') + plt.close() - index_err_RMSD = find_closest_index(obs_time, obs_time_err) # height_km, height_km_err) - residuals_vel = (vel_kms_err-vel_kms[index_err_RMSD]) - residuals_len = (lag_kms_err-lag_residual[index_err_RMSD]) + # # Save the DataFrame with RMSD + # if output_folder_of_csv == '': + # df_sel_shower_real.to_csv(output_dir + os.sep +'PCA_'+file_name_obs + '_sim_sel.csv', index=False) + # else: + # df_sel_shower_real.to_csv(output_folder_of_csv, index=False) - residual_time_pos = obs_time_err - residual_height_pos = height_km_err - # calculate the RMSD - rmsd_mag = np.sqrt(np.mean(residuals_mag**2)) - rmsd_vel = np.sqrt(np.mean(residuals_vel**2)) - rmsd_lag = np.sqrt(np.mean(residuals_len**2)) - return rmsd_mag, rmsd_vel, rmsd_lag, residuals_mag, residuals_vel, residuals_len, residual_time_pos, residual_height_pos +# RMSD ########################################################################################### -def RMSD_calc_diff(data_file, fit_funct): +def RMSD_calc_diff(sim_file_data, real_funct_data): + # copy the data + sim_file = copy.deepcopy(sim_file_data) + real_funct = copy.deepcopy(real_funct_data) + # Check if data_file and fit_funct are not None - if data_file is None or fit_funct is None: + if sim_file is None or real_funct is None: print('Error: data_file or fit_funct is None') - return 9999,9999,9999,9999,9999,9999,0, 100 + return 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 0, 100, 0 # Check if required keys are present in data_file and fit_funct required_keys = ['height', 'absolute_magnitudes', 'time', 'velocities', 'lag'] for key in required_keys: - if key not in data_file or key not in fit_funct: + if key not in sim_file or key not in real_funct: print(f'Error: Missing key {key} in data_file or fit_funct') - return 9999,9999,9999,9999,9999,9999,0, 100 - - # from list to array - height_km_err = np.array(fit_funct['height']) / 1000 - abs_mag_sim_err = np.array(fit_funct['absolute_magnitudes']) - obs_time_err = np.array(fit_funct['time']) - vel_kms_err = np.array(fit_funct['velocities']) / 1000 - lag_kms_err = np.array(fit_funct['lag']) / 1000 - - # from list to array - height_km = np.array(data_file['height']) / 1000 - abs_mag_sim = np.array(data_file['absolute_magnitudes']) - obs_time = np.array(data_file['time']) - vel_kms = np.array(data_file['velocities']) / 1000 - lag_residual = np.array(data_file['lag']) / 1000 - residual_time_pos = np.array(data_file['time']) - residual_height_pos = height_km.copy() - - # Define the range of heights for interpolation - common_height_min = max(min(height_km), min(height_km_err)) - common_height_max = min(max(height_km), max(height_km_err)) - - if common_height_min > common_height_max: # handle the case where there is no overlap in height - print('No overlap in height') - return 9999,9999,9999,9999,9999,9999,obs_time_err[0], height_km_err[0] - - common_heights = np.linspace(common_height_min, common_height_max, num=len(height_km_err)) # Adjust the number of points as needed - - # Interpolate the magnitudes - interp_magnitudes1 = interp1d(height_km, abs_mag_sim, kind='linear', fill_value="extrapolate") - interp_magnitudes2 = interp1d(height_km_err, abs_mag_sim_err, kind='linear', fill_value="extrapolate") + return 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 0, 100, 0 + + # Convert lists to arrays and adjust units + height_km_sim = np.array(sim_file['height']) / 1000 + abs_mag_sim = np.array(sim_file['absolute_magnitudes']) + time_sim= np.array(sim_file['time']) + vel_kms_sim = np.array(sim_file['velocities']) / 1000 + len_km_sim = np.array(sim_file['length']) / 1000 + lag_kms_sim = np.array(sim_file['lag']) / 1000 + + + # Convert lists to arrays and adjust units + height_km_real = np.array(real_funct['height']) / 1000 + abs_mag_real = np.array(real_funct['absolute_magnitudes']) + time_real = np.array(real_funct['time']) + vel_kms_real = np.array(real_funct['velocities']) / 1000 + len_km_real = np.array(real_funct['length']) / 1000 + # lag_kms_real = len_km_real - (vel_kms_sim[0] * time_real) + # wrong_lag = np.array(real_funct['lag']) / 1000 + lag_kms_real = np.array(real_funct['lag']) / 1000 + # # start from 0 + # lag_kms_real = lag_kms_real - lag_kms_real[0] + + if 'v_init' in sim_file: + lag_kms_sim = len_km_sim - (real_funct['v_init']/1000 * time_sim) + else: + lag_kms_sim = len_km_sim - (vel_kms_real[0] * time_sim) - # Get magnitudes at the common heights - magnitudes1_common = interp_magnitudes1(common_heights) - magnitudes2_common = interp_magnitudes2(common_heights) + # Define the overlapping range for time + common_height_min = max(height_km_sim.min(), height_km_real.min()) + common_height_max = min(height_km_sim.max(), height_km_real.max()) - # Calculate the magnitude differences - magnitude_differences = magnitudes1_common - magnitudes2_common + if common_height_min >= common_height_max: + print('No overlap in time') + return 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, time_real[0], height_km_real[0], 0 - # Calculate the RMSD for magnitudes - rmsd_mag = np.sqrt(np.mean(magnitude_differences**2)) + # Restrict fit_funct data to the overlapping time range + valid_fit_indices = (height_km_real >= common_height_min) & (height_km_real <= common_height_max) + if not np.any(valid_fit_indices): + print('No valid fit data in overlapping time range') + return 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, 9999, time_real[0], height_km_real[0], 0 - # # Determine the fraction of matching points for magnitudes - # total_possible_points_mag = len(common_heights) - # matching_points_mag = np.sum((common_heights >= common_height_min) & (common_heights <= common_height_max)) - # fraction_matching_mag = matching_points_mag / total_possible_points_mag - # # Apply a penalty to the RMSD for magnitudes based on the fraction of matching points - # penalty_factor_mag = 1 / fraction_matching_mag if fraction_matching_mag > 0 else 9999 - # adjusted_rmsd_mag = rmsd_mag * penalty_factor_mag + # Interpolation on the fit data's height grid + interp_ht_absmag= interp1d(height_km_sim, abs_mag_sim, kind='linear', bounds_error=False, fill_value='extrapolate') + interp_ht_time = interp1d(height_km_sim, time_sim, kind='linear', bounds_error=False, fill_value='extrapolate') + # Interpolated fit on data grid + abs_mag_sim_interp = interp_ht_absmag(height_km_real) + time_sim_interp = interp_ht_time(height_km_real) - # Interpolate the velocities - interp_velocities1 = interp1d(obs_time, vel_kms, kind='linear', fill_value="extrapolate") - interp_velocities2 = interp1d(obs_time_err, vel_kms_err, kind='linear', fill_value="extrapolate") + magnitude_differences = abs_mag_real - abs_mag_sim_interp - # Get velocities at the common times - common_times_min = max(min(obs_time), min(obs_time_err)) - common_times_max = min(max(obs_time), max(obs_time_err)) - common_times = np.linspace(common_times_min, common_times_max, num=len(obs_time_err)) - velocities1_common = interp_velocities1(common_times) - velocities2_common = interp_velocities2(common_times) + # Interpolation on the fit data's time grid + interp_t_vel = interp1d(time_sim, vel_kms_sim, kind='linear', bounds_error=False, fill_value='extrapolate') + interp_t_lag = interp1d(time_sim, lag_kms_sim, kind='linear', bounds_error=False, fill_value='extrapolate') + # Interpolated fit on data grid + vel_kms_sim_interp = interp_t_vel(time_sim_interp) + lag_kms_sim_interp = interp_t_lag(time_sim_interp) - # Calculate the velocity differences - velocity_differences = velocities1_common - velocities2_common + velocity_differences = vel_kms_real - vel_kms_sim_interp + lag_differences = lag_kms_real - lag_kms_sim_interp - # Calculate the RMSD for velocities + residual_time_pos = time_sim_interp + residual_height_pos = height_km_real + + # copute RMSD + rmsd_mag = np.sqrt(np.mean(magnitude_differences**2)) rmsd_vel = np.sqrt(np.mean(velocity_differences**2)) - - # # Determine the fraction of matching points for velocities - # total_possible_points_vel = len(common_times) - # matching_points_vel = np.sum((common_times >= common_times_min) & (common_times <= common_times_max)) - # fraction_matching_vel = matching_points_vel / total_possible_points_vel - - # # Apply a penalty to the RMSD for velocities based on the fraction of matching points - # penalty_factor_vel = 1 / fraction_matching_vel if fraction_matching_vel > 0 else 9999 - # adjusted_rmsd_vel = rmsd_vel * penalty_factor_vel - - # Interpolate the lag residuals - interp_lag1 = interp1d(obs_time, lag_residual, kind='linear', fill_value="extrapolate") - interp_lag2 = interp1d(obs_time_err, lag_kms_err, kind='linear', fill_value="extrapolate") - - # Get lags at the common times - lags1_common = interp_lag1(common_times) - lags2_common = interp_lag2(common_times) - - # Calculate the lag differences - lag_differences = lags1_common - lags2_common - - # Calculate the RMSD for lags rmsd_lag = np.sqrt(np.mean(lag_differences**2)) + + # check if threshold_mag exists + if 'rmsd_mag' in real_funct: + threshold_mag = real_funct['rmsd_mag'] + else: + threshold_mag = 9999 + if 'rmsd_vel' in real_funct: + threshold_vel = real_funct['rmsd_vel'] + else: + threshold_vel = 9999 + if 'rmsd_len' in real_funct: + threshold_lag = real_funct['rmsd_len'] + # print('threshold_lag',threshold_lag) + # print('lag_differences',lag_differences) + # exceeds_threshold = np.abs(lag_differences) > threshold_lag*3 + # if np.any(exceeds_threshold): + # exceeding_values = lag_differences[exceeds_threshold] + # print(f'Lag differences exceeding {threshold_lag*3} found: {len(exceeding_values)}') + # rmsd_lag = 9999 + else: + threshold_lag = 9999 + if 'fps' in real_funct: + fps = real_funct['fps'] + else: + fps = 32 + + # max_diff_threshold = MAX_MAG_DIFF + # # Identify which differences exceed the maximum allowed difference + # if threshold_mag*4 < MAX_MAG_DIFF: + # max_diff_threshold = threshold_mag*4 + # exceeds_threshold = np.abs(magnitude_differences) > max_diff_threshold + # else: + # exceeds_threshold = np.abs(magnitude_differences) > max_diff_threshold - # # Determine the fraction of matching points for lags - # total_possible_points_lag = len(common_times) - # matching_points_lag = np.sum((common_times >= min(obs_time)) & (common_times <= max(obs_time))) - # fraction_matching_lag = matching_points_lag / total_possible_points_lag - - # # Apply a penalty to the RMSD for lags based on the fraction of matching points - # penalty_factor_lag = 1 / fraction_matching_lag if fraction_matching_lag > 0 else 9999 - # adjusted_rmsd_lag = rmsd_lag * penalty_factor_lag - - residual_time_pos = common_times - residual_height_pos = common_heights + # if np.any(exceeds_threshold): + # exceeding_values = magnitude_differences[exceeds_threshold] + # print(f'Magnitude differences exceeding {max_diff_threshold} found: {len(exceeding_values)}') + # rmsd_mag = 9999 - # if rmsd_mag is nan give 9999 + # Handle NaNs in RMSD calculations if np.isnan(rmsd_mag): rmsd_mag = 9999 if np.isnan(rmsd_vel): @@ -3720,953 +5126,725 @@ def RMSD_calc_diff(data_file, fit_funct): if np.isnan(rmsd_lag): rmsd_lag = 9999 - return rmsd_mag, rmsd_vel, rmsd_lag, magnitude_differences, velocity_differences, lag_differences, residual_time_pos, residual_height_pos + # sigma values estimate from the data + sigma_abs_mag = threshold_mag # np.std(abs_mag_real - abs_mag_sim_interp) + sigma_vel = threshold_vel # np.std(vel_kms_real - vel_kms_sim_interp) + sigma_lag = threshold_lag # np.std(lag_kms_real - lag_kms_sim_interp) + + # Compute the chi-squared statistics + chi2_mag = np.sum((magnitude_differences / sigma_abs_mag) ** 2) + chi2_vel = np.sum((velocity_differences / sigma_vel) ** 2) + chi2_lag = np.sum((lag_differences / sigma_lag) ** 2) + + # Degrees of freedom (assuming no parameters estimated from data) + dof_mag = len(abs_mag_real) - 0 # Adjust if you have fitted parameters + dof_vel = len(vel_kms_real) - 0 + dof_lag = len(lag_kms_real) - 0 + + # Reduced chi-squared + chi2_red_mag = chi2_mag / dof_mag + chi2_red_vel = chi2_vel / dof_vel + chi2_red_lag = chi2_lag / dof_lag + + p_value_mag = 1 - chi2.cdf(chi2_mag, dof_mag) + p_value_vel = 1 - chi2.cdf(chi2_vel, dof_vel) + p_value_lag = 1 - chi2.cdf(chi2_lag, dof_lag) + + # Define the significance level (alpha) + alpha = 0.05 # Corresponds to 95% confidence level + + # Define thresholds + chi2_red_threshold_lower = 0.5 # Lower bound for reduced chi-squared + chi2_red_threshold_upper = 1.5 # Upper bound for reduced chi-squared + + # check if any is nan and if so substitute tha with 9999 + if np.isnan(chi2_mag): + chi2_mag = 9999 + if np.isnan(chi2_vel): + chi2_vel = 9999 + if np.isnan(chi2_lag): + chi2_lag = 9999 + if np.isnan(chi2_red_mag): + chi2_red_mag = 9999 + if np.isnan(chi2_red_vel): + chi2_red_vel = 9999 + if np.isnan(chi2_red_lag): + chi2_red_lag = 9999 + if np.isnan(p_value_mag): + p_value_mag = 9999 + if np.isnan(p_value_vel): + p_value_vel = 9999 + if np.isnan(p_value_lag): + p_value_lag = 9999 + + # Initialize results dictionary + chi2_results = { + 'chi2_mag': chi2_mag, + 'chi2_red_mag': chi2_red_mag, + 'p_value_mag': p_value_mag, + 'chi2_vel': chi2_vel, + 'chi2_red_vel': chi2_red_vel, + 'p_value_vel': p_value_vel, + 'chi2_len': chi2_lag, + 'chi2_red_len': chi2_red_lag, + 'p_value_len': p_value_lag, + } + return chi2_red_mag, chi2_red_vel, chi2_red_lag, rmsd_mag, rmsd_vel, rmsd_lag, magnitude_differences, velocity_differences, lag_differences, residual_time_pos, residual_height_pos, lag_kms_sim -def PCA_LightCurveRMSDPLOT_optimize(df_sel_shower, df_obs_shower, output_dir, fit_funct='', gen_Metsim='', mag_noise_real = 0.1, len_noise_real = 20.0, file_name_obs='', number_event_to_optimize=0, run_optimization=True): - # merge curr_sel and curr_obs - curr_sel = df_sel_shower.copy() +def compute_chi2_red_thresholds(confidence_level, degrees_of_freedom): # 0.95, len(residuals_mag) + # Significance level + alpha = 1 - confidence_level # e.g., 0.10 for 90% confidence level + + # Lower and upper percentiles + lower_percentile = alpha / 2 + upper_percentile = 1 - (alpha / 2) + + # Critical chi-squared values + chi2_lower = chi2.ppf(lower_percentile, degrees_of_freedom) + chi2_upper = chi2.ppf(upper_percentile, degrees_of_freedom) + + # Thresholds for reduced chi-squared + chi2_red_threshold_lower = chi2_lower / degrees_of_freedom + chi2_red_threshold_upper = chi2_upper / degrees_of_freedom + + return chi2_red_threshold_lower, chi2_red_threshold_upper - pd_datafram_PCA_selected_optimized=pd.DataFrame() +def order_base_on_both_RMSD(pd_datafram_PCA_sim): - # sigma5=5 + # deep copy pd_datafram_PCA_sim + pd_datafram_check_RMSD = pd_datafram_PCA_sim.copy(deep=True) - # 5 sigma confidence interval - # five_sigma=False - # mag_noise = MAG_RMSD*SIGMA_ERR - # len_noise = LEN_RMSD*SIGMA_ERR - mag_noise = mag_noise_real.copy() - len_noise = len_noise_real.copy() + # Normalize the columns to bring them to the same scale + pd_datafram_check_RMSD['rmsd_mag_norm'] = pd_datafram_check_RMSD['rmsd_mag'] / pd_datafram_check_RMSD['rmsd_mag'].max() + pd_datafram_check_RMSD['rmsd_len_norm'] = pd_datafram_check_RMSD['rmsd_len'] / pd_datafram_check_RMSD['rmsd_len'].max() - # # Standard deviation of the magnitude Gaussian noise 1 sigma - # # SD of noise in length (m) 1 sigma in km - len_noise= len_noise/1000 - # velocity noise 1 sigma km/s - # vel_noise = (len_noise*np.sqrt(2)/(1/FPS)) - vel_noise = (len_noise/(1/FPS)) + # Compute the combined metric (e.g., sum of absolute normalized values) + pd_datafram_check_RMSD['combined_RMSD_metric'] = abs(pd_datafram_check_RMSD['rmsd_mag_norm']) + abs(pd_datafram_check_RMSD['rmsd_len_norm']) - # # put the first plot in 2 sublots - # fig, ax = plt.subplots(1, 2, figsize=(17, 5)) + # Sort the DataFrame based on the combined metric + pd_datafram_check_RMSD = pd_datafram_check_RMSD.sort_values(by='combined_RMSD_metric') - # # group by solution_id_dist and keep only n_confront_sel from each group - # curr_sel = curr_sel.groupby('solution_id_dist').head(len(number_event_to_optimize)) - # check if distance_meteor is in the columns - no_distance_flag = False - if 'distance_meteor' in curr_sel.columns: - # order by distance_meteor - curr_sel = curr_sel.sort_values('distance_meteor') - else: - no_distance_flag = True + pd_datafram_check_RMSD = pd_datafram_check_RMSD.reset_index(drop=True) - if number_event_to_optimize == 0: - number_event_to_optimize = len(df_sel_shower) + # delete rmsd_mag_norm and rmsd_len_norm and combined_RMSD_metric + pd_datafram_check_RMSD = pd_datafram_check_RMSD.drop(columns=['rmsd_mag_norm', 'rmsd_len_norm', 'combined_RMSD_metric']) - # pick from the first n_confront_sel - curr_sel = curr_sel.head(number_event_to_optimize) + return pd_datafram_check_RMSD - # # count duplicates and add a column for the number of duplicates - # curr_sel['num_duplicates'] = curr_sel.groupby('solution_id')['solution_id'].transform('size') - # curr_sel.drop_duplicates(subset='solution_id', keep='first', inplace=True) - curr_sel['erosion_coeff']=curr_sel['erosion_coeff']*1000000 - curr_sel['sigma']=curr_sel['sigma']*1000000 - - # check if end with pickle - if df_obs_shower.iloc[0]['solution_id'].endswith('.pickle'): - data_file_real = read_pickle_reduction_file(df_obs_shower.iloc[0]['solution_id']) - elif df_obs_shower.iloc[0]['solution_id'].endswith('.json'): - data_file_real = read_with_noise_GenerateSimulations_output(df_obs_shower.iloc[0]['solution_id']) - _, _, _, residuals_mag_real, residuals_vel_real, _, residual_time_pos_real, residual_height_pos_real = RMSD_calc_diff(data_file_real, fit_funct) - # Get the default color cycle - color_cycle = plt.rcParams['axes.prop_cycle'].by_key()['color'] - # Create an infinite cycle of colors - infinite_color_cycle = itertools.cycle(color_cycle) - for ii in range(len(curr_sel)): - fig, ax = plt.subplots(2, 3, figsize=(14, 6),gridspec_kw={'height_ratios': [ 3, 0.5],'width_ratios': [ 3, 0.5, 3]}) - # fig, ax = plt.subplots(2, 4) - # flat the ax - ax = ax.flatten() - - # pick the ii element of the solution_id column - namefile_sel=curr_sel.iloc[ii]['solution_id'] - Metsim_flag=False - # chec if the file exist - if not os.path.isfile(namefile_sel): - print('file '+namefile_sel+' not found') - continue - else: - if namefile_sel.endswith('.pickle'): - data_file = read_pickle_reduction_file(namefile_sel) - elif namefile_sel.endswith('.json'): - # open the json file with the name namefile_sel - f = open(namefile_sel,"r") - data = json.loads(f.read()) - if 'ht_sampled' in data: - data_file = read_GenerateSimulations_output(namefile_sel, data_file_real) - else: - if gen_Metsim == '': - print('no data for the Metsim file') - continue - else: - # make a copy of gen_Metsim - data_file = gen_Metsim.copy() - # file metsim - Metsim_flag=True - rmsd_mag, rmsd_vel, rmsd_lag, residuals_mag, residuals_vel, residuals_len, residual_time_pos, residual_height_pos = RMSD_calc_diff(data_file, fit_funct) - print('real noise mag', round(mag_noise_real,3),''+str(SIGMA_ERR)+'sig',round(MAG_RMSD*SIGMA_ERR,3),''+str(SIGMA_ERR*2)+'sig',round(MAG_RMSD*SIGMA_ERR*2,3),'|| Event noise mag', round(rmsd_mag,3), '\nreal noise len', round(len_noise_real/1000,3),''+str(SIGMA_ERR)+'sig',round(LEN_RMSD*SIGMA_ERR,3),''+str(SIGMA_ERR*2)+'sig',round(LEN_RMSD*MAG_RMSD*2,3),'|| Event noise len', round(rmsd_lag,3)) - plot_side_by_side(data_file_real, fig, ax, 'go', file_name_obs[:15]+'\nRMSDmag '+str(round(mag_noise_real,3))+' RMSDlen '+str(round(len_noise_real/1000,3)), residuals_mag_real, residuals_vel_real, residual_time_pos_real, residual_height_pos_real, fit_funct, mag_noise, vel_noise, 'Std.dev. realizations') - - color_line=next(infinite_color_cycle) - if Metsim_flag: - - # plot_side_by_side(data_file, fig, ax, '-k', ii, residuals_mag, residuals_vel, residual_time_pos, residual_height_pos) - if no_distance_flag: - plot_side_by_side(data_file, fig, ax, '-k', 'Metsim data event\n\ -RMSDmag '+str(round(rmsd_mag,3))+' RMSDlen '+str(round(rmsd_lag,3))+'\n\ - m:'+str('{:.2e}'.format(curr_sel.iloc[ii]['mass'],1))+' F:'+str(round(curr_sel.iloc[ii]['F'],2))+'\n\ - rho:'+str(round(curr_sel.iloc[ii]['rho']))+' sigma:'+str(round(curr_sel.iloc[ii]['sigma'],4))+'\n\ - er.height:'+str(round(curr_sel.iloc[ii]['erosion_height_start'],2))+' er.log:'+str(round(curr_sel.iloc[ii]['erosion_range'],1))+'\n\ - er.coeff:'+str(round(curr_sel.iloc[ii]['erosion_coeff'],3))+' er.index:'+str(round(curr_sel.iloc[ii]['erosion_mass_index'],2)), residuals_mag, residuals_vel, residual_time_pos, residual_height_pos) - - else: - plot_side_by_side(data_file, fig, ax, '-k', 'Metsim data event\n\ -RMSDmag '+str(round(rmsd_mag,3))+' RMSDlen '+str(round(rmsd_lag,3))+'\n\ - N°duplic. '+str(round(curr_sel.iloc[ii]['num_duplicates']))+' min dist:'+str(round(curr_sel.iloc[ii]['distance_meteor'],2))+'\n\ - m:'+str('{:.2e}'.format(curr_sel.iloc[ii]['mass'],1))+' F:'+str(round(curr_sel.iloc[ii]['F'],2))+'\n\ - rho:'+str(round(curr_sel.iloc[ii]['rho']))+' sigma:'+str(round(curr_sel.iloc[ii]['sigma'],4))+'\n\ - er.height:'+str(round(curr_sel.iloc[ii]['erosion_height_start'],2))+' er.log:'+str(round(curr_sel.iloc[ii]['erosion_range'],1))+'\n\ - er.coeff:'+str(round(curr_sel.iloc[ii]['erosion_coeff'],3))+' er.index:'+str(round(curr_sel.iloc[ii]['erosion_mass_index'],2)), residuals_mag, residuals_vel, residual_time_pos, residual_height_pos) - - - - else: - # if color_line == '#2ca02c': - # color_line='m' +# MAIN FUNCTION ################################################################################## - # plot_side_by_side(data_file, fig, ax, '-', ii, residuals_mag, residuals_vel, residual_time_pos, residual_height_pos) - if no_distance_flag: - plot_side_by_side(data_file, fig, ax, '-','RMSDmag '+str(round(rmsd_mag,3))+' RMSDlen '+str(round(rmsd_lag,3))+'\n\ - m:'+str('{:.2e}'.format(curr_sel.iloc[ii]['mass'],1))+' F:'+str(round(curr_sel.iloc[ii]['F'],2))+'\n\ - rho:'+str(round(curr_sel.iloc[ii]['rho']))+' sigma:'+str(round(curr_sel.iloc[ii]['sigma'],4))+'\n\ - er.height:'+str(round(curr_sel.iloc[ii]['erosion_height_start'],2))+' er.log:'+str(round(curr_sel.iloc[ii]['erosion_range'],1))+'\n\ - er.coeff:'+str(round(curr_sel.iloc[ii]['erosion_coeff'],3))+' er.index:'+str(round(curr_sel.iloc[ii]['erosion_mass_index'],2)), residuals_mag, residuals_vel, residual_time_pos, residual_height_pos) +def main_PhysUncert(trajectory_file, file_name, input_folder, output_folder, trajectory_Metsim_file, cml_args_user): + #copy cml_args_user + cml_args = copy.deepcopy(cml_args_user) - else: - plot_side_by_side(data_file, fig, ax, '-','RMSDmag '+str(round(rmsd_mag,3))+' RMSDlen '+str(round(rmsd_lag,3))+'\n\ - N°duplic. '+str(round(curr_sel.iloc[ii]['num_duplicates']))+' min dist:'+str(round(curr_sel.iloc[ii]['distance_meteor'],2))+'\n\ - m:'+str('{:.2e}'.format(curr_sel.iloc[ii]['mass'],1))+' F:'+str(round(curr_sel.iloc[ii]['F'],2))+'\n\ - rho:'+str(round(curr_sel.iloc[ii]['rho']))+' sigma:'+str(round(curr_sel.iloc[ii]['sigma'],4))+'\n\ - er.height:'+str(round(curr_sel.iloc[ii]['erosion_height_start'],2))+' er.log:'+str(round(curr_sel.iloc[ii]['erosion_range'],1))+'\n\ - er.coeff:'+str(round(curr_sel.iloc[ii]['erosion_coeff'],3))+' er.index:'+str(round(curr_sel.iloc[ii]['erosion_mass_index'],2)), residuals_mag, residuals_vel, residual_time_pos, residual_height_pos) - - # change first line color - ax[0].lines[1].set_color(color_line) - ax[1].lines[1].set_color(color_line) - ax[2].lines[1].set_color(color_line) - ax[5].lines[1].set_color(color_line) - - # split the name from the path - _, file_name_title = os.path.split(curr_sel.iloc[ii]['solution_id']) - # suptitle of the plot - fig.suptitle(file_name_title) - - # pu the leggend putside the plot and adjust the plot base on the screen size - ax[2].legend(bbox_to_anchor=(1.05, 1.0), loc='upper left', borderaxespad=0.) - # the legend do not fit in the plot, so adjust the plot - plt.subplots_adjust(right=.7) - plt.subplots_adjust(wspace=0.2) + print('processing file:',file_name) + print(trajectory_file) + print(input_folder) + print(output_folder) + print(trajectory_Metsim_file) - # make more space - plt.tight_layout() - - file_json_save_phys_NOoptimized=output_dir+os.sep+SAVE_SELECTION_FOLDER+os.sep+file_name_title - if Metsim_flag: - file_json_save_phys=output_dir+os.sep+SAVE_SELECTION_FOLDER+os.sep+file_name_title[:23]+'_fitted.json' - file_json_save_results=output_dir+os.sep+SAVE_RESULTS_FOLDER_EVENTS_PLOTS+os.sep+file_name_title[:23]+'_fitted.json' - const_nominal, _ = loadConstants(namefile_sel) - saveConstants(const_nominal,output_dir,file_name_obs+'_sim_fit.json') - else: - file_json_save_phys=output_dir+os.sep+SAVE_SELECTION_FOLDER+os.sep+file_name_obs[:15]+'_'+file_name_title[:23]+'_fitted.json' - file_json_save_results=output_dir+os.sep+SAVE_RESULTS_FOLDER_EVENTS_PLOTS+os.sep+file_name_obs[:15]+'_'+file_name_title[:23]+'_fitted.json' - # from namefile_sel json file open the json file and save the namefile_sel.const part as file_name_obs+'_sim_fit.json' - with open(namefile_sel) as json_file: - data = json.load(json_file) - const_part = data['const'] - with open(output_dir+os.sep+file_name_obs+'_sim_fit.json', 'w') as outfile: - json.dump(const_part, outfile, indent=4) + # take the name of the last folder in output_folder + folder_name_output = output_folder.split(os.sep)[-1] + print('folder_name_output:',folder_name_output) - shutil.copy(namefile_sel, file_json_save_phys_NOoptimized) + if cml_args.delete_all: + # if presen the output_folder then delete all the files in the folder + if os.path.isdir(output_folder): + # remove all the files in the folder + shutil.rmtree(output_folder) + print('All files in the output folder have been deleted.') - if run_optimization: + print("Cleanup completed!") - # check if file_json_save_phys is present - if not os.path.isfile(file_json_save_phys): - if rmsd_mag<=mag_noise_real and rmsd_lag<=len_noise_real/1000: - print('below sigma noise, SAVED') + flag_manual_metsim=True + # check if it ends with _first_guess.json + if trajectory_Metsim_file.endswith('_first_guess.json'): + flag_manual_metsim=False - shutil.copy(output_dir+os.sep+file_name_obs+'_sim_fit.json', file_json_save_phys) + start_time = time.time() - pd_datafram_PCA_selected_optimized = pd.concat([pd_datafram_PCA_selected_optimized, curr_sel.iloc[ii]], axis=0) + # chek if input_folder+os.sep+file_name+NAME_SUFX_CSV_OBS exist + if os.path.isfile(output_folder+os.sep+file_name+NAME_SUFX_CSV_OBS): + # read the csv file + trajectory_file = output_folder+os.sep+file_name+NAME_SUFX_CSV_OBS - # suptitle of the plot - fig.suptitle(file_name_title+' PERFECT below sigma noise') + # check if the output_folder exists + if not os.path.isdir(output_folder): + mkdirP(output_folder) - # pu the leggend putside the plot and adjust the plot base on the screen size - ax[2].legend(bbox_to_anchor=(1.05, 1.0), loc='upper left', borderaxespad=0.) - # the legend do not fit in the plot, so adjust the plot - plt.subplots_adjust(right=.7) - plt.subplots_adjust(wspace=0.2) - # make more space - plt.tight_layout() - plt.savefig(output_dir+os.sep+SAVE_RESULTS_FOLDER_EVENTS_PLOTS+os.sep+file_name_title[:23]+'_RMSDmag'+str(round(rmsd_mag,2))+'_RMSDlen'+str(round(rmsd_lag,2))+'_Heigh_MagVelCoef.png') - shutil.copy(output_dir+os.sep+file_name_obs+'_sim_fit_fitted.json', file_json_save_results) + # check if the input_folder exists if the csv file has been already created + if trajectory_file.endswith('.csv'): + # read the csv file + pd_dataframe_obs_real = pd.read_csv(trajectory_file) + # check the column name solution_id and see if it matches a file i the folder + if not input_folder in pd_dataframe_obs_real['solution_id'][0]: + # if the solution_id is in the name of the file then the file is the real data + print('The folder of the csv file is different') + # check if the file is present in the folder + if not os.path.isfile(pd_dataframe_obs_real['solution_id'][0]): + print() + print('--- MODIFY OLD CSV FILE PATH ---') + # take the first element pd_dataframe_obs_real['solution_id'][0] and take only the path + old_input_folder = os.path.split(pd_dataframe_obs_real['solution_id'][0])[0] + # run the update_solution_ids function + print('old_input_folder',old_input_folder) + update_solution_ids(old_input_folder, input_folder) + + print() + + ######################### OBSERVATION ############################### + print('--- OBSERVATION ---') + + # add to save_res_fin_folder the file_name + save_results_folder=SAVE_RESULTS_FINAL_FOLDER+file_name #save_res_fin_folder + save_results_folder_events_plots = METEOR_PLOTS_JSON_FILE_FOLDER + + result_dir = output_folder+os.sep+save_results_folder + results_event_dir = output_folder+os.sep+save_results_folder_events_plots + mkdirP(result_dir) + mkdirP(results_event_dir) + + # check the extension of the file if it already present the csv file meas it has been aleady processed + if trajectory_file.endswith('.csv'): + # read the csv file + pd_dataframe_obs_real = pd.read_csv(trajectory_file) + + if pd_dataframe_obs_real['type'][0] != 'Observation' and pd_dataframe_obs_real['type'][0] != 'Observation_sim': + # raise an error saing that the type is wrong and canot be processed by PCA + raise ValueError('Type of the csv file is wrong and canot be processed by script.') + + trajectory_file = pd_dataframe_obs_real['solution_id'][0] + # print the file that is going to be processed + print('file to be processed:',trajectory_file) - plt.savefig(output_dir+os.sep+SAVE_SELECTION_FOLDER+os.sep+file_name_title[:23]+'_RMSDmag'+str(round(rmsd_mag,2))+'_RMSDlen'+str(round(rmsd_lag,2))+'_Heigh_MagVelCoef.png') - # close the plot - plt.close() - continue + if trajectory_file.endswith('.pickle'): + # read the pickle file + gensim_data_obs = read_pickle_reduction_file(trajectory_file) #,trajectory_Metsim_file - elif rmsd_mag1: + # set the value of rmsd_t0_lag=rmsd_mag_obs and len_RMSD=rmsd_lag_obs*conf_lvl + rmsd_t0_lag = np.array(cml_args.len_rmsd)/z_score + else: + # keep it in m instead of km + rmsd_t0_lag = np.array(cml_args.len_rmsd*1000) + + if rmsd_pol_mag1: + len_RMSD_real = rmsd_t0_lag/1000*z_score + else: + len_RMSD_real = rmsd_t0_lag*z_score + # ned in m instead of km + rmsd_t0_lag=rmsd_t0_lag*1000 - fig.suptitle(file_name_title+' BAD no optimization and no save') + # # Calculate the cumulative probability for the z-value, the confidence level is the percentage of the area within ±z_value + CONFIDENCE_LEVEL = (2 * stats.norm.cdf(z_score) - 1)*100 + print('CONFIDENCE LEVEL required : '+str(np.round(CONFIDENCE_LEVEL,3))+'%') + print('mag_RMSD:',mag_RMSD_real) + print('len_RMSD:',len_RMSD_real,'km') - plt.savefig(output_dir+os.sep+SAVE_SELECTION_FOLDER+os.sep+file_name_title[:23]+'_RMSDmag'+str(round(rmsd_mag,2))+'_RMSDlen'+str(round(rmsd_lag,2))+'_Heigh_MagVelCoef.png') + print() - # close the plot - plt.close() - continue - print('runing the optimization...') - # this creates a ew file called output_dir+os.sep+file_name_obs+'_sim_fit_fitted.json' - subprocess.run( - ['python', '-m', 'wmpl.MetSim.AutoRefineFit', - output_dir, 'AutoRefineFit_options.txt', '-x'], - # stdout=subprocess.PIPE, - # stderr=subprocess.PIPE, - text=True - ) + ######################## RANDOM SEARCH ############################### + print('--- RANDOM SEARCH ---') - # save the 20230811_082648_sim_fit_fitted.json as a json file in the output_dir+os.sep+SAVE_SELECTION_FOLDER+os.sep+file_name_title[:23]+'_sim_fit_fitted.json' - shutil.copy(output_dir+os.sep+file_name_obs+'_sim_fit_fitted.json', file_json_save_phys) - else: - print('file '+file_json_save_phys+' already exist, read it...') + # copy the file to the output_folder + shutil.copy(cml_args.ref_opt_path, output_folder+os.sep+'AutoRefineFit_options.txt') - _, gensim_data_optimized, pd_datafram_PCA_sim_optimized = run_simulation(file_json_save_phys, data_file_real) + # the file name from trajectory_Metsim_file so split it form the path + metsim_file_name = os.path.split(trajectory_Metsim_file)[1] + print('File :',metsim_file_name) + print('Run MetSim file:',trajectory_Metsim_file) + simulation_MetSim_object, gensim_data_Metsim, pd_datafram_Metsim = run_simulation(trajectory_Metsim_file, gensim_data_obs, fit_funct) - rmsd_mag, rmsd_vel, rmsd_lag, residuals_mag, residuals_vel, residuals_len, residual_time_pos, residual_height_pos = RMSD_calc_diff(gensim_data_optimized, fit_funct) + # print('metsim',gensim_data_Metsim['dens_co']) + # print('obs',gensim_data_obs['dens_co']) + if flag_manual_metsim: + dens_co = gensim_data_Metsim['dens_co'] + else: + dens_co = gensim_data_obs['dens_co'] + + flag_results_found_metsim = create_json_file_and_optimiz(gensim_data_Metsim, file_name, gensim_data_obs, fit_funct, results_event_dir, output_folder, 'Metsim') + # avoid carry over if a name with gensim_data_Metsim['name'] has not been found in the output_folder + if flag_results_found_metsim == False and flag_manual_metsim: + print('Try other MetSim manual solution:',trajectory_Metsim_file) + # break if requested from user + if cml_args.stop_bad_manual_sol: + print('Break requested from user (to avoid this set the stop_bad_manual_sol to False)') + return + elif flag_results_found_metsim: + # copy the trajectory_Metsim_file to the output_folder + shutil.copy(trajectory_Metsim_file, results_event_dir+os.sep+metsim_file_name) - print('real noise mag', round(mag_noise_real,3),''+str(SIGMA_ERR)+'sig',round(mag_noise_real*SIGMA_ERR,3),''+str(SIGMA_ERR*2)+'sig',round(MAG_RMSD*SIGMA_ERR*2,3),'|| Event noise mag', round(rmsd_mag,3), '\nreal noise len', round(len_noise_real/1000,3),''+str(SIGMA_ERR)+'sig',round(LEN_RMSD*SIGMA_ERR,3),''+str(SIGMA_ERR*2)+'sig',round(LEN_RMSD*SIGMA_ERR*2,3),'|| Event noise len', round(rmsd_lag,3)) + # Init simulation parameters with the given class name + _, pd_dataframe_ranges = range_gen_simulations(pd_dataframe_obs_real,simulation_MetSim_object, fps, dens_co, flag_manual_metsim) - if Metsim_flag: - - plot_side_by_side(gensim_data_optimized, fig, ax, 'k--', 'Optimized MetSim RMSDmag '+str(round(rmsd_mag,3))+' RMSDlen '+str(round(rmsd_lag,3))+'\n\ - m:'+str('{:.2e}'.format(pd_datafram_PCA_sim_optimized.iloc[0]['mass'],1))+' F:'+str(round(pd_datafram_PCA_sim_optimized.iloc[0]['F'],2))+'\n\ - rho:'+str(round(pd_datafram_PCA_sim_optimized.iloc[0]['rho']))+' sigma:'+str(round(pd_datafram_PCA_sim_optimized.iloc[0]['sigma']*1000000,4))+'\n\ - er.height:'+str(round(pd_datafram_PCA_sim_optimized.iloc[0]['erosion_height_start'],2))+' er.log:'+str(round(pd_datafram_PCA_sim_optimized.iloc[0]['erosion_range'],1))+'\n\ - er.coeff:'+str(round(pd_datafram_PCA_sim_optimized.iloc[0]['erosion_coeff']*1000000,3))+' er.index:'+str(round(pd_datafram_PCA_sim_optimized.iloc[0]['erosion_mass_index'],2)), residuals_mag, residuals_vel, residual_time_pos, residual_height_pos) + all_jsonfiles = get_json_files(results_event_dir) - else: - plot_side_by_side(gensim_data_optimized, fig, ax, '--', 'Optimized RMSDmag '+str(round(rmsd_mag,3))+' RMSDlen '+str(round(rmsd_lag,3))+'\n\ - m:'+str('{:.2e}'.format(pd_datafram_PCA_sim_optimized.iloc[0]['mass'],1))+' F:'+str(round(pd_datafram_PCA_sim_optimized.iloc[0]['F'],2))+'\n\ - rho:'+str(round(pd_datafram_PCA_sim_optimized.iloc[0]['rho']))+' sigma:'+str(round(pd_datafram_PCA_sim_optimized.iloc[0]['sigma']*1000000,4))+'\n\ - er.height:'+str(round(pd_datafram_PCA_sim_optimized.iloc[0]['erosion_height_start'],2))+' er.log:'+str(round(pd_datafram_PCA_sim_optimized.iloc[0]['erosion_range'],1))+'\n\ - er.coeff:'+str(round(pd_datafram_PCA_sim_optimized.iloc[0]['erosion_coeff']*1000000,3))+' er.index:'+str(round(pd_datafram_PCA_sim_optimized.iloc[0]['erosion_mass_index'],2)), residuals_mag, residuals_vel, residual_time_pos, residual_height_pos) - - # change first line color - ax[0].lines[-1].set_color(color_line) - ax[1].lines[-1].set_color(color_line) - ax[2].lines[-1].set_color(color_line) - ax[5].lines[-1].set_color(color_line) - ax[0].lines[-1].set_marker("x") - ax[1].lines[-1].set_marker("x") - ax[2].lines[-1].set_marker("x") - ax[5].lines[-1].set_marker("x") - - - if rmsd_mag10000: # w/o takes forever to plot - # pick randomly 10000 events - df_sim_shower_small=df_sim_shower_small.sample(n=10000) - if 'MetSim' not in df_sim_shower_small['type'].values and 'Real' not in df_sim_shower_small['type'].values: - df_sim_shower_small = pd.concat([df_sim_shower_small.iloc[[0]], df_sim_shower_small]) - - if save_log: - # check if a file with the name "log"+n_PC_in_PCA+"_"+str(len(df_sel))+"ev.txt" already exist - if os.path.exists(output_dir+os.sep+"log_"+file_name[:15]+"_CI"+str(n_PC_in_PCA)+"PC.txt"): - # remove the file - os.remove(output_dir+os.sep+"log_"+file_name[:15]+"_CI"+str(n_PC_in_PCA)+"PC.txt") - sys.stdout = Logger(output_dir,"log_"+file_name[:15]+"_CI"+str(n_PC_in_PCA)+"PC.txt") # _30var_99%_13PC + if flag_manual_metsim and flag_results_found_metsim: + # concatenate the two dataframes + pd_initial_results = pd.concat([pd_datafram_Metsim, pd_initial_results]) + # print(df_sim_shower) + pd_initial_results.reset_index(drop=True, inplace=True) + # check that the pd_datafram_Metsim is in th first row of pd_initial_results if not move it to the first row + if flag_manual_metsim and flag_results_found_metsim: + # look for the row with the same solution_id as pd_datafram_Metsim['solution_id'][0] + index = pd_initial_results[pd_initial_results['solution_id'] == pd_datafram_Metsim['solution_id'][0]].index + # move the row to the first row + pd_initial_results = pd_initial_results.drop(index) + pd_initial_results = pd.concat([pd_datafram_Metsim, pd_initial_results]) - curr_df_sim_sel = pd.concat([df_sim_shower_small,df_sel_shower], axis=0) + if pd_dataframe_obs_real['solution_id'].iloc[0].endswith('.json'): + print('REAL json file:',trajectory_Metsim_file) + # change the type column to Real + pd_initial_results['type'].iloc[0] = 'Real' + pd_datafram_Metsim['type'] = 'Real' - # multiply the erosion coeff by 1000000 to have it in km/s - curr_df_sim_sel['erosion_coeff']=curr_df_sim_sel['erosion_coeff']*1000000 - curr_df_sim_sel['sigma']=curr_df_sim_sel['sigma']*1000000 - curr_df_sim_sel['erosion_energy_per_unit_cross_section']=curr_df_sim_sel['erosion_energy_per_unit_cross_section']/1000000 - curr_df_sim_sel['erosion_energy_per_unit_mass']=curr_df_sim_sel['erosion_energy_per_unit_mass']/1000000 + pd_initial_results = order_base_on_both_RMSD(pd_initial_results) - group_mapping = { - 'Simulation_sel': 'selected', - 'MetSim': 'simulated', - 'Real': 'simulated', - 'Simulation': 'simulated' - } - curr_df_sim_sel['group'] = curr_df_sim_sel['type'].map(group_mapping) + pd_initial_results.to_csv(output_folder+os.sep+file_name+NAME_SUFX_CSV_RESULTS, index=False) + # print saved csv file + print('saved sim csv file:',output_folder+os.sep+file_name+NAME_SUFX_CSV_RESULTS) - curr_df_sim_sel['num_group'] = curr_df_sim_sel.groupby('group')['group'].transform('size') - curr_df_sim_sel['weight'] = 1 / curr_df_sim_sel['num_group'] + # save the trajectory_file in the output_folder + shutil.copy(pd_dataframe_obs_real['solution_id'][0], output_folder) - curr_df_sim_sel['num_type'] = curr_df_sim_sel.groupby('type')['type'].transform('size') - curr_df_sim_sel['weight_type'] = 1 / curr_df_sim_sel['num_type'] + # delete any file that end with _good_files.txt in the output_folder + files = [f for f in os.listdir(output_folder) if f.endswith('_good_files.txt')] + for file in files: + os.remove(os.path.join(output_folder, file)) - curr_sel = curr_df_sim_sel[curr_df_sim_sel['group'] == 'selected'].copy() - # curr_sim = curr_df_sim_sel[curr_df_sim_sel['group'] == 'simulated'].copy() + result_number = len(all_jsonfiles) - # with color based on the shower but skip the first 2 columns (shower_code, shower_id) - to_plot=['mass','rho','sigma','erosion_height_start','erosion_coeff','erosion_mass_index','erosion_mass_min','erosion_mass_max','erosion_range','erosion_energy_per_unit_mass','erosion_energy_per_unit_cross_section','erosion_energy_per_unit_cross_section'] - # to_plot_unit=['mass [kg]','rho [kg/m^3]','sigma [s^2/km^2]','erosion height start [km]','erosion coeff [s^2/km^2]','erosion mass index [-]','log eros. mass min [kg]','log eros. mass max [kg]','log eros. mass range [-]','erosion energy per unit mass [MJ/kg]','erosion energy per unit cross section [MJ/m^2]','erosion energy per unit cross section [MJ/m^2]'] - to_plot_unit = [r'$m_0$ [kg]', r'$\rho$ [kg/m$^3$]', r'$\sigma$ [s$^2$/km$^2$]', r'$h_{e}$ [km]', r'$\eta$ [s$^2$/km$^2$]', r'$s$ [-]', r'log($m_{l}$) [-]', r'log($m_{u}$) [-]',r'log($m_{u}$)-log($m_{l}$) [-]'] + print() + - fig, axs = plt.subplots(3, 3) - # from 2 numbers to one numbr for the subplot axs - axs = axs.flatten() + ######################## ITERATIVE SEARCH ############################### + + print('--- ITERATIVE SEARCH ---') + + flag_fail = False + # old_results_number could be deleted... + old_results_number = result_number + # save as recursive results + pd_results = pd_initial_results.copy(deep=True) + ii_repeat = 0 + # check_change = ['mass', 'rho', 'sigma', 'erosion_height_start', 'erosion_coeff', 'erosion_mass_index', 'erosion_mass_min', 'erosion_mass_max', 'erosion_range', 'erosion_energy_per_unit_cross_section', 'erosion_energy_per_unit_mass'] + check_change = ['erosion_energy_per_unit_cross_section', 'erosion_energy_per_unit_mass'] + # while cml_args.min_nresults > result_number: + print(cml_args.min_nresults,'results to find:') + + # make dir for result_dir+os.sep+'Physical_characteristics' + physChar_dir = result_dir+os.sep+'Physical_characteristics' + mkdirP(physChar_dir) + # make dir for result_dir+os.sep+'Correlation_matrix' + corrMat_dir = result_dir+os.sep+'Correlation_matrix' + mkdirP(corrMat_dir) + + # do-while loop works th first time annd break in the if condition + while True: + print('Number of results:',result_number) # erosion_energy_per_unit_cross_section erosion_energy_per_unit_mass + # check all check_change variables in the pd_results and in case the max pd_dataframe_ranges['erosion_energy_per_unit_cross_section'] is below the max pd_results['erosion_energy_per_unit_cross_section'] change the value + for check in check_change: + if check in pd_results.columns: + if pd_dataframe_ranges[check].min() > pd_results[check].min(): + print('Change the MIN value of',check,'to',pd_results[check].min(),'before:',pd_dataframe_ranges[check].min()) + pd_dataframe_ranges[check].iloc[0] = pd_results[check].min() + # if check == 'erosion_energy_per_unit_cross_section' or check == 'erosion_energy_per_unit_mass': + # pd_dataframe_ranges[check].iloc[0] = 0 + if pd_dataframe_ranges[check].max() < pd_results[check].max(): + print('Change the MAX value of',check,'to',pd_results[check].max(),'before:',pd_dataframe_ranges[check].max()) + pd_dataframe_ranges[check].iloc[1] = pd_results[check].max() + + if 'solution_id' in pd_results.columns: + print('PLOT: the physical characteristics results') + PhysicalPropPLOT(pd_results, pd_dataframe_ranges, physChar_dir, file_name) + print('PLOT: correlation matrix of the results (takes a long time)') + correlation_selPLOT(pd_dataframe_ranges, pd_results, corrMat_dir) + print('PLOT: best 10 results and add the RMSD value to csv selected') + # pd_results_ordered = order_base_on_both_RMSD(pd_results) + LightCurveCoefPLOT(pd_results, pd_dataframe_obs_real, result_dir, fit_funct, gensim_data_obs, rmsd_pol_mag, rmsd_t0_lag, fps, file_name, trajectory_Metsim_file,result_dir+os.sep+file_name+'_sim_sel_results.csv', vel_lagplot='lag') + LightCurveCoefPLOT(pd_results, pd_dataframe_obs_real, result_dir, fit_funct, gensim_data_obs, rmsd_pol_mag, rmsd_t0_lag, fps, file_name, trajectory_Metsim_file,result_dir+os.sep+file_name+'_sim_sel_results.csv', vel_lagplot='vel') + print('PLOT: the sigma range waterfall plot') + sigma_waterfallPLOT(pd_results, pd_dataframe_ranges, gensim_data_obs['rmsd_mag'], gensim_data_obs['rmsd_len'], result_dir, file_name) + print() + print('SUCCES: the physical characteristics range is in the results folder') + else: + # print('FAIL: Not found any result below magRMSD',rmsd_pol_mag,'and lenRMSD',rmsd_t0_lag/1000) + print('FAIL: Not found any result below magRMSD',mag_RMSD_real,'and lenRMSD',len_RMSD_real) + flag_fail = True + break + CI_physical_param = CI_range_gen_sim(pd_results, ii_repeat, old_results_number,pd_dataframe_ranges) - - print('\\hline') - if len(Min_KDE_point) > 0: - # print('var & $real$ & $1D_{KDE}$ & $1D_{KDE}\\%_{dif}$ & $allD_{KDE}$ & $allD_{KDE}\\%_{dif}$\\\\') - # print('var & real & mode & min$_{KDE}$ & -1\\sigma/+1\\sigma & -2\\sigma/+2\\sigma \\\\') - print('Variables & '+str(df_sim_shower['type'].iloc[0])+' & Mode & Dens.Point $ & 95\\%CIlow & 95\\%CIup \\\\') - else: - print('Variables & '+str(df_sim_shower['type'].iloc[0])+' & Mode & 95\\%CIlow & 95\\%CIup \\\\') + print('CI_physical_param:',CI_physical_param) - ii_densest=0 - for i in range(9): - # put legendoutside north - plotvar=to_plot[i] + # result_number = len(pd_results) + all_jsonfiles = get_json_files(results_event_dir) + result_number = len(all_jsonfiles) - if plotvar == 'erosion_mass_min' or plotvar == 'erosion_mass_max': - # take the log of the erosion_mass_min and erosion_mass_max - curr_df_sim_sel[plotvar]=np.log10(curr_df_sim_sel[plotvar]) - curr_sel[plotvar]=np.log10(curr_sel[plotvar]) - if len(Min_KDE_point) > 0: - Min_KDE_point[ii_densest]=np.log10(Min_KDE_point[ii_densest]) - # Min_KDE_point[ii_densest-1]=np.log10(Min_KDE_point[ii_densest-1]) - # sns.histplot(curr_df_sim_sel, x=curr_df_sim_sel[plotvar], weights=curr_df_sim_sel['weight'],hue='shower_code', ax=axs[i], kde=True, palette='bright', bins=20) - sns.histplot(curr_df_sim_sel, x=curr_df_sim_sel[plotvar], weights=curr_df_sim_sel['weight'], hue='group', ax=axs[i], palette='bright', bins=20) - unique_values_count = curr_sel[plotvar].nunique() - if unique_values_count > 1: - # # add the kde to the plot probability density function - sns.histplot(curr_sel, x=curr_sel[plotvar], weights=curr_sel['weight'], bins=20, ax=axs[i], fill=False, edgecolor=False, color='r', kde=True, binrange=[np.min(curr_df_sim_sel[plotvar]),np.max(curr_df_sim_sel[plotvar])]) - kde_line = axs[i].lines[-1] - axs[i].lines[-1].remove() + if cml_args.min_nresults <= result_number: + # print the number of results found + print('SUCCES: Number of results found:',result_number) + break else: - kde_line = None + if old_results_number == result_number: + print('Same number of results found:',result_number) + ii_repeat+=1 + # if ii_repeat==cml_args.ntry: + # print('STOP: After '+str(cml_args.ntry)+' failed attempt') + # print('STOP: No new simulation below magRMSD',mag_RMSD_real,'and lenRMSD',len_RMSD_real) + # print('STOP: Number of results found:',result_number) + # flag_fail = True + # break - # if the only_select_meteors_from is equal to any curr_df_sim_sel plot the observed event value as a vertical red line - # check if curr_df_sim_sel['type']=='MetSim' is in the curr_df_sim_sel['type'].values - if 'MetSim' in curr_df_sim_sel['type'].values: - # get the value of the observed event - axs[i].axvline(x=curr_df_sim_sel[curr_df_sim_sel['type']=='MetSim'][plotvar].values[0], color='k', linewidth=2) - elif 'Real' in curr_df_sim_sel['type'].values: - axs[i].axvline(x=curr_df_sim_sel[curr_df_sim_sel['type']=='Real'][plotvar].values[0], color='g', linewidth=2, linestyle='--') + old_results_number = result_number - if plotvar == 'erosion_mass_min' or plotvar == 'erosion_mass_max': - # put it back as it was - curr_df_sim_sel[plotvar]=10**curr_df_sim_sel[plotvar] - curr_sel[plotvar]=10**curr_sel[plotvar] + # every 10 adjust the CI and recompute + look_for_n_sim=result_number+10 + if look_for_n_sim > cml_args.min_nresults: # look_for_n_sim+5 + look_for_n_sim = cml_args.min_nresults - # get te 97.72nd percentile and the 2.28th percentile of curr_sel[plotvar] and call them sigma_97 and sigma_2 - sigma_95=np.percentile(curr_sel[plotvar], 95) - sigma_84=np.percentile(curr_sel[plotvar], 84.13) - sigma_15=np.percentile(curr_sel[plotvar], 15.87) - sigma_5=np.percentile(curr_sel[plotvar], 5) + print('regenerate new simulation in the CI range') + generate_simulations(pd_dataframe_obs_real, simulation_MetSim_object, gensim_data_obs, fit_funct, look_for_n_sim, cml_args.cores, results_event_dir, output_folder, file_name,fps,dens_co, flag_manual_metsim, CI_physical_param) - if kde_line is not None: - # Get the x and y data from the KDE line - kde_line_Xval = kde_line.get_xdata() - kde_line_Yval = kde_line.get_ydata() + all_jsonfiles = get_json_files(results_event_dir) - # Find the index of the maximum y value - max_index = np.argmax(kde_line_Yval) - if i!=8: - # Plot a dot at the maximum point - # axs[i].plot(kde_line_Xval[max_index], kde_line_Yval[max_index], 'ro') # 'ro' for red dot - axs[i].axvline(x=kde_line_Xval[max_index], color='red', linestyle='-.') - - x_10mode=kde_line_Xval[max_index] - if plotvar == 'erosion_mass_min' or plotvar == 'erosion_mass_max': - x_10mode=10**kde_line_Xval[max_index] - - if len(Min_KDE_point) > 0: - if len(Min_KDE_point)>ii_densest: - - # Find the index with the closest value to densest_point[ii_dense] to all y values - densest_index = find_closest_index(kde_line_Xval, [Min_KDE_point[ii_densest]]) - - # add also the densest_point[i] as a blue dot - # axs[i].plot(Min_KDE_point[ii_densest], kde_line_Yval[densest_index[0]], 'bo') - axs[i].axvline(x=Min_KDE_point[ii_densest], color='blue', linestyle='-.') - - if plotvar == 'erosion_mass_min' or plotvar == 'erosion_mass_max': - Min_KDE_point[ii_densest]=10**(Min_KDE_point[ii_densest]) - - if i<9: - print('\\hline') #df_sel_save[df_sel_save['solution_id']==only_select_meteors_from][plotvar].values[0] - # print(f"{to_plot_unit[i]} & ${'{:.4g}'.format(df_sel_save[df_sel_save['solution_id']==only_select_meteors_from][plotvar].values[0])}$ & ${'{:.4g}'.format(x_10mode)}$ & $ {'{:.2g}'.format(percent_diff_1D[i])}$\\% & $ {'{:.4g}'.format(densest_point[i])}$ & $ {'{:.2g}'.format(percent_diff_allD[i])}$\\% \\\\") - # print(to_plot_unit[i]+'& $'+str(x[max_index])+'$ & $'+str(percent_diff_1D[i])+'$\\% & $'+str(densest_point[ii_densest])+'$ & $'+str(percent_diff_allD[i])+'\\% \\\\') - # print(f"{to_plot_unit[i]} & ${'{:.4g}'.format(df_sel_save[df_sel_save['solution_id']==only_select_meteors_from][plotvar].values[0])}$ & ${'{:.4g}'.format(x_10mode)}$ & $ {'{:.2g}'.format(percent_diff_1D[i])}$\\% & $ {'{:.4g}'.format(densest_point[i])}$ & $ {'{:.2g}'.format(percent_diff_allD[i])}$\\% \\\\") - # print(f"{to_plot_unit[i]} & {'{:.4g}'.format(df_sel_save[df_sel_save['solution_id']==only_select_meteors_from][plotvar].values[0])} & {'{:.4g}'.format(x_10mode)} & {'{:.4g}'.format(densest_point[i])} & {'{:.4g}'.format(sigma_15)} / {'{:.4g}'.format(sigma_84)} & {'{:.4g}'.format(sigma_2)} / {'{:.4g}'.format(sigma_97)} \\\\") - print(f"{to_plot_unit[i]} & {'{:.4g}'.format(curr_df_sim_sel[plotvar].iloc[0])} & {'{:.4g}'.format(x_10mode)} & {'{:.4g}'.format(Min_KDE_point[i])} & {'{:.4g}'.format(sigma_5)} & {'{:.4g}'.format(sigma_95)} \\\\") - ii_densest=ii_densest+1 - else: - if i<9: - print('\\hline') - print(f"{to_plot_unit[i]} & {'{:.4g}'.format(curr_df_sim_sel[plotvar].iloc[0])} & {'{:.4g}'.format(x_10mode)} & {'{:.4g}'.format(sigma_5)} & {'{:.4g}'.format(sigma_95)} \\\\") - else: - if i<9: - print('\\hline') - print(f"{to_plot_unit[i]} & {'{:.4g}'.format(curr_df_sim_sel[plotvar].iloc[0])} & {'{:.4g}'.format(sigma_5)} & {'{:.4g}'.format(sigma_95)} \\\\") + print('start reading the json files') - axs[i].set_ylabel('probability') - axs[i].set_xlabel(to_plot_unit[i]) + print('Number of simulated files: ',len(all_jsonfiles)) - # check if y axis is above 1 if so set_ylim(0,1) - if axs[i].get_ylim()[1]>1: - axs[i].set_ylim(0,1) - - # # plot the legend outside the plot - # axs[i].legend() - axs[i].get_legend().remove() + input_list = [[all_jsonfiles[ii], 'simulation_'+str(ii+1), fit_funct, gensim_data_obs, True] for ii in range(len(all_jsonfiles))] + results_list = domainParallelizer(input_list, read_GenerateSimulations_output_to_PCA, cores=cml_args.cores) + # if no read the json files in the folder and create a new csv file + pd_results = pd.concat(results_list) + # reset index + pd_results.reset_index(drop=True, inplace=True) + + # give to every row the type Simulation_sel + pd_results['type'] = 'Simulation_sel' + + # # # check if any of them has in the solution_id the same as the pd_datafram_Metsim split by os.sep and take the last element + # if flag_manual_metsim and flag_results_found_metsim: + # for ii in range(len(pd_results)): + # # split the solution_id by os.sep and take the last element + # if metsim_file_name in pd_results['solution_id'].iloc[ii].split(os.sep)[-1]: + # pd_results['type'].iloc[ii] = pd_datafram_Metsim['type'].iloc[0] + # pd_results['solution_id'].iloc[ii] = pd_datafram_Metsim['solution_id'].iloc[0] + + # if flag_manual_metsim and flag_results_found_metsim: + # # Create a boolean mask to identify rows where the file name matches + # mask = pd_results['solution_id'].apply(lambda x: metsim_file_name in x.split(os.sep)[-1]) + + # # Update the 'type' column and reassign the solution_id to itself (unnecessary but explicit) + # pd_results.loc[mask, 'type'] = pd_datafram_Metsim['type'].iloc[0] + # pd_results.loc[mask, 'solution_id'] = pd_results.loc[mask, 'solution_id'] - if i==0: - # place the xaxis exponent in the bottom right corner - axs[i].xaxis.get_offset_text().set_x(1.10) - - # # more space between the subplots erosion_coeff sigma - plt.tight_layout() - - print('\\hline') - - - # save the figure maximized and with the right name - fig.savefig(output_dir+os.sep+file_name+'_PhysicProp'+str(n_PC_in_PCA)+'PC_'+str(len(curr_sel))+'ev.png', dpi=300) # _dist'+str(np.round(np.min(curr_sel['distance_meteor']),2))+'-'+str(np.round(np.max(curr_sel['distance_meteor']),2))+' - - # close the figure - plt.close() - - if save_log: - # Close the Logger to ensure everything is written to the file STOP COPY in TXT file - sys.stdout.close() - - # Reset sys.stdout to its original value if needed - sys.stdout = sys.__stdout__ - ii_densest=0 - if 'solution_id_dist' in df_sel_shower_real.columns: - # the plot can get suck if too many reliazations - if len(df_sel_shower_real['solution_id_dist'].unique())<60: - if len(df_sel_shower_real['solution_id_dist'].unique())>1: - print('plot the distribution of the Realization',len(df_sel_shower_real['solution_id_dist'].unique())) - fig, axs = plt.subplots(3, 3) - # from 2 numbers to one numbr for the subplot axs - axs = axs.flatten() - - # ii_densest=0 - for i in range(9): - # put legendoutside north - plotvar=to_plot[i] - - if plotvar == 'erosion_mass_min' or plotvar == 'erosion_mass_max': - - sns.histplot(curr_df_sim_sel, x=np.log10(curr_df_sim_sel[plotvar]), weights=curr_df_sim_sel['weight'], hue='group', ax=axs[i], palette='bright', bins=20, binrange=[np.log10(np.min(curr_df_sim_sel[plotvar])),np.log10(np.max(curr_df_sim_sel[plotvar]))]) - # sns.histplot(curr_df_sim_sel, x=curr_df_sim_sel[plotvar], weights=curr_df_sim_sel['weight'],hue='solution_id_dist', ax=axs[i], multiple="stack", kde=True, bins=20, binrange=[np.min(df_sel_save[plotvar]),np.max(df_sel_save[plotvar])]) - sns.histplot(curr_df_sim_sel, x=np.log10(curr_df_sim_sel[plotvar]), weights=curr_df_sim_sel['weight'],hue='solution_id_dist', ax=axs[i], multiple="stack", bins=20, binrange=[np.log10(np.min(curr_df_sim_sel[plotvar])),np.log10(np.max(curr_df_sim_sel[plotvar]))]) - # # add the kde to the plot as a probability density function - sns.histplot(curr_sel, x=np.log10(curr_sel[plotvar]), weights=curr_sel['weight'], bins=20, ax=axs[i], multiple="stack", fill=False, edgecolor=False, color='r', kde=True, binrange=[np.log10(np.min(curr_df_sim_sel[plotvar])),np.log10(np.max(curr_df_sim_sel[plotvar]))]) - - kde_line = axs[i].lines[-1] - # delete from the plot the axs[i].lines[-1] - axs[i].lines[-1].remove() - - # if the only_select_meteors_from is equal to any curr_df_sim_sel plot the observed event value as a vertical red line - if 'MetSim' in curr_df_sim_sel['type'].values: - # get the value of the observed event - axs[i].axvline(x=np.log10(curr_df_sim_sel[curr_df_sim_sel['type']=='MetSim'][plotvar].values[0]), color='k', linewidth=2) - elif 'Real' in curr_df_sim_sel['type'].values: - axs[i].axvline(x=np.log10(curr_df_sim_sel[curr_df_sim_sel['type']=='Real'][plotvar].values[0]), color='g', linewidth=2, linestyle='--') - - # if len(Min_KDE_point) > 0: - # Min_KDE_point[ii_densest]=np.log10(Min_KDE_point[ii_densest]) - # # Min_KDE_point[ii_densest-1]=np.log10(Min_KDE_point[ii_densest-1]) - - else: + # change all the 'type' of pd_results to the one that matches the 'solution_id' of the pd_initial_results + if 'solution_id' in pd_results.columns and 'solution_id' in pd_initial_results.columns: + # Create a dictionary mapping 'solution_id' to 'type' from pd_initial_results + solution_type_map = pd_initial_results.set_index('solution_id')['type'].to_dict() - sns.histplot(curr_df_sim_sel, x=curr_df_sim_sel[plotvar], weights=curr_df_sim_sel['weight'], hue='group', ax=axs[i], palette='bright', bins=20, binrange=[np.min(curr_df_sim_sel[plotvar]),np.max(curr_df_sim_sel[plotvar])]) - # sns.histplot(curr_df_sim_sel, x=curr_df_sim_sel[plotvar], weights=curr_df_sim_sel['weight'],hue='solution_id_dist', ax=axs[i], multiple="stack", kde=True, bins=20, binrange=[np.min(df_sel_save[plotvar]),np.max(df_sel_save[plotvar])]) - sns.histplot(curr_df_sim_sel, x=curr_df_sim_sel[plotvar], weights=curr_df_sim_sel['weight'], hue='solution_id_dist', ax=axs[i], multiple="stack", bins=20, binrange=[np.min(curr_df_sim_sel[plotvar]),np.max(curr_df_sim_sel[plotvar])]) - # # add the kde to the plot as a probability density function - sns.histplot(curr_sel, x=curr_sel[plotvar], weights=curr_sel['weight'], bins=20, ax=axs[i], multiple="stack", fill=False, edgecolor=False, color='r', kde=True, binrange=[np.min(curr_df_sim_sel[plotvar]),np.max(curr_df_sim_sel[plotvar])]) - - kde_line = axs[i].lines[-1] - - # delete from the plot the axs[i].lines[-1] - axs[i].lines[-1].remove() - - # if the only_select_meteors_from is equal to any curr_df_sim_sel plot the observed event value as a vertical red line - if 'MetSim' in curr_df_sim_sel['type'].values: - # get the value of the observed event - axs[i].axvline(x=curr_df_sim_sel[curr_df_sim_sel['type']=='MetSim'][plotvar].values[0], color='k', linewidth=2) - elif 'Real' in curr_df_sim_sel['type'].values: - axs[i].axvline(x=curr_df_sim_sel[curr_df_sim_sel['type']=='Real'][plotvar].values[0], color='g', linewidth=2, linestyle='--') - # put the value of diff_percent_1d at th upper left of the line - - axs[i].set_ylabel('probability') - axs[i].set_xlabel(to_plot_unit[i]) - # check if y axis is above 1 if so set_ylim(0,1) - if axs[i].get_ylim()[1]>1: - axs[i].set_ylim(0,1) - - # # plot the legend outside the plot - # axs[i].legend() - axs[i].get_legend().remove() - - # # Get the x and y data from the KDE line - # kde_line_Xval = kde_line.get_xdata() - # kde_line_Yval = kde_line.get_ydata() - - # if i != 8: - # axs[i].plot(kde_line_Xval[max_index], kde_line_Yval[max_index], 'ro') - - # if i==0: - # # place the xaxis exponent in the bottom right corner - # axs[i].xaxis.get_offset_text().set_x(1.10) - # if len(Min_KDE_point) > 0: - # if len(Min_KDE_point)>ii_densest: - - # # Find the index with the closest value to densest_point[ii_dense] to all y values - # densest_index = find_closest_index(kde_line_Xval, [Min_KDE_point[ii_densest]]) - - # # add also the densest_point[i] as a blue dot - # axs[i].plot(Min_KDE_point[ii_densest], kde_line_Yval[densest_index[0]], 'bo') - # ii_densest=ii_densest+1 - # # more space between the subplots erosion_coeff sigma - plt.tight_layout() - - # save the figure maximized and with the right name - fig.savefig(output_dir+os.sep+file_name+'_PhysicProp_Reliazations_'+str(n_PC_in_PCA)+'PC_'+str(len(curr_sel))+'ev.png', dpi=300) - - - -def PCA_LightCurveCoefPLOT(df_sel_shower_real, df_obs_shower, output_dir, fit_funct='', gensim_data_obs='', mag_noise_real= 0.1, len_noise_real = 20.0, file_name_obs='', trajectory_Metsim_file='', output_folder_of_csv=''): - - # number to confront - n_confront_obs=1 - if output_folder_of_csv=='': - n_confront_sel=7 - else: - n_confront_sel=9 + # Update 'type' in pd_results based on the mapping + pd_results['type'] = pd_results['solution_id'].map(solution_type_map).fillna(pd_results['type']) + print('Updated "type" values in pd_results based on pd_initial_results.') - # number of PC in PCA - with_noise=True + # Identify rows in pd_results whose 'solution_id' is NOT in pd_initial_results + no_mapping_mask = ~pd_results['solution_id'].isin(solution_type_map.keys()) + pd_results.loc[no_mapping_mask, 'type'] = 'Simulation_sel' # 'Iteration' + print("Set 'Simulation_sel' for rows in pd_results that have no mapping in pd_initial_results.") - # is the input data noisy - noise_data_input=False + # Identify rows in pd_initial_results whose 'solution_id' is NOT in pd_results + missing_rows_mask = ~pd_initial_results['solution_id'].isin(pd_results['solution_id']) + missing_rows = pd_initial_results[missing_rows_mask] - # activate jachia - jacchia_fit=False + # If there are any rows with solution_ids not found in pd_results, append them + if not missing_rows.empty: + pd_results = pd.concat([missing_rows, pd_results], ignore_index=True) + print(f"Appended {len(missing_rows)} missing rows from pd_initial_results.") + # else: + # print("No missing rows to append.") - # activate parabolic fit - parabolic_fit=False + # re order all the rows based on the RMSD + pd_results = order_base_on_both_RMSD(pd_results) - t0_fit=False + # save and update the disk + pd_results.to_csv(output_folder+os.sep+file_name+NAME_SUFX_CSV_RESULTS, index=False) + - mag_fit=False + print() - # 5 sigma confidence interval - # five_sigma=False - # mag_noise = MAG_RMSD*SIGMA_ERR - # len_noise = LEN_RMSD*SIGMA_ERR - mag_noise = mag_noise_real.copy() - len_noise = len_noise_real.copy() - # # Standard deviation of the magnitude Gaussian noise 1 sigma - # # SD of noise in length (m) 1 sigma in km - len_noise= len_noise/1000 - # velocity noise 1 sigma km/s - # vel_noise = (len_noise*np.sqrt(2)/(1/FPS)) - vel_noise = (len_noise/(1/FPS)) + ######################## PLOT SECTION ############################### - # put the first plot in 2 sublots - fig, ax = plt.subplots(1, 2, figsize=(17, 5)) + # add to pd_results a column called 'multiple_rmsd' where we have the smallest for each row of the pd_results['rmsd_mag']/pd_dataframe_obs_real['rmsd_mag'] and pd_results['rmsd_len']/pd_dataframe_obs_real['rmsd_len'] + calc_multiple = [] - df_sel_shower = df_sel_shower_real.copy() + for ii in range(len(pd_results)): + ratio_mag = pd_results['rmsd_mag'].iloc[ii] / gensim_data_obs['rmsd_mag'] + ratio_len = pd_results['rmsd_len'].iloc[ii] / gensim_data_obs['rmsd_len'] + calc_multiple.append(max(ratio_mag, ratio_len)) + # check if multiple_rmsd present update it if not added to the pd_results + if 'multiple_rmsd' in pd_results.columns: + pd_results['multiple_rmsd'] = calc_multiple + else: + pd_results.insert(2, 'multiple_rmsd', calc_multiple) - # # group by solution_id_dist and keep only n_confront_sel from each group - # df_sel_shower = df_sel_shower.groupby('solution_id_dist').head(len(df_sel_shower)) + pd_results.to_csv(output_folder+os.sep+file_name+NAME_SUFX_CSV_RESULTS, index=False) + # waterfall directory result_dir+os.sep+'waterfall' + waterfall_dir = result_dir+os.sep+'waterfall' + mkdirP(waterfall_dir) + for dist_ii in [2, 1.9, 1.8, 1.7, 1.6, 1.5, 1.4, 1.3, 1.2, 1.1, 1.0]: + plot_gray_dist(pd_results, dist_ii, gensim_data_obs['z_score'], 'multiple_rmsd', pd_dataframe_obs_real, waterfall_dir, fit_funct, gensim_data_obs, rmsd_pol_mag, rmsd_t0_lag, fps, file_name, trajectory_Metsim_file) - # check if distance_meteor is in the columns - if 'distance_meteor' in df_sel_shower.columns: - # order by distance_meteor - df_sel_shower = df_sel_shower.sort_values('distance_meteor') + ######################## PCA PLOTS ############################### - # # count duplicates and add a column for the number of duplicates - # df_sel_shower['num_duplicates'] = df_sel_shower.groupby('solution_id')['solution_id'].transform('size') + PCA_percent=99 - # df_sel_shower.drop_duplicates(subset='solution_id', keep='first', inplace=True) + YesPCA=[] - df_sel_shower['erosion_coeff']=df_sel_shower['erosion_coeff']*1000000 - df_sel_shower['sigma']=df_sel_shower['sigma']*1000000 + NoPCA=['chi2_red_mag', 'chi2_red_len', 'rmsd_mag', 'rmsd_len', 'vel_180km','a1_acc_jac','a2_acc_jac','a_acc','b_acc','c_acc','c_mag_init','c_mag_end','a_t0', 'b_t0', 'c_t0'] - if n_confront_obs result_number: - print(cml_args.min_nres,'simulatd to found') - while cml_args.min_nres > result_number: - - # reset index - pd_datafram_PCA_selected_lowRMSD.reset_index(drop=True, inplace=True) - - pd_datafram_PCA_selected_lowRMSD['type'] = 'Simulation_sel' - - # delete any row from the csv file that has the same value of mass, rho, sigma, erosion_height_start, erosion_coeff, erosion_mass_index, erosion_mass_min, erosion_mass_max, erosion_range, erosion_energy_per_unit_cross_section, erosion_energy_per_unit_mass - if 'mass' in pd_datafram_PCA_selected_lowRMSD.columns: - # Drop duplicate rows based on the specified columns - pd_datafram_PCA_selected_lowRMSD = pd_datafram_PCA_selected_lowRMSD.drop_duplicates(subset=[ - 'mass', 'rho', 'sigma', 'erosion_height_start', 'erosion_coeff', - 'erosion_mass_index', 'erosion_mass_min', 'erosion_mass_max', - 'erosion_range', 'erosion_energy_per_unit_cross_section', - 'erosion_energy_per_unit_mass' - ]) - pd_datafram_PCA_selected_lowRMSD.reset_index(drop=True, inplace=True) - - pd_results = pd.concat([pd_results, pd_datafram_PCA_selected_lowRMSD]) - - # save and update the disk - pd_results.to_csv(output_folder+os.sep+SAVE_RESULTS_FOLDER+os.sep+file_name+'_sim_sel_results.csv', index=False) - - - if 'solution_id' in pd_results.columns: - print('PLOT: the physical characteristics results') - PCA_PhysicalPropPLOT(pd_results, pd_datafram_PCA_sim, pca_N_comp, output_folder+os.sep+SAVE_RESULTS_FOLDER, file_name) - print('PLOT: correlation matrix of the results') - PCAcorrelation_selPLOT(pd_datafram_PCA_sim, pd_results, pca_N_comp, output_folder+os.sep+SAVE_RESULTS_FOLDER) - print('PLOT: best 9 results and add the RMSD value to csv selected') - PCA_LightCurveCoefPLOT(pd_results, pd_dataframe_PCA_obs_real, output_folder+os.sep+SAVE_RESULTS_FOLDER, fit_funct, gensim_data_obs, rmsd_pol_mag, rmsd_t0_lag, file_name, trajectory_Metsim_file,output_folder+os.sep+SAVE_RESULTS_FOLDER+os.sep+file_name+'_sim_sel_results.csv') - print() - print('SUCCES: the physical characteristics range is in the results folder') - else: - # print('FAIL: Not found any result below magRMSD',rmsd_pol_mag*SIGMA_ERR,'and lenRMSD',rmsd_t0_lag*SIGMA_ERR/1000) - print('FAIL: Not found any result below magRMSD',MAG_RMSD*SIGMA_ERR,'and lenRMSD',LEN_RMSD*SIGMA_ERR) - break - - - # check if only 1 in len break - if len(pd_results) == 1: - print('Only one result found') - # create a dictionary with the physical parameters - CI_physical_param = { - 'v_init_180km': [pd_results['v_init_180km'].values[0], pd_results['v_init_180km'].values[0]], - 'zenith_angle': [pd_results['zenith_angle'].values[0], pd_results['zenith_angle'].values[0]], - 'mass': [pd_results['mass'].values[0], pd_results['mass'].values[0]], - 'rho': [pd_results['rho'].values[0], pd_results['rho'].values[0]], - 'sigma': [pd_results['sigma'].values[0], pd_results['sigma'].values[0]], - 'erosion_height_start': [pd_results['erosion_height_start'].values[0], pd_results['erosion_height_start'].values[0]], - 'erosion_coeff': [pd_results['erosion_coeff'].values[0], pd_results['erosion_coeff'].values[0]], - 'erosion_mass_index': [pd_results['erosion_mass_index'].values[0], pd_results['erosion_mass_index'].values[0]], - 'erosion_mass_min': [pd_results['erosion_mass_min'].values[0], pd_results['erosion_mass_min'].values[0]], - 'erosion_mass_max': [pd_results['erosion_mass_max'].values[0], pd_results['erosion_mass_max'].values[0]] - } - - else: - print('Number of results found:',len(pd_results)) - columns_physpar = ['v_init_180km','zenith_angle','mass', 'rho', 'sigma', 'erosion_height_start', 'erosion_coeff', - 'erosion_mass_index', 'erosion_mass_min', 'erosion_mass_max'] - - ############################################################################### - - # # Calculate the quantiles - # quantiles = pd_results[columns_physpar].quantile([0.05, 0.95]) - - # # Convert the quantiles to a dictionary - # CI_physical_param = {col: quantiles[col].tolist() for col in columns_physpar} - - ############################################################################### - - # Calculate the quantiles - quantiles = pd_results[columns_physpar].quantile([0.1, 0.9]) - - # Get the minimum and maximum values - min_val = pd_results[columns_physpar].min() - max_val = pd_results[columns_physpar].max() - - # Calculate the extended range using the logic provided - extended_min = min_val - (quantiles.loc[0.1] - min_val) - # consider the value extended_min<0 Check each column in extended_min and set to min_val if negative - for col in columns_physpar: - if extended_min[col] < 0: - extended_min[col] = min_val[col] - extended_max = max_val + (max_val - quantiles.loc[0.9]) - - # Convert the extended range to a dictionary - CI_physical_param = {col: [extended_min[col], extended_max[col]] for col in columns_physpar} - - ############################################################################### - - - # check if v_init_180km are the same value - if CI_physical_param['v_init_180km'][0] == CI_physical_param['v_init_180km'][1]: - CI_physical_param['v_init_180km'] = [CI_physical_param['v_init_180km'][0] - CI_physical_param['v_init_180km'][0]/1000, CI_physical_param['v_init_180km'][1] + CI_physical_param['v_init_180km'][1]/1000] - if CI_physical_param['zenith_angle'][0] == CI_physical_param['zenith_angle'][1]: - CI_physical_param['zenith_angle'] = [CI_physical_param['zenith_angle'][0] - CI_physical_param['zenith_angle'][0]/10000, CI_physical_param['zenith_angle'][1] + CI_physical_param['zenith_angle'][1]/10000] - if CI_physical_param['mass'][0] == CI_physical_param['mass'][1]: - CI_physical_param['mass'] = [CI_physical_param['mass'][0] - CI_physical_param['mass'][0]/10, CI_physical_param['mass'][1] + CI_physical_param['mass'][1]/10] - if np.round(CI_physical_param['rho'][0]/100) == np.round(CI_physical_param['rho'][1]/100): - CI_physical_param['rho'] = [CI_physical_param['rho'][0] - CI_physical_param['rho'][0]/5, CI_physical_param['rho'][1] + CI_physical_param['rho'][1]/5] - if CI_physical_param['sigma'][0] == CI_physical_param['sigma'][1]: - CI_physical_param['sigma'] = [CI_physical_param['sigma'][0] - CI_physical_param['sigma'][0]/10, CI_physical_param['sigma'][1] + CI_physical_param['sigma'][1]/10] - if CI_physical_param['erosion_height_start'][0] == CI_physical_param['erosion_height_start'][1]: - CI_physical_param['erosion_height_start'] = [CI_physical_param['erosion_height_start'][0] - CI_physical_param['erosion_height_start'][0]/100, CI_physical_param['erosion_height_start'][1] + CI_physical_param['erosion_height_start'][1]/100] - if CI_physical_param['erosion_coeff'][0] == CI_physical_param['erosion_coeff'][1]: - CI_physical_param['erosion_coeff'] = [CI_physical_param['erosion_coeff'][0] - CI_physical_param['erosion_coeff'][0]/10, CI_physical_param['erosion_coeff'][1] + CI_physical_param['erosion_coeff'][1]/10] - if CI_physical_param['erosion_mass_index'][0] == CI_physical_param['erosion_mass_index'][1]: - CI_physical_param['erosion_mass_index'] = [CI_physical_param['erosion_mass_index'][0] - CI_physical_param['erosion_mass_index'][0]/10, CI_physical_param['erosion_mass_index'][1] + CI_physical_param['erosion_mass_index'][1]/10] - if CI_physical_param['erosion_mass_min'][0] == CI_physical_param['erosion_mass_min'][1]: - CI_physical_param['erosion_mass_min'] = [CI_physical_param['erosion_mass_min'][0] - CI_physical_param['erosion_mass_min'][0]/10, CI_physical_param['erosion_mass_min'][1] + CI_physical_param['erosion_mass_min'][1]/10] - if CI_physical_param['erosion_mass_max'][0] == CI_physical_param['erosion_mass_max'][1]: - CI_physical_param['erosion_mass_max'] = [CI_physical_param['erosion_mass_max'][0] - CI_physical_param['erosion_mass_max'][0]/10, CI_physical_param['erosion_mass_max'][1] + CI_physical_param['erosion_mass_max'][1]/10] - - - # Multiply the 'erosion_height_start' values by 1000 - CI_physical_param['erosion_height_start'] = [x * 1000 for x in CI_physical_param['erosion_height_start']] - - print('CI_physical_param:',CI_physical_param) - - result_number = len(pd_results) - - if cml_args.min_nres <= result_number: - # print the number of results found - print('SUCCES: Number of results found:',result_number) - break - else: - if old_results_number == result_number: - print('Same number of results found:',result_number) - ii_repeat+=1 - if ii_repeat==3: - print('STOP: After 3 times the same number of results found') - print('STOP: After new simulation within 95%CI no new simulation below magRMSD',MAG_RMSD*SIGMA_ERR,'and lenRMSD',LEN_RMSD*SIGMA_ERR) - print('STOP: Number of results found:',result_number) - break - old_results_number = result_number - print('regenerate new simulation in the CI range') - generate_simulations(pd_dataframe_PCA_obs_real, simulation_MetSim_object, gensim_data_obs, cml_args.min_nres, output_folder, file_name, False, CI_physical_param) - - # look for the good_files = glob.glob(os.path.join(output_folder, '*_good_files.txt')) - good_files = [f for f in os.listdir(output_folder) if f.endswith('_good_files.txt')] - - # Construct the full path to the good file - good_file_path = os.path.join(output_folder, good_files[0]) - - # Read the file, skipping the first line - df_good_files = pd.read_csv(good_file_path, skiprows=1) - - # Rename the columns - df_good_files.columns = ["File name", "lim mag", "lim mag length", "length delay (s)"] - - # Extract the first column into an array - file_names = df_good_files["File name"].to_numpy() - - # Change the file extension to .json - all_jsonfiles = [file_name.replace('.pickle', '.json') for file_name in file_names] - - # open the folder and extract all the json files - os.chdir(input_folder) - - print('Number of simulated files in 95CI : ',len(all_jsonfiles)) - - input_list = [[all_jsonfiles[ii], 'simulation_'+str(ii+1)] for ii in range(len(all_jsonfiles))] - results_list = domainParallelizer(input_list, read_GenerateSimulations_output_to_PCA, cores=cml_args.cores) - - # if no read the json files in the folder and create a new csv file - pd_datafram_NEWsim_good = pd.concat(results_list) - - pd_datafram_NEWsim_good.to_csv(output_folder+os.sep+file_name+NAME_SUFX_CSV_SIM_NEW, index=False) - # print saved csv file - print('saved sim csv file:',output_folder+os.sep+file_name+NAME_SUFX_CSV_SIM_NEW) - - input_list_obs = [[pd_datafram_NEWsim_good.iloc[[ii]].reset_index(drop=True), pd_dataframe_PCA_obs_real, output_folder, fit_funct, gensim_data_Metsim, rmsd_pol_mag, rmsd_t0_lag, file_name, 0, False] for ii in range(len(pd_datafram_NEWsim_good))] - results_list = domainParallelizer(input_list_obs, PCA_LightCurveRMSDPLOT_optimize, cores=cml_args.cores) - - # base on the one selected - pd_datafram_PCA_selected_lowRMSD = pd.concat(results_list) - - # Timing end - end_time = time.time() - - # Compute elapsed time - elapsed_time = end_time - start_time - hours, rem = divmod(elapsed_time, 3600) - minutes, seconds = divmod(rem, 60) - # print('Elapsed time in seconds:',elapsed_time) - print(f"Elapsed time: {int(hours):02}:{int(minutes):02}:{int(seconds):02}") - - print() - + for trajectory_file, file_name, input_folder, output_folder, trajectory_Metsim_file in input_folder_file: + # run the main function + main_PhysUncert(trajectory_file, file_name, input_folder, output_folder, trajectory_Metsim_file, cml_args) diff --git a/wmpl/MetSim/ML/GenerateSimulations.py b/wmpl/MetSim/ML/GenerateSimulations.py index 0a4664fe..2d5f7ca6 100644 --- a/wmpl/MetSim/ML/GenerateSimulations.py +++ b/wmpl/MetSim/ML/GenerateSimulations.py @@ -615,8 +615,14 @@ def runSimulation(self, min_frames_visible=MIN_FRAMES_VISIBLE): # Make the density folder if not os.path.isdir(dens_folder_path): - os.makedirs(dens_folder_path) - + # Needed when multiplrocessing or some other process creates the directory before this one + os.makedirs(dens_folder_path, exist_ok=True) + # Needed when multiplrocessing but slower + # try: + # os.makedirs(dens_folder_path) + # except FileExistsError: + # # The directory already exists; no action needed + # pass ### @@ -887,6 +893,13 @@ def extractSimData(sim, min_frames_visible=MIN_FRAMES_VISIBLE, check_only=False, ### ADD NOISE ### + # lag_sampled=len_sampled[first_length_index:]-(vel_sampled[0]*time_sampled+len_sampled[0]) # +len_sampled[0] + + # lag_sampled+= np.random.normal(loc=0.0, scale=params.len_noise, \ + # size=len(len_sampled[first_length_index:])) + + # lag_sampled=lag_sampled-lag_sampled[0] + # Add noise to magnitude data mag_sampled[mag_sampled <= lim_mag] += np.random.normal(loc=0.0, scale=params.mag_noise, \ size=len(mag_sampled[mag_sampled <= lim_mag])) @@ -910,7 +923,11 @@ def extractSimData(sim, min_frames_visible=MIN_FRAMES_VISIBLE, check_only=False, padOrTruncate(len_normed, params.data_length), \ padOrTruncate(mag_normed, params.data_length)]) - lag_sampled=len_sampled-(vel_sampled[0]*time_sampled+len_sampled[0]) + # vel_sampled+= np.random.normal(loc=0.0, scale=params.len_noise*np.sqrt(2)/(1.0/params.fps), \ + # size=len(len_sampled[first_length_index:])) + # rmsd_t0_lag/1000*np.sqrt(2)/(1.0/fps) + + lag_sampled=len_sampled[first_length_index:]-(vel_sampled[first_length_index]*time_sampled) # +len_sampled[0] # get the new velocity with noise for vel_ii in range(1,len(time_sampled)): @@ -921,6 +938,7 @@ def extractSimData(sim, min_frames_visible=MIN_FRAMES_VISIBLE, check_only=False, else: vel_sampled[vel_ii]=(len_sampled[vel_ii]-len_sampled[vel_ii-1])/(time_sampled[vel_ii]-time_sampled[vel_ii-1]) + # vel_sampled[0]=vel_sampled[first_length_index] # Return input data and results return params, time_sampled, ht_sampled, len_sampled, mag_sampled, vel_sampled, lag_sampled, input_data_normed, simulated_data_normed @@ -979,8 +997,10 @@ def saveProcessedList(data_path, results_list, param_class_name, min_frames_visi """ - # Reject all None's from the results - good_list = [entry for entry in results_list if entry is not None] + # # Reject all None's from the results + # good_list = [entry for entry in results_list if entry is not None] + # Reject all None's from the results and entries where the filename is None + good_list = [entry for entry in results_list if entry is not None and entry[0] is not None] # Load one simulation to get simulation parameters sim = loadPickle(*os.path.split(good_list[0][0]))