From a783c95686d8b86e180725e26d7a49385afcdaa1 Mon Sep 17 00:00:00 2001
From: zerolee <464806884@qq.com>
Date: Mon, 28 Oct 2024 21:03:56 +0800
Subject: [PATCH 01/13] feat: demo2
---
package.json | 1 +
pnpm-lock.yaml | 8 +
src/DataInterpreter/index.md | 5 +-
src/components/demo2/datas/credit-g/tree.json | 262 +++++++++++
.../demo2/datas/credit-g/tree_01.json | 24 +
.../demo2/datas/credit-g/tree_02.json | 79 ++++
.../demo2/datas/credit-g/tree_03.json | 134 ++++++
.../demo2/datas/credit-g/tree_04.json | 145 ++++++
.../demo2/datas/credit-g/tree_05.json | 145 ++++++
.../demo2/datas/credit-g/tree_06.json | 200 ++++++++
.../demo2/datas/credit-g/tree_07.json | 211 +++++++++
.../demo2/datas/credit-g/tree_08.json | 222 +++++++++
.../demo2/datas/credit-g/tree_09.json | 277 ++++++++++++
.../demo2/datas/credit-g/tree_10.json | 288 ++++++++++++
src/components/demo2/demo2.vue | 427 ++++++++++++++++++
src/components/demoList.vue | 2 +-
16 files changed, 2427 insertions(+), 3 deletions(-)
create mode 100644 src/components/demo2/datas/credit-g/tree.json
create mode 100644 src/components/demo2/datas/credit-g/tree_01.json
create mode 100644 src/components/demo2/datas/credit-g/tree_02.json
create mode 100644 src/components/demo2/datas/credit-g/tree_03.json
create mode 100644 src/components/demo2/datas/credit-g/tree_04.json
create mode 100644 src/components/demo2/datas/credit-g/tree_05.json
create mode 100644 src/components/demo2/datas/credit-g/tree_06.json
create mode 100644 src/components/demo2/datas/credit-g/tree_07.json
create mode 100644 src/components/demo2/datas/credit-g/tree_08.json
create mode 100644 src/components/demo2/datas/credit-g/tree_09.json
create mode 100644 src/components/demo2/datas/credit-g/tree_10.json
create mode 100644 src/components/demo2/demo2.vue
diff --git a/package.json b/package.json
index ae3a2e92..c0c2774a 100644
--- a/package.json
+++ b/package.json
@@ -23,6 +23,7 @@
"@vueuse/core": "^10.6.1",
"dayjs": "^1.11.10",
"execa": "^8.0.1",
+ "highlight.js": "^11.10.0",
"lint-staged": "^15.0.2",
"prettier": "^3.0.3",
"sass": "^1.71.1",
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 30890144..48ac2ccf 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -35,6 +35,9 @@ dependencies:
execa:
specifier: ^8.0.1
version: 8.0.1
+ highlight.js:
+ specifier: ^11.10.0
+ version: 11.10.0
lint-staged:
specifier: ^15.0.2
version: 15.0.2
@@ -2012,6 +2015,11 @@ packages:
function-bind: 1.1.2
dev: false
+ /highlight.js@11.10.0:
+ resolution: {integrity: sha512-SYVnVFswQER+zu1laSya563s+F8VDGt7o35d4utbamowvUNLLMovFqwCLSocpZTz3MgaSRA1IbqRWZv97dtErQ==}
+ engines: {node: '>=12.0.0'}
+ dev: false
+
/html-tags@3.3.1:
resolution: {integrity: sha512-ztqyC3kLto0e9WbNp0aeP+M3kTt+nbaIveGmUxAtZa+8iFgKLUOD4YKM5j+f3QD89bra7UeumolZHKuOXnTmeQ==}
engines: {node: '>=8'}
diff --git a/src/DataInterpreter/index.md b/src/DataInterpreter/index.md
index a98a6681..d84a5413 100644
--- a/src/DataInterpreter/index.md
+++ b/src/DataInterpreter/index.md
@@ -4,7 +4,8 @@ footer: false
---
-
+
diff --git a/src/components/demo2/datas/credit-g/tree.json b/src/components/demo2/datas/credit-g/tree.json
new file mode 100644
index 00000000..07f9c134
--- /dev/null
+++ b/src/components/demo2/datas/credit-g/tree.json
@@ -0,0 +1,262 @@
+[
+ {
+ "id": "0",
+ "parent_id": null,
+ "avg_score": "73.7",
+ "dev_score": "83.4",
+ "visits": 10,
+ "order": 1,
+ "instruction": "Perform exploratory data analysis on the train and dev sets to understand the dataset structure and characteristics.\n\nPreprocess the train, dev, and test sets by handling missing values, encoding categorical variables, and scaling numerical features.",
+ "code": "import pandas as pd\nimport numpy as np\n\n# Load the dataset\ntrain_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\ndev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n\n# Display basic information about the datasets\nprint(\"Train Dataset Info:\")\nprint(train_df.info())\nprint(\"\\nDev Dataset Info:\")\nprint(dev_df.info())\n\n# Display summary statistics for numerical columns\nprint(\"\\nTrain Dataset Summary Statistics:\")\nprint(train_df.describe())\nprint(\"\\nDev Dataset Summary Statistics:\")\nprint(dev_df.describe())\n\n# Display the number of unique values for categorical columns\ncategorical_columns = train_df.select_dtypes(include=['object', 'category']).columns\nprint(\"\\nUnique Values in Categorical Columns:\")\nfor col in categorical_columns:\n print(f\"{col}: {train_df[col].nunique()} unique values\")\n\n# Check for missing values\nprint(\"\\nMissing Values in Train Dataset:\")\nprint(train_df.isnull().sum())\nprint(\"\\nMissing Values in Dev Dataset:\")\nprint(dev_df.isnull().sum())\n\n# Check the distribution of the target column\nprint(\"\\nTarget Column Distribution in Train Dataset:\")\nprint(train_df['class'].value_counts(normalize=True))\nprint(\"\\nTarget Column Distribution in Dev Dataset:\")\nprint(dev_df['class'].value_counts(normalize=True))\n\n\nfrom sklearn.preprocessing import StandardScaler\n\n# Function to preprocess data\ndef preprocess_data(df, scaler=None):\n df_copy = df.copy()\n \n # Separate target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Scale numerical features\n if scaler is None:\n scaler = StandardScaler()\n df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n else:\n df_copy[df_copy.columns] = scaler.transform(df_copy)\n \n # Reattach target column if it was separated\n if y is not None:\n df_copy['class'] = y\n \n return df_copy, scaler\n\n# Preprocess train, dev, and test sets\ntrain_df_processed, scaler = preprocess_data(train_df)\ndev_df_processed, _ = preprocess_data(dev_df, scaler)\ntest_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\ntest_df_processed, _ = preprocess_data(test_df, scaler)\n\n# Display the first few rows of the processed train dataset\nprint(\"Processed Train Dataset:\")\nprint(train_df_processed.head())\n"
+ },
+ {
+ "id": "0-0",
+ "parent_id": "0",
+ "avg_score": "82.5",
+ "dev_score": "83.2",
+ "visits": 2,
+ "order": 6,
+ "instruction": "Perform a correlation analysis to identify highly correlated features and potential multicollinearity issues before engineering new features or transforming existing ones to improve model performance.",
+ "code": "import numpy as np\n\n# Function to perform correlation analysis and identify highly correlated features\ndef correlation_analysis(df):\n corr_matrix = df.corr()\n upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))\n to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]\n return to_drop\n\n# Perform correlation analysis on the processed train dataset\nto_drop = correlation_analysis(train_df_processed.drop(columns=['class']))\n\n# Print the features to drop due to high correlation\nprint(\"Features to drop due to high correlation:\", to_drop)\n\n# Drop the identified features from the train, dev, and test datasets\ntrain_df_processed = train_df_processed.drop(columns=to_drop)\ndev_df_processed = dev_df_processed.drop(columns=to_drop)\ntest_df_processed = test_df_processed.drop(columns=to_drop)\n\n# Print the shape of the datasets after dropping highly correlated features\nprint(\"Shape of processed train dataset after dropping features:\", train_df_processed.shape)\nprint(\"Shape of processed dev dataset after dropping features:\", dev_df_processed.shape)\nprint(\"Shape of processed test dataset after dropping features:\", test_df_processed.shape)\n"
+ },
+ {
+ "id": "0-0-0",
+ "parent_id": "0-0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "6.5",
+ "instruction": "Perform a correlation analysis to identify highly correlated features and potential multicollinearity issues before training a base model to predict the target column 'class' on the train set.",
+ "code": ""
+ },
+ {
+ "id": "0-0-1",
+ "parent_id": "0-0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "6.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable 'class' to check for class imbalance. If significant class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.",
+ "code": ""
+ },
+ {
+ "id": "0-0-2",
+ "parent_id": "0-0",
+ "avg_score": "81.8",
+ "dev_score": "81.8",
+ "visits": 1,
+ "order": 7,
+ "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.",
+ "code": "from sklearn.model_selection import train_test_split\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\nfrom sklearn.ensemble import StackingClassifier\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\nfrom sklearn.neighbors import KNeighborsClassifier\n\n# Prepare data for modeling\nX_train = train_df_processed.drop(columns=['class'])\ny_train = train_df_processed['class']\nX_dev = dev_df_processed.drop(columns=['class'])\ny_dev = dev_df_processed['class']\nX_test = test_df_processed\n\n# Define base models\nbase_models = [\n ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, learning_rate=0.1, max_depth=5)),\n ('rf', RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)),\n ('et', ExtraTreesClassifier(n_estimators=200, max_depth=10, random_state=42)),\n ('knn', KNeighborsClassifier(n_neighbors=5))\n]\n\n# Define meta-model\nmeta_model = LogisticRegression()\n\n# Create stacking ensemble model\nstacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)\n\n# Train the stacking model\nstacking_model.fit(X_train, y_train)\n\n# Predict on dev set\ny_dev_pred = stacking_model.predict(X_dev)\n\n# Predict on test set\ny_test_pred = stacking_model.predict(X_test)\n\n# Save predictions\npd.DataFrame({'target': y_dev_pred}).to_csv('../workspace/jasmine/dev_predictions.csv', index=False)\npd.DataFrame({'target': y_test_pred}).to_csv('../workspace/jasmine/test_predictions.csv', index=False)\n\n# Evaluate performance\ntrain_f1 = f1_score(y_train, stacking_model.predict(X_train))\ndev_f1 = f1_score(y_dev, y_dev_pred)\n\nprint(f\"Train F1 Score: {train_f1}\")\nprint(f\"Dev F1 Score: {dev_f1}\")\n"
+ },
+ {
+ "id": "0-0-3",
+ "parent_id": "0-0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "6.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set, after analyzing the frequency distribution of categorical features to understand their diversity and potential impact on the model.",
+ "code": ""
+ },
+ {
+ "id": "0-0-4",
+ "parent_id": "0-0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "6.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.",
+ "code": ""
+ },
+ {
+ "id": "0-1",
+ "parent_id": "0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 1,
+ "order": 8,
+ "instruction": "Analyze the distribution of the target variable to check for class imbalance and decide on appropriate sampling techniques if necessary, before engineering features by creating new columns or transforming existing ones to improve model performance.",
+ "code": "from sklearn.decomposition import PCA\nfrom sklearn.preprocessing import PolynomialFeatures\n\ndef feature_engineering(df):\n df_copy = df.copy()\n \n # Remove the target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Apply PCA for dimensionality reduction\n pca = PCA(n_components=0.95) # Keep 95% of the variance\n pca_features = pca.fit_transform(df_copy)\n pca_df = pd.DataFrame(pca_features, columns=[f'PCA_{i}' for i in range(pca_features.shape[1])])\n \n # Apply PolynomialFeatures for interaction terms\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n poly_columns = poly.get_feature_names_out(df_copy.columns)\n poly_df = pd.DataFrame(poly_features, columns=poly_columns)\n \n # Combine PCA and Polynomial features\n combined_df = pd.concat([pca_df, poly_df], axis=1)\n \n # Re-add the target column if it was removed\n if y is not None:\n combined_df['class'] = y\n \n return combined_df\n\n# Apply feature engineering to train, dev, and test sets\ntrain_df_engineered = feature_engineering(train_df_processed)\ndev_df_engineered = feature_engineering(dev_df_processed)\ntest_df_engineered = feature_engineering(test_df_processed)\n\nprint(\"Engineered Train Dataset:\")\nprint(train_df_engineered.head())\n"
+ },
+ {
+ "id": "0-2",
+ "parent_id": "0",
+ "avg_score": "80.9",
+ "dev_score": "82.9",
+ "visits": 2,
+ "order": 9,
+ "instruction": "Visualize the distribution of numerical features to identify outliers and skewed distributions, then engineer features by creating new columns or transforming existing ones to improve model performance.",
+ "code": "from sklearn.preprocessing import PolynomialFeatures\n\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Remove ID columns if any\n df_copy = df_copy.select_dtypes(include=['number'])\n \n # Polynomial features\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n \n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n \n if y is not None:\n df_poly['class'] = y\n \n return df_poly, poly\n\ntrain_df_engineered, poly = engineer_features(train_df_processed)\ndev_df_engineered, _ = engineer_features(dev_df_processed, poly)\ntest_df_engineered, _ = engineer_features(test_df_processed, poly)\n\nprint(\"Engineered Train Dataset:\")\nprint(train_df_engineered.head())\n"
+ },
+ {
+ "id": "0-2-0",
+ "parent_id": "0-2",
+ "avg_score": "78.9",
+ "dev_score": "78.9",
+ "visits": 1,
+ "order": 10,
+ "instruction": "Perform a correlation analysis to identify highly correlated features and potential multicollinearity issues before training a base model to predict the target column 'class' on the train set.",
+ "code": "import numpy as np\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\nfrom sklearn.feature_selection import SelectFromModel\nimport pandas as pd\nfrom sklearn.preprocessing import StandardScaler, PolynomialFeatures\n\n# Define the preprocess_data function\ndef preprocess_data(df, scaler=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n if scaler is None:\n scaler = StandardScaler()\n df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n else:\n df_copy[df_copy.columns] = scaler.transform(df_copy)\n if y is not None:\n df_copy['class'] = y\n return df_copy, scaler\n\n# Define the engineer_features function\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n df_copy = df_copy.select_dtypes(include=['number'])\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n if y is not None:\n df_poly['class'] = y\n return df_poly, poly\n\n# Load the datasets if they are not already loaded\nif 'train_df_engineered' not in locals():\n train_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\n train_df_engineered = preprocess_data(train_df)[0]\n train_df_engineered, _ = engineer_features(train_df_engineered)\n\nif 'dev_df_engineered' not in locals():\n dev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n dev_df_engineered = preprocess_data(dev_df)[0]\n dev_df_engineered, _ = engineer_features(dev_df_engineered)\n\nif 'test_df_engineered' not in locals():\n test_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\n test_df_engineered = preprocess_data(test_df)[0]\n test_df_engineered, _ = engineer_features(test_df_engineered)\n\n# Function to optimize data types\ndef optimize_dtypes(df):\n for col in df.columns:\n if df[col].dtype == 'float64':\n df[col] = df[col].astype('float32')\n elif df[col].dtype == 'int64':\n df[col] = df[col].astype('int32')\n return df\n\n# Optimize data types for train, dev, and test datasets\ntrain_df_engineered = optimize_dtypes(train_df_engineered)\ndev_df_engineered = optimize_dtypes(dev_df_engineered)\ntest_df_engineered = optimize_dtypes(test_df_engineered)\n\n# Prepare data for modeling\nX_train = train_df_engineered.drop(columns=['class'])\ny_train = train_df_engineered['class']\nX_dev = dev_df_engineered.drop(columns=['class'])\ny_dev = dev_df_engineered['class']\nX_test = test_df_engineered\n\n# Feature selection using SelectFromModel\nbase_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')\nselector = SelectFromModel(base_model, threshold='median')\nselector.fit(X_train, y_train)\n\nX_train_selected = selector.transform(X_train)\nX_dev_selected = selector.transform(X_dev)\nX_test_selected = selector.transform(X_test)\n\n# Define the final model\nfinal_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, max_depth=5)\n\n# Train the final model\nfinal_model.fit(X_train_selected, y_train)\n\n# Predict on dev set\ny_dev_pred = final_model.predict(X_dev_selected)\n\n# Predict on test set\ny_test_pred = final_model.predict(X_test_selected)\n\n# Save predictions\npd.DataFrame({'target': y_dev_pred}).to_csv('../workspace/jasmine/dev_predictions.csv', index=False)\npd.DataFrame({'target': y_test_pred}).to_csv('../workspace/jasmine/test_predictions.csv', index=False)\n\n# Evaluate performance\ntrain_f1 = f1_score(y_train, final_model.predict(X_train_selected))\ndev_f1 = f1_score(y_dev, y_dev_pred)\n\nprint(f\"Train F1 Score: {train_f1}\")\nprint(f\"Dev F1 Score: {dev_f1}\")\n"
+ },
+ {
+ "id": "0-2-1",
+ "parent_id": "0-2",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "9.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set. Before training, analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset.",
+ "code": ""
+ },
+ {
+ "id": "0-2-2",
+ "parent_id": "0-2",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "9.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.",
+ "code": ""
+ },
+ {
+ "id": "0-2-3",
+ "parent_id": "0-2",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "9.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set. Prior to training, analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model.",
+ "code": ""
+ },
+ {
+ "id": "0-2-4",
+ "parent_id": "0-2",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "9.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.",
+ "code": ""
+ },
+ {
+ "id": "0-3",
+ "parent_id": "0",
+ "avg_score": "81.6",
+ "dev_score": "81.0",
+ "visits": 4,
+ "order": 2,
+ "instruction": "Analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model. Use this analysis to engineer new features or transform existing ones that can better capture the underlying patterns in the data.",
+ "code": "from sklearn.preprocessing import PolynomialFeatures\n\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Remove ID columns if any\n df_copy = df_copy.select_dtypes(include=['number'])\n \n # Polynomial features\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n \n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n \n if y is not None:\n df_poly['class'] = y\n \n return df_poly, poly\n\ntrain_df_poly, poly = engineer_features(train_df_processed)\ndev_df_poly, _ = engineer_features(dev_df_processed, poly)\ntest_df_poly, _ = engineer_features(test_df_processed, poly)\n\nprint(\"Processed Train Dataset with Polynomial Features:\")\nprint(train_df_poly.head())\n"
+ },
+ {
+ "id": "0-3-0",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after performing a correlation analysis to identify highly correlated features and address potential multicollinearity issues.",
+ "code": ""
+ },
+ {
+ "id": "0-3-1",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.",
+ "code": ""
+ },
+ {
+ "id": "0-3-2",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.",
+ "code": ""
+ },
+ {
+ "id": "0-3-3",
+ "parent_id": "0-3",
+ "avg_score": "81.9",
+ "dev_score": "81.0",
+ "visits": 3,
+ "order": 3,
+ "instruction": "Train a base model to predict the target column 'class' on the train set. Prior to training, analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model.",
+ "code": "from sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\n\n# Label encode categorical features if any\nlabel_encoders = {}\nfor col in categorical_columns:\n le = LabelEncoder()\n train_df_poly[col] = le.fit_transform(train_df_poly[col])\n dev_df_poly[col] = le.transform(dev_df_poly[col])\n test_df_poly[col] = le.transform(test_df_poly[col])\n label_encoders[col] = le\n\n# Separate features and target\nX_train = train_df_poly.drop(columns=['class'])\ny_train = train_df_poly['class']\nX_dev = dev_df_poly.drop(columns=['class'])\ny_dev = dev_df_poly['class']\nX_test = test_df_poly\n\n# Train a base XGBoost model\nmodel = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, max_depth=7, learning_rate=0.1)\nmodel.fit(X_train, y_train)\n\n# Predict on dev set\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Predict on test set\ny_test_pred = model.predict(X_test)\n\n# Save predictions\nimport os\noutput_dir = '../workspace/jasmine'\nos.makedirs(output_dir, exist_ok=True)\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n"
+ },
+ {
+ "id": "0-3-3-0",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and perform a correlation analysis to identify highly correlated features and potential multicollinearity issues.",
+ "code": ""
+ },
+ {
+ "id": "0-3-3-1",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques before re-evaluating the model.",
+ "code": ""
+ },
+ {
+ "id": "0-3-3-2",
+ "parent_id": "0-3-3",
+ "avg_score": "82.3",
+ "dev_score": "82.3",
+ "visits": 2,
+ "order": 4,
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and visualize the distribution of numerical features to identify outliers and skewed distributions.",
+ "code": "# Evaluate the base model on the dev set and report the F1 score\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Save the dev set predictions\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\n\n# Save the test set predictions\ny_test_pred = model.predict(X_test)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n\n# Print the train and dev set performance\ntrain_f1 = f1_score(y_train, model.predict(X_train))\nprint(f\"Train Set F1 Score: {train_f1}\")\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n"
+ },
+ {
+ "id": "0-3-3-3",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the frequency distribution of categorical features to understand their impact on the model performance.",
+ "code": ""
+ },
+ {
+ "id": "0-3-3-4",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Conduct a PCA (Principal Component Analysis) on the dev set to reduce dimensionality and understand the variance captured by different components. Then, evaluate the base model on the transformed dev set and report the F1 score.",
+ "code": ""
+ },
+ {
+ "id": "0-3-4",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.",
+ "code": ""
+ },
+ {
+ "id": "0-4",
+ "parent_id": "0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "1.5",
+ "instruction": "Perform Principal Component Analysis (PCA) to reduce dimensionality and understand the variance captured by different components, then engineer features based on the most significant principal components to improve model performance.",
+ "code": ""
+ }
+]
\ No newline at end of file
diff --git a/src/components/demo2/datas/credit-g/tree_01.json b/src/components/demo2/datas/credit-g/tree_01.json
new file mode 100644
index 00000000..66cbf7da
--- /dev/null
+++ b/src/components/demo2/datas/credit-g/tree_01.json
@@ -0,0 +1,24 @@
+[
+ {
+ "id": "0",
+ "parent_id": null,
+ "avg_score": "73.7",
+ "dev_score": "83.4",
+ "visits": 10,
+ "order": 1,
+ "instruction": "Perform exploratory data analysis on the train and dev sets to understand the dataset structure and characteristics.\n\nPreprocess the train, dev, and test sets by handling missing values, encoding categorical variables, and scaling numerical features.",
+ "code": "import pandas as pd\nimport numpy as np\n\n# Load the dataset\ntrain_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\ndev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n\n# Display basic information about the datasets\nprint(\"Train Dataset Info:\")\nprint(train_df.info())\nprint(\"\\nDev Dataset Info:\")\nprint(dev_df.info())\n\n# Display summary statistics for numerical columns\nprint(\"\\nTrain Dataset Summary Statistics:\")\nprint(train_df.describe())\nprint(\"\\nDev Dataset Summary Statistics:\")\nprint(dev_df.describe())\n\n# Display the number of unique values for categorical columns\ncategorical_columns = train_df.select_dtypes(include=['object', 'category']).columns\nprint(\"\\nUnique Values in Categorical Columns:\")\nfor col in categorical_columns:\n print(f\"{col}: {train_df[col].nunique()} unique values\")\n\n# Check for missing values\nprint(\"\\nMissing Values in Train Dataset:\")\nprint(train_df.isnull().sum())\nprint(\"\\nMissing Values in Dev Dataset:\")\nprint(dev_df.isnull().sum())\n\n# Check the distribution of the target column\nprint(\"\\nTarget Column Distribution in Train Dataset:\")\nprint(train_df['class'].value_counts(normalize=True))\nprint(\"\\nTarget Column Distribution in Dev Dataset:\")\nprint(dev_df['class'].value_counts(normalize=True))\n\n\nfrom sklearn.preprocessing import StandardScaler\n\n# Function to preprocess data\ndef preprocess_data(df, scaler=None):\n df_copy = df.copy()\n \n # Separate target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Scale numerical features\n if scaler is None:\n scaler = StandardScaler()\n df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n else:\n df_copy[df_copy.columns] = scaler.transform(df_copy)\n \n # Reattach target column if it was separated\n if y is not None:\n df_copy['class'] = y\n \n return df_copy, scaler\n\n# Preprocess train, dev, and test sets\ntrain_df_processed, scaler = preprocess_data(train_df)\ndev_df_processed, _ = preprocess_data(dev_df, scaler)\ntest_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\ntest_df_processed, _ = preprocess_data(test_df, scaler)\n\n# Display the first few rows of the processed train dataset\nprint(\"Processed Train Dataset:\")\nprint(train_df_processed.head())\n",
+ "active": true
+ },
+ {
+ "id": "0-4",
+ "parent_id": "0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "1.5",
+ "instruction": "Perform Principal Component Analysis (PCA) to reduce dimensionality and understand the variance captured by different components, then engineer features based on the most significant principal components to improve model performance.",
+ "code": "",
+ "active": false
+ }
+]
\ No newline at end of file
diff --git a/src/components/demo2/datas/credit-g/tree_02.json b/src/components/demo2/datas/credit-g/tree_02.json
new file mode 100644
index 00000000..0c01ebaf
--- /dev/null
+++ b/src/components/demo2/datas/credit-g/tree_02.json
@@ -0,0 +1,79 @@
+[
+ {
+ "id": "0",
+ "parent_id": null,
+ "avg_score": "73.7",
+ "dev_score": "83.4",
+ "visits": 10,
+ "order": 1,
+ "instruction": "Perform exploratory data analysis on the train and dev sets to understand the dataset structure and characteristics.\n\nPreprocess the train, dev, and test sets by handling missing values, encoding categorical variables, and scaling numerical features.",
+ "code": "import pandas as pd\nimport numpy as np\n\n# Load the dataset\ntrain_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\ndev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n\n# Display basic information about the datasets\nprint(\"Train Dataset Info:\")\nprint(train_df.info())\nprint(\"\\nDev Dataset Info:\")\nprint(dev_df.info())\n\n# Display summary statistics for numerical columns\nprint(\"\\nTrain Dataset Summary Statistics:\")\nprint(train_df.describe())\nprint(\"\\nDev Dataset Summary Statistics:\")\nprint(dev_df.describe())\n\n# Display the number of unique values for categorical columns\ncategorical_columns = train_df.select_dtypes(include=['object', 'category']).columns\nprint(\"\\nUnique Values in Categorical Columns:\")\nfor col in categorical_columns:\n print(f\"{col}: {train_df[col].nunique()} unique values\")\n\n# Check for missing values\nprint(\"\\nMissing Values in Train Dataset:\")\nprint(train_df.isnull().sum())\nprint(\"\\nMissing Values in Dev Dataset:\")\nprint(dev_df.isnull().sum())\n\n# Check the distribution of the target column\nprint(\"\\nTarget Column Distribution in Train Dataset:\")\nprint(train_df['class'].value_counts(normalize=True))\nprint(\"\\nTarget Column Distribution in Dev Dataset:\")\nprint(dev_df['class'].value_counts(normalize=True))\n\n\nfrom sklearn.preprocessing import StandardScaler\n\n# Function to preprocess data\ndef preprocess_data(df, scaler=None):\n df_copy = df.copy()\n \n # Separate target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Scale numerical features\n if scaler is None:\n scaler = StandardScaler()\n df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n else:\n df_copy[df_copy.columns] = scaler.transform(df_copy)\n \n # Reattach target column if it was separated\n if y is not None:\n df_copy['class'] = y\n \n return df_copy, scaler\n\n# Preprocess train, dev, and test sets\ntrain_df_processed, scaler = preprocess_data(train_df)\ndev_df_processed, _ = preprocess_data(dev_df, scaler)\ntest_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\ntest_df_processed, _ = preprocess_data(test_df, scaler)\n\n# Display the first few rows of the processed train dataset\nprint(\"Processed Train Dataset:\")\nprint(train_df_processed.head())\n",
+ "active": true
+ },
+ {
+ "id": "0-3",
+ "parent_id": "0",
+ "avg_score": "81.6",
+ "dev_score": "81.0",
+ "visits": 4,
+ "order": 2,
+ "instruction": "Analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model. Use this analysis to engineer new features or transform existing ones that can better capture the underlying patterns in the data.",
+ "code": "from sklearn.preprocessing import PolynomialFeatures\n\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Remove ID columns if any\n df_copy = df_copy.select_dtypes(include=['number'])\n \n # Polynomial features\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n \n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n \n if y is not None:\n df_poly['class'] = y\n \n return df_poly, poly\n\ntrain_df_poly, poly = engineer_features(train_df_processed)\ndev_df_poly, _ = engineer_features(dev_df_processed, poly)\ntest_df_poly, _ = engineer_features(test_df_processed, poly)\n\nprint(\"Processed Train Dataset with Polynomial Features:\")\nprint(train_df_poly.head())\n",
+ "active": true
+ },
+ {
+ "id": "0-3-0",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after performing a correlation analysis to identify highly correlated features and address potential multicollinearity issues.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-1",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-2",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-4",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-4",
+ "parent_id": "0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "1.5",
+ "instruction": "Perform Principal Component Analysis (PCA) to reduce dimensionality and understand the variance captured by different components, then engineer features based on the most significant principal components to improve model performance.",
+ "code": "",
+ "active": false
+ }
+]
\ No newline at end of file
diff --git a/src/components/demo2/datas/credit-g/tree_03.json b/src/components/demo2/datas/credit-g/tree_03.json
new file mode 100644
index 00000000..0f0cc77f
--- /dev/null
+++ b/src/components/demo2/datas/credit-g/tree_03.json
@@ -0,0 +1,134 @@
+[
+ {
+ "id": "0",
+ "parent_id": null,
+ "avg_score": "73.7",
+ "dev_score": "83.4",
+ "visits": 10,
+ "order": 1,
+ "instruction": "Perform exploratory data analysis on the train and dev sets to understand the dataset structure and characteristics.\n\nPreprocess the train, dev, and test sets by handling missing values, encoding categorical variables, and scaling numerical features.",
+ "code": "import pandas as pd\nimport numpy as np\n\n# Load the dataset\ntrain_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\ndev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n\n# Display basic information about the datasets\nprint(\"Train Dataset Info:\")\nprint(train_df.info())\nprint(\"\\nDev Dataset Info:\")\nprint(dev_df.info())\n\n# Display summary statistics for numerical columns\nprint(\"\\nTrain Dataset Summary Statistics:\")\nprint(train_df.describe())\nprint(\"\\nDev Dataset Summary Statistics:\")\nprint(dev_df.describe())\n\n# Display the number of unique values for categorical columns\ncategorical_columns = train_df.select_dtypes(include=['object', 'category']).columns\nprint(\"\\nUnique Values in Categorical Columns:\")\nfor col in categorical_columns:\n print(f\"{col}: {train_df[col].nunique()} unique values\")\n\n# Check for missing values\nprint(\"\\nMissing Values in Train Dataset:\")\nprint(train_df.isnull().sum())\nprint(\"\\nMissing Values in Dev Dataset:\")\nprint(dev_df.isnull().sum())\n\n# Check the distribution of the target column\nprint(\"\\nTarget Column Distribution in Train Dataset:\")\nprint(train_df['class'].value_counts(normalize=True))\nprint(\"\\nTarget Column Distribution in Dev Dataset:\")\nprint(dev_df['class'].value_counts(normalize=True))\n\n\nfrom sklearn.preprocessing import StandardScaler\n\n# Function to preprocess data\ndef preprocess_data(df, scaler=None):\n df_copy = df.copy()\n \n # Separate target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Scale numerical features\n if scaler is None:\n scaler = StandardScaler()\n df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n else:\n df_copy[df_copy.columns] = scaler.transform(df_copy)\n \n # Reattach target column if it was separated\n if y is not None:\n df_copy['class'] = y\n \n return df_copy, scaler\n\n# Preprocess train, dev, and test sets\ntrain_df_processed, scaler = preprocess_data(train_df)\ndev_df_processed, _ = preprocess_data(dev_df, scaler)\ntest_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\ntest_df_processed, _ = preprocess_data(test_df, scaler)\n\n# Display the first few rows of the processed train dataset\nprint(\"Processed Train Dataset:\")\nprint(train_df_processed.head())\n",
+ "active": true
+ },
+ {
+ "id": "0-3",
+ "parent_id": "0",
+ "avg_score": "81.6",
+ "dev_score": "81.0",
+ "visits": 4,
+ "order": 2,
+ "instruction": "Analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model. Use this analysis to engineer new features or transform existing ones that can better capture the underlying patterns in the data.",
+ "code": "from sklearn.preprocessing import PolynomialFeatures\n\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Remove ID columns if any\n df_copy = df_copy.select_dtypes(include=['number'])\n \n # Polynomial features\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n \n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n \n if y is not None:\n df_poly['class'] = y\n \n return df_poly, poly\n\ntrain_df_poly, poly = engineer_features(train_df_processed)\ndev_df_poly, _ = engineer_features(dev_df_processed, poly)\ntest_df_poly, _ = engineer_features(test_df_processed, poly)\n\nprint(\"Processed Train Dataset with Polynomial Features:\")\nprint(train_df_poly.head())\n",
+ "active": true
+ },
+ {
+ "id": "0-3-0",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after performing a correlation analysis to identify highly correlated features and address potential multicollinearity issues.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-1",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-2",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3",
+ "parent_id": "0-3",
+ "avg_score": "81.9",
+ "dev_score": "81.0",
+ "visits": 3,
+ "order": 3,
+ "instruction": "Train a base model to predict the target column 'class' on the train set. Prior to training, analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model.",
+ "code": "from sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\n\n# Label encode categorical features if any\nlabel_encoders = {}\nfor col in categorical_columns:\n le = LabelEncoder()\n train_df_poly[col] = le.fit_transform(train_df_poly[col])\n dev_df_poly[col] = le.transform(dev_df_poly[col])\n test_df_poly[col] = le.transform(test_df_poly[col])\n label_encoders[col] = le\n\n# Separate features and target\nX_train = train_df_poly.drop(columns=['class'])\ny_train = train_df_poly['class']\nX_dev = dev_df_poly.drop(columns=['class'])\ny_dev = dev_df_poly['class']\nX_test = test_df_poly\n\n# Train a base XGBoost model\nmodel = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, max_depth=7, learning_rate=0.1)\nmodel.fit(X_train, y_train)\n\n# Predict on dev set\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Predict on test set\ny_test_pred = model.predict(X_test)\n\n# Save predictions\nimport os\noutput_dir = '../workspace/jasmine'\nos.makedirs(output_dir, exist_ok=True)\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n",
+ "active": true
+ },
+ {
+ "id": "0-3-3-0",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and perform a correlation analysis to identify highly correlated features and potential multicollinearity issues.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3-1",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques before re-evaluating the model.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3-3",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the frequency distribution of categorical features to understand their impact on the model performance.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3-4",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Conduct a PCA (Principal Component Analysis) on the dev set to reduce dimensionality and understand the variance captured by different components. Then, evaluate the base model on the transformed dev set and report the F1 score.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-4",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-4",
+ "parent_id": "0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "1.5",
+ "instruction": "Perform Principal Component Analysis (PCA) to reduce dimensionality and understand the variance captured by different components, then engineer features based on the most significant principal components to improve model performance.",
+ "code": "",
+ "active": false
+ }
+]
\ No newline at end of file
diff --git a/src/components/demo2/datas/credit-g/tree_04.json b/src/components/demo2/datas/credit-g/tree_04.json
new file mode 100644
index 00000000..85db669d
--- /dev/null
+++ b/src/components/demo2/datas/credit-g/tree_04.json
@@ -0,0 +1,145 @@
+[
+ {
+ "id": "0",
+ "parent_id": null,
+ "avg_score": "73.7",
+ "dev_score": "83.4",
+ "visits": 10,
+ "order": 1,
+ "instruction": "Perform exploratory data analysis on the train and dev sets to understand the dataset structure and characteristics.\n\nPreprocess the train, dev, and test sets by handling missing values, encoding categorical variables, and scaling numerical features.",
+ "code": "import pandas as pd\nimport numpy as np\n\n# Load the dataset\ntrain_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\ndev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n\n# Display basic information about the datasets\nprint(\"Train Dataset Info:\")\nprint(train_df.info())\nprint(\"\\nDev Dataset Info:\")\nprint(dev_df.info())\n\n# Display summary statistics for numerical columns\nprint(\"\\nTrain Dataset Summary Statistics:\")\nprint(train_df.describe())\nprint(\"\\nDev Dataset Summary Statistics:\")\nprint(dev_df.describe())\n\n# Display the number of unique values for categorical columns\ncategorical_columns = train_df.select_dtypes(include=['object', 'category']).columns\nprint(\"\\nUnique Values in Categorical Columns:\")\nfor col in categorical_columns:\n print(f\"{col}: {train_df[col].nunique()} unique values\")\n\n# Check for missing values\nprint(\"\\nMissing Values in Train Dataset:\")\nprint(train_df.isnull().sum())\nprint(\"\\nMissing Values in Dev Dataset:\")\nprint(dev_df.isnull().sum())\n\n# Check the distribution of the target column\nprint(\"\\nTarget Column Distribution in Train Dataset:\")\nprint(train_df['class'].value_counts(normalize=True))\nprint(\"\\nTarget Column Distribution in Dev Dataset:\")\nprint(dev_df['class'].value_counts(normalize=True))\n\n\nfrom sklearn.preprocessing import StandardScaler\n\n# Function to preprocess data\ndef preprocess_data(df, scaler=None):\n df_copy = df.copy()\n \n # Separate target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Scale numerical features\n if scaler is None:\n scaler = StandardScaler()\n df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n else:\n df_copy[df_copy.columns] = scaler.transform(df_copy)\n \n # Reattach target column if it was separated\n if y is not None:\n df_copy['class'] = y\n \n return df_copy, scaler\n\n# Preprocess train, dev, and test sets\ntrain_df_processed, scaler = preprocess_data(train_df)\ndev_df_processed, _ = preprocess_data(dev_df, scaler)\ntest_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\ntest_df_processed, _ = preprocess_data(test_df, scaler)\n\n# Display the first few rows of the processed train dataset\nprint(\"Processed Train Dataset:\")\nprint(train_df_processed.head())\n",
+ "active": true
+ },
+ {
+ "id": "0-3",
+ "parent_id": "0",
+ "avg_score": "81.6",
+ "dev_score": "81.0",
+ "visits": 4,
+ "order": 2,
+ "instruction": "Analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model. Use this analysis to engineer new features or transform existing ones that can better capture the underlying patterns in the data.",
+ "code": "from sklearn.preprocessing import PolynomialFeatures\n\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Remove ID columns if any\n df_copy = df_copy.select_dtypes(include=['number'])\n \n # Polynomial features\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n \n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n \n if y is not None:\n df_poly['class'] = y\n \n return df_poly, poly\n\ntrain_df_poly, poly = engineer_features(train_df_processed)\ndev_df_poly, _ = engineer_features(dev_df_processed, poly)\ntest_df_poly, _ = engineer_features(test_df_processed, poly)\n\nprint(\"Processed Train Dataset with Polynomial Features:\")\nprint(train_df_poly.head())\n",
+ "active": true
+ },
+ {
+ "id": "0-3-0",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after performing a correlation analysis to identify highly correlated features and address potential multicollinearity issues.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-1",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-2",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3",
+ "parent_id": "0-3",
+ "avg_score": "81.9",
+ "dev_score": "81.0",
+ "visits": 3,
+ "order": 3,
+ "instruction": "Train a base model to predict the target column 'class' on the train set. Prior to training, analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model.",
+ "code": "from sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\n\n# Label encode categorical features if any\nlabel_encoders = {}\nfor col in categorical_columns:\n le = LabelEncoder()\n train_df_poly[col] = le.fit_transform(train_df_poly[col])\n dev_df_poly[col] = le.transform(dev_df_poly[col])\n test_df_poly[col] = le.transform(test_df_poly[col])\n label_encoders[col] = le\n\n# Separate features and target\nX_train = train_df_poly.drop(columns=['class'])\ny_train = train_df_poly['class']\nX_dev = dev_df_poly.drop(columns=['class'])\ny_dev = dev_df_poly['class']\nX_test = test_df_poly\n\n# Train a base XGBoost model\nmodel = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, max_depth=7, learning_rate=0.1)\nmodel.fit(X_train, y_train)\n\n# Predict on dev set\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Predict on test set\ny_test_pred = model.predict(X_test)\n\n# Save predictions\nimport os\noutput_dir = '../workspace/jasmine'\nos.makedirs(output_dir, exist_ok=True)\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n",
+ "active": true
+ },
+ {
+ "id": "0-3-3-0",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and perform a correlation analysis to identify highly correlated features and potential multicollinearity issues.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3-1",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques before re-evaluating the model.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3-2",
+ "parent_id": "0-3-3",
+ "avg_score": "82.3",
+ "dev_score": "82.3",
+ "visits": 2,
+ "order": 4,
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and visualize the distribution of numerical features to identify outliers and skewed distributions.",
+ "code": "# Evaluate the base model on the dev set and report the F1 score\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Save the dev set predictions\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\n\n# Save the test set predictions\ny_test_pred = model.predict(X_test)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n\n# Print the train and dev set performance\ntrain_f1 = f1_score(y_train, model.predict(X_train))\nprint(f\"Train Set F1 Score: {train_f1}\")\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n",
+ "active": true
+ },
+ {
+ "id": "0-3-3-3",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the frequency distribution of categorical features to understand their impact on the model performance.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3-4",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Conduct a PCA (Principal Component Analysis) on the dev set to reduce dimensionality and understand the variance captured by different components. Then, evaluate the base model on the transformed dev set and report the F1 score.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-4",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-4",
+ "parent_id": "0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "1.5",
+ "instruction": "Perform Principal Component Analysis (PCA) to reduce dimensionality and understand the variance captured by different components, then engineer features based on the most significant principal components to improve model performance.",
+ "code": "",
+ "active": false
+ }
+]
\ No newline at end of file
diff --git a/src/components/demo2/datas/credit-g/tree_05.json b/src/components/demo2/datas/credit-g/tree_05.json
new file mode 100644
index 00000000..9d37c727
--- /dev/null
+++ b/src/components/demo2/datas/credit-g/tree_05.json
@@ -0,0 +1,145 @@
+[
+ {
+ "id": "0",
+ "parent_id": null,
+ "avg_score": "73.7",
+ "dev_score": "83.4",
+ "visits": 10,
+ "order": 1,
+ "instruction": "Perform exploratory data analysis on the train and dev sets to understand the dataset structure and characteristics.\n\nPreprocess the train, dev, and test sets by handling missing values, encoding categorical variables, and scaling numerical features.",
+ "code": "import pandas as pd\nimport numpy as np\n\n# Load the dataset\ntrain_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\ndev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n\n# Display basic information about the datasets\nprint(\"Train Dataset Info:\")\nprint(train_df.info())\nprint(\"\\nDev Dataset Info:\")\nprint(dev_df.info())\n\n# Display summary statistics for numerical columns\nprint(\"\\nTrain Dataset Summary Statistics:\")\nprint(train_df.describe())\nprint(\"\\nDev Dataset Summary Statistics:\")\nprint(dev_df.describe())\n\n# Display the number of unique values for categorical columns\ncategorical_columns = train_df.select_dtypes(include=['object', 'category']).columns\nprint(\"\\nUnique Values in Categorical Columns:\")\nfor col in categorical_columns:\n print(f\"{col}: {train_df[col].nunique()} unique values\")\n\n# Check for missing values\nprint(\"\\nMissing Values in Train Dataset:\")\nprint(train_df.isnull().sum())\nprint(\"\\nMissing Values in Dev Dataset:\")\nprint(dev_df.isnull().sum())\n\n# Check the distribution of the target column\nprint(\"\\nTarget Column Distribution in Train Dataset:\")\nprint(train_df['class'].value_counts(normalize=True))\nprint(\"\\nTarget Column Distribution in Dev Dataset:\")\nprint(dev_df['class'].value_counts(normalize=True))\n\n\nfrom sklearn.preprocessing import StandardScaler\n\n# Function to preprocess data\ndef preprocess_data(df, scaler=None):\n df_copy = df.copy()\n \n # Separate target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Scale numerical features\n if scaler is None:\n scaler = StandardScaler()\n df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n else:\n df_copy[df_copy.columns] = scaler.transform(df_copy)\n \n # Reattach target column if it was separated\n if y is not None:\n df_copy['class'] = y\n \n return df_copy, scaler\n\n# Preprocess train, dev, and test sets\ntrain_df_processed, scaler = preprocess_data(train_df)\ndev_df_processed, _ = preprocess_data(dev_df, scaler)\ntest_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\ntest_df_processed, _ = preprocess_data(test_df, scaler)\n\n# Display the first few rows of the processed train dataset\nprint(\"Processed Train Dataset:\")\nprint(train_df_processed.head())\n",
+ "active": false
+ },
+ {
+ "id": "0-3",
+ "parent_id": "0",
+ "avg_score": "81.6",
+ "dev_score": "81.0",
+ "visits": 4,
+ "order": 2,
+ "instruction": "Analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model. Use this analysis to engineer new features or transform existing ones that can better capture the underlying patterns in the data.",
+ "code": "from sklearn.preprocessing import PolynomialFeatures\n\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Remove ID columns if any\n df_copy = df_copy.select_dtypes(include=['number'])\n \n # Polynomial features\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n \n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n \n if y is not None:\n df_poly['class'] = y\n \n return df_poly, poly\n\ntrain_df_poly, poly = engineer_features(train_df_processed)\ndev_df_poly, _ = engineer_features(dev_df_processed, poly)\ntest_df_poly, _ = engineer_features(test_df_processed, poly)\n\nprint(\"Processed Train Dataset with Polynomial Features:\")\nprint(train_df_poly.head())\n",
+ "active": false
+ },
+ {
+ "id": "0-3-0",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after performing a correlation analysis to identify highly correlated features and address potential multicollinearity issues.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-1",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-2",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3",
+ "parent_id": "0-3",
+ "avg_score": "81.9",
+ "dev_score": "81.0",
+ "visits": 3,
+ "order": 3,
+ "instruction": "Train a base model to predict the target column 'class' on the train set. Prior to training, analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model.",
+ "code": "from sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\n\n# Label encode categorical features if any\nlabel_encoders = {}\nfor col in categorical_columns:\n le = LabelEncoder()\n train_df_poly[col] = le.fit_transform(train_df_poly[col])\n dev_df_poly[col] = le.transform(dev_df_poly[col])\n test_df_poly[col] = le.transform(test_df_poly[col])\n label_encoders[col] = le\n\n# Separate features and target\nX_train = train_df_poly.drop(columns=['class'])\ny_train = train_df_poly['class']\nX_dev = dev_df_poly.drop(columns=['class'])\ny_dev = dev_df_poly['class']\nX_test = test_df_poly\n\n# Train a base XGBoost model\nmodel = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, max_depth=7, learning_rate=0.1)\nmodel.fit(X_train, y_train)\n\n# Predict on dev set\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Predict on test set\ny_test_pred = model.predict(X_test)\n\n# Save predictions\nimport os\noutput_dir = '../workspace/jasmine'\nos.makedirs(output_dir, exist_ok=True)\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n",
+ "active": false
+ },
+ {
+ "id": "0-3-3-0",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and perform a correlation analysis to identify highly correlated features and potential multicollinearity issues.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3-1",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques before re-evaluating the model.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3-2",
+ "parent_id": "0-3-3",
+ "avg_score": "82.3",
+ "dev_score": "82.3",
+ "visits": 2,
+ "order": 4,
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and visualize the distribution of numerical features to identify outliers and skewed distributions.",
+ "code": "# Evaluate the base model on the dev set and report the F1 score\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Save the dev set predictions\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\n\n# Save the test set predictions\ny_test_pred = model.predict(X_test)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n\n# Print the train and dev set performance\ntrain_f1 = f1_score(y_train, model.predict(X_train))\nprint(f\"Train Set F1 Score: {train_f1}\")\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n",
+ "active": false
+ },
+ {
+ "id": "0-3-3-3",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the frequency distribution of categorical features to understand their impact on the model performance.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3-4",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Conduct a PCA (Principal Component Analysis) on the dev set to reduce dimensionality and understand the variance captured by different components. Then, evaluate the base model on the transformed dev set and report the F1 score.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-4",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-4",
+ "parent_id": "0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "1.5",
+ "instruction": "Perform Principal Component Analysis (PCA) to reduce dimensionality and understand the variance captured by different components, then engineer features based on the most significant principal components to improve model performance.",
+ "code": "",
+ "active": false
+ }
+]
\ No newline at end of file
diff --git a/src/components/demo2/datas/credit-g/tree_06.json b/src/components/demo2/datas/credit-g/tree_06.json
new file mode 100644
index 00000000..95cd19e7
--- /dev/null
+++ b/src/components/demo2/datas/credit-g/tree_06.json
@@ -0,0 +1,200 @@
+[
+ {
+ "id": "0",
+ "parent_id": null,
+ "avg_score": "73.7",
+ "dev_score": "83.4",
+ "visits": 10,
+ "order": 1,
+ "instruction": "Perform exploratory data analysis on the train and dev sets to understand the dataset structure and characteristics.\n\nPreprocess the train, dev, and test sets by handling missing values, encoding categorical variables, and scaling numerical features.",
+ "code": "import pandas as pd\nimport numpy as np\n\n# Load the dataset\ntrain_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\ndev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n\n# Display basic information about the datasets\nprint(\"Train Dataset Info:\")\nprint(train_df.info())\nprint(\"\\nDev Dataset Info:\")\nprint(dev_df.info())\n\n# Display summary statistics for numerical columns\nprint(\"\\nTrain Dataset Summary Statistics:\")\nprint(train_df.describe())\nprint(\"\\nDev Dataset Summary Statistics:\")\nprint(dev_df.describe())\n\n# Display the number of unique values for categorical columns\ncategorical_columns = train_df.select_dtypes(include=['object', 'category']).columns\nprint(\"\\nUnique Values in Categorical Columns:\")\nfor col in categorical_columns:\n print(f\"{col}: {train_df[col].nunique()} unique values\")\n\n# Check for missing values\nprint(\"\\nMissing Values in Train Dataset:\")\nprint(train_df.isnull().sum())\nprint(\"\\nMissing Values in Dev Dataset:\")\nprint(dev_df.isnull().sum())\n\n# Check the distribution of the target column\nprint(\"\\nTarget Column Distribution in Train Dataset:\")\nprint(train_df['class'].value_counts(normalize=True))\nprint(\"\\nTarget Column Distribution in Dev Dataset:\")\nprint(dev_df['class'].value_counts(normalize=True))\n\n\nfrom sklearn.preprocessing import StandardScaler\n\n# Function to preprocess data\ndef preprocess_data(df, scaler=None):\n df_copy = df.copy()\n \n # Separate target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Scale numerical features\n if scaler is None:\n scaler = StandardScaler()\n df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n else:\n df_copy[df_copy.columns] = scaler.transform(df_copy)\n \n # Reattach target column if it was separated\n if y is not None:\n df_copy['class'] = y\n \n return df_copy, scaler\n\n# Preprocess train, dev, and test sets\ntrain_df_processed, scaler = preprocess_data(train_df)\ndev_df_processed, _ = preprocess_data(dev_df, scaler)\ntest_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\ntest_df_processed, _ = preprocess_data(test_df, scaler)\n\n# Display the first few rows of the processed train dataset\nprint(\"Processed Train Dataset:\")\nprint(train_df_processed.head())\n",
+ "active": true
+ },
+ {
+ "id": "0-0",
+ "parent_id": "0",
+ "avg_score": "82.5",
+ "dev_score": "83.2",
+ "visits": 2,
+ "order": 6,
+ "instruction": "Perform a correlation analysis to identify highly correlated features and potential multicollinearity issues before engineering new features or transforming existing ones to improve model performance.",
+ "code": "import numpy as np\n\n# Function to perform correlation analysis and identify highly correlated features\ndef correlation_analysis(df):\n corr_matrix = df.corr()\n upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))\n to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]\n return to_drop\n\n# Perform correlation analysis on the processed train dataset\nto_drop = correlation_analysis(train_df_processed.drop(columns=['class']))\n\n# Print the features to drop due to high correlation\nprint(\"Features to drop due to high correlation:\", to_drop)\n\n# Drop the identified features from the train, dev, and test datasets\ntrain_df_processed = train_df_processed.drop(columns=to_drop)\ndev_df_processed = dev_df_processed.drop(columns=to_drop)\ntest_df_processed = test_df_processed.drop(columns=to_drop)\n\n# Print the shape of the datasets after dropping highly correlated features\nprint(\"Shape of processed train dataset after dropping features:\", train_df_processed.shape)\nprint(\"Shape of processed dev dataset after dropping features:\", dev_df_processed.shape)\nprint(\"Shape of processed test dataset after dropping features:\", test_df_processed.shape)\n",
+ "active": true
+ },
+ {
+ "id": "0-0-0",
+ "parent_id": "0-0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "6.5",
+ "instruction": "Perform a correlation analysis to identify highly correlated features and potential multicollinearity issues before training a base model to predict the target column 'class' on the train set.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-0-1",
+ "parent_id": "0-0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "6.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable 'class' to check for class imbalance. If significant class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-0-3",
+ "parent_id": "0-0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "6.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set, after analyzing the frequency distribution of categorical features to understand their diversity and potential impact on the model.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-0-4",
+ "parent_id": "0-0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "6.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3",
+ "parent_id": "0",
+ "avg_score": "81.6",
+ "dev_score": "81.0",
+ "visits": 4,
+ "order": 2,
+ "instruction": "Analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model. Use this analysis to engineer new features or transform existing ones that can better capture the underlying patterns in the data.",
+ "code": "from sklearn.preprocessing import PolynomialFeatures\n\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Remove ID columns if any\n df_copy = df_copy.select_dtypes(include=['number'])\n \n # Polynomial features\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n \n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n \n if y is not None:\n df_poly['class'] = y\n \n return df_poly, poly\n\ntrain_df_poly, poly = engineer_features(train_df_processed)\ndev_df_poly, _ = engineer_features(dev_df_processed, poly)\ntest_df_poly, _ = engineer_features(test_df_processed, poly)\n\nprint(\"Processed Train Dataset with Polynomial Features:\")\nprint(train_df_poly.head())\n",
+ "active": false
+ },
+ {
+ "id": "0-3-0",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after performing a correlation analysis to identify highly correlated features and address potential multicollinearity issues.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-1",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-2",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3",
+ "parent_id": "0-3",
+ "avg_score": "81.9",
+ "dev_score": "81.0",
+ "visits": 3,
+ "order": 3,
+ "instruction": "Train a base model to predict the target column 'class' on the train set. Prior to training, analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model.",
+ "code": "from sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\n\n# Label encode categorical features if any\nlabel_encoders = {}\nfor col in categorical_columns:\n le = LabelEncoder()\n train_df_poly[col] = le.fit_transform(train_df_poly[col])\n dev_df_poly[col] = le.transform(dev_df_poly[col])\n test_df_poly[col] = le.transform(test_df_poly[col])\n label_encoders[col] = le\n\n# Separate features and target\nX_train = train_df_poly.drop(columns=['class'])\ny_train = train_df_poly['class']\nX_dev = dev_df_poly.drop(columns=['class'])\ny_dev = dev_df_poly['class']\nX_test = test_df_poly\n\n# Train a base XGBoost model\nmodel = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, max_depth=7, learning_rate=0.1)\nmodel.fit(X_train, y_train)\n\n# Predict on dev set\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Predict on test set\ny_test_pred = model.predict(X_test)\n\n# Save predictions\nimport os\noutput_dir = '../workspace/jasmine'\nos.makedirs(output_dir, exist_ok=True)\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n",
+ "active": false
+ },
+ {
+ "id": "0-3-3-0",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and perform a correlation analysis to identify highly correlated features and potential multicollinearity issues.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3-1",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques before re-evaluating the model.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3-2",
+ "parent_id": "0-3-3",
+ "avg_score": "82.3",
+ "dev_score": "82.3",
+ "visits": 2,
+ "order": 4,
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and visualize the distribution of numerical features to identify outliers and skewed distributions.",
+ "code": "# Evaluate the base model on the dev set and report the F1 score\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Save the dev set predictions\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\n\n# Save the test set predictions\ny_test_pred = model.predict(X_test)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n\n# Print the train and dev set performance\ntrain_f1 = f1_score(y_train, model.predict(X_train))\nprint(f\"Train Set F1 Score: {train_f1}\")\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n",
+ "active": false
+ },
+ {
+ "id": "0-3-3-3",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the frequency distribution of categorical features to understand their impact on the model performance.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3-4",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Conduct a PCA (Principal Component Analysis) on the dev set to reduce dimensionality and understand the variance captured by different components. Then, evaluate the base model on the transformed dev set and report the F1 score.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-4",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-4",
+ "parent_id": "0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "1.5",
+ "instruction": "Perform Principal Component Analysis (PCA) to reduce dimensionality and understand the variance captured by different components, then engineer features based on the most significant principal components to improve model performance.",
+ "code": "",
+ "active": false
+ }
+]
\ No newline at end of file
diff --git a/src/components/demo2/datas/credit-g/tree_07.json b/src/components/demo2/datas/credit-g/tree_07.json
new file mode 100644
index 00000000..80efa243
--- /dev/null
+++ b/src/components/demo2/datas/credit-g/tree_07.json
@@ -0,0 +1,211 @@
+[
+ {
+ "id": "0",
+ "parent_id": null,
+ "avg_score": "73.7",
+ "dev_score": "83.4",
+ "visits": 10,
+ "order": 1,
+ "instruction": "Perform exploratory data analysis on the train and dev sets to understand the dataset structure and characteristics.\n\nPreprocess the train, dev, and test sets by handling missing values, encoding categorical variables, and scaling numerical features.",
+ "code": "import pandas as pd\nimport numpy as np\n\n# Load the dataset\ntrain_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\ndev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n\n# Display basic information about the datasets\nprint(\"Train Dataset Info:\")\nprint(train_df.info())\nprint(\"\\nDev Dataset Info:\")\nprint(dev_df.info())\n\n# Display summary statistics for numerical columns\nprint(\"\\nTrain Dataset Summary Statistics:\")\nprint(train_df.describe())\nprint(\"\\nDev Dataset Summary Statistics:\")\nprint(dev_df.describe())\n\n# Display the number of unique values for categorical columns\ncategorical_columns = train_df.select_dtypes(include=['object', 'category']).columns\nprint(\"\\nUnique Values in Categorical Columns:\")\nfor col in categorical_columns:\n print(f\"{col}: {train_df[col].nunique()} unique values\")\n\n# Check for missing values\nprint(\"\\nMissing Values in Train Dataset:\")\nprint(train_df.isnull().sum())\nprint(\"\\nMissing Values in Dev Dataset:\")\nprint(dev_df.isnull().sum())\n\n# Check the distribution of the target column\nprint(\"\\nTarget Column Distribution in Train Dataset:\")\nprint(train_df['class'].value_counts(normalize=True))\nprint(\"\\nTarget Column Distribution in Dev Dataset:\")\nprint(dev_df['class'].value_counts(normalize=True))\n\n\nfrom sklearn.preprocessing import StandardScaler\n\n# Function to preprocess data\ndef preprocess_data(df, scaler=None):\n df_copy = df.copy()\n \n # Separate target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Scale numerical features\n if scaler is None:\n scaler = StandardScaler()\n df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n else:\n df_copy[df_copy.columns] = scaler.transform(df_copy)\n \n # Reattach target column if it was separated\n if y is not None:\n df_copy['class'] = y\n \n return df_copy, scaler\n\n# Preprocess train, dev, and test sets\ntrain_df_processed, scaler = preprocess_data(train_df)\ndev_df_processed, _ = preprocess_data(dev_df, scaler)\ntest_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\ntest_df_processed, _ = preprocess_data(test_df, scaler)\n\n# Display the first few rows of the processed train dataset\nprint(\"Processed Train Dataset:\")\nprint(train_df_processed.head())\n",
+ "active": true
+ },
+ {
+ "id": "0-0",
+ "parent_id": "0",
+ "avg_score": "82.5",
+ "dev_score": "83.2",
+ "visits": 2,
+ "order": 6,
+ "instruction": "Perform a correlation analysis to identify highly correlated features and potential multicollinearity issues before engineering new features or transforming existing ones to improve model performance.",
+ "code": "import numpy as np\n\n# Function to perform correlation analysis and identify highly correlated features\ndef correlation_analysis(df):\n corr_matrix = df.corr()\n upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))\n to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]\n return to_drop\n\n# Perform correlation analysis on the processed train dataset\nto_drop = correlation_analysis(train_df_processed.drop(columns=['class']))\n\n# Print the features to drop due to high correlation\nprint(\"Features to drop due to high correlation:\", to_drop)\n\n# Drop the identified features from the train, dev, and test datasets\ntrain_df_processed = train_df_processed.drop(columns=to_drop)\ndev_df_processed = dev_df_processed.drop(columns=to_drop)\ntest_df_processed = test_df_processed.drop(columns=to_drop)\n\n# Print the shape of the datasets after dropping highly correlated features\nprint(\"Shape of processed train dataset after dropping features:\", train_df_processed.shape)\nprint(\"Shape of processed dev dataset after dropping features:\", dev_df_processed.shape)\nprint(\"Shape of processed test dataset after dropping features:\", test_df_processed.shape)\n",
+ "active": true
+ },
+ {
+ "id": "0-0-0",
+ "parent_id": "0-0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "6.5",
+ "instruction": "Perform a correlation analysis to identify highly correlated features and potential multicollinearity issues before training a base model to predict the target column 'class' on the train set.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-0-1",
+ "parent_id": "0-0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "6.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable 'class' to check for class imbalance. If significant class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-0-2",
+ "parent_id": "0-0",
+ "avg_score": "81.8",
+ "dev_score": "81.8",
+ "visits": 1,
+ "order": 7,
+ "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.",
+ "code": "from sklearn.model_selection import train_test_split\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\nfrom sklearn.ensemble import StackingClassifier\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\nfrom sklearn.neighbors import KNeighborsClassifier\n\n# Prepare data for modeling\nX_train = train_df_processed.drop(columns=['class'])\ny_train = train_df_processed['class']\nX_dev = dev_df_processed.drop(columns=['class'])\ny_dev = dev_df_processed['class']\nX_test = test_df_processed\n\n# Define base models\nbase_models = [\n ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, learning_rate=0.1, max_depth=5)),\n ('rf', RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)),\n ('et', ExtraTreesClassifier(n_estimators=200, max_depth=10, random_state=42)),\n ('knn', KNeighborsClassifier(n_neighbors=5))\n]\n\n# Define meta-model\nmeta_model = LogisticRegression()\n\n# Create stacking ensemble model\nstacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)\n\n# Train the stacking model\nstacking_model.fit(X_train, y_train)\n\n# Predict on dev set\ny_dev_pred = stacking_model.predict(X_dev)\n\n# Predict on test set\ny_test_pred = stacking_model.predict(X_test)\n\n# Save predictions\npd.DataFrame({'target': y_dev_pred}).to_csv('../workspace/jasmine/dev_predictions.csv', index=False)\npd.DataFrame({'target': y_test_pred}).to_csv('../workspace/jasmine/test_predictions.csv', index=False)\n\n# Evaluate performance\ntrain_f1 = f1_score(y_train, stacking_model.predict(X_train))\ndev_f1 = f1_score(y_dev, y_dev_pred)\n\nprint(f\"Train F1 Score: {train_f1}\")\nprint(f\"Dev F1 Score: {dev_f1}\")\n",
+ "active": true
+ },
+ {
+ "id": "0-0-3",
+ "parent_id": "0-0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "6.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set, after analyzing the frequency distribution of categorical features to understand their diversity and potential impact on the model.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-0-4",
+ "parent_id": "0-0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "6.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3",
+ "parent_id": "0",
+ "avg_score": "81.6",
+ "dev_score": "81.0",
+ "visits": 4,
+ "order": 2,
+ "instruction": "Analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model. Use this analysis to engineer new features or transform existing ones that can better capture the underlying patterns in the data.",
+ "code": "from sklearn.preprocessing import PolynomialFeatures\n\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Remove ID columns if any\n df_copy = df_copy.select_dtypes(include=['number'])\n \n # Polynomial features\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n \n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n \n if y is not None:\n df_poly['class'] = y\n \n return df_poly, poly\n\ntrain_df_poly, poly = engineer_features(train_df_processed)\ndev_df_poly, _ = engineer_features(dev_df_processed, poly)\ntest_df_poly, _ = engineer_features(test_df_processed, poly)\n\nprint(\"Processed Train Dataset with Polynomial Features:\")\nprint(train_df_poly.head())\n",
+ "active": false
+ },
+ {
+ "id": "0-3-0",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after performing a correlation analysis to identify highly correlated features and address potential multicollinearity issues.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-1",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-2",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3",
+ "parent_id": "0-3",
+ "avg_score": "81.9",
+ "dev_score": "81.0",
+ "visits": 3,
+ "order": 3,
+ "instruction": "Train a base model to predict the target column 'class' on the train set. Prior to training, analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model.",
+ "code": "from sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\n\n# Label encode categorical features if any\nlabel_encoders = {}\nfor col in categorical_columns:\n le = LabelEncoder()\n train_df_poly[col] = le.fit_transform(train_df_poly[col])\n dev_df_poly[col] = le.transform(dev_df_poly[col])\n test_df_poly[col] = le.transform(test_df_poly[col])\n label_encoders[col] = le\n\n# Separate features and target\nX_train = train_df_poly.drop(columns=['class'])\ny_train = train_df_poly['class']\nX_dev = dev_df_poly.drop(columns=['class'])\ny_dev = dev_df_poly['class']\nX_test = test_df_poly\n\n# Train a base XGBoost model\nmodel = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, max_depth=7, learning_rate=0.1)\nmodel.fit(X_train, y_train)\n\n# Predict on dev set\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Predict on test set\ny_test_pred = model.predict(X_test)\n\n# Save predictions\nimport os\noutput_dir = '../workspace/jasmine'\nos.makedirs(output_dir, exist_ok=True)\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n",
+ "active": false
+ },
+ {
+ "id": "0-3-3-0",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and perform a correlation analysis to identify highly correlated features and potential multicollinearity issues.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3-1",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques before re-evaluating the model.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3-2",
+ "parent_id": "0-3-3",
+ "avg_score": "82.3",
+ "dev_score": "82.3",
+ "visits": 2,
+ "order": 4,
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and visualize the distribution of numerical features to identify outliers and skewed distributions.",
+ "code": "# Evaluate the base model on the dev set and report the F1 score\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Save the dev set predictions\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\n\n# Save the test set predictions\ny_test_pred = model.predict(X_test)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n\n# Print the train and dev set performance\ntrain_f1 = f1_score(y_train, model.predict(X_train))\nprint(f\"Train Set F1 Score: {train_f1}\")\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n",
+ "active": false
+ },
+ {
+ "id": "0-3-3-3",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the frequency distribution of categorical features to understand their impact on the model performance.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3-4",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Conduct a PCA (Principal Component Analysis) on the dev set to reduce dimensionality and understand the variance captured by different components. Then, evaluate the base model on the transformed dev set and report the F1 score.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-4",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-4",
+ "parent_id": "0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "1.5",
+ "instruction": "Perform Principal Component Analysis (PCA) to reduce dimensionality and understand the variance captured by different components, then engineer features based on the most significant principal components to improve model performance.",
+ "code": "",
+ "active": false
+ }
+]
\ No newline at end of file
diff --git a/src/components/demo2/datas/credit-g/tree_08.json b/src/components/demo2/datas/credit-g/tree_08.json
new file mode 100644
index 00000000..fbf0cf7a
--- /dev/null
+++ b/src/components/demo2/datas/credit-g/tree_08.json
@@ -0,0 +1,222 @@
+[
+ {
+ "id": "0",
+ "parent_id": null,
+ "avg_score": "73.7",
+ "dev_score": "83.4",
+ "visits": 10,
+ "order": 1,
+ "instruction": "Perform exploratory data analysis on the train and dev sets to understand the dataset structure and characteristics.\n\nPreprocess the train, dev, and test sets by handling missing values, encoding categorical variables, and scaling numerical features.",
+ "code": "import pandas as pd\nimport numpy as np\n\n# Load the dataset\ntrain_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\ndev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n\n# Display basic information about the datasets\nprint(\"Train Dataset Info:\")\nprint(train_df.info())\nprint(\"\\nDev Dataset Info:\")\nprint(dev_df.info())\n\n# Display summary statistics for numerical columns\nprint(\"\\nTrain Dataset Summary Statistics:\")\nprint(train_df.describe())\nprint(\"\\nDev Dataset Summary Statistics:\")\nprint(dev_df.describe())\n\n# Display the number of unique values for categorical columns\ncategorical_columns = train_df.select_dtypes(include=['object', 'category']).columns\nprint(\"\\nUnique Values in Categorical Columns:\")\nfor col in categorical_columns:\n print(f\"{col}: {train_df[col].nunique()} unique values\")\n\n# Check for missing values\nprint(\"\\nMissing Values in Train Dataset:\")\nprint(train_df.isnull().sum())\nprint(\"\\nMissing Values in Dev Dataset:\")\nprint(dev_df.isnull().sum())\n\n# Check the distribution of the target column\nprint(\"\\nTarget Column Distribution in Train Dataset:\")\nprint(train_df['class'].value_counts(normalize=True))\nprint(\"\\nTarget Column Distribution in Dev Dataset:\")\nprint(dev_df['class'].value_counts(normalize=True))\n\n\nfrom sklearn.preprocessing import StandardScaler\n\n# Function to preprocess data\ndef preprocess_data(df, scaler=None):\n df_copy = df.copy()\n \n # Separate target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Scale numerical features\n if scaler is None:\n scaler = StandardScaler()\n df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n else:\n df_copy[df_copy.columns] = scaler.transform(df_copy)\n \n # Reattach target column if it was separated\n if y is not None:\n df_copy['class'] = y\n \n return df_copy, scaler\n\n# Preprocess train, dev, and test sets\ntrain_df_processed, scaler = preprocess_data(train_df)\ndev_df_processed, _ = preprocess_data(dev_df, scaler)\ntest_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\ntest_df_processed, _ = preprocess_data(test_df, scaler)\n\n# Display the first few rows of the processed train dataset\nprint(\"Processed Train Dataset:\")\nprint(train_df_processed.head())\n",
+ "active": true
+ },
+ {
+ "id": "0-0",
+ "parent_id": "0",
+ "avg_score": "82.5",
+ "dev_score": "83.2",
+ "visits": 2,
+ "order": 6,
+ "instruction": "Perform a correlation analysis to identify highly correlated features and potential multicollinearity issues before engineering new features or transforming existing ones to improve model performance.",
+ "code": "import numpy as np\n\n# Function to perform correlation analysis and identify highly correlated features\ndef correlation_analysis(df):\n corr_matrix = df.corr()\n upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))\n to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]\n return to_drop\n\n# Perform correlation analysis on the processed train dataset\nto_drop = correlation_analysis(train_df_processed.drop(columns=['class']))\n\n# Print the features to drop due to high correlation\nprint(\"Features to drop due to high correlation:\", to_drop)\n\n# Drop the identified features from the train, dev, and test datasets\ntrain_df_processed = train_df_processed.drop(columns=to_drop)\ndev_df_processed = dev_df_processed.drop(columns=to_drop)\ntest_df_processed = test_df_processed.drop(columns=to_drop)\n\n# Print the shape of the datasets after dropping highly correlated features\nprint(\"Shape of processed train dataset after dropping features:\", train_df_processed.shape)\nprint(\"Shape of processed dev dataset after dropping features:\", dev_df_processed.shape)\nprint(\"Shape of processed test dataset after dropping features:\", test_df_processed.shape)\n",
+ "active": false
+ },
+ {
+ "id": "0-0-0",
+ "parent_id": "0-0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "6.5",
+ "instruction": "Perform a correlation analysis to identify highly correlated features and potential multicollinearity issues before training a base model to predict the target column 'class' on the train set.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-0-1",
+ "parent_id": "0-0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "6.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable 'class' to check for class imbalance. If significant class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-0-2",
+ "parent_id": "0-0",
+ "avg_score": "81.8",
+ "dev_score": "81.8",
+ "visits": 1,
+ "order": 7,
+ "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.",
+ "code": "from sklearn.model_selection import train_test_split\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\nfrom sklearn.ensemble import StackingClassifier\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\nfrom sklearn.neighbors import KNeighborsClassifier\n\n# Prepare data for modeling\nX_train = train_df_processed.drop(columns=['class'])\ny_train = train_df_processed['class']\nX_dev = dev_df_processed.drop(columns=['class'])\ny_dev = dev_df_processed['class']\nX_test = test_df_processed\n\n# Define base models\nbase_models = [\n ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, learning_rate=0.1, max_depth=5)),\n ('rf', RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)),\n ('et', ExtraTreesClassifier(n_estimators=200, max_depth=10, random_state=42)),\n ('knn', KNeighborsClassifier(n_neighbors=5))\n]\n\n# Define meta-model\nmeta_model = LogisticRegression()\n\n# Create stacking ensemble model\nstacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)\n\n# Train the stacking model\nstacking_model.fit(X_train, y_train)\n\n# Predict on dev set\ny_dev_pred = stacking_model.predict(X_dev)\n\n# Predict on test set\ny_test_pred = stacking_model.predict(X_test)\n\n# Save predictions\npd.DataFrame({'target': y_dev_pred}).to_csv('../workspace/jasmine/dev_predictions.csv', index=False)\npd.DataFrame({'target': y_test_pred}).to_csv('../workspace/jasmine/test_predictions.csv', index=False)\n\n# Evaluate performance\ntrain_f1 = f1_score(y_train, stacking_model.predict(X_train))\ndev_f1 = f1_score(y_dev, y_dev_pred)\n\nprint(f\"Train F1 Score: {train_f1}\")\nprint(f\"Dev F1 Score: {dev_f1}\")\n",
+ "active": false
+ },
+ {
+ "id": "0-0-3",
+ "parent_id": "0-0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "6.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set, after analyzing the frequency distribution of categorical features to understand their diversity and potential impact on the model.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-0-4",
+ "parent_id": "0-0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "6.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-1",
+ "parent_id": "0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 1,
+ "order": 8,
+ "instruction": "Analyze the distribution of the target variable to check for class imbalance and decide on appropriate sampling techniques if necessary, before engineering features by creating new columns or transforming existing ones to improve model performance.",
+ "code": "from sklearn.decomposition import PCA\nfrom sklearn.preprocessing import PolynomialFeatures\n\ndef feature_engineering(df):\n df_copy = df.copy()\n \n # Remove the target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Apply PCA for dimensionality reduction\n pca = PCA(n_components=0.95) # Keep 95% of the variance\n pca_features = pca.fit_transform(df_copy)\n pca_df = pd.DataFrame(pca_features, columns=[f'PCA_{i}' for i in range(pca_features.shape[1])])\n \n # Apply PolynomialFeatures for interaction terms\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n poly_columns = poly.get_feature_names_out(df_copy.columns)\n poly_df = pd.DataFrame(poly_features, columns=poly_columns)\n \n # Combine PCA and Polynomial features\n combined_df = pd.concat([pca_df, poly_df], axis=1)\n \n # Re-add the target column if it was removed\n if y is not None:\n combined_df['class'] = y\n \n return combined_df\n\n# Apply feature engineering to train, dev, and test sets\ntrain_df_engineered = feature_engineering(train_df_processed)\ndev_df_engineered = feature_engineering(dev_df_processed)\ntest_df_engineered = feature_engineering(test_df_processed)\n\nprint(\"Engineered Train Dataset:\")\nprint(train_df_engineered.head())\n",
+ "active": true
+ },
+ {
+ "id": "0-3",
+ "parent_id": "0",
+ "avg_score": "81.6",
+ "dev_score": "81.0",
+ "visits": 4,
+ "order": 2,
+ "instruction": "Analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model. Use this analysis to engineer new features or transform existing ones that can better capture the underlying patterns in the data.",
+ "code": "from sklearn.preprocessing import PolynomialFeatures\n\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Remove ID columns if any\n df_copy = df_copy.select_dtypes(include=['number'])\n \n # Polynomial features\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n \n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n \n if y is not None:\n df_poly['class'] = y\n \n return df_poly, poly\n\ntrain_df_poly, poly = engineer_features(train_df_processed)\ndev_df_poly, _ = engineer_features(dev_df_processed, poly)\ntest_df_poly, _ = engineer_features(test_df_processed, poly)\n\nprint(\"Processed Train Dataset with Polynomial Features:\")\nprint(train_df_poly.head())\n",
+ "active": false
+ },
+ {
+ "id": "0-3-0",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after performing a correlation analysis to identify highly correlated features and address potential multicollinearity issues.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-1",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-2",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3",
+ "parent_id": "0-3",
+ "avg_score": "81.9",
+ "dev_score": "81.0",
+ "visits": 3,
+ "order": 3,
+ "instruction": "Train a base model to predict the target column 'class' on the train set. Prior to training, analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model.",
+ "code": "from sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\n\n# Label encode categorical features if any\nlabel_encoders = {}\nfor col in categorical_columns:\n le = LabelEncoder()\n train_df_poly[col] = le.fit_transform(train_df_poly[col])\n dev_df_poly[col] = le.transform(dev_df_poly[col])\n test_df_poly[col] = le.transform(test_df_poly[col])\n label_encoders[col] = le\n\n# Separate features and target\nX_train = train_df_poly.drop(columns=['class'])\ny_train = train_df_poly['class']\nX_dev = dev_df_poly.drop(columns=['class'])\ny_dev = dev_df_poly['class']\nX_test = test_df_poly\n\n# Train a base XGBoost model\nmodel = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, max_depth=7, learning_rate=0.1)\nmodel.fit(X_train, y_train)\n\n# Predict on dev set\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Predict on test set\ny_test_pred = model.predict(X_test)\n\n# Save predictions\nimport os\noutput_dir = '../workspace/jasmine'\nos.makedirs(output_dir, exist_ok=True)\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n",
+ "active": false
+ },
+ {
+ "id": "0-3-3-0",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and perform a correlation analysis to identify highly correlated features and potential multicollinearity issues.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3-1",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques before re-evaluating the model.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3-2",
+ "parent_id": "0-3-3",
+ "avg_score": "82.3",
+ "dev_score": "82.3",
+ "visits": 2,
+ "order": 4,
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and visualize the distribution of numerical features to identify outliers and skewed distributions.",
+ "code": "# Evaluate the base model on the dev set and report the F1 score\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Save the dev set predictions\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\n\n# Save the test set predictions\ny_test_pred = model.predict(X_test)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n\n# Print the train and dev set performance\ntrain_f1 = f1_score(y_train, model.predict(X_train))\nprint(f\"Train Set F1 Score: {train_f1}\")\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n",
+ "active": false
+ },
+ {
+ "id": "0-3-3-3",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the frequency distribution of categorical features to understand their impact on the model performance.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3-4",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Conduct a PCA (Principal Component Analysis) on the dev set to reduce dimensionality and understand the variance captured by different components. Then, evaluate the base model on the transformed dev set and report the F1 score.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-4",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-4",
+ "parent_id": "0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "1.5",
+ "instruction": "Perform Principal Component Analysis (PCA) to reduce dimensionality and understand the variance captured by different components, then engineer features based on the most significant principal components to improve model performance.",
+ "code": "",
+ "active": false
+ }
+]
\ No newline at end of file
diff --git a/src/components/demo2/datas/credit-g/tree_09.json b/src/components/demo2/datas/credit-g/tree_09.json
new file mode 100644
index 00000000..817809ac
--- /dev/null
+++ b/src/components/demo2/datas/credit-g/tree_09.json
@@ -0,0 +1,277 @@
+[
+ {
+ "id": "0",
+ "parent_id": null,
+ "avg_score": "73.7",
+ "dev_score": "83.4",
+ "visits": 10,
+ "order": 1,
+ "instruction": "Perform exploratory data analysis on the train and dev sets to understand the dataset structure and characteristics.\n\nPreprocess the train, dev, and test sets by handling missing values, encoding categorical variables, and scaling numerical features.",
+ "code": "import pandas as pd\nimport numpy as np\n\n# Load the dataset\ntrain_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\ndev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n\n# Display basic information about the datasets\nprint(\"Train Dataset Info:\")\nprint(train_df.info())\nprint(\"\\nDev Dataset Info:\")\nprint(dev_df.info())\n\n# Display summary statistics for numerical columns\nprint(\"\\nTrain Dataset Summary Statistics:\")\nprint(train_df.describe())\nprint(\"\\nDev Dataset Summary Statistics:\")\nprint(dev_df.describe())\n\n# Display the number of unique values for categorical columns\ncategorical_columns = train_df.select_dtypes(include=['object', 'category']).columns\nprint(\"\\nUnique Values in Categorical Columns:\")\nfor col in categorical_columns:\n print(f\"{col}: {train_df[col].nunique()} unique values\")\n\n# Check for missing values\nprint(\"\\nMissing Values in Train Dataset:\")\nprint(train_df.isnull().sum())\nprint(\"\\nMissing Values in Dev Dataset:\")\nprint(dev_df.isnull().sum())\n\n# Check the distribution of the target column\nprint(\"\\nTarget Column Distribution in Train Dataset:\")\nprint(train_df['class'].value_counts(normalize=True))\nprint(\"\\nTarget Column Distribution in Dev Dataset:\")\nprint(dev_df['class'].value_counts(normalize=True))\n\n\nfrom sklearn.preprocessing import StandardScaler\n\n# Function to preprocess data\ndef preprocess_data(df, scaler=None):\n df_copy = df.copy()\n \n # Separate target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Scale numerical features\n if scaler is None:\n scaler = StandardScaler()\n df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n else:\n df_copy[df_copy.columns] = scaler.transform(df_copy)\n \n # Reattach target column if it was separated\n if y is not None:\n df_copy['class'] = y\n \n return df_copy, scaler\n\n# Preprocess train, dev, and test sets\ntrain_df_processed, scaler = preprocess_data(train_df)\ndev_df_processed, _ = preprocess_data(dev_df, scaler)\ntest_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\ntest_df_processed, _ = preprocess_data(test_df, scaler)\n\n# Display the first few rows of the processed train dataset\nprint(\"Processed Train Dataset:\")\nprint(train_df_processed.head())\n",
+ "active": true
+ },
+ {
+ "id": "0-0",
+ "parent_id": "0",
+ "avg_score": "82.5",
+ "dev_score": "83.2",
+ "visits": 2,
+ "order": 6,
+ "instruction": "Perform a correlation analysis to identify highly correlated features and potential multicollinearity issues before engineering new features or transforming existing ones to improve model performance.",
+ "code": "import numpy as np\n\n# Function to perform correlation analysis and identify highly correlated features\ndef correlation_analysis(df):\n corr_matrix = df.corr()\n upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))\n to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]\n return to_drop\n\n# Perform correlation analysis on the processed train dataset\nto_drop = correlation_analysis(train_df_processed.drop(columns=['class']))\n\n# Print the features to drop due to high correlation\nprint(\"Features to drop due to high correlation:\", to_drop)\n\n# Drop the identified features from the train, dev, and test datasets\ntrain_df_processed = train_df_processed.drop(columns=to_drop)\ndev_df_processed = dev_df_processed.drop(columns=to_drop)\ntest_df_processed = test_df_processed.drop(columns=to_drop)\n\n# Print the shape of the datasets after dropping highly correlated features\nprint(\"Shape of processed train dataset after dropping features:\", train_df_processed.shape)\nprint(\"Shape of processed dev dataset after dropping features:\", dev_df_processed.shape)\nprint(\"Shape of processed test dataset after dropping features:\", test_df_processed.shape)\n",
+ "active": false
+ },
+ {
+ "id": "0-0-0",
+ "parent_id": "0-0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "6.5",
+ "instruction": "Perform a correlation analysis to identify highly correlated features and potential multicollinearity issues before training a base model to predict the target column 'class' on the train set.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-0-1",
+ "parent_id": "0-0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "6.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable 'class' to check for class imbalance. If significant class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-0-2",
+ "parent_id": "0-0",
+ "avg_score": "81.8",
+ "dev_score": "81.8",
+ "visits": 1,
+ "order": 7,
+ "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.",
+ "code": "from sklearn.model_selection import train_test_split\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\nfrom sklearn.ensemble import StackingClassifier\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\nfrom sklearn.neighbors import KNeighborsClassifier\n\n# Prepare data for modeling\nX_train = train_df_processed.drop(columns=['class'])\ny_train = train_df_processed['class']\nX_dev = dev_df_processed.drop(columns=['class'])\ny_dev = dev_df_processed['class']\nX_test = test_df_processed\n\n# Define base models\nbase_models = [\n ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, learning_rate=0.1, max_depth=5)),\n ('rf', RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)),\n ('et', ExtraTreesClassifier(n_estimators=200, max_depth=10, random_state=42)),\n ('knn', KNeighborsClassifier(n_neighbors=5))\n]\n\n# Define meta-model\nmeta_model = LogisticRegression()\n\n# Create stacking ensemble model\nstacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)\n\n# Train the stacking model\nstacking_model.fit(X_train, y_train)\n\n# Predict on dev set\ny_dev_pred = stacking_model.predict(X_dev)\n\n# Predict on test set\ny_test_pred = stacking_model.predict(X_test)\n\n# Save predictions\npd.DataFrame({'target': y_dev_pred}).to_csv('../workspace/jasmine/dev_predictions.csv', index=False)\npd.DataFrame({'target': y_test_pred}).to_csv('../workspace/jasmine/test_predictions.csv', index=False)\n\n# Evaluate performance\ntrain_f1 = f1_score(y_train, stacking_model.predict(X_train))\ndev_f1 = f1_score(y_dev, y_dev_pred)\n\nprint(f\"Train F1 Score: {train_f1}\")\nprint(f\"Dev F1 Score: {dev_f1}\")\n",
+ "active": false
+ },
+ {
+ "id": "0-0-3",
+ "parent_id": "0-0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "6.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set, after analyzing the frequency distribution of categorical features to understand their diversity and potential impact on the model.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-0-4",
+ "parent_id": "0-0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "6.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-1",
+ "parent_id": "0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 1,
+ "order": 8,
+ "instruction": "Analyze the distribution of the target variable to check for class imbalance and decide on appropriate sampling techniques if necessary, before engineering features by creating new columns or transforming existing ones to improve model performance.",
+ "code": "from sklearn.decomposition import PCA\nfrom sklearn.preprocessing import PolynomialFeatures\n\ndef feature_engineering(df):\n df_copy = df.copy()\n \n # Remove the target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Apply PCA for dimensionality reduction\n pca = PCA(n_components=0.95) # Keep 95% of the variance\n pca_features = pca.fit_transform(df_copy)\n pca_df = pd.DataFrame(pca_features, columns=[f'PCA_{i}' for i in range(pca_features.shape[1])])\n \n # Apply PolynomialFeatures for interaction terms\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n poly_columns = poly.get_feature_names_out(df_copy.columns)\n poly_df = pd.DataFrame(poly_features, columns=poly_columns)\n \n # Combine PCA and Polynomial features\n combined_df = pd.concat([pca_df, poly_df], axis=1)\n \n # Re-add the target column if it was removed\n if y is not None:\n combined_df['class'] = y\n \n return combined_df\n\n# Apply feature engineering to train, dev, and test sets\ntrain_df_engineered = feature_engineering(train_df_processed)\ndev_df_engineered = feature_engineering(dev_df_processed)\ntest_df_engineered = feature_engineering(test_df_processed)\n\nprint(\"Engineered Train Dataset:\")\nprint(train_df_engineered.head())\n",
+ "active": false
+ },
+ {
+ "id": "0-2",
+ "parent_id": "0",
+ "avg_score": "80.9",
+ "dev_score": "82.9",
+ "visits": 2,
+ "order": 9,
+ "instruction": "Visualize the distribution of numerical features to identify outliers and skewed distributions, then engineer features by creating new columns or transforming existing ones to improve model performance.",
+ "code": "from sklearn.preprocessing import PolynomialFeatures\n\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Remove ID columns if any\n df_copy = df_copy.select_dtypes(include=['number'])\n \n # Polynomial features\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n \n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n \n if y is not None:\n df_poly['class'] = y\n \n return df_poly, poly\n\ntrain_df_engineered, poly = engineer_features(train_df_processed)\ndev_df_engineered, _ = engineer_features(dev_df_processed, poly)\ntest_df_engineered, _ = engineer_features(test_df_processed, poly)\n\nprint(\"Engineered Train Dataset:\")\nprint(train_df_engineered.head())\n",
+ "active": true
+ },
+ {
+ "id": "0-2-1",
+ "parent_id": "0-2",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "9.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set. Before training, analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-2-2",
+ "parent_id": "0-2",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "9.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-2-3",
+ "parent_id": "0-2",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "9.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set. Prior to training, analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-2-4",
+ "parent_id": "0-2",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "9.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3",
+ "parent_id": "0",
+ "avg_score": "81.6",
+ "dev_score": "81.0",
+ "visits": 4,
+ "order": 2,
+ "instruction": "Analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model. Use this analysis to engineer new features or transform existing ones that can better capture the underlying patterns in the data.",
+ "code": "from sklearn.preprocessing import PolynomialFeatures\n\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Remove ID columns if any\n df_copy = df_copy.select_dtypes(include=['number'])\n \n # Polynomial features\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n \n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n \n if y is not None:\n df_poly['class'] = y\n \n return df_poly, poly\n\ntrain_df_poly, poly = engineer_features(train_df_processed)\ndev_df_poly, _ = engineer_features(dev_df_processed, poly)\ntest_df_poly, _ = engineer_features(test_df_processed, poly)\n\nprint(\"Processed Train Dataset with Polynomial Features:\")\nprint(train_df_poly.head())\n",
+ "active": false
+ },
+ {
+ "id": "0-3-0",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after performing a correlation analysis to identify highly correlated features and address potential multicollinearity issues.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-1",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-2",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3",
+ "parent_id": "0-3",
+ "avg_score": "81.9",
+ "dev_score": "81.0",
+ "visits": 3,
+ "order": 3,
+ "instruction": "Train a base model to predict the target column 'class' on the train set. Prior to training, analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model.",
+ "code": "from sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\n\n# Label encode categorical features if any\nlabel_encoders = {}\nfor col in categorical_columns:\n le = LabelEncoder()\n train_df_poly[col] = le.fit_transform(train_df_poly[col])\n dev_df_poly[col] = le.transform(dev_df_poly[col])\n test_df_poly[col] = le.transform(test_df_poly[col])\n label_encoders[col] = le\n\n# Separate features and target\nX_train = train_df_poly.drop(columns=['class'])\ny_train = train_df_poly['class']\nX_dev = dev_df_poly.drop(columns=['class'])\ny_dev = dev_df_poly['class']\nX_test = test_df_poly\n\n# Train a base XGBoost model\nmodel = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, max_depth=7, learning_rate=0.1)\nmodel.fit(X_train, y_train)\n\n# Predict on dev set\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Predict on test set\ny_test_pred = model.predict(X_test)\n\n# Save predictions\nimport os\noutput_dir = '../workspace/jasmine'\nos.makedirs(output_dir, exist_ok=True)\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n",
+ "active": false
+ },
+ {
+ "id": "0-3-3-0",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and perform a correlation analysis to identify highly correlated features and potential multicollinearity issues.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3-1",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques before re-evaluating the model.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3-2",
+ "parent_id": "0-3-3",
+ "avg_score": "82.3",
+ "dev_score": "82.3",
+ "visits": 2,
+ "order": 4,
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and visualize the distribution of numerical features to identify outliers and skewed distributions.",
+ "code": "# Evaluate the base model on the dev set and report the F1 score\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Save the dev set predictions\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\n\n# Save the test set predictions\ny_test_pred = model.predict(X_test)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n\n# Print the train and dev set performance\ntrain_f1 = f1_score(y_train, model.predict(X_train))\nprint(f\"Train Set F1 Score: {train_f1}\")\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n",
+ "active": false
+ },
+ {
+ "id": "0-3-3-3",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the frequency distribution of categorical features to understand their impact on the model performance.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3-4",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Conduct a PCA (Principal Component Analysis) on the dev set to reduce dimensionality and understand the variance captured by different components. Then, evaluate the base model on the transformed dev set and report the F1 score.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-4",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-4",
+ "parent_id": "0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "1.5",
+ "instruction": "Perform Principal Component Analysis (PCA) to reduce dimensionality and understand the variance captured by different components, then engineer features based on the most significant principal components to improve model performance.",
+ "code": "",
+ "active": false
+ }
+]
\ No newline at end of file
diff --git a/src/components/demo2/datas/credit-g/tree_10.json b/src/components/demo2/datas/credit-g/tree_10.json
new file mode 100644
index 00000000..19c970b7
--- /dev/null
+++ b/src/components/demo2/datas/credit-g/tree_10.json
@@ -0,0 +1,288 @@
+[
+ {
+ "id": "0",
+ "parent_id": null,
+ "avg_score": "73.7",
+ "dev_score": "83.4",
+ "visits": 10,
+ "order": 1,
+ "instruction": "Perform exploratory data analysis on the train and dev sets to understand the dataset structure and characteristics.\n\nPreprocess the train, dev, and test sets by handling missing values, encoding categorical variables, and scaling numerical features.",
+ "code": "import pandas as pd\nimport numpy as np\n\n# Load the dataset\ntrain_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\ndev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n\n# Display basic information about the datasets\nprint(\"Train Dataset Info:\")\nprint(train_df.info())\nprint(\"\\nDev Dataset Info:\")\nprint(dev_df.info())\n\n# Display summary statistics for numerical columns\nprint(\"\\nTrain Dataset Summary Statistics:\")\nprint(train_df.describe())\nprint(\"\\nDev Dataset Summary Statistics:\")\nprint(dev_df.describe())\n\n# Display the number of unique values for categorical columns\ncategorical_columns = train_df.select_dtypes(include=['object', 'category']).columns\nprint(\"\\nUnique Values in Categorical Columns:\")\nfor col in categorical_columns:\n print(f\"{col}: {train_df[col].nunique()} unique values\")\n\n# Check for missing values\nprint(\"\\nMissing Values in Train Dataset:\")\nprint(train_df.isnull().sum())\nprint(\"\\nMissing Values in Dev Dataset:\")\nprint(dev_df.isnull().sum())\n\n# Check the distribution of the target column\nprint(\"\\nTarget Column Distribution in Train Dataset:\")\nprint(train_df['class'].value_counts(normalize=True))\nprint(\"\\nTarget Column Distribution in Dev Dataset:\")\nprint(dev_df['class'].value_counts(normalize=True))\n\n\nfrom sklearn.preprocessing import StandardScaler\n\n# Function to preprocess data\ndef preprocess_data(df, scaler=None):\n df_copy = df.copy()\n \n # Separate target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Scale numerical features\n if scaler is None:\n scaler = StandardScaler()\n df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n else:\n df_copy[df_copy.columns] = scaler.transform(df_copy)\n \n # Reattach target column if it was separated\n if y is not None:\n df_copy['class'] = y\n \n return df_copy, scaler\n\n# Preprocess train, dev, and test sets\ntrain_df_processed, scaler = preprocess_data(train_df)\ndev_df_processed, _ = preprocess_data(dev_df, scaler)\ntest_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\ntest_df_processed, _ = preprocess_data(test_df, scaler)\n\n# Display the first few rows of the processed train dataset\nprint(\"Processed Train Dataset:\")\nprint(train_df_processed.head())\n",
+ "active": true
+ },
+ {
+ "id": "0-0",
+ "parent_id": "0",
+ "avg_score": "82.5",
+ "dev_score": "83.2",
+ "visits": 2,
+ "order": 6,
+ "instruction": "Perform a correlation analysis to identify highly correlated features and potential multicollinearity issues before engineering new features or transforming existing ones to improve model performance.",
+ "code": "import numpy as np\n\n# Function to perform correlation analysis and identify highly correlated features\ndef correlation_analysis(df):\n corr_matrix = df.corr()\n upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))\n to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]\n return to_drop\n\n# Perform correlation analysis on the processed train dataset\nto_drop = correlation_analysis(train_df_processed.drop(columns=['class']))\n\n# Print the features to drop due to high correlation\nprint(\"Features to drop due to high correlation:\", to_drop)\n\n# Drop the identified features from the train, dev, and test datasets\ntrain_df_processed = train_df_processed.drop(columns=to_drop)\ndev_df_processed = dev_df_processed.drop(columns=to_drop)\ntest_df_processed = test_df_processed.drop(columns=to_drop)\n\n# Print the shape of the datasets after dropping highly correlated features\nprint(\"Shape of processed train dataset after dropping features:\", train_df_processed.shape)\nprint(\"Shape of processed dev dataset after dropping features:\", dev_df_processed.shape)\nprint(\"Shape of processed test dataset after dropping features:\", test_df_processed.shape)\n",
+ "active": false
+ },
+ {
+ "id": "0-0-0",
+ "parent_id": "0-0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "6.5",
+ "instruction": "Perform a correlation analysis to identify highly correlated features and potential multicollinearity issues before training a base model to predict the target column 'class' on the train set.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-0-1",
+ "parent_id": "0-0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "6.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable 'class' to check for class imbalance. If significant class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-0-2",
+ "parent_id": "0-0",
+ "avg_score": "81.8",
+ "dev_score": "81.8",
+ "visits": 1,
+ "order": 7,
+ "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.",
+ "code": "from sklearn.model_selection import train_test_split\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\nfrom sklearn.ensemble import StackingClassifier\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\nfrom sklearn.neighbors import KNeighborsClassifier\n\n# Prepare data for modeling\nX_train = train_df_processed.drop(columns=['class'])\ny_train = train_df_processed['class']\nX_dev = dev_df_processed.drop(columns=['class'])\ny_dev = dev_df_processed['class']\nX_test = test_df_processed\n\n# Define base models\nbase_models = [\n ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, learning_rate=0.1, max_depth=5)),\n ('rf', RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)),\n ('et', ExtraTreesClassifier(n_estimators=200, max_depth=10, random_state=42)),\n ('knn', KNeighborsClassifier(n_neighbors=5))\n]\n\n# Define meta-model\nmeta_model = LogisticRegression()\n\n# Create stacking ensemble model\nstacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)\n\n# Train the stacking model\nstacking_model.fit(X_train, y_train)\n\n# Predict on dev set\ny_dev_pred = stacking_model.predict(X_dev)\n\n# Predict on test set\ny_test_pred = stacking_model.predict(X_test)\n\n# Save predictions\npd.DataFrame({'target': y_dev_pred}).to_csv('../workspace/jasmine/dev_predictions.csv', index=False)\npd.DataFrame({'target': y_test_pred}).to_csv('../workspace/jasmine/test_predictions.csv', index=False)\n\n# Evaluate performance\ntrain_f1 = f1_score(y_train, stacking_model.predict(X_train))\ndev_f1 = f1_score(y_dev, y_dev_pred)\n\nprint(f\"Train F1 Score: {train_f1}\")\nprint(f\"Dev F1 Score: {dev_f1}\")\n",
+ "active": false
+ },
+ {
+ "id": "0-0-3",
+ "parent_id": "0-0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "6.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set, after analyzing the frequency distribution of categorical features to understand their diversity and potential impact on the model.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-0-4",
+ "parent_id": "0-0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "6.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-1",
+ "parent_id": "0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 1,
+ "order": 8,
+ "instruction": "Analyze the distribution of the target variable to check for class imbalance and decide on appropriate sampling techniques if necessary, before engineering features by creating new columns or transforming existing ones to improve model performance.",
+ "code": "from sklearn.decomposition import PCA\nfrom sklearn.preprocessing import PolynomialFeatures\n\ndef feature_engineering(df):\n df_copy = df.copy()\n \n # Remove the target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Apply PCA for dimensionality reduction\n pca = PCA(n_components=0.95) # Keep 95% of the variance\n pca_features = pca.fit_transform(df_copy)\n pca_df = pd.DataFrame(pca_features, columns=[f'PCA_{i}' for i in range(pca_features.shape[1])])\n \n # Apply PolynomialFeatures for interaction terms\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n poly_columns = poly.get_feature_names_out(df_copy.columns)\n poly_df = pd.DataFrame(poly_features, columns=poly_columns)\n \n # Combine PCA and Polynomial features\n combined_df = pd.concat([pca_df, poly_df], axis=1)\n \n # Re-add the target column if it was removed\n if y is not None:\n combined_df['class'] = y\n \n return combined_df\n\n# Apply feature engineering to train, dev, and test sets\ntrain_df_engineered = feature_engineering(train_df_processed)\ndev_df_engineered = feature_engineering(dev_df_processed)\ntest_df_engineered = feature_engineering(test_df_processed)\n\nprint(\"Engineered Train Dataset:\")\nprint(train_df_engineered.head())\n",
+ "active": false
+ },
+ {
+ "id": "0-2",
+ "parent_id": "0",
+ "avg_score": "80.9",
+ "dev_score": "82.9",
+ "visits": 2,
+ "order": 9,
+ "instruction": "Visualize the distribution of numerical features to identify outliers and skewed distributions, then engineer features by creating new columns or transforming existing ones to improve model performance.",
+ "code": "from sklearn.preprocessing import PolynomialFeatures\n\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Remove ID columns if any\n df_copy = df_copy.select_dtypes(include=['number'])\n \n # Polynomial features\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n \n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n \n if y is not None:\n df_poly['class'] = y\n \n return df_poly, poly\n\ntrain_df_engineered, poly = engineer_features(train_df_processed)\ndev_df_engineered, _ = engineer_features(dev_df_processed, poly)\ntest_df_engineered, _ = engineer_features(test_df_processed, poly)\n\nprint(\"Engineered Train Dataset:\")\nprint(train_df_engineered.head())\n",
+ "active": true
+ },
+ {
+ "id": "0-2-0",
+ "parent_id": "0-2",
+ "avg_score": "78.9",
+ "dev_score": "78.9",
+ "visits": 1,
+ "order": 10,
+ "instruction": "Perform a correlation analysis to identify highly correlated features and potential multicollinearity issues before training a base model to predict the target column 'class' on the train set.",
+ "code": "import numpy as np\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\nfrom sklearn.feature_selection import SelectFromModel\nimport pandas as pd\nfrom sklearn.preprocessing import StandardScaler, PolynomialFeatures\n\n# Define the preprocess_data function\ndef preprocess_data(df, scaler=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n if scaler is None:\n scaler = StandardScaler()\n df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n else:\n df_copy[df_copy.columns] = scaler.transform(df_copy)\n if y is not None:\n df_copy['class'] = y\n return df_copy, scaler\n\n# Define the engineer_features function\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n df_copy = df_copy.select_dtypes(include=['number'])\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n if y is not None:\n df_poly['class'] = y\n return df_poly, poly\n\n# Load the datasets if they are not already loaded\nif 'train_df_engineered' not in locals():\n train_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\n train_df_engineered = preprocess_data(train_df)[0]\n train_df_engineered, _ = engineer_features(train_df_engineered)\n\nif 'dev_df_engineered' not in locals():\n dev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n dev_df_engineered = preprocess_data(dev_df)[0]\n dev_df_engineered, _ = engineer_features(dev_df_engineered)\n\nif 'test_df_engineered' not in locals():\n test_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\n test_df_engineered = preprocess_data(test_df)[0]\n test_df_engineered, _ = engineer_features(test_df_engineered)\n\n# Function to optimize data types\ndef optimize_dtypes(df):\n for col in df.columns:\n if df[col].dtype == 'float64':\n df[col] = df[col].astype('float32')\n elif df[col].dtype == 'int64':\n df[col] = df[col].astype('int32')\n return df\n\n# Optimize data types for train, dev, and test datasets\ntrain_df_engineered = optimize_dtypes(train_df_engineered)\ndev_df_engineered = optimize_dtypes(dev_df_engineered)\ntest_df_engineered = optimize_dtypes(test_df_engineered)\n\n# Prepare data for modeling\nX_train = train_df_engineered.drop(columns=['class'])\ny_train = train_df_engineered['class']\nX_dev = dev_df_engineered.drop(columns=['class'])\ny_dev = dev_df_engineered['class']\nX_test = test_df_engineered\n\n# Feature selection using SelectFromModel\nbase_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')\nselector = SelectFromModel(base_model, threshold='median')\nselector.fit(X_train, y_train)\n\nX_train_selected = selector.transform(X_train)\nX_dev_selected = selector.transform(X_dev)\nX_test_selected = selector.transform(X_test)\n\n# Define the final model\nfinal_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, max_depth=5)\n\n# Train the final model\nfinal_model.fit(X_train_selected, y_train)\n\n# Predict on dev set\ny_dev_pred = final_model.predict(X_dev_selected)\n\n# Predict on test set\ny_test_pred = final_model.predict(X_test_selected)\n\n# Save predictions\npd.DataFrame({'target': y_dev_pred}).to_csv('../workspace/jasmine/dev_predictions.csv', index=False)\npd.DataFrame({'target': y_test_pred}).to_csv('../workspace/jasmine/test_predictions.csv', index=False)\n\n# Evaluate performance\ntrain_f1 = f1_score(y_train, final_model.predict(X_train_selected))\ndev_f1 = f1_score(y_dev, y_dev_pred)\n\nprint(f\"Train F1 Score: {train_f1}\")\nprint(f\"Dev F1 Score: {dev_f1}\")\n",
+ "active": true
+ },
+ {
+ "id": "0-2-1",
+ "parent_id": "0-2",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "9.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set. Before training, analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-2-2",
+ "parent_id": "0-2",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "9.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-2-3",
+ "parent_id": "0-2",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "9.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set. Prior to training, analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-2-4",
+ "parent_id": "0-2",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "9.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3",
+ "parent_id": "0",
+ "avg_score": "81.6",
+ "dev_score": "81.0",
+ "visits": 4,
+ "order": 2,
+ "instruction": "Analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model. Use this analysis to engineer new features or transform existing ones that can better capture the underlying patterns in the data.",
+ "code": "from sklearn.preprocessing import PolynomialFeatures\n\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Remove ID columns if any\n df_copy = df_copy.select_dtypes(include=['number'])\n \n # Polynomial features\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n \n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n \n if y is not None:\n df_poly['class'] = y\n \n return df_poly, poly\n\ntrain_df_poly, poly = engineer_features(train_df_processed)\ndev_df_poly, _ = engineer_features(dev_df_processed, poly)\ntest_df_poly, _ = engineer_features(test_df_processed, poly)\n\nprint(\"Processed Train Dataset with Polynomial Features:\")\nprint(train_df_poly.head())\n",
+ "active": false
+ },
+ {
+ "id": "0-3-0",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after performing a correlation analysis to identify highly correlated features and address potential multicollinearity issues.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-1",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-2",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3",
+ "parent_id": "0-3",
+ "avg_score": "81.9",
+ "dev_score": "81.0",
+ "visits": 3,
+ "order": 3,
+ "instruction": "Train a base model to predict the target column 'class' on the train set. Prior to training, analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model.",
+ "code": "from sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\n\n# Label encode categorical features if any\nlabel_encoders = {}\nfor col in categorical_columns:\n le = LabelEncoder()\n train_df_poly[col] = le.fit_transform(train_df_poly[col])\n dev_df_poly[col] = le.transform(dev_df_poly[col])\n test_df_poly[col] = le.transform(test_df_poly[col])\n label_encoders[col] = le\n\n# Separate features and target\nX_train = train_df_poly.drop(columns=['class'])\ny_train = train_df_poly['class']\nX_dev = dev_df_poly.drop(columns=['class'])\ny_dev = dev_df_poly['class']\nX_test = test_df_poly\n\n# Train a base XGBoost model\nmodel = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, max_depth=7, learning_rate=0.1)\nmodel.fit(X_train, y_train)\n\n# Predict on dev set\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Predict on test set\ny_test_pred = model.predict(X_test)\n\n# Save predictions\nimport os\noutput_dir = '../workspace/jasmine'\nos.makedirs(output_dir, exist_ok=True)\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n",
+ "active": false
+ },
+ {
+ "id": "0-3-3-0",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and perform a correlation analysis to identify highly correlated features and potential multicollinearity issues.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3-1",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques before re-evaluating the model.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3-2",
+ "parent_id": "0-3-3",
+ "avg_score": "82.3",
+ "dev_score": "82.3",
+ "visits": 2,
+ "order": 4,
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and visualize the distribution of numerical features to identify outliers and skewed distributions.",
+ "code": "# Evaluate the base model on the dev set and report the F1 score\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Save the dev set predictions\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\n\n# Save the test set predictions\ny_test_pred = model.predict(X_test)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n\n# Print the train and dev set performance\ntrain_f1 = f1_score(y_train, model.predict(X_train))\nprint(f\"Train Set F1 Score: {train_f1}\")\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n",
+ "active": false
+ },
+ {
+ "id": "0-3-3-3",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the frequency distribution of categorical features to understand their impact on the model performance.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-3-4",
+ "parent_id": "0-3-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "3.5",
+ "instruction": "Conduct a PCA (Principal Component Analysis) on the dev set to reduce dimensionality and understand the variance captured by different components. Then, evaluate the base model on the transformed dev set and report the F1 score.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-3-4",
+ "parent_id": "0-3",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "2.5",
+ "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.",
+ "code": "",
+ "active": false
+ },
+ {
+ "id": "0-4",
+ "parent_id": "0",
+ "avg_score": "0.0",
+ "dev_score": "0.0",
+ "visits": 0,
+ "order": "1.5",
+ "instruction": "Perform Principal Component Analysis (PCA) to reduce dimensionality and understand the variance captured by different components, then engineer features based on the most significant principal components to improve model performance.",
+ "code": "",
+ "active": false
+ }
+]
\ No newline at end of file
diff --git a/src/components/demo2/demo2.vue b/src/components/demo2/demo2.vue
new file mode 100644
index 00000000..60d98f73
--- /dev/null
+++ b/src/components/demo2/demo2.vue
@@ -0,0 +1,427 @@
+
+
+
+