From a783c95686d8b86e180725e26d7a49385afcdaa1 Mon Sep 17 00:00:00 2001 From: zerolee <464806884@qq.com> Date: Mon, 28 Oct 2024 21:03:56 +0800 Subject: [PATCH 01/13] feat: demo2 --- package.json | 1 + pnpm-lock.yaml | 8 + src/DataInterpreter/index.md | 5 +- src/components/demo2/datas/credit-g/tree.json | 262 +++++++++++ .../demo2/datas/credit-g/tree_01.json | 24 + .../demo2/datas/credit-g/tree_02.json | 79 ++++ .../demo2/datas/credit-g/tree_03.json | 134 ++++++ .../demo2/datas/credit-g/tree_04.json | 145 ++++++ .../demo2/datas/credit-g/tree_05.json | 145 ++++++ .../demo2/datas/credit-g/tree_06.json | 200 ++++++++ .../demo2/datas/credit-g/tree_07.json | 211 +++++++++ .../demo2/datas/credit-g/tree_08.json | 222 +++++++++ .../demo2/datas/credit-g/tree_09.json | 277 ++++++++++++ .../demo2/datas/credit-g/tree_10.json | 288 ++++++++++++ src/components/demo2/demo2.vue | 427 ++++++++++++++++++ src/components/demoList.vue | 2 +- 16 files changed, 2427 insertions(+), 3 deletions(-) create mode 100644 src/components/demo2/datas/credit-g/tree.json create mode 100644 src/components/demo2/datas/credit-g/tree_01.json create mode 100644 src/components/demo2/datas/credit-g/tree_02.json create mode 100644 src/components/demo2/datas/credit-g/tree_03.json create mode 100644 src/components/demo2/datas/credit-g/tree_04.json create mode 100644 src/components/demo2/datas/credit-g/tree_05.json create mode 100644 src/components/demo2/datas/credit-g/tree_06.json create mode 100644 src/components/demo2/datas/credit-g/tree_07.json create mode 100644 src/components/demo2/datas/credit-g/tree_08.json create mode 100644 src/components/demo2/datas/credit-g/tree_09.json create mode 100644 src/components/demo2/datas/credit-g/tree_10.json create mode 100644 src/components/demo2/demo2.vue diff --git a/package.json b/package.json index ae3a2e92..c0c2774a 100644 --- a/package.json +++ b/package.json @@ -23,6 +23,7 @@ "@vueuse/core": "^10.6.1", "dayjs": "^1.11.10", "execa": "^8.0.1", + "highlight.js": "^11.10.0", "lint-staged": "^15.0.2", "prettier": "^3.0.3", "sass": "^1.71.1", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 30890144..48ac2ccf 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -35,6 +35,9 @@ dependencies: execa: specifier: ^8.0.1 version: 8.0.1 + highlight.js: + specifier: ^11.10.0 + version: 11.10.0 lint-staged: specifier: ^15.0.2 version: 15.0.2 @@ -2012,6 +2015,11 @@ packages: function-bind: 1.1.2 dev: false + /highlight.js@11.10.0: + resolution: {integrity: sha512-SYVnVFswQER+zu1laSya563s+F8VDGt7o35d4utbamowvUNLLMovFqwCLSocpZTz3MgaSRA1IbqRWZv97dtErQ==} + engines: {node: '>=12.0.0'} + dev: false + /html-tags@3.3.1: resolution: {integrity: sha512-ztqyC3kLto0e9WbNp0aeP+M3kTt+nbaIveGmUxAtZa+8iFgKLUOD4YKM5j+f3QD89bra7UeumolZHKuOXnTmeQ==} engines: {node: '>=8'} diff --git a/src/DataInterpreter/index.md b/src/DataInterpreter/index.md index a98a6681..d84a5413 100644 --- a/src/DataInterpreter/index.md +++ b/src/DataInterpreter/index.md @@ -4,7 +4,8 @@ footer: false --- - + diff --git a/src/components/demo2/datas/credit-g/tree.json b/src/components/demo2/datas/credit-g/tree.json new file mode 100644 index 00000000..07f9c134 --- /dev/null +++ b/src/components/demo2/datas/credit-g/tree.json @@ -0,0 +1,262 @@ +[ + { + "id": "0", + "parent_id": null, + "avg_score": "73.7", + "dev_score": "83.4", + "visits": 10, + "order": 1, + "instruction": "Perform exploratory data analysis on the train and dev sets to understand the dataset structure and characteristics.\n\nPreprocess the train, dev, and test sets by handling missing values, encoding categorical variables, and scaling numerical features.", + "code": "import pandas as pd\nimport numpy as np\n\n# Load the dataset\ntrain_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\ndev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n\n# Display basic information about the datasets\nprint(\"Train Dataset Info:\")\nprint(train_df.info())\nprint(\"\\nDev Dataset Info:\")\nprint(dev_df.info())\n\n# Display summary statistics for numerical columns\nprint(\"\\nTrain Dataset Summary Statistics:\")\nprint(train_df.describe())\nprint(\"\\nDev Dataset Summary Statistics:\")\nprint(dev_df.describe())\n\n# Display the number of unique values for categorical columns\ncategorical_columns = train_df.select_dtypes(include=['object', 'category']).columns\nprint(\"\\nUnique Values in Categorical Columns:\")\nfor col in categorical_columns:\n print(f\"{col}: {train_df[col].nunique()} unique values\")\n\n# Check for missing values\nprint(\"\\nMissing Values in Train Dataset:\")\nprint(train_df.isnull().sum())\nprint(\"\\nMissing Values in Dev Dataset:\")\nprint(dev_df.isnull().sum())\n\n# Check the distribution of the target column\nprint(\"\\nTarget Column Distribution in Train Dataset:\")\nprint(train_df['class'].value_counts(normalize=True))\nprint(\"\\nTarget Column Distribution in Dev Dataset:\")\nprint(dev_df['class'].value_counts(normalize=True))\n\n\nfrom sklearn.preprocessing import StandardScaler\n\n# Function to preprocess data\ndef preprocess_data(df, scaler=None):\n df_copy = df.copy()\n \n # Separate target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Scale numerical features\n if scaler is None:\n scaler = StandardScaler()\n df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n else:\n df_copy[df_copy.columns] = scaler.transform(df_copy)\n \n # Reattach target column if it was separated\n if y is not None:\n df_copy['class'] = y\n \n return df_copy, scaler\n\n# Preprocess train, dev, and test sets\ntrain_df_processed, scaler = preprocess_data(train_df)\ndev_df_processed, _ = preprocess_data(dev_df, scaler)\ntest_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\ntest_df_processed, _ = preprocess_data(test_df, scaler)\n\n# Display the first few rows of the processed train dataset\nprint(\"Processed Train Dataset:\")\nprint(train_df_processed.head())\n" + }, + { + "id": "0-0", + "parent_id": "0", + "avg_score": "82.5", + "dev_score": "83.2", + "visits": 2, + "order": 6, + "instruction": "Perform a correlation analysis to identify highly correlated features and potential multicollinearity issues before engineering new features or transforming existing ones to improve model performance.", + "code": "import numpy as np\n\n# Function to perform correlation analysis and identify highly correlated features\ndef correlation_analysis(df):\n corr_matrix = df.corr()\n upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))\n to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]\n return to_drop\n\n# Perform correlation analysis on the processed train dataset\nto_drop = correlation_analysis(train_df_processed.drop(columns=['class']))\n\n# Print the features to drop due to high correlation\nprint(\"Features to drop due to high correlation:\", to_drop)\n\n# Drop the identified features from the train, dev, and test datasets\ntrain_df_processed = train_df_processed.drop(columns=to_drop)\ndev_df_processed = dev_df_processed.drop(columns=to_drop)\ntest_df_processed = test_df_processed.drop(columns=to_drop)\n\n# Print the shape of the datasets after dropping highly correlated features\nprint(\"Shape of processed train dataset after dropping features:\", train_df_processed.shape)\nprint(\"Shape of processed dev dataset after dropping features:\", dev_df_processed.shape)\nprint(\"Shape of processed test dataset after dropping features:\", test_df_processed.shape)\n" + }, + { + "id": "0-0-0", + "parent_id": "0-0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "6.5", + "instruction": "Perform a correlation analysis to identify highly correlated features and potential multicollinearity issues before training a base model to predict the target column 'class' on the train set.", + "code": "" + }, + { + "id": "0-0-1", + "parent_id": "0-0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "6.5", + "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable 'class' to check for class imbalance. If significant class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.", + "code": "" + }, + { + "id": "0-0-2", + "parent_id": "0-0", + "avg_score": "81.8", + "dev_score": "81.8", + "visits": 1, + "order": 7, + "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.", + "code": "from sklearn.model_selection import train_test_split\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\nfrom sklearn.ensemble import StackingClassifier\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\nfrom sklearn.neighbors import KNeighborsClassifier\n\n# Prepare data for modeling\nX_train = train_df_processed.drop(columns=['class'])\ny_train = train_df_processed['class']\nX_dev = dev_df_processed.drop(columns=['class'])\ny_dev = dev_df_processed['class']\nX_test = test_df_processed\n\n# Define base models\nbase_models = [\n ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, learning_rate=0.1, max_depth=5)),\n ('rf', RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)),\n ('et', ExtraTreesClassifier(n_estimators=200, max_depth=10, random_state=42)),\n ('knn', KNeighborsClassifier(n_neighbors=5))\n]\n\n# Define meta-model\nmeta_model = LogisticRegression()\n\n# Create stacking ensemble model\nstacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)\n\n# Train the stacking model\nstacking_model.fit(X_train, y_train)\n\n# Predict on dev set\ny_dev_pred = stacking_model.predict(X_dev)\n\n# Predict on test set\ny_test_pred = stacking_model.predict(X_test)\n\n# Save predictions\npd.DataFrame({'target': y_dev_pred}).to_csv('../workspace/jasmine/dev_predictions.csv', index=False)\npd.DataFrame({'target': y_test_pred}).to_csv('../workspace/jasmine/test_predictions.csv', index=False)\n\n# Evaluate performance\ntrain_f1 = f1_score(y_train, stacking_model.predict(X_train))\ndev_f1 = f1_score(y_dev, y_dev_pred)\n\nprint(f\"Train F1 Score: {train_f1}\")\nprint(f\"Dev F1 Score: {dev_f1}\")\n" + }, + { + "id": "0-0-3", + "parent_id": "0-0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "6.5", + "instruction": "Train a base model to predict the target column 'class' on the train set, after analyzing the frequency distribution of categorical features to understand their diversity and potential impact on the model.", + "code": "" + }, + { + "id": "0-0-4", + "parent_id": "0-0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "6.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.", + "code": "" + }, + { + "id": "0-1", + "parent_id": "0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 1, + "order": 8, + "instruction": "Analyze the distribution of the target variable to check for class imbalance and decide on appropriate sampling techniques if necessary, before engineering features by creating new columns or transforming existing ones to improve model performance.", + "code": "from sklearn.decomposition import PCA\nfrom sklearn.preprocessing import PolynomialFeatures\n\ndef feature_engineering(df):\n df_copy = df.copy()\n \n # Remove the target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Apply PCA for dimensionality reduction\n pca = PCA(n_components=0.95) # Keep 95% of the variance\n pca_features = pca.fit_transform(df_copy)\n pca_df = pd.DataFrame(pca_features, columns=[f'PCA_{i}' for i in range(pca_features.shape[1])])\n \n # Apply PolynomialFeatures for interaction terms\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n poly_columns = poly.get_feature_names_out(df_copy.columns)\n poly_df = pd.DataFrame(poly_features, columns=poly_columns)\n \n # Combine PCA and Polynomial features\n combined_df = pd.concat([pca_df, poly_df], axis=1)\n \n # Re-add the target column if it was removed\n if y is not None:\n combined_df['class'] = y\n \n return combined_df\n\n# Apply feature engineering to train, dev, and test sets\ntrain_df_engineered = feature_engineering(train_df_processed)\ndev_df_engineered = feature_engineering(dev_df_processed)\ntest_df_engineered = feature_engineering(test_df_processed)\n\nprint(\"Engineered Train Dataset:\")\nprint(train_df_engineered.head())\n" + }, + { + "id": "0-2", + "parent_id": "0", + "avg_score": "80.9", + "dev_score": "82.9", + "visits": 2, + "order": 9, + "instruction": "Visualize the distribution of numerical features to identify outliers and skewed distributions, then engineer features by creating new columns or transforming existing ones to improve model performance.", + "code": "from sklearn.preprocessing import PolynomialFeatures\n\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Remove ID columns if any\n df_copy = df_copy.select_dtypes(include=['number'])\n \n # Polynomial features\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n \n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n \n if y is not None:\n df_poly['class'] = y\n \n return df_poly, poly\n\ntrain_df_engineered, poly = engineer_features(train_df_processed)\ndev_df_engineered, _ = engineer_features(dev_df_processed, poly)\ntest_df_engineered, _ = engineer_features(test_df_processed, poly)\n\nprint(\"Engineered Train Dataset:\")\nprint(train_df_engineered.head())\n" + }, + { + "id": "0-2-0", + "parent_id": "0-2", + "avg_score": "78.9", + "dev_score": "78.9", + "visits": 1, + "order": 10, + "instruction": "Perform a correlation analysis to identify highly correlated features and potential multicollinearity issues before training a base model to predict the target column 'class' on the train set.", + "code": "import numpy as np\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\nfrom sklearn.feature_selection import SelectFromModel\nimport pandas as pd\nfrom sklearn.preprocessing import StandardScaler, PolynomialFeatures\n\n# Define the preprocess_data function\ndef preprocess_data(df, scaler=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n if scaler is None:\n scaler = StandardScaler()\n df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n else:\n df_copy[df_copy.columns] = scaler.transform(df_copy)\n if y is not None:\n df_copy['class'] = y\n return df_copy, scaler\n\n# Define the engineer_features function\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n df_copy = df_copy.select_dtypes(include=['number'])\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n if y is not None:\n df_poly['class'] = y\n return df_poly, poly\n\n# Load the datasets if they are not already loaded\nif 'train_df_engineered' not in locals():\n train_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\n train_df_engineered = preprocess_data(train_df)[0]\n train_df_engineered, _ = engineer_features(train_df_engineered)\n\nif 'dev_df_engineered' not in locals():\n dev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n dev_df_engineered = preprocess_data(dev_df)[0]\n dev_df_engineered, _ = engineer_features(dev_df_engineered)\n\nif 'test_df_engineered' not in locals():\n test_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\n test_df_engineered = preprocess_data(test_df)[0]\n test_df_engineered, _ = engineer_features(test_df_engineered)\n\n# Function to optimize data types\ndef optimize_dtypes(df):\n for col in df.columns:\n if df[col].dtype == 'float64':\n df[col] = df[col].astype('float32')\n elif df[col].dtype == 'int64':\n df[col] = df[col].astype('int32')\n return df\n\n# Optimize data types for train, dev, and test datasets\ntrain_df_engineered = optimize_dtypes(train_df_engineered)\ndev_df_engineered = optimize_dtypes(dev_df_engineered)\ntest_df_engineered = optimize_dtypes(test_df_engineered)\n\n# Prepare data for modeling\nX_train = train_df_engineered.drop(columns=['class'])\ny_train = train_df_engineered['class']\nX_dev = dev_df_engineered.drop(columns=['class'])\ny_dev = dev_df_engineered['class']\nX_test = test_df_engineered\n\n# Feature selection using SelectFromModel\nbase_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')\nselector = SelectFromModel(base_model, threshold='median')\nselector.fit(X_train, y_train)\n\nX_train_selected = selector.transform(X_train)\nX_dev_selected = selector.transform(X_dev)\nX_test_selected = selector.transform(X_test)\n\n# Define the final model\nfinal_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, max_depth=5)\n\n# Train the final model\nfinal_model.fit(X_train_selected, y_train)\n\n# Predict on dev set\ny_dev_pred = final_model.predict(X_dev_selected)\n\n# Predict on test set\ny_test_pred = final_model.predict(X_test_selected)\n\n# Save predictions\npd.DataFrame({'target': y_dev_pred}).to_csv('../workspace/jasmine/dev_predictions.csv', index=False)\npd.DataFrame({'target': y_test_pred}).to_csv('../workspace/jasmine/test_predictions.csv', index=False)\n\n# Evaluate performance\ntrain_f1 = f1_score(y_train, final_model.predict(X_train_selected))\ndev_f1 = f1_score(y_dev, y_dev_pred)\n\nprint(f\"Train F1 Score: {train_f1}\")\nprint(f\"Dev F1 Score: {dev_f1}\")\n" + }, + { + "id": "0-2-1", + "parent_id": "0-2", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "9.5", + "instruction": "Train a base model to predict the target column 'class' on the train set. Before training, analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset.", + "code": "" + }, + { + "id": "0-2-2", + "parent_id": "0-2", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "9.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.", + "code": "" + }, + { + "id": "0-2-3", + "parent_id": "0-2", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "9.5", + "instruction": "Train a base model to predict the target column 'class' on the train set. Prior to training, analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model.", + "code": "" + }, + { + "id": "0-2-4", + "parent_id": "0-2", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "9.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.", + "code": "" + }, + { + "id": "0-3", + "parent_id": "0", + "avg_score": "81.6", + "dev_score": "81.0", + "visits": 4, + "order": 2, + "instruction": "Analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model. Use this analysis to engineer new features or transform existing ones that can better capture the underlying patterns in the data.", + "code": "from sklearn.preprocessing import PolynomialFeatures\n\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Remove ID columns if any\n df_copy = df_copy.select_dtypes(include=['number'])\n \n # Polynomial features\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n \n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n \n if y is not None:\n df_poly['class'] = y\n \n return df_poly, poly\n\ntrain_df_poly, poly = engineer_features(train_df_processed)\ndev_df_poly, _ = engineer_features(dev_df_processed, poly)\ntest_df_poly, _ = engineer_features(test_df_processed, poly)\n\nprint(\"Processed Train Dataset with Polynomial Features:\")\nprint(train_df_poly.head())\n" + }, + { + "id": "0-3-0", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after performing a correlation analysis to identify highly correlated features and address potential multicollinearity issues.", + "code": "" + }, + { + "id": "0-3-1", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.", + "code": "" + }, + { + "id": "0-3-2", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.", + "code": "" + }, + { + "id": "0-3-3", + "parent_id": "0-3", + "avg_score": "81.9", + "dev_score": "81.0", + "visits": 3, + "order": 3, + "instruction": "Train a base model to predict the target column 'class' on the train set. Prior to training, analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model.", + "code": "from sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\n\n# Label encode categorical features if any\nlabel_encoders = {}\nfor col in categorical_columns:\n le = LabelEncoder()\n train_df_poly[col] = le.fit_transform(train_df_poly[col])\n dev_df_poly[col] = le.transform(dev_df_poly[col])\n test_df_poly[col] = le.transform(test_df_poly[col])\n label_encoders[col] = le\n\n# Separate features and target\nX_train = train_df_poly.drop(columns=['class'])\ny_train = train_df_poly['class']\nX_dev = dev_df_poly.drop(columns=['class'])\ny_dev = dev_df_poly['class']\nX_test = test_df_poly\n\n# Train a base XGBoost model\nmodel = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, max_depth=7, learning_rate=0.1)\nmodel.fit(X_train, y_train)\n\n# Predict on dev set\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Predict on test set\ny_test_pred = model.predict(X_test)\n\n# Save predictions\nimport os\noutput_dir = '../workspace/jasmine'\nos.makedirs(output_dir, exist_ok=True)\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n" + }, + { + "id": "0-3-3-0", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Evaluate the base model on the dev set, report the F1 score, and perform a correlation analysis to identify highly correlated features and potential multicollinearity issues.", + "code": "" + }, + { + "id": "0-3-3-1", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques before re-evaluating the model.", + "code": "" + }, + { + "id": "0-3-3-2", + "parent_id": "0-3-3", + "avg_score": "82.3", + "dev_score": "82.3", + "visits": 2, + "order": 4, + "instruction": "Evaluate the base model on the dev set, report the F1 score, and visualize the distribution of numerical features to identify outliers and skewed distributions.", + "code": "# Evaluate the base model on the dev set and report the F1 score\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Save the dev set predictions\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\n\n# Save the test set predictions\ny_test_pred = model.predict(X_test)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n\n# Print the train and dev set performance\ntrain_f1 = f1_score(y_train, model.predict(X_train))\nprint(f\"Train Set F1 Score: {train_f1}\")\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n" + }, + { + "id": "0-3-3-3", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the frequency distribution of categorical features to understand their impact on the model performance.", + "code": "" + }, + { + "id": "0-3-3-4", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Conduct a PCA (Principal Component Analysis) on the dev set to reduce dimensionality and understand the variance captured by different components. Then, evaluate the base model on the transformed dev set and report the F1 score.", + "code": "" + }, + { + "id": "0-3-4", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.", + "code": "" + }, + { + "id": "0-4", + "parent_id": "0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "1.5", + "instruction": "Perform Principal Component Analysis (PCA) to reduce dimensionality and understand the variance captured by different components, then engineer features based on the most significant principal components to improve model performance.", + "code": "" + } +] \ No newline at end of file diff --git a/src/components/demo2/datas/credit-g/tree_01.json b/src/components/demo2/datas/credit-g/tree_01.json new file mode 100644 index 00000000..66cbf7da --- /dev/null +++ b/src/components/demo2/datas/credit-g/tree_01.json @@ -0,0 +1,24 @@ +[ + { + "id": "0", + "parent_id": null, + "avg_score": "73.7", + "dev_score": "83.4", + "visits": 10, + "order": 1, + "instruction": "Perform exploratory data analysis on the train and dev sets to understand the dataset structure and characteristics.\n\nPreprocess the train, dev, and test sets by handling missing values, encoding categorical variables, and scaling numerical features.", + "code": "import pandas as pd\nimport numpy as np\n\n# Load the dataset\ntrain_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\ndev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n\n# Display basic information about the datasets\nprint(\"Train Dataset Info:\")\nprint(train_df.info())\nprint(\"\\nDev Dataset Info:\")\nprint(dev_df.info())\n\n# Display summary statistics for numerical columns\nprint(\"\\nTrain Dataset Summary Statistics:\")\nprint(train_df.describe())\nprint(\"\\nDev Dataset Summary Statistics:\")\nprint(dev_df.describe())\n\n# Display the number of unique values for categorical columns\ncategorical_columns = train_df.select_dtypes(include=['object', 'category']).columns\nprint(\"\\nUnique Values in Categorical Columns:\")\nfor col in categorical_columns:\n print(f\"{col}: {train_df[col].nunique()} unique values\")\n\n# Check for missing values\nprint(\"\\nMissing Values in Train Dataset:\")\nprint(train_df.isnull().sum())\nprint(\"\\nMissing Values in Dev Dataset:\")\nprint(dev_df.isnull().sum())\n\n# Check the distribution of the target column\nprint(\"\\nTarget Column Distribution in Train Dataset:\")\nprint(train_df['class'].value_counts(normalize=True))\nprint(\"\\nTarget Column Distribution in Dev Dataset:\")\nprint(dev_df['class'].value_counts(normalize=True))\n\n\nfrom sklearn.preprocessing import StandardScaler\n\n# Function to preprocess data\ndef preprocess_data(df, scaler=None):\n df_copy = df.copy()\n \n # Separate target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Scale numerical features\n if scaler is None:\n scaler = StandardScaler()\n df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n else:\n df_copy[df_copy.columns] = scaler.transform(df_copy)\n \n # Reattach target column if it was separated\n if y is not None:\n df_copy['class'] = y\n \n return df_copy, scaler\n\n# Preprocess train, dev, and test sets\ntrain_df_processed, scaler = preprocess_data(train_df)\ndev_df_processed, _ = preprocess_data(dev_df, scaler)\ntest_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\ntest_df_processed, _ = preprocess_data(test_df, scaler)\n\n# Display the first few rows of the processed train dataset\nprint(\"Processed Train Dataset:\")\nprint(train_df_processed.head())\n", + "active": true + }, + { + "id": "0-4", + "parent_id": "0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "1.5", + "instruction": "Perform Principal Component Analysis (PCA) to reduce dimensionality and understand the variance captured by different components, then engineer features based on the most significant principal components to improve model performance.", + "code": "", + "active": false + } +] \ No newline at end of file diff --git a/src/components/demo2/datas/credit-g/tree_02.json b/src/components/demo2/datas/credit-g/tree_02.json new file mode 100644 index 00000000..0c01ebaf --- /dev/null +++ b/src/components/demo2/datas/credit-g/tree_02.json @@ -0,0 +1,79 @@ +[ + { + "id": "0", + "parent_id": null, + "avg_score": "73.7", + "dev_score": "83.4", + "visits": 10, + "order": 1, + "instruction": "Perform exploratory data analysis on the train and dev sets to understand the dataset structure and characteristics.\n\nPreprocess the train, dev, and test sets by handling missing values, encoding categorical variables, and scaling numerical features.", + "code": "import pandas as pd\nimport numpy as np\n\n# Load the dataset\ntrain_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\ndev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n\n# Display basic information about the datasets\nprint(\"Train Dataset Info:\")\nprint(train_df.info())\nprint(\"\\nDev Dataset Info:\")\nprint(dev_df.info())\n\n# Display summary statistics for numerical columns\nprint(\"\\nTrain Dataset Summary Statistics:\")\nprint(train_df.describe())\nprint(\"\\nDev Dataset Summary Statistics:\")\nprint(dev_df.describe())\n\n# Display the number of unique values for categorical columns\ncategorical_columns = train_df.select_dtypes(include=['object', 'category']).columns\nprint(\"\\nUnique Values in Categorical Columns:\")\nfor col in categorical_columns:\n print(f\"{col}: {train_df[col].nunique()} unique values\")\n\n# Check for missing values\nprint(\"\\nMissing Values in Train Dataset:\")\nprint(train_df.isnull().sum())\nprint(\"\\nMissing Values in Dev Dataset:\")\nprint(dev_df.isnull().sum())\n\n# Check the distribution of the target column\nprint(\"\\nTarget Column Distribution in Train Dataset:\")\nprint(train_df['class'].value_counts(normalize=True))\nprint(\"\\nTarget Column Distribution in Dev Dataset:\")\nprint(dev_df['class'].value_counts(normalize=True))\n\n\nfrom sklearn.preprocessing import StandardScaler\n\n# Function to preprocess data\ndef preprocess_data(df, scaler=None):\n df_copy = df.copy()\n \n # Separate target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Scale numerical features\n if scaler is None:\n scaler = StandardScaler()\n df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n else:\n df_copy[df_copy.columns] = scaler.transform(df_copy)\n \n # Reattach target column if it was separated\n if y is not None:\n df_copy['class'] = y\n \n return df_copy, scaler\n\n# Preprocess train, dev, and test sets\ntrain_df_processed, scaler = preprocess_data(train_df)\ndev_df_processed, _ = preprocess_data(dev_df, scaler)\ntest_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\ntest_df_processed, _ = preprocess_data(test_df, scaler)\n\n# Display the first few rows of the processed train dataset\nprint(\"Processed Train Dataset:\")\nprint(train_df_processed.head())\n", + "active": true + }, + { + "id": "0-3", + "parent_id": "0", + "avg_score": "81.6", + "dev_score": "81.0", + "visits": 4, + "order": 2, + "instruction": "Analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model. Use this analysis to engineer new features or transform existing ones that can better capture the underlying patterns in the data.", + "code": "from sklearn.preprocessing import PolynomialFeatures\n\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Remove ID columns if any\n df_copy = df_copy.select_dtypes(include=['number'])\n \n # Polynomial features\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n \n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n \n if y is not None:\n df_poly['class'] = y\n \n return df_poly, poly\n\ntrain_df_poly, poly = engineer_features(train_df_processed)\ndev_df_poly, _ = engineer_features(dev_df_processed, poly)\ntest_df_poly, _ = engineer_features(test_df_processed, poly)\n\nprint(\"Processed Train Dataset with Polynomial Features:\")\nprint(train_df_poly.head())\n", + "active": true + }, + { + "id": "0-3-0", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after performing a correlation analysis to identify highly correlated features and address potential multicollinearity issues.", + "code": "", + "active": false + }, + { + "id": "0-3-1", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.", + "code": "", + "active": false + }, + { + "id": "0-3-2", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.", + "code": "", + "active": false + }, + { + "id": "0-3-4", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.", + "code": "", + "active": false + }, + { + "id": "0-4", + "parent_id": "0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "1.5", + "instruction": "Perform Principal Component Analysis (PCA) to reduce dimensionality and understand the variance captured by different components, then engineer features based on the most significant principal components to improve model performance.", + "code": "", + "active": false + } +] \ No newline at end of file diff --git a/src/components/demo2/datas/credit-g/tree_03.json b/src/components/demo2/datas/credit-g/tree_03.json new file mode 100644 index 00000000..0f0cc77f --- /dev/null +++ b/src/components/demo2/datas/credit-g/tree_03.json @@ -0,0 +1,134 @@ +[ + { + "id": "0", + "parent_id": null, + "avg_score": "73.7", + "dev_score": "83.4", + "visits": 10, + "order": 1, + "instruction": "Perform exploratory data analysis on the train and dev sets to understand the dataset structure and characteristics.\n\nPreprocess the train, dev, and test sets by handling missing values, encoding categorical variables, and scaling numerical features.", + "code": "import pandas as pd\nimport numpy as np\n\n# Load the dataset\ntrain_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\ndev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n\n# Display basic information about the datasets\nprint(\"Train Dataset Info:\")\nprint(train_df.info())\nprint(\"\\nDev Dataset Info:\")\nprint(dev_df.info())\n\n# Display summary statistics for numerical columns\nprint(\"\\nTrain Dataset Summary Statistics:\")\nprint(train_df.describe())\nprint(\"\\nDev Dataset Summary Statistics:\")\nprint(dev_df.describe())\n\n# Display the number of unique values for categorical columns\ncategorical_columns = train_df.select_dtypes(include=['object', 'category']).columns\nprint(\"\\nUnique Values in Categorical Columns:\")\nfor col in categorical_columns:\n print(f\"{col}: {train_df[col].nunique()} unique values\")\n\n# Check for missing values\nprint(\"\\nMissing Values in Train Dataset:\")\nprint(train_df.isnull().sum())\nprint(\"\\nMissing Values in Dev Dataset:\")\nprint(dev_df.isnull().sum())\n\n# Check the distribution of the target column\nprint(\"\\nTarget Column Distribution in Train Dataset:\")\nprint(train_df['class'].value_counts(normalize=True))\nprint(\"\\nTarget Column Distribution in Dev Dataset:\")\nprint(dev_df['class'].value_counts(normalize=True))\n\n\nfrom sklearn.preprocessing import StandardScaler\n\n# Function to preprocess data\ndef preprocess_data(df, scaler=None):\n df_copy = df.copy()\n \n # Separate target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Scale numerical features\n if scaler is None:\n scaler = StandardScaler()\n df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n else:\n df_copy[df_copy.columns] = scaler.transform(df_copy)\n \n # Reattach target column if it was separated\n if y is not None:\n df_copy['class'] = y\n \n return df_copy, scaler\n\n# Preprocess train, dev, and test sets\ntrain_df_processed, scaler = preprocess_data(train_df)\ndev_df_processed, _ = preprocess_data(dev_df, scaler)\ntest_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\ntest_df_processed, _ = preprocess_data(test_df, scaler)\n\n# Display the first few rows of the processed train dataset\nprint(\"Processed Train Dataset:\")\nprint(train_df_processed.head())\n", + "active": true + }, + { + "id": "0-3", + "parent_id": "0", + "avg_score": "81.6", + "dev_score": "81.0", + "visits": 4, + "order": 2, + "instruction": "Analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model. Use this analysis to engineer new features or transform existing ones that can better capture the underlying patterns in the data.", + "code": "from sklearn.preprocessing import PolynomialFeatures\n\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Remove ID columns if any\n df_copy = df_copy.select_dtypes(include=['number'])\n \n # Polynomial features\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n \n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n \n if y is not None:\n df_poly['class'] = y\n \n return df_poly, poly\n\ntrain_df_poly, poly = engineer_features(train_df_processed)\ndev_df_poly, _ = engineer_features(dev_df_processed, poly)\ntest_df_poly, _ = engineer_features(test_df_processed, poly)\n\nprint(\"Processed Train Dataset with Polynomial Features:\")\nprint(train_df_poly.head())\n", + "active": true + }, + { + "id": "0-3-0", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after performing a correlation analysis to identify highly correlated features and address potential multicollinearity issues.", + "code": "", + "active": false + }, + { + "id": "0-3-1", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.", + "code": "", + "active": false + }, + { + "id": "0-3-2", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.", + "code": "", + "active": false + }, + { + "id": "0-3-3", + "parent_id": "0-3", + "avg_score": "81.9", + "dev_score": "81.0", + "visits": 3, + "order": 3, + "instruction": "Train a base model to predict the target column 'class' on the train set. Prior to training, analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model.", + "code": "from sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\n\n# Label encode categorical features if any\nlabel_encoders = {}\nfor col in categorical_columns:\n le = LabelEncoder()\n train_df_poly[col] = le.fit_transform(train_df_poly[col])\n dev_df_poly[col] = le.transform(dev_df_poly[col])\n test_df_poly[col] = le.transform(test_df_poly[col])\n label_encoders[col] = le\n\n# Separate features and target\nX_train = train_df_poly.drop(columns=['class'])\ny_train = train_df_poly['class']\nX_dev = dev_df_poly.drop(columns=['class'])\ny_dev = dev_df_poly['class']\nX_test = test_df_poly\n\n# Train a base XGBoost model\nmodel = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, max_depth=7, learning_rate=0.1)\nmodel.fit(X_train, y_train)\n\n# Predict on dev set\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Predict on test set\ny_test_pred = model.predict(X_test)\n\n# Save predictions\nimport os\noutput_dir = '../workspace/jasmine'\nos.makedirs(output_dir, exist_ok=True)\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n", + "active": true + }, + { + "id": "0-3-3-0", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Evaluate the base model on the dev set, report the F1 score, and perform a correlation analysis to identify highly correlated features and potential multicollinearity issues.", + "code": "", + "active": false + }, + { + "id": "0-3-3-1", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques before re-evaluating the model.", + "code": "", + "active": false + }, + { + "id": "0-3-3-3", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the frequency distribution of categorical features to understand their impact on the model performance.", + "code": "", + "active": false + }, + { + "id": "0-3-3-4", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Conduct a PCA (Principal Component Analysis) on the dev set to reduce dimensionality and understand the variance captured by different components. Then, evaluate the base model on the transformed dev set and report the F1 score.", + "code": "", + "active": false + }, + { + "id": "0-3-4", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.", + "code": "", + "active": false + }, + { + "id": "0-4", + "parent_id": "0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "1.5", + "instruction": "Perform Principal Component Analysis (PCA) to reduce dimensionality and understand the variance captured by different components, then engineer features based on the most significant principal components to improve model performance.", + "code": "", + "active": false + } +] \ No newline at end of file diff --git a/src/components/demo2/datas/credit-g/tree_04.json b/src/components/demo2/datas/credit-g/tree_04.json new file mode 100644 index 00000000..85db669d --- /dev/null +++ b/src/components/demo2/datas/credit-g/tree_04.json @@ -0,0 +1,145 @@ +[ + { + "id": "0", + "parent_id": null, + "avg_score": "73.7", + "dev_score": "83.4", + "visits": 10, + "order": 1, + "instruction": "Perform exploratory data analysis on the train and dev sets to understand the dataset structure and characteristics.\n\nPreprocess the train, dev, and test sets by handling missing values, encoding categorical variables, and scaling numerical features.", + "code": "import pandas as pd\nimport numpy as np\n\n# Load the dataset\ntrain_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\ndev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n\n# Display basic information about the datasets\nprint(\"Train Dataset Info:\")\nprint(train_df.info())\nprint(\"\\nDev Dataset Info:\")\nprint(dev_df.info())\n\n# Display summary statistics for numerical columns\nprint(\"\\nTrain Dataset Summary Statistics:\")\nprint(train_df.describe())\nprint(\"\\nDev Dataset Summary Statistics:\")\nprint(dev_df.describe())\n\n# Display the number of unique values for categorical columns\ncategorical_columns = train_df.select_dtypes(include=['object', 'category']).columns\nprint(\"\\nUnique Values in Categorical Columns:\")\nfor col in categorical_columns:\n print(f\"{col}: {train_df[col].nunique()} unique values\")\n\n# Check for missing values\nprint(\"\\nMissing Values in Train Dataset:\")\nprint(train_df.isnull().sum())\nprint(\"\\nMissing Values in Dev Dataset:\")\nprint(dev_df.isnull().sum())\n\n# Check the distribution of the target column\nprint(\"\\nTarget Column Distribution in Train Dataset:\")\nprint(train_df['class'].value_counts(normalize=True))\nprint(\"\\nTarget Column Distribution in Dev Dataset:\")\nprint(dev_df['class'].value_counts(normalize=True))\n\n\nfrom sklearn.preprocessing import StandardScaler\n\n# Function to preprocess data\ndef preprocess_data(df, scaler=None):\n df_copy = df.copy()\n \n # Separate target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Scale numerical features\n if scaler is None:\n scaler = StandardScaler()\n df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n else:\n df_copy[df_copy.columns] = scaler.transform(df_copy)\n \n # Reattach target column if it was separated\n if y is not None:\n df_copy['class'] = y\n \n return df_copy, scaler\n\n# Preprocess train, dev, and test sets\ntrain_df_processed, scaler = preprocess_data(train_df)\ndev_df_processed, _ = preprocess_data(dev_df, scaler)\ntest_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\ntest_df_processed, _ = preprocess_data(test_df, scaler)\n\n# Display the first few rows of the processed train dataset\nprint(\"Processed Train Dataset:\")\nprint(train_df_processed.head())\n", + "active": true + }, + { + "id": "0-3", + "parent_id": "0", + "avg_score": "81.6", + "dev_score": "81.0", + "visits": 4, + "order": 2, + "instruction": "Analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model. Use this analysis to engineer new features or transform existing ones that can better capture the underlying patterns in the data.", + "code": "from sklearn.preprocessing import PolynomialFeatures\n\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Remove ID columns if any\n df_copy = df_copy.select_dtypes(include=['number'])\n \n # Polynomial features\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n \n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n \n if y is not None:\n df_poly['class'] = y\n \n return df_poly, poly\n\ntrain_df_poly, poly = engineer_features(train_df_processed)\ndev_df_poly, _ = engineer_features(dev_df_processed, poly)\ntest_df_poly, _ = engineer_features(test_df_processed, poly)\n\nprint(\"Processed Train Dataset with Polynomial Features:\")\nprint(train_df_poly.head())\n", + "active": true + }, + { + "id": "0-3-0", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after performing a correlation analysis to identify highly correlated features and address potential multicollinearity issues.", + "code": "", + "active": false + }, + { + "id": "0-3-1", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.", + "code": "", + "active": false + }, + { + "id": "0-3-2", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.", + "code": "", + "active": false + }, + { + "id": "0-3-3", + "parent_id": "0-3", + "avg_score": "81.9", + "dev_score": "81.0", + "visits": 3, + "order": 3, + "instruction": "Train a base model to predict the target column 'class' on the train set. Prior to training, analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model.", + "code": "from sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\n\n# Label encode categorical features if any\nlabel_encoders = {}\nfor col in categorical_columns:\n le = LabelEncoder()\n train_df_poly[col] = le.fit_transform(train_df_poly[col])\n dev_df_poly[col] = le.transform(dev_df_poly[col])\n test_df_poly[col] = le.transform(test_df_poly[col])\n label_encoders[col] = le\n\n# Separate features and target\nX_train = train_df_poly.drop(columns=['class'])\ny_train = train_df_poly['class']\nX_dev = dev_df_poly.drop(columns=['class'])\ny_dev = dev_df_poly['class']\nX_test = test_df_poly\n\n# Train a base XGBoost model\nmodel = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, max_depth=7, learning_rate=0.1)\nmodel.fit(X_train, y_train)\n\n# Predict on dev set\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Predict on test set\ny_test_pred = model.predict(X_test)\n\n# Save predictions\nimport os\noutput_dir = '../workspace/jasmine'\nos.makedirs(output_dir, exist_ok=True)\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n", + "active": true + }, + { + "id": "0-3-3-0", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Evaluate the base model on the dev set, report the F1 score, and perform a correlation analysis to identify highly correlated features and potential multicollinearity issues.", + "code": "", + "active": false + }, + { + "id": "0-3-3-1", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques before re-evaluating the model.", + "code": "", + "active": false + }, + { + "id": "0-3-3-2", + "parent_id": "0-3-3", + "avg_score": "82.3", + "dev_score": "82.3", + "visits": 2, + "order": 4, + "instruction": "Evaluate the base model on the dev set, report the F1 score, and visualize the distribution of numerical features to identify outliers and skewed distributions.", + "code": "# Evaluate the base model on the dev set and report the F1 score\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Save the dev set predictions\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\n\n# Save the test set predictions\ny_test_pred = model.predict(X_test)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n\n# Print the train and dev set performance\ntrain_f1 = f1_score(y_train, model.predict(X_train))\nprint(f\"Train Set F1 Score: {train_f1}\")\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n", + "active": true + }, + { + "id": "0-3-3-3", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the frequency distribution of categorical features to understand their impact on the model performance.", + "code": "", + "active": false + }, + { + "id": "0-3-3-4", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Conduct a PCA (Principal Component Analysis) on the dev set to reduce dimensionality and understand the variance captured by different components. Then, evaluate the base model on the transformed dev set and report the F1 score.", + "code": "", + "active": false + }, + { + "id": "0-3-4", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.", + "code": "", + "active": false + }, + { + "id": "0-4", + "parent_id": "0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "1.5", + "instruction": "Perform Principal Component Analysis (PCA) to reduce dimensionality and understand the variance captured by different components, then engineer features based on the most significant principal components to improve model performance.", + "code": "", + "active": false + } +] \ No newline at end of file diff --git a/src/components/demo2/datas/credit-g/tree_05.json b/src/components/demo2/datas/credit-g/tree_05.json new file mode 100644 index 00000000..9d37c727 --- /dev/null +++ b/src/components/demo2/datas/credit-g/tree_05.json @@ -0,0 +1,145 @@ +[ + { + "id": "0", + "parent_id": null, + "avg_score": "73.7", + "dev_score": "83.4", + "visits": 10, + "order": 1, + "instruction": "Perform exploratory data analysis on the train and dev sets to understand the dataset structure and characteristics.\n\nPreprocess the train, dev, and test sets by handling missing values, encoding categorical variables, and scaling numerical features.", + "code": "import pandas as pd\nimport numpy as np\n\n# Load the dataset\ntrain_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\ndev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n\n# Display basic information about the datasets\nprint(\"Train Dataset Info:\")\nprint(train_df.info())\nprint(\"\\nDev Dataset Info:\")\nprint(dev_df.info())\n\n# Display summary statistics for numerical columns\nprint(\"\\nTrain Dataset Summary Statistics:\")\nprint(train_df.describe())\nprint(\"\\nDev Dataset Summary Statistics:\")\nprint(dev_df.describe())\n\n# Display the number of unique values for categorical columns\ncategorical_columns = train_df.select_dtypes(include=['object', 'category']).columns\nprint(\"\\nUnique Values in Categorical Columns:\")\nfor col in categorical_columns:\n print(f\"{col}: {train_df[col].nunique()} unique values\")\n\n# Check for missing values\nprint(\"\\nMissing Values in Train Dataset:\")\nprint(train_df.isnull().sum())\nprint(\"\\nMissing Values in Dev Dataset:\")\nprint(dev_df.isnull().sum())\n\n# Check the distribution of the target column\nprint(\"\\nTarget Column Distribution in Train Dataset:\")\nprint(train_df['class'].value_counts(normalize=True))\nprint(\"\\nTarget Column Distribution in Dev Dataset:\")\nprint(dev_df['class'].value_counts(normalize=True))\n\n\nfrom sklearn.preprocessing import StandardScaler\n\n# Function to preprocess data\ndef preprocess_data(df, scaler=None):\n df_copy = df.copy()\n \n # Separate target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Scale numerical features\n if scaler is None:\n scaler = StandardScaler()\n df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n else:\n df_copy[df_copy.columns] = scaler.transform(df_copy)\n \n # Reattach target column if it was separated\n if y is not None:\n df_copy['class'] = y\n \n return df_copy, scaler\n\n# Preprocess train, dev, and test sets\ntrain_df_processed, scaler = preprocess_data(train_df)\ndev_df_processed, _ = preprocess_data(dev_df, scaler)\ntest_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\ntest_df_processed, _ = preprocess_data(test_df, scaler)\n\n# Display the first few rows of the processed train dataset\nprint(\"Processed Train Dataset:\")\nprint(train_df_processed.head())\n", + "active": false + }, + { + "id": "0-3", + "parent_id": "0", + "avg_score": "81.6", + "dev_score": "81.0", + "visits": 4, + "order": 2, + "instruction": "Analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model. Use this analysis to engineer new features or transform existing ones that can better capture the underlying patterns in the data.", + "code": "from sklearn.preprocessing import PolynomialFeatures\n\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Remove ID columns if any\n df_copy = df_copy.select_dtypes(include=['number'])\n \n # Polynomial features\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n \n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n \n if y is not None:\n df_poly['class'] = y\n \n return df_poly, poly\n\ntrain_df_poly, poly = engineer_features(train_df_processed)\ndev_df_poly, _ = engineer_features(dev_df_processed, poly)\ntest_df_poly, _ = engineer_features(test_df_processed, poly)\n\nprint(\"Processed Train Dataset with Polynomial Features:\")\nprint(train_df_poly.head())\n", + "active": false + }, + { + "id": "0-3-0", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after performing a correlation analysis to identify highly correlated features and address potential multicollinearity issues.", + "code": "", + "active": false + }, + { + "id": "0-3-1", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.", + "code": "", + "active": false + }, + { + "id": "0-3-2", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.", + "code": "", + "active": false + }, + { + "id": "0-3-3", + "parent_id": "0-3", + "avg_score": "81.9", + "dev_score": "81.0", + "visits": 3, + "order": 3, + "instruction": "Train a base model to predict the target column 'class' on the train set. Prior to training, analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model.", + "code": "from sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\n\n# Label encode categorical features if any\nlabel_encoders = {}\nfor col in categorical_columns:\n le = LabelEncoder()\n train_df_poly[col] = le.fit_transform(train_df_poly[col])\n dev_df_poly[col] = le.transform(dev_df_poly[col])\n test_df_poly[col] = le.transform(test_df_poly[col])\n label_encoders[col] = le\n\n# Separate features and target\nX_train = train_df_poly.drop(columns=['class'])\ny_train = train_df_poly['class']\nX_dev = dev_df_poly.drop(columns=['class'])\ny_dev = dev_df_poly['class']\nX_test = test_df_poly\n\n# Train a base XGBoost model\nmodel = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, max_depth=7, learning_rate=0.1)\nmodel.fit(X_train, y_train)\n\n# Predict on dev set\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Predict on test set\ny_test_pred = model.predict(X_test)\n\n# Save predictions\nimport os\noutput_dir = '../workspace/jasmine'\nos.makedirs(output_dir, exist_ok=True)\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n", + "active": false + }, + { + "id": "0-3-3-0", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Evaluate the base model on the dev set, report the F1 score, and perform a correlation analysis to identify highly correlated features and potential multicollinearity issues.", + "code": "", + "active": false + }, + { + "id": "0-3-3-1", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques before re-evaluating the model.", + "code": "", + "active": false + }, + { + "id": "0-3-3-2", + "parent_id": "0-3-3", + "avg_score": "82.3", + "dev_score": "82.3", + "visits": 2, + "order": 4, + "instruction": "Evaluate the base model on the dev set, report the F1 score, and visualize the distribution of numerical features to identify outliers and skewed distributions.", + "code": "# Evaluate the base model on the dev set and report the F1 score\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Save the dev set predictions\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\n\n# Save the test set predictions\ny_test_pred = model.predict(X_test)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n\n# Print the train and dev set performance\ntrain_f1 = f1_score(y_train, model.predict(X_train))\nprint(f\"Train Set F1 Score: {train_f1}\")\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n", + "active": false + }, + { + "id": "0-3-3-3", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the frequency distribution of categorical features to understand their impact on the model performance.", + "code": "", + "active": false + }, + { + "id": "0-3-3-4", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Conduct a PCA (Principal Component Analysis) on the dev set to reduce dimensionality and understand the variance captured by different components. Then, evaluate the base model on the transformed dev set and report the F1 score.", + "code": "", + "active": false + }, + { + "id": "0-3-4", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.", + "code": "", + "active": false + }, + { + "id": "0-4", + "parent_id": "0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "1.5", + "instruction": "Perform Principal Component Analysis (PCA) to reduce dimensionality and understand the variance captured by different components, then engineer features based on the most significant principal components to improve model performance.", + "code": "", + "active": false + } +] \ No newline at end of file diff --git a/src/components/demo2/datas/credit-g/tree_06.json b/src/components/demo2/datas/credit-g/tree_06.json new file mode 100644 index 00000000..95cd19e7 --- /dev/null +++ b/src/components/demo2/datas/credit-g/tree_06.json @@ -0,0 +1,200 @@ +[ + { + "id": "0", + "parent_id": null, + "avg_score": "73.7", + "dev_score": "83.4", + "visits": 10, + "order": 1, + "instruction": "Perform exploratory data analysis on the train and dev sets to understand the dataset structure and characteristics.\n\nPreprocess the train, dev, and test sets by handling missing values, encoding categorical variables, and scaling numerical features.", + "code": "import pandas as pd\nimport numpy as np\n\n# Load the dataset\ntrain_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\ndev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n\n# Display basic information about the datasets\nprint(\"Train Dataset Info:\")\nprint(train_df.info())\nprint(\"\\nDev Dataset Info:\")\nprint(dev_df.info())\n\n# Display summary statistics for numerical columns\nprint(\"\\nTrain Dataset Summary Statistics:\")\nprint(train_df.describe())\nprint(\"\\nDev Dataset Summary Statistics:\")\nprint(dev_df.describe())\n\n# Display the number of unique values for categorical columns\ncategorical_columns = train_df.select_dtypes(include=['object', 'category']).columns\nprint(\"\\nUnique Values in Categorical Columns:\")\nfor col in categorical_columns:\n print(f\"{col}: {train_df[col].nunique()} unique values\")\n\n# Check for missing values\nprint(\"\\nMissing Values in Train Dataset:\")\nprint(train_df.isnull().sum())\nprint(\"\\nMissing Values in Dev Dataset:\")\nprint(dev_df.isnull().sum())\n\n# Check the distribution of the target column\nprint(\"\\nTarget Column Distribution in Train Dataset:\")\nprint(train_df['class'].value_counts(normalize=True))\nprint(\"\\nTarget Column Distribution in Dev Dataset:\")\nprint(dev_df['class'].value_counts(normalize=True))\n\n\nfrom sklearn.preprocessing import StandardScaler\n\n# Function to preprocess data\ndef preprocess_data(df, scaler=None):\n df_copy = df.copy()\n \n # Separate target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Scale numerical features\n if scaler is None:\n scaler = StandardScaler()\n df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n else:\n df_copy[df_copy.columns] = scaler.transform(df_copy)\n \n # Reattach target column if it was separated\n if y is not None:\n df_copy['class'] = y\n \n return df_copy, scaler\n\n# Preprocess train, dev, and test sets\ntrain_df_processed, scaler = preprocess_data(train_df)\ndev_df_processed, _ = preprocess_data(dev_df, scaler)\ntest_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\ntest_df_processed, _ = preprocess_data(test_df, scaler)\n\n# Display the first few rows of the processed train dataset\nprint(\"Processed Train Dataset:\")\nprint(train_df_processed.head())\n", + "active": true + }, + { + "id": "0-0", + "parent_id": "0", + "avg_score": "82.5", + "dev_score": "83.2", + "visits": 2, + "order": 6, + "instruction": "Perform a correlation analysis to identify highly correlated features and potential multicollinearity issues before engineering new features or transforming existing ones to improve model performance.", + "code": "import numpy as np\n\n# Function to perform correlation analysis and identify highly correlated features\ndef correlation_analysis(df):\n corr_matrix = df.corr()\n upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))\n to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]\n return to_drop\n\n# Perform correlation analysis on the processed train dataset\nto_drop = correlation_analysis(train_df_processed.drop(columns=['class']))\n\n# Print the features to drop due to high correlation\nprint(\"Features to drop due to high correlation:\", to_drop)\n\n# Drop the identified features from the train, dev, and test datasets\ntrain_df_processed = train_df_processed.drop(columns=to_drop)\ndev_df_processed = dev_df_processed.drop(columns=to_drop)\ntest_df_processed = test_df_processed.drop(columns=to_drop)\n\n# Print the shape of the datasets after dropping highly correlated features\nprint(\"Shape of processed train dataset after dropping features:\", train_df_processed.shape)\nprint(\"Shape of processed dev dataset after dropping features:\", dev_df_processed.shape)\nprint(\"Shape of processed test dataset after dropping features:\", test_df_processed.shape)\n", + "active": true + }, + { + "id": "0-0-0", + "parent_id": "0-0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "6.5", + "instruction": "Perform a correlation analysis to identify highly correlated features and potential multicollinearity issues before training a base model to predict the target column 'class' on the train set.", + "code": "", + "active": false + }, + { + "id": "0-0-1", + "parent_id": "0-0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "6.5", + "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable 'class' to check for class imbalance. If significant class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.", + "code": "", + "active": false + }, + { + "id": "0-0-3", + "parent_id": "0-0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "6.5", + "instruction": "Train a base model to predict the target column 'class' on the train set, after analyzing the frequency distribution of categorical features to understand their diversity and potential impact on the model.", + "code": "", + "active": false + }, + { + "id": "0-0-4", + "parent_id": "0-0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "6.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.", + "code": "", + "active": false + }, + { + "id": "0-3", + "parent_id": "0", + "avg_score": "81.6", + "dev_score": "81.0", + "visits": 4, + "order": 2, + "instruction": "Analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model. Use this analysis to engineer new features or transform existing ones that can better capture the underlying patterns in the data.", + "code": "from sklearn.preprocessing import PolynomialFeatures\n\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Remove ID columns if any\n df_copy = df_copy.select_dtypes(include=['number'])\n \n # Polynomial features\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n \n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n \n if y is not None:\n df_poly['class'] = y\n \n return df_poly, poly\n\ntrain_df_poly, poly = engineer_features(train_df_processed)\ndev_df_poly, _ = engineer_features(dev_df_processed, poly)\ntest_df_poly, _ = engineer_features(test_df_processed, poly)\n\nprint(\"Processed Train Dataset with Polynomial Features:\")\nprint(train_df_poly.head())\n", + "active": false + }, + { + "id": "0-3-0", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after performing a correlation analysis to identify highly correlated features and address potential multicollinearity issues.", + "code": "", + "active": false + }, + { + "id": "0-3-1", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.", + "code": "", + "active": false + }, + { + "id": "0-3-2", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.", + "code": "", + "active": false + }, + { + "id": "0-3-3", + "parent_id": "0-3", + "avg_score": "81.9", + "dev_score": "81.0", + "visits": 3, + "order": 3, + "instruction": "Train a base model to predict the target column 'class' on the train set. Prior to training, analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model.", + "code": "from sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\n\n# Label encode categorical features if any\nlabel_encoders = {}\nfor col in categorical_columns:\n le = LabelEncoder()\n train_df_poly[col] = le.fit_transform(train_df_poly[col])\n dev_df_poly[col] = le.transform(dev_df_poly[col])\n test_df_poly[col] = le.transform(test_df_poly[col])\n label_encoders[col] = le\n\n# Separate features and target\nX_train = train_df_poly.drop(columns=['class'])\ny_train = train_df_poly['class']\nX_dev = dev_df_poly.drop(columns=['class'])\ny_dev = dev_df_poly['class']\nX_test = test_df_poly\n\n# Train a base XGBoost model\nmodel = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, max_depth=7, learning_rate=0.1)\nmodel.fit(X_train, y_train)\n\n# Predict on dev set\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Predict on test set\ny_test_pred = model.predict(X_test)\n\n# Save predictions\nimport os\noutput_dir = '../workspace/jasmine'\nos.makedirs(output_dir, exist_ok=True)\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n", + "active": false + }, + { + "id": "0-3-3-0", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Evaluate the base model on the dev set, report the F1 score, and perform a correlation analysis to identify highly correlated features and potential multicollinearity issues.", + "code": "", + "active": false + }, + { + "id": "0-3-3-1", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques before re-evaluating the model.", + "code": "", + "active": false + }, + { + "id": "0-3-3-2", + "parent_id": "0-3-3", + "avg_score": "82.3", + "dev_score": "82.3", + "visits": 2, + "order": 4, + "instruction": "Evaluate the base model on the dev set, report the F1 score, and visualize the distribution of numerical features to identify outliers and skewed distributions.", + "code": "# Evaluate the base model on the dev set and report the F1 score\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Save the dev set predictions\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\n\n# Save the test set predictions\ny_test_pred = model.predict(X_test)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n\n# Print the train and dev set performance\ntrain_f1 = f1_score(y_train, model.predict(X_train))\nprint(f\"Train Set F1 Score: {train_f1}\")\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n", + "active": false + }, + { + "id": "0-3-3-3", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the frequency distribution of categorical features to understand their impact on the model performance.", + "code": "", + "active": false + }, + { + "id": "0-3-3-4", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Conduct a PCA (Principal Component Analysis) on the dev set to reduce dimensionality and understand the variance captured by different components. Then, evaluate the base model on the transformed dev set and report the F1 score.", + "code": "", + "active": false + }, + { + "id": "0-3-4", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.", + "code": "", + "active": false + }, + { + "id": "0-4", + "parent_id": "0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "1.5", + "instruction": "Perform Principal Component Analysis (PCA) to reduce dimensionality and understand the variance captured by different components, then engineer features based on the most significant principal components to improve model performance.", + "code": "", + "active": false + } +] \ No newline at end of file diff --git a/src/components/demo2/datas/credit-g/tree_07.json b/src/components/demo2/datas/credit-g/tree_07.json new file mode 100644 index 00000000..80efa243 --- /dev/null +++ b/src/components/demo2/datas/credit-g/tree_07.json @@ -0,0 +1,211 @@ +[ + { + "id": "0", + "parent_id": null, + "avg_score": "73.7", + "dev_score": "83.4", + "visits": 10, + "order": 1, + "instruction": "Perform exploratory data analysis on the train and dev sets to understand the dataset structure and characteristics.\n\nPreprocess the train, dev, and test sets by handling missing values, encoding categorical variables, and scaling numerical features.", + "code": "import pandas as pd\nimport numpy as np\n\n# Load the dataset\ntrain_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\ndev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n\n# Display basic information about the datasets\nprint(\"Train Dataset Info:\")\nprint(train_df.info())\nprint(\"\\nDev Dataset Info:\")\nprint(dev_df.info())\n\n# Display summary statistics for numerical columns\nprint(\"\\nTrain Dataset Summary Statistics:\")\nprint(train_df.describe())\nprint(\"\\nDev Dataset Summary Statistics:\")\nprint(dev_df.describe())\n\n# Display the number of unique values for categorical columns\ncategorical_columns = train_df.select_dtypes(include=['object', 'category']).columns\nprint(\"\\nUnique Values in Categorical Columns:\")\nfor col in categorical_columns:\n print(f\"{col}: {train_df[col].nunique()} unique values\")\n\n# Check for missing values\nprint(\"\\nMissing Values in Train Dataset:\")\nprint(train_df.isnull().sum())\nprint(\"\\nMissing Values in Dev Dataset:\")\nprint(dev_df.isnull().sum())\n\n# Check the distribution of the target column\nprint(\"\\nTarget Column Distribution in Train Dataset:\")\nprint(train_df['class'].value_counts(normalize=True))\nprint(\"\\nTarget Column Distribution in Dev Dataset:\")\nprint(dev_df['class'].value_counts(normalize=True))\n\n\nfrom sklearn.preprocessing import StandardScaler\n\n# Function to preprocess data\ndef preprocess_data(df, scaler=None):\n df_copy = df.copy()\n \n # Separate target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Scale numerical features\n if scaler is None:\n scaler = StandardScaler()\n df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n else:\n df_copy[df_copy.columns] = scaler.transform(df_copy)\n \n # Reattach target column if it was separated\n if y is not None:\n df_copy['class'] = y\n \n return df_copy, scaler\n\n# Preprocess train, dev, and test sets\ntrain_df_processed, scaler = preprocess_data(train_df)\ndev_df_processed, _ = preprocess_data(dev_df, scaler)\ntest_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\ntest_df_processed, _ = preprocess_data(test_df, scaler)\n\n# Display the first few rows of the processed train dataset\nprint(\"Processed Train Dataset:\")\nprint(train_df_processed.head())\n", + "active": true + }, + { + "id": "0-0", + "parent_id": "0", + "avg_score": "82.5", + "dev_score": "83.2", + "visits": 2, + "order": 6, + "instruction": "Perform a correlation analysis to identify highly correlated features and potential multicollinearity issues before engineering new features or transforming existing ones to improve model performance.", + "code": "import numpy as np\n\n# Function to perform correlation analysis and identify highly correlated features\ndef correlation_analysis(df):\n corr_matrix = df.corr()\n upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))\n to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]\n return to_drop\n\n# Perform correlation analysis on the processed train dataset\nto_drop = correlation_analysis(train_df_processed.drop(columns=['class']))\n\n# Print the features to drop due to high correlation\nprint(\"Features to drop due to high correlation:\", to_drop)\n\n# Drop the identified features from the train, dev, and test datasets\ntrain_df_processed = train_df_processed.drop(columns=to_drop)\ndev_df_processed = dev_df_processed.drop(columns=to_drop)\ntest_df_processed = test_df_processed.drop(columns=to_drop)\n\n# Print the shape of the datasets after dropping highly correlated features\nprint(\"Shape of processed train dataset after dropping features:\", train_df_processed.shape)\nprint(\"Shape of processed dev dataset after dropping features:\", dev_df_processed.shape)\nprint(\"Shape of processed test dataset after dropping features:\", test_df_processed.shape)\n", + "active": true + }, + { + "id": "0-0-0", + "parent_id": "0-0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "6.5", + "instruction": "Perform a correlation analysis to identify highly correlated features and potential multicollinearity issues before training a base model to predict the target column 'class' on the train set.", + "code": "", + "active": false + }, + { + "id": "0-0-1", + "parent_id": "0-0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "6.5", + "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable 'class' to check for class imbalance. If significant class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.", + "code": "", + "active": false + }, + { + "id": "0-0-2", + "parent_id": "0-0", + "avg_score": "81.8", + "dev_score": "81.8", + "visits": 1, + "order": 7, + "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.", + "code": "from sklearn.model_selection import train_test_split\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\nfrom sklearn.ensemble import StackingClassifier\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\nfrom sklearn.neighbors import KNeighborsClassifier\n\n# Prepare data for modeling\nX_train = train_df_processed.drop(columns=['class'])\ny_train = train_df_processed['class']\nX_dev = dev_df_processed.drop(columns=['class'])\ny_dev = dev_df_processed['class']\nX_test = test_df_processed\n\n# Define base models\nbase_models = [\n ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, learning_rate=0.1, max_depth=5)),\n ('rf', RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)),\n ('et', ExtraTreesClassifier(n_estimators=200, max_depth=10, random_state=42)),\n ('knn', KNeighborsClassifier(n_neighbors=5))\n]\n\n# Define meta-model\nmeta_model = LogisticRegression()\n\n# Create stacking ensemble model\nstacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)\n\n# Train the stacking model\nstacking_model.fit(X_train, y_train)\n\n# Predict on dev set\ny_dev_pred = stacking_model.predict(X_dev)\n\n# Predict on test set\ny_test_pred = stacking_model.predict(X_test)\n\n# Save predictions\npd.DataFrame({'target': y_dev_pred}).to_csv('../workspace/jasmine/dev_predictions.csv', index=False)\npd.DataFrame({'target': y_test_pred}).to_csv('../workspace/jasmine/test_predictions.csv', index=False)\n\n# Evaluate performance\ntrain_f1 = f1_score(y_train, stacking_model.predict(X_train))\ndev_f1 = f1_score(y_dev, y_dev_pred)\n\nprint(f\"Train F1 Score: {train_f1}\")\nprint(f\"Dev F1 Score: {dev_f1}\")\n", + "active": true + }, + { + "id": "0-0-3", + "parent_id": "0-0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "6.5", + "instruction": "Train a base model to predict the target column 'class' on the train set, after analyzing the frequency distribution of categorical features to understand their diversity and potential impact on the model.", + "code": "", + "active": false + }, + { + "id": "0-0-4", + "parent_id": "0-0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "6.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.", + "code": "", + "active": false + }, + { + "id": "0-3", + "parent_id": "0", + "avg_score": "81.6", + "dev_score": "81.0", + "visits": 4, + "order": 2, + "instruction": "Analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model. Use this analysis to engineer new features or transform existing ones that can better capture the underlying patterns in the data.", + "code": "from sklearn.preprocessing import PolynomialFeatures\n\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Remove ID columns if any\n df_copy = df_copy.select_dtypes(include=['number'])\n \n # Polynomial features\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n \n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n \n if y is not None:\n df_poly['class'] = y\n \n return df_poly, poly\n\ntrain_df_poly, poly = engineer_features(train_df_processed)\ndev_df_poly, _ = engineer_features(dev_df_processed, poly)\ntest_df_poly, _ = engineer_features(test_df_processed, poly)\n\nprint(\"Processed Train Dataset with Polynomial Features:\")\nprint(train_df_poly.head())\n", + "active": false + }, + { + "id": "0-3-0", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after performing a correlation analysis to identify highly correlated features and address potential multicollinearity issues.", + "code": "", + "active": false + }, + { + "id": "0-3-1", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.", + "code": "", + "active": false + }, + { + "id": "0-3-2", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.", + "code": "", + "active": false + }, + { + "id": "0-3-3", + "parent_id": "0-3", + "avg_score": "81.9", + "dev_score": "81.0", + "visits": 3, + "order": 3, + "instruction": "Train a base model to predict the target column 'class' on the train set. Prior to training, analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model.", + "code": "from sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\n\n# Label encode categorical features if any\nlabel_encoders = {}\nfor col in categorical_columns:\n le = LabelEncoder()\n train_df_poly[col] = le.fit_transform(train_df_poly[col])\n dev_df_poly[col] = le.transform(dev_df_poly[col])\n test_df_poly[col] = le.transform(test_df_poly[col])\n label_encoders[col] = le\n\n# Separate features and target\nX_train = train_df_poly.drop(columns=['class'])\ny_train = train_df_poly['class']\nX_dev = dev_df_poly.drop(columns=['class'])\ny_dev = dev_df_poly['class']\nX_test = test_df_poly\n\n# Train a base XGBoost model\nmodel = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, max_depth=7, learning_rate=0.1)\nmodel.fit(X_train, y_train)\n\n# Predict on dev set\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Predict on test set\ny_test_pred = model.predict(X_test)\n\n# Save predictions\nimport os\noutput_dir = '../workspace/jasmine'\nos.makedirs(output_dir, exist_ok=True)\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n", + "active": false + }, + { + "id": "0-3-3-0", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Evaluate the base model on the dev set, report the F1 score, and perform a correlation analysis to identify highly correlated features and potential multicollinearity issues.", + "code": "", + "active": false + }, + { + "id": "0-3-3-1", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques before re-evaluating the model.", + "code": "", + "active": false + }, + { + "id": "0-3-3-2", + "parent_id": "0-3-3", + "avg_score": "82.3", + "dev_score": "82.3", + "visits": 2, + "order": 4, + "instruction": "Evaluate the base model on the dev set, report the F1 score, and visualize the distribution of numerical features to identify outliers and skewed distributions.", + "code": "# Evaluate the base model on the dev set and report the F1 score\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Save the dev set predictions\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\n\n# Save the test set predictions\ny_test_pred = model.predict(X_test)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n\n# Print the train and dev set performance\ntrain_f1 = f1_score(y_train, model.predict(X_train))\nprint(f\"Train Set F1 Score: {train_f1}\")\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n", + "active": false + }, + { + "id": "0-3-3-3", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the frequency distribution of categorical features to understand their impact on the model performance.", + "code": "", + "active": false + }, + { + "id": "0-3-3-4", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Conduct a PCA (Principal Component Analysis) on the dev set to reduce dimensionality and understand the variance captured by different components. Then, evaluate the base model on the transformed dev set and report the F1 score.", + "code": "", + "active": false + }, + { + "id": "0-3-4", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.", + "code": "", + "active": false + }, + { + "id": "0-4", + "parent_id": "0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "1.5", + "instruction": "Perform Principal Component Analysis (PCA) to reduce dimensionality and understand the variance captured by different components, then engineer features based on the most significant principal components to improve model performance.", + "code": "", + "active": false + } +] \ No newline at end of file diff --git a/src/components/demo2/datas/credit-g/tree_08.json b/src/components/demo2/datas/credit-g/tree_08.json new file mode 100644 index 00000000..fbf0cf7a --- /dev/null +++ b/src/components/demo2/datas/credit-g/tree_08.json @@ -0,0 +1,222 @@ +[ + { + "id": "0", + "parent_id": null, + "avg_score": "73.7", + "dev_score": "83.4", + "visits": 10, + "order": 1, + "instruction": "Perform exploratory data analysis on the train and dev sets to understand the dataset structure and characteristics.\n\nPreprocess the train, dev, and test sets by handling missing values, encoding categorical variables, and scaling numerical features.", + "code": "import pandas as pd\nimport numpy as np\n\n# Load the dataset\ntrain_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\ndev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n\n# Display basic information about the datasets\nprint(\"Train Dataset Info:\")\nprint(train_df.info())\nprint(\"\\nDev Dataset Info:\")\nprint(dev_df.info())\n\n# Display summary statistics for numerical columns\nprint(\"\\nTrain Dataset Summary Statistics:\")\nprint(train_df.describe())\nprint(\"\\nDev Dataset Summary Statistics:\")\nprint(dev_df.describe())\n\n# Display the number of unique values for categorical columns\ncategorical_columns = train_df.select_dtypes(include=['object', 'category']).columns\nprint(\"\\nUnique Values in Categorical Columns:\")\nfor col in categorical_columns:\n print(f\"{col}: {train_df[col].nunique()} unique values\")\n\n# Check for missing values\nprint(\"\\nMissing Values in Train Dataset:\")\nprint(train_df.isnull().sum())\nprint(\"\\nMissing Values in Dev Dataset:\")\nprint(dev_df.isnull().sum())\n\n# Check the distribution of the target column\nprint(\"\\nTarget Column Distribution in Train Dataset:\")\nprint(train_df['class'].value_counts(normalize=True))\nprint(\"\\nTarget Column Distribution in Dev Dataset:\")\nprint(dev_df['class'].value_counts(normalize=True))\n\n\nfrom sklearn.preprocessing import StandardScaler\n\n# Function to preprocess data\ndef preprocess_data(df, scaler=None):\n df_copy = df.copy()\n \n # Separate target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Scale numerical features\n if scaler is None:\n scaler = StandardScaler()\n df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n else:\n df_copy[df_copy.columns] = scaler.transform(df_copy)\n \n # Reattach target column if it was separated\n if y is not None:\n df_copy['class'] = y\n \n return df_copy, scaler\n\n# Preprocess train, dev, and test sets\ntrain_df_processed, scaler = preprocess_data(train_df)\ndev_df_processed, _ = preprocess_data(dev_df, scaler)\ntest_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\ntest_df_processed, _ = preprocess_data(test_df, scaler)\n\n# Display the first few rows of the processed train dataset\nprint(\"Processed Train Dataset:\")\nprint(train_df_processed.head())\n", + "active": true + }, + { + "id": "0-0", + "parent_id": "0", + "avg_score": "82.5", + "dev_score": "83.2", + "visits": 2, + "order": 6, + "instruction": "Perform a correlation analysis to identify highly correlated features and potential multicollinearity issues before engineering new features or transforming existing ones to improve model performance.", + "code": "import numpy as np\n\n# Function to perform correlation analysis and identify highly correlated features\ndef correlation_analysis(df):\n corr_matrix = df.corr()\n upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))\n to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]\n return to_drop\n\n# Perform correlation analysis on the processed train dataset\nto_drop = correlation_analysis(train_df_processed.drop(columns=['class']))\n\n# Print the features to drop due to high correlation\nprint(\"Features to drop due to high correlation:\", to_drop)\n\n# Drop the identified features from the train, dev, and test datasets\ntrain_df_processed = train_df_processed.drop(columns=to_drop)\ndev_df_processed = dev_df_processed.drop(columns=to_drop)\ntest_df_processed = test_df_processed.drop(columns=to_drop)\n\n# Print the shape of the datasets after dropping highly correlated features\nprint(\"Shape of processed train dataset after dropping features:\", train_df_processed.shape)\nprint(\"Shape of processed dev dataset after dropping features:\", dev_df_processed.shape)\nprint(\"Shape of processed test dataset after dropping features:\", test_df_processed.shape)\n", + "active": false + }, + { + "id": "0-0-0", + "parent_id": "0-0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "6.5", + "instruction": "Perform a correlation analysis to identify highly correlated features and potential multicollinearity issues before training a base model to predict the target column 'class' on the train set.", + "code": "", + "active": false + }, + { + "id": "0-0-1", + "parent_id": "0-0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "6.5", + "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable 'class' to check for class imbalance. If significant class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.", + "code": "", + "active": false + }, + { + "id": "0-0-2", + "parent_id": "0-0", + "avg_score": "81.8", + "dev_score": "81.8", + "visits": 1, + "order": 7, + "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.", + "code": "from sklearn.model_selection import train_test_split\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\nfrom sklearn.ensemble import StackingClassifier\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\nfrom sklearn.neighbors import KNeighborsClassifier\n\n# Prepare data for modeling\nX_train = train_df_processed.drop(columns=['class'])\ny_train = train_df_processed['class']\nX_dev = dev_df_processed.drop(columns=['class'])\ny_dev = dev_df_processed['class']\nX_test = test_df_processed\n\n# Define base models\nbase_models = [\n ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, learning_rate=0.1, max_depth=5)),\n ('rf', RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)),\n ('et', ExtraTreesClassifier(n_estimators=200, max_depth=10, random_state=42)),\n ('knn', KNeighborsClassifier(n_neighbors=5))\n]\n\n# Define meta-model\nmeta_model = LogisticRegression()\n\n# Create stacking ensemble model\nstacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)\n\n# Train the stacking model\nstacking_model.fit(X_train, y_train)\n\n# Predict on dev set\ny_dev_pred = stacking_model.predict(X_dev)\n\n# Predict on test set\ny_test_pred = stacking_model.predict(X_test)\n\n# Save predictions\npd.DataFrame({'target': y_dev_pred}).to_csv('../workspace/jasmine/dev_predictions.csv', index=False)\npd.DataFrame({'target': y_test_pred}).to_csv('../workspace/jasmine/test_predictions.csv', index=False)\n\n# Evaluate performance\ntrain_f1 = f1_score(y_train, stacking_model.predict(X_train))\ndev_f1 = f1_score(y_dev, y_dev_pred)\n\nprint(f\"Train F1 Score: {train_f1}\")\nprint(f\"Dev F1 Score: {dev_f1}\")\n", + "active": false + }, + { + "id": "0-0-3", + "parent_id": "0-0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "6.5", + "instruction": "Train a base model to predict the target column 'class' on the train set, after analyzing the frequency distribution of categorical features to understand their diversity and potential impact on the model.", + "code": "", + "active": false + }, + { + "id": "0-0-4", + "parent_id": "0-0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "6.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.", + "code": "", + "active": false + }, + { + "id": "0-1", + "parent_id": "0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 1, + "order": 8, + "instruction": "Analyze the distribution of the target variable to check for class imbalance and decide on appropriate sampling techniques if necessary, before engineering features by creating new columns or transforming existing ones to improve model performance.", + "code": "from sklearn.decomposition import PCA\nfrom sklearn.preprocessing import PolynomialFeatures\n\ndef feature_engineering(df):\n df_copy = df.copy()\n \n # Remove the target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Apply PCA for dimensionality reduction\n pca = PCA(n_components=0.95) # Keep 95% of the variance\n pca_features = pca.fit_transform(df_copy)\n pca_df = pd.DataFrame(pca_features, columns=[f'PCA_{i}' for i in range(pca_features.shape[1])])\n \n # Apply PolynomialFeatures for interaction terms\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n poly_columns = poly.get_feature_names_out(df_copy.columns)\n poly_df = pd.DataFrame(poly_features, columns=poly_columns)\n \n # Combine PCA and Polynomial features\n combined_df = pd.concat([pca_df, poly_df], axis=1)\n \n # Re-add the target column if it was removed\n if y is not None:\n combined_df['class'] = y\n \n return combined_df\n\n# Apply feature engineering to train, dev, and test sets\ntrain_df_engineered = feature_engineering(train_df_processed)\ndev_df_engineered = feature_engineering(dev_df_processed)\ntest_df_engineered = feature_engineering(test_df_processed)\n\nprint(\"Engineered Train Dataset:\")\nprint(train_df_engineered.head())\n", + "active": true + }, + { + "id": "0-3", + "parent_id": "0", + "avg_score": "81.6", + "dev_score": "81.0", + "visits": 4, + "order": 2, + "instruction": "Analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model. Use this analysis to engineer new features or transform existing ones that can better capture the underlying patterns in the data.", + "code": "from sklearn.preprocessing import PolynomialFeatures\n\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Remove ID columns if any\n df_copy = df_copy.select_dtypes(include=['number'])\n \n # Polynomial features\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n \n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n \n if y is not None:\n df_poly['class'] = y\n \n return df_poly, poly\n\ntrain_df_poly, poly = engineer_features(train_df_processed)\ndev_df_poly, _ = engineer_features(dev_df_processed, poly)\ntest_df_poly, _ = engineer_features(test_df_processed, poly)\n\nprint(\"Processed Train Dataset with Polynomial Features:\")\nprint(train_df_poly.head())\n", + "active": false + }, + { + "id": "0-3-0", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after performing a correlation analysis to identify highly correlated features and address potential multicollinearity issues.", + "code": "", + "active": false + }, + { + "id": "0-3-1", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.", + "code": "", + "active": false + }, + { + "id": "0-3-2", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.", + "code": "", + "active": false + }, + { + "id": "0-3-3", + "parent_id": "0-3", + "avg_score": "81.9", + "dev_score": "81.0", + "visits": 3, + "order": 3, + "instruction": "Train a base model to predict the target column 'class' on the train set. Prior to training, analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model.", + "code": "from sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\n\n# Label encode categorical features if any\nlabel_encoders = {}\nfor col in categorical_columns:\n le = LabelEncoder()\n train_df_poly[col] = le.fit_transform(train_df_poly[col])\n dev_df_poly[col] = le.transform(dev_df_poly[col])\n test_df_poly[col] = le.transform(test_df_poly[col])\n label_encoders[col] = le\n\n# Separate features and target\nX_train = train_df_poly.drop(columns=['class'])\ny_train = train_df_poly['class']\nX_dev = dev_df_poly.drop(columns=['class'])\ny_dev = dev_df_poly['class']\nX_test = test_df_poly\n\n# Train a base XGBoost model\nmodel = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, max_depth=7, learning_rate=0.1)\nmodel.fit(X_train, y_train)\n\n# Predict on dev set\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Predict on test set\ny_test_pred = model.predict(X_test)\n\n# Save predictions\nimport os\noutput_dir = '../workspace/jasmine'\nos.makedirs(output_dir, exist_ok=True)\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n", + "active": false + }, + { + "id": "0-3-3-0", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Evaluate the base model on the dev set, report the F1 score, and perform a correlation analysis to identify highly correlated features and potential multicollinearity issues.", + "code": "", + "active": false + }, + { + "id": "0-3-3-1", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques before re-evaluating the model.", + "code": "", + "active": false + }, + { + "id": "0-3-3-2", + "parent_id": "0-3-3", + "avg_score": "82.3", + "dev_score": "82.3", + "visits": 2, + "order": 4, + "instruction": "Evaluate the base model on the dev set, report the F1 score, and visualize the distribution of numerical features to identify outliers and skewed distributions.", + "code": "# Evaluate the base model on the dev set and report the F1 score\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Save the dev set predictions\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\n\n# Save the test set predictions\ny_test_pred = model.predict(X_test)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n\n# Print the train and dev set performance\ntrain_f1 = f1_score(y_train, model.predict(X_train))\nprint(f\"Train Set F1 Score: {train_f1}\")\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n", + "active": false + }, + { + "id": "0-3-3-3", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the frequency distribution of categorical features to understand their impact on the model performance.", + "code": "", + "active": false + }, + { + "id": "0-3-3-4", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Conduct a PCA (Principal Component Analysis) on the dev set to reduce dimensionality and understand the variance captured by different components. Then, evaluate the base model on the transformed dev set and report the F1 score.", + "code": "", + "active": false + }, + { + "id": "0-3-4", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.", + "code": "", + "active": false + }, + { + "id": "0-4", + "parent_id": "0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "1.5", + "instruction": "Perform Principal Component Analysis (PCA) to reduce dimensionality and understand the variance captured by different components, then engineer features based on the most significant principal components to improve model performance.", + "code": "", + "active": false + } +] \ No newline at end of file diff --git a/src/components/demo2/datas/credit-g/tree_09.json b/src/components/demo2/datas/credit-g/tree_09.json new file mode 100644 index 00000000..817809ac --- /dev/null +++ b/src/components/demo2/datas/credit-g/tree_09.json @@ -0,0 +1,277 @@ +[ + { + "id": "0", + "parent_id": null, + "avg_score": "73.7", + "dev_score": "83.4", + "visits": 10, + "order": 1, + "instruction": "Perform exploratory data analysis on the train and dev sets to understand the dataset structure and characteristics.\n\nPreprocess the train, dev, and test sets by handling missing values, encoding categorical variables, and scaling numerical features.", + "code": "import pandas as pd\nimport numpy as np\n\n# Load the dataset\ntrain_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\ndev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n\n# Display basic information about the datasets\nprint(\"Train Dataset Info:\")\nprint(train_df.info())\nprint(\"\\nDev Dataset Info:\")\nprint(dev_df.info())\n\n# Display summary statistics for numerical columns\nprint(\"\\nTrain Dataset Summary Statistics:\")\nprint(train_df.describe())\nprint(\"\\nDev Dataset Summary Statistics:\")\nprint(dev_df.describe())\n\n# Display the number of unique values for categorical columns\ncategorical_columns = train_df.select_dtypes(include=['object', 'category']).columns\nprint(\"\\nUnique Values in Categorical Columns:\")\nfor col in categorical_columns:\n print(f\"{col}: {train_df[col].nunique()} unique values\")\n\n# Check for missing values\nprint(\"\\nMissing Values in Train Dataset:\")\nprint(train_df.isnull().sum())\nprint(\"\\nMissing Values in Dev Dataset:\")\nprint(dev_df.isnull().sum())\n\n# Check the distribution of the target column\nprint(\"\\nTarget Column Distribution in Train Dataset:\")\nprint(train_df['class'].value_counts(normalize=True))\nprint(\"\\nTarget Column Distribution in Dev Dataset:\")\nprint(dev_df['class'].value_counts(normalize=True))\n\n\nfrom sklearn.preprocessing import StandardScaler\n\n# Function to preprocess data\ndef preprocess_data(df, scaler=None):\n df_copy = df.copy()\n \n # Separate target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Scale numerical features\n if scaler is None:\n scaler = StandardScaler()\n df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n else:\n df_copy[df_copy.columns] = scaler.transform(df_copy)\n \n # Reattach target column if it was separated\n if y is not None:\n df_copy['class'] = y\n \n return df_copy, scaler\n\n# Preprocess train, dev, and test sets\ntrain_df_processed, scaler = preprocess_data(train_df)\ndev_df_processed, _ = preprocess_data(dev_df, scaler)\ntest_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\ntest_df_processed, _ = preprocess_data(test_df, scaler)\n\n# Display the first few rows of the processed train dataset\nprint(\"Processed Train Dataset:\")\nprint(train_df_processed.head())\n", + "active": true + }, + { + "id": "0-0", + "parent_id": "0", + "avg_score": "82.5", + "dev_score": "83.2", + "visits": 2, + "order": 6, + "instruction": "Perform a correlation analysis to identify highly correlated features and potential multicollinearity issues before engineering new features or transforming existing ones to improve model performance.", + "code": "import numpy as np\n\n# Function to perform correlation analysis and identify highly correlated features\ndef correlation_analysis(df):\n corr_matrix = df.corr()\n upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))\n to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]\n return to_drop\n\n# Perform correlation analysis on the processed train dataset\nto_drop = correlation_analysis(train_df_processed.drop(columns=['class']))\n\n# Print the features to drop due to high correlation\nprint(\"Features to drop due to high correlation:\", to_drop)\n\n# Drop the identified features from the train, dev, and test datasets\ntrain_df_processed = train_df_processed.drop(columns=to_drop)\ndev_df_processed = dev_df_processed.drop(columns=to_drop)\ntest_df_processed = test_df_processed.drop(columns=to_drop)\n\n# Print the shape of the datasets after dropping highly correlated features\nprint(\"Shape of processed train dataset after dropping features:\", train_df_processed.shape)\nprint(\"Shape of processed dev dataset after dropping features:\", dev_df_processed.shape)\nprint(\"Shape of processed test dataset after dropping features:\", test_df_processed.shape)\n", + "active": false + }, + { + "id": "0-0-0", + "parent_id": "0-0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "6.5", + "instruction": "Perform a correlation analysis to identify highly correlated features and potential multicollinearity issues before training a base model to predict the target column 'class' on the train set.", + "code": "", + "active": false + }, + { + "id": "0-0-1", + "parent_id": "0-0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "6.5", + "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable 'class' to check for class imbalance. If significant class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.", + "code": "", + "active": false + }, + { + "id": "0-0-2", + "parent_id": "0-0", + "avg_score": "81.8", + "dev_score": "81.8", + "visits": 1, + "order": 7, + "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.", + "code": "from sklearn.model_selection import train_test_split\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\nfrom sklearn.ensemble import StackingClassifier\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\nfrom sklearn.neighbors import KNeighborsClassifier\n\n# Prepare data for modeling\nX_train = train_df_processed.drop(columns=['class'])\ny_train = train_df_processed['class']\nX_dev = dev_df_processed.drop(columns=['class'])\ny_dev = dev_df_processed['class']\nX_test = test_df_processed\n\n# Define base models\nbase_models = [\n ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, learning_rate=0.1, max_depth=5)),\n ('rf', RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)),\n ('et', ExtraTreesClassifier(n_estimators=200, max_depth=10, random_state=42)),\n ('knn', KNeighborsClassifier(n_neighbors=5))\n]\n\n# Define meta-model\nmeta_model = LogisticRegression()\n\n# Create stacking ensemble model\nstacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)\n\n# Train the stacking model\nstacking_model.fit(X_train, y_train)\n\n# Predict on dev set\ny_dev_pred = stacking_model.predict(X_dev)\n\n# Predict on test set\ny_test_pred = stacking_model.predict(X_test)\n\n# Save predictions\npd.DataFrame({'target': y_dev_pred}).to_csv('../workspace/jasmine/dev_predictions.csv', index=False)\npd.DataFrame({'target': y_test_pred}).to_csv('../workspace/jasmine/test_predictions.csv', index=False)\n\n# Evaluate performance\ntrain_f1 = f1_score(y_train, stacking_model.predict(X_train))\ndev_f1 = f1_score(y_dev, y_dev_pred)\n\nprint(f\"Train F1 Score: {train_f1}\")\nprint(f\"Dev F1 Score: {dev_f1}\")\n", + "active": false + }, + { + "id": "0-0-3", + "parent_id": "0-0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "6.5", + "instruction": "Train a base model to predict the target column 'class' on the train set, after analyzing the frequency distribution of categorical features to understand their diversity and potential impact on the model.", + "code": "", + "active": false + }, + { + "id": "0-0-4", + "parent_id": "0-0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "6.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.", + "code": "", + "active": false + }, + { + "id": "0-1", + "parent_id": "0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 1, + "order": 8, + "instruction": "Analyze the distribution of the target variable to check for class imbalance and decide on appropriate sampling techniques if necessary, before engineering features by creating new columns or transforming existing ones to improve model performance.", + "code": "from sklearn.decomposition import PCA\nfrom sklearn.preprocessing import PolynomialFeatures\n\ndef feature_engineering(df):\n df_copy = df.copy()\n \n # Remove the target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Apply PCA for dimensionality reduction\n pca = PCA(n_components=0.95) # Keep 95% of the variance\n pca_features = pca.fit_transform(df_copy)\n pca_df = pd.DataFrame(pca_features, columns=[f'PCA_{i}' for i in range(pca_features.shape[1])])\n \n # Apply PolynomialFeatures for interaction terms\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n poly_columns = poly.get_feature_names_out(df_copy.columns)\n poly_df = pd.DataFrame(poly_features, columns=poly_columns)\n \n # Combine PCA and Polynomial features\n combined_df = pd.concat([pca_df, poly_df], axis=1)\n \n # Re-add the target column if it was removed\n if y is not None:\n combined_df['class'] = y\n \n return combined_df\n\n# Apply feature engineering to train, dev, and test sets\ntrain_df_engineered = feature_engineering(train_df_processed)\ndev_df_engineered = feature_engineering(dev_df_processed)\ntest_df_engineered = feature_engineering(test_df_processed)\n\nprint(\"Engineered Train Dataset:\")\nprint(train_df_engineered.head())\n", + "active": false + }, + { + "id": "0-2", + "parent_id": "0", + "avg_score": "80.9", + "dev_score": "82.9", + "visits": 2, + "order": 9, + "instruction": "Visualize the distribution of numerical features to identify outliers and skewed distributions, then engineer features by creating new columns or transforming existing ones to improve model performance.", + "code": "from sklearn.preprocessing import PolynomialFeatures\n\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Remove ID columns if any\n df_copy = df_copy.select_dtypes(include=['number'])\n \n # Polynomial features\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n \n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n \n if y is not None:\n df_poly['class'] = y\n \n return df_poly, poly\n\ntrain_df_engineered, poly = engineer_features(train_df_processed)\ndev_df_engineered, _ = engineer_features(dev_df_processed, poly)\ntest_df_engineered, _ = engineer_features(test_df_processed, poly)\n\nprint(\"Engineered Train Dataset:\")\nprint(train_df_engineered.head())\n", + "active": true + }, + { + "id": "0-2-1", + "parent_id": "0-2", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "9.5", + "instruction": "Train a base model to predict the target column 'class' on the train set. Before training, analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset.", + "code": "", + "active": false + }, + { + "id": "0-2-2", + "parent_id": "0-2", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "9.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.", + "code": "", + "active": false + }, + { + "id": "0-2-3", + "parent_id": "0-2", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "9.5", + "instruction": "Train a base model to predict the target column 'class' on the train set. Prior to training, analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model.", + "code": "", + "active": false + }, + { + "id": "0-2-4", + "parent_id": "0-2", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "9.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.", + "code": "", + "active": false + }, + { + "id": "0-3", + "parent_id": "0", + "avg_score": "81.6", + "dev_score": "81.0", + "visits": 4, + "order": 2, + "instruction": "Analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model. Use this analysis to engineer new features or transform existing ones that can better capture the underlying patterns in the data.", + "code": "from sklearn.preprocessing import PolynomialFeatures\n\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Remove ID columns if any\n df_copy = df_copy.select_dtypes(include=['number'])\n \n # Polynomial features\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n \n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n \n if y is not None:\n df_poly['class'] = y\n \n return df_poly, poly\n\ntrain_df_poly, poly = engineer_features(train_df_processed)\ndev_df_poly, _ = engineer_features(dev_df_processed, poly)\ntest_df_poly, _ = engineer_features(test_df_processed, poly)\n\nprint(\"Processed Train Dataset with Polynomial Features:\")\nprint(train_df_poly.head())\n", + "active": false + }, + { + "id": "0-3-0", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after performing a correlation analysis to identify highly correlated features and address potential multicollinearity issues.", + "code": "", + "active": false + }, + { + "id": "0-3-1", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.", + "code": "", + "active": false + }, + { + "id": "0-3-2", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.", + "code": "", + "active": false + }, + { + "id": "0-3-3", + "parent_id": "0-3", + "avg_score": "81.9", + "dev_score": "81.0", + "visits": 3, + "order": 3, + "instruction": "Train a base model to predict the target column 'class' on the train set. Prior to training, analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model.", + "code": "from sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\n\n# Label encode categorical features if any\nlabel_encoders = {}\nfor col in categorical_columns:\n le = LabelEncoder()\n train_df_poly[col] = le.fit_transform(train_df_poly[col])\n dev_df_poly[col] = le.transform(dev_df_poly[col])\n test_df_poly[col] = le.transform(test_df_poly[col])\n label_encoders[col] = le\n\n# Separate features and target\nX_train = train_df_poly.drop(columns=['class'])\ny_train = train_df_poly['class']\nX_dev = dev_df_poly.drop(columns=['class'])\ny_dev = dev_df_poly['class']\nX_test = test_df_poly\n\n# Train a base XGBoost model\nmodel = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, max_depth=7, learning_rate=0.1)\nmodel.fit(X_train, y_train)\n\n# Predict on dev set\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Predict on test set\ny_test_pred = model.predict(X_test)\n\n# Save predictions\nimport os\noutput_dir = '../workspace/jasmine'\nos.makedirs(output_dir, exist_ok=True)\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n", + "active": false + }, + { + "id": "0-3-3-0", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Evaluate the base model on the dev set, report the F1 score, and perform a correlation analysis to identify highly correlated features and potential multicollinearity issues.", + "code": "", + "active": false + }, + { + "id": "0-3-3-1", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques before re-evaluating the model.", + "code": "", + "active": false + }, + { + "id": "0-3-3-2", + "parent_id": "0-3-3", + "avg_score": "82.3", + "dev_score": "82.3", + "visits": 2, + "order": 4, + "instruction": "Evaluate the base model on the dev set, report the F1 score, and visualize the distribution of numerical features to identify outliers and skewed distributions.", + "code": "# Evaluate the base model on the dev set and report the F1 score\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Save the dev set predictions\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\n\n# Save the test set predictions\ny_test_pred = model.predict(X_test)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n\n# Print the train and dev set performance\ntrain_f1 = f1_score(y_train, model.predict(X_train))\nprint(f\"Train Set F1 Score: {train_f1}\")\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n", + "active": false + }, + { + "id": "0-3-3-3", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the frequency distribution of categorical features to understand their impact on the model performance.", + "code": "", + "active": false + }, + { + "id": "0-3-3-4", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Conduct a PCA (Principal Component Analysis) on the dev set to reduce dimensionality and understand the variance captured by different components. Then, evaluate the base model on the transformed dev set and report the F1 score.", + "code": "", + "active": false + }, + { + "id": "0-3-4", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.", + "code": "", + "active": false + }, + { + "id": "0-4", + "parent_id": "0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "1.5", + "instruction": "Perform Principal Component Analysis (PCA) to reduce dimensionality and understand the variance captured by different components, then engineer features based on the most significant principal components to improve model performance.", + "code": "", + "active": false + } +] \ No newline at end of file diff --git a/src/components/demo2/datas/credit-g/tree_10.json b/src/components/demo2/datas/credit-g/tree_10.json new file mode 100644 index 00000000..19c970b7 --- /dev/null +++ b/src/components/demo2/datas/credit-g/tree_10.json @@ -0,0 +1,288 @@ +[ + { + "id": "0", + "parent_id": null, + "avg_score": "73.7", + "dev_score": "83.4", + "visits": 10, + "order": 1, + "instruction": "Perform exploratory data analysis on the train and dev sets to understand the dataset structure and characteristics.\n\nPreprocess the train, dev, and test sets by handling missing values, encoding categorical variables, and scaling numerical features.", + "code": "import pandas as pd\nimport numpy as np\n\n# Load the dataset\ntrain_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\ndev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n\n# Display basic information about the datasets\nprint(\"Train Dataset Info:\")\nprint(train_df.info())\nprint(\"\\nDev Dataset Info:\")\nprint(dev_df.info())\n\n# Display summary statistics for numerical columns\nprint(\"\\nTrain Dataset Summary Statistics:\")\nprint(train_df.describe())\nprint(\"\\nDev Dataset Summary Statistics:\")\nprint(dev_df.describe())\n\n# Display the number of unique values for categorical columns\ncategorical_columns = train_df.select_dtypes(include=['object', 'category']).columns\nprint(\"\\nUnique Values in Categorical Columns:\")\nfor col in categorical_columns:\n print(f\"{col}: {train_df[col].nunique()} unique values\")\n\n# Check for missing values\nprint(\"\\nMissing Values in Train Dataset:\")\nprint(train_df.isnull().sum())\nprint(\"\\nMissing Values in Dev Dataset:\")\nprint(dev_df.isnull().sum())\n\n# Check the distribution of the target column\nprint(\"\\nTarget Column Distribution in Train Dataset:\")\nprint(train_df['class'].value_counts(normalize=True))\nprint(\"\\nTarget Column Distribution in Dev Dataset:\")\nprint(dev_df['class'].value_counts(normalize=True))\n\n\nfrom sklearn.preprocessing import StandardScaler\n\n# Function to preprocess data\ndef preprocess_data(df, scaler=None):\n df_copy = df.copy()\n \n # Separate target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Scale numerical features\n if scaler is None:\n scaler = StandardScaler()\n df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n else:\n df_copy[df_copy.columns] = scaler.transform(df_copy)\n \n # Reattach target column if it was separated\n if y is not None:\n df_copy['class'] = y\n \n return df_copy, scaler\n\n# Preprocess train, dev, and test sets\ntrain_df_processed, scaler = preprocess_data(train_df)\ndev_df_processed, _ = preprocess_data(dev_df, scaler)\ntest_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\ntest_df_processed, _ = preprocess_data(test_df, scaler)\n\n# Display the first few rows of the processed train dataset\nprint(\"Processed Train Dataset:\")\nprint(train_df_processed.head())\n", + "active": true + }, + { + "id": "0-0", + "parent_id": "0", + "avg_score": "82.5", + "dev_score": "83.2", + "visits": 2, + "order": 6, + "instruction": "Perform a correlation analysis to identify highly correlated features and potential multicollinearity issues before engineering new features or transforming existing ones to improve model performance.", + "code": "import numpy as np\n\n# Function to perform correlation analysis and identify highly correlated features\ndef correlation_analysis(df):\n corr_matrix = df.corr()\n upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))\n to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]\n return to_drop\n\n# Perform correlation analysis on the processed train dataset\nto_drop = correlation_analysis(train_df_processed.drop(columns=['class']))\n\n# Print the features to drop due to high correlation\nprint(\"Features to drop due to high correlation:\", to_drop)\n\n# Drop the identified features from the train, dev, and test datasets\ntrain_df_processed = train_df_processed.drop(columns=to_drop)\ndev_df_processed = dev_df_processed.drop(columns=to_drop)\ntest_df_processed = test_df_processed.drop(columns=to_drop)\n\n# Print the shape of the datasets after dropping highly correlated features\nprint(\"Shape of processed train dataset after dropping features:\", train_df_processed.shape)\nprint(\"Shape of processed dev dataset after dropping features:\", dev_df_processed.shape)\nprint(\"Shape of processed test dataset after dropping features:\", test_df_processed.shape)\n", + "active": false + }, + { + "id": "0-0-0", + "parent_id": "0-0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "6.5", + "instruction": "Perform a correlation analysis to identify highly correlated features and potential multicollinearity issues before training a base model to predict the target column 'class' on the train set.", + "code": "", + "active": false + }, + { + "id": "0-0-1", + "parent_id": "0-0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "6.5", + "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable 'class' to check for class imbalance. If significant class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.", + "code": "", + "active": false + }, + { + "id": "0-0-2", + "parent_id": "0-0", + "avg_score": "81.8", + "dev_score": "81.8", + "visits": 1, + "order": 7, + "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.", + "code": "from sklearn.model_selection import train_test_split\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\nfrom sklearn.ensemble import StackingClassifier\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier\nfrom sklearn.neighbors import KNeighborsClassifier\n\n# Prepare data for modeling\nX_train = train_df_processed.drop(columns=['class'])\ny_train = train_df_processed['class']\nX_dev = dev_df_processed.drop(columns=['class'])\ny_dev = dev_df_processed['class']\nX_test = test_df_processed\n\n# Define base models\nbase_models = [\n ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, learning_rate=0.1, max_depth=5)),\n ('rf', RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)),\n ('et', ExtraTreesClassifier(n_estimators=200, max_depth=10, random_state=42)),\n ('knn', KNeighborsClassifier(n_neighbors=5))\n]\n\n# Define meta-model\nmeta_model = LogisticRegression()\n\n# Create stacking ensemble model\nstacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)\n\n# Train the stacking model\nstacking_model.fit(X_train, y_train)\n\n# Predict on dev set\ny_dev_pred = stacking_model.predict(X_dev)\n\n# Predict on test set\ny_test_pred = stacking_model.predict(X_test)\n\n# Save predictions\npd.DataFrame({'target': y_dev_pred}).to_csv('../workspace/jasmine/dev_predictions.csv', index=False)\npd.DataFrame({'target': y_test_pred}).to_csv('../workspace/jasmine/test_predictions.csv', index=False)\n\n# Evaluate performance\ntrain_f1 = f1_score(y_train, stacking_model.predict(X_train))\ndev_f1 = f1_score(y_dev, y_dev_pred)\n\nprint(f\"Train F1 Score: {train_f1}\")\nprint(f\"Dev F1 Score: {dev_f1}\")\n", + "active": false + }, + { + "id": "0-0-3", + "parent_id": "0-0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "6.5", + "instruction": "Train a base model to predict the target column 'class' on the train set, after analyzing the frequency distribution of categorical features to understand their diversity and potential impact on the model.", + "code": "", + "active": false + }, + { + "id": "0-0-4", + "parent_id": "0-0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "6.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.", + "code": "", + "active": false + }, + { + "id": "0-1", + "parent_id": "0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 1, + "order": 8, + "instruction": "Analyze the distribution of the target variable to check for class imbalance and decide on appropriate sampling techniques if necessary, before engineering features by creating new columns or transforming existing ones to improve model performance.", + "code": "from sklearn.decomposition import PCA\nfrom sklearn.preprocessing import PolynomialFeatures\n\ndef feature_engineering(df):\n df_copy = df.copy()\n \n # Remove the target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Apply PCA for dimensionality reduction\n pca = PCA(n_components=0.95) # Keep 95% of the variance\n pca_features = pca.fit_transform(df_copy)\n pca_df = pd.DataFrame(pca_features, columns=[f'PCA_{i}' for i in range(pca_features.shape[1])])\n \n # Apply PolynomialFeatures for interaction terms\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n poly_columns = poly.get_feature_names_out(df_copy.columns)\n poly_df = pd.DataFrame(poly_features, columns=poly_columns)\n \n # Combine PCA and Polynomial features\n combined_df = pd.concat([pca_df, poly_df], axis=1)\n \n # Re-add the target column if it was removed\n if y is not None:\n combined_df['class'] = y\n \n return combined_df\n\n# Apply feature engineering to train, dev, and test sets\ntrain_df_engineered = feature_engineering(train_df_processed)\ndev_df_engineered = feature_engineering(dev_df_processed)\ntest_df_engineered = feature_engineering(test_df_processed)\n\nprint(\"Engineered Train Dataset:\")\nprint(train_df_engineered.head())\n", + "active": false + }, + { + "id": "0-2", + "parent_id": "0", + "avg_score": "80.9", + "dev_score": "82.9", + "visits": 2, + "order": 9, + "instruction": "Visualize the distribution of numerical features to identify outliers and skewed distributions, then engineer features by creating new columns or transforming existing ones to improve model performance.", + "code": "from sklearn.preprocessing import PolynomialFeatures\n\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Remove ID columns if any\n df_copy = df_copy.select_dtypes(include=['number'])\n \n # Polynomial features\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n \n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n \n if y is not None:\n df_poly['class'] = y\n \n return df_poly, poly\n\ntrain_df_engineered, poly = engineer_features(train_df_processed)\ndev_df_engineered, _ = engineer_features(dev_df_processed, poly)\ntest_df_engineered, _ = engineer_features(test_df_processed, poly)\n\nprint(\"Engineered Train Dataset:\")\nprint(train_df_engineered.head())\n", + "active": true + }, + { + "id": "0-2-0", + "parent_id": "0-2", + "avg_score": "78.9", + "dev_score": "78.9", + "visits": 1, + "order": 10, + "instruction": "Perform a correlation analysis to identify highly correlated features and potential multicollinearity issues before training a base model to predict the target column 'class' on the train set.", + "code": "import numpy as np\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\nfrom sklearn.feature_selection import SelectFromModel\nimport pandas as pd\nfrom sklearn.preprocessing import StandardScaler, PolynomialFeatures\n\n# Define the preprocess_data function\ndef preprocess_data(df, scaler=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n if scaler is None:\n scaler = StandardScaler()\n df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n else:\n df_copy[df_copy.columns] = scaler.transform(df_copy)\n if y is not None:\n df_copy['class'] = y\n return df_copy, scaler\n\n# Define the engineer_features function\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n df_copy = df_copy.select_dtypes(include=['number'])\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n if y is not None:\n df_poly['class'] = y\n return df_poly, poly\n\n# Load the datasets if they are not already loaded\nif 'train_df_engineered' not in locals():\n train_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\n train_df_engineered = preprocess_data(train_df)[0]\n train_df_engineered, _ = engineer_features(train_df_engineered)\n\nif 'dev_df_engineered' not in locals():\n dev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n dev_df_engineered = preprocess_data(dev_df)[0]\n dev_df_engineered, _ = engineer_features(dev_df_engineered)\n\nif 'test_df_engineered' not in locals():\n test_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\n test_df_engineered = preprocess_data(test_df)[0]\n test_df_engineered, _ = engineer_features(test_df_engineered)\n\n# Function to optimize data types\ndef optimize_dtypes(df):\n for col in df.columns:\n if df[col].dtype == 'float64':\n df[col] = df[col].astype('float32')\n elif df[col].dtype == 'int64':\n df[col] = df[col].astype('int32')\n return df\n\n# Optimize data types for train, dev, and test datasets\ntrain_df_engineered = optimize_dtypes(train_df_engineered)\ndev_df_engineered = optimize_dtypes(dev_df_engineered)\ntest_df_engineered = optimize_dtypes(test_df_engineered)\n\n# Prepare data for modeling\nX_train = train_df_engineered.drop(columns=['class'])\ny_train = train_df_engineered['class']\nX_dev = dev_df_engineered.drop(columns=['class'])\ny_dev = dev_df_engineered['class']\nX_test = test_df_engineered\n\n# Feature selection using SelectFromModel\nbase_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')\nselector = SelectFromModel(base_model, threshold='median')\nselector.fit(X_train, y_train)\n\nX_train_selected = selector.transform(X_train)\nX_dev_selected = selector.transform(X_dev)\nX_test_selected = selector.transform(X_test)\n\n# Define the final model\nfinal_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, max_depth=5)\n\n# Train the final model\nfinal_model.fit(X_train_selected, y_train)\n\n# Predict on dev set\ny_dev_pred = final_model.predict(X_dev_selected)\n\n# Predict on test set\ny_test_pred = final_model.predict(X_test_selected)\n\n# Save predictions\npd.DataFrame({'target': y_dev_pred}).to_csv('../workspace/jasmine/dev_predictions.csv', index=False)\npd.DataFrame({'target': y_test_pred}).to_csv('../workspace/jasmine/test_predictions.csv', index=False)\n\n# Evaluate performance\ntrain_f1 = f1_score(y_train, final_model.predict(X_train_selected))\ndev_f1 = f1_score(y_dev, y_dev_pred)\n\nprint(f\"Train F1 Score: {train_f1}\")\nprint(f\"Dev F1 Score: {dev_f1}\")\n", + "active": true + }, + { + "id": "0-2-1", + "parent_id": "0-2", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "9.5", + "instruction": "Train a base model to predict the target column 'class' on the train set. Before training, analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset.", + "code": "", + "active": false + }, + { + "id": "0-2-2", + "parent_id": "0-2", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "9.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.", + "code": "", + "active": false + }, + { + "id": "0-2-3", + "parent_id": "0-2", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "9.5", + "instruction": "Train a base model to predict the target column 'class' on the train set. Prior to training, analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model.", + "code": "", + "active": false + }, + { + "id": "0-2-4", + "parent_id": "0-2", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "9.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.", + "code": "", + "active": false + }, + { + "id": "0-3", + "parent_id": "0", + "avg_score": "81.6", + "dev_score": "81.0", + "visits": 4, + "order": 2, + "instruction": "Analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model. Use this analysis to engineer new features or transform existing ones that can better capture the underlying patterns in the data.", + "code": "from sklearn.preprocessing import PolynomialFeatures\n\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Remove ID columns if any\n df_copy = df_copy.select_dtypes(include=['number'])\n \n # Polynomial features\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n \n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n \n if y is not None:\n df_poly['class'] = y\n \n return df_poly, poly\n\ntrain_df_poly, poly = engineer_features(train_df_processed)\ndev_df_poly, _ = engineer_features(dev_df_processed, poly)\ntest_df_poly, _ = engineer_features(test_df_processed, poly)\n\nprint(\"Processed Train Dataset with Polynomial Features:\")\nprint(train_df_poly.head())\n", + "active": false + }, + { + "id": "0-3-0", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after performing a correlation analysis to identify highly correlated features and address potential multicollinearity issues.", + "code": "", + "active": false + }, + { + "id": "0-3-1", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.", + "code": "", + "active": false + }, + { + "id": "0-3-2", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.", + "code": "", + "active": false + }, + { + "id": "0-3-3", + "parent_id": "0-3", + "avg_score": "81.9", + "dev_score": "81.0", + "visits": 3, + "order": 3, + "instruction": "Train a base model to predict the target column 'class' on the train set. Prior to training, analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model.", + "code": "from sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import LabelEncoder\nfrom sklearn.metrics import f1_score\nfrom xgboost import XGBClassifier\n\n# Label encode categorical features if any\nlabel_encoders = {}\nfor col in categorical_columns:\n le = LabelEncoder()\n train_df_poly[col] = le.fit_transform(train_df_poly[col])\n dev_df_poly[col] = le.transform(dev_df_poly[col])\n test_df_poly[col] = le.transform(test_df_poly[col])\n label_encoders[col] = le\n\n# Separate features and target\nX_train = train_df_poly.drop(columns=['class'])\ny_train = train_df_poly['class']\nX_dev = dev_df_poly.drop(columns=['class'])\ny_dev = dev_df_poly['class']\nX_test = test_df_poly\n\n# Train a base XGBoost model\nmodel = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=200, max_depth=7, learning_rate=0.1)\nmodel.fit(X_train, y_train)\n\n# Predict on dev set\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Predict on test set\ny_test_pred = model.predict(X_test)\n\n# Save predictions\nimport os\noutput_dir = '../workspace/jasmine'\nos.makedirs(output_dir, exist_ok=True)\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n", + "active": false + }, + { + "id": "0-3-3-0", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Evaluate the base model on the dev set, report the F1 score, and perform a correlation analysis to identify highly correlated features and potential multicollinearity issues.", + "code": "", + "active": false + }, + { + "id": "0-3-3-1", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques before re-evaluating the model.", + "code": "", + "active": false + }, + { + "id": "0-3-3-2", + "parent_id": "0-3-3", + "avg_score": "82.3", + "dev_score": "82.3", + "visits": 2, + "order": 4, + "instruction": "Evaluate the base model on the dev set, report the F1 score, and visualize the distribution of numerical features to identify outliers and skewed distributions.", + "code": "# Evaluate the base model on the dev set and report the F1 score\ny_dev_pred = model.predict(X_dev)\ndev_f1 = f1_score(y_dev, y_dev_pred)\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n\n# Save the dev set predictions\npd.DataFrame(y_dev_pred, columns=['target']).to_csv(os.path.join(output_dir, 'dev_predictions.csv'), index=False)\n\n# Save the test set predictions\ny_test_pred = model.predict(X_test)\npd.DataFrame(y_test_pred, columns=['target']).to_csv(os.path.join(output_dir, 'test_predictions.csv'), index=False)\n\n# Print the train and dev set performance\ntrain_f1 = f1_score(y_train, model.predict(X_train))\nprint(f\"Train Set F1 Score: {train_f1}\")\nprint(f\"Dev Set F1 Score: {dev_f1}\")\n", + "active": false + }, + { + "id": "0-3-3-3", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Evaluate the base model on the dev set, report the F1 score, and analyze the frequency distribution of categorical features to understand their impact on the model performance.", + "code": "", + "active": false + }, + { + "id": "0-3-3-4", + "parent_id": "0-3-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "3.5", + "instruction": "Conduct a PCA (Principal Component Analysis) on the dev set to reduce dimensionality and understand the variance captured by different components. Then, evaluate the base model on the transformed dev set and report the F1 score.", + "code": "", + "active": false + }, + { + "id": "0-3-4", + "parent_id": "0-3", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "2.5", + "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.", + "code": "", + "active": false + }, + { + "id": "0-4", + "parent_id": "0", + "avg_score": "0.0", + "dev_score": "0.0", + "visits": 0, + "order": "1.5", + "instruction": "Perform Principal Component Analysis (PCA) to reduce dimensionality and understand the variance captured by different components, then engineer features based on the most significant principal components to improve model performance.", + "code": "", + "active": false + } +] \ No newline at end of file diff --git a/src/components/demo2/demo2.vue b/src/components/demo2/demo2.vue new file mode 100644 index 00000000..60d98f73 --- /dev/null +++ b/src/components/demo2/demo2.vue @@ -0,0 +1,427 @@ + + + + + diff --git a/src/components/demoList.vue b/src/components/demoList.vue index 7351a46f..d106d5bc 100644 --- a/src/components/demoList.vue +++ b/src/components/demoList.vue @@ -39,7 +39,7 @@ >
{{ item.project }}
-
{{ item.prompt }}
+
{{ item.prompt }}
From 5daf20abebb3a48a4082e373fe37455aa81ae02a Mon Sep 17 00:00:00 2001 From: liuminhui Date: Wed, 30 Oct 2024 15:02:44 +0800 Subject: [PATCH 02/13] user cases add case file link --- src/en/guide/use_cases/agent/interpreter/crawl_webpage.md | 2 ++ .../guide/use_cases/agent/interpreter/data_visualization.md | 2 +- src/en/guide/use_cases/agent/interpreter/email_summary.md | 2 ++ .../guide/use_cases/agent/interpreter/human_interaction.md | 2 ++ src/en/guide/use_cases/agent/interpreter/image_removebg.md | 2 ++ src/en/guide/use_cases/agent/interpreter/imitate_webpage.md | 2 ++ .../guide/use_cases/agent/interpreter/machine_learning.md | 2 ++ .../agent/interpreter/machine_learning_with_tools.md | 2 ++ src/en/guide/use_cases/agent/interpreter/ocr_receipt.md | 2 ++ .../agent/interpreter/solve_mathematical_problems.md | 6 ++++++ src/en/guide/use_cases/agent/interpreter/text2image.md | 2 ++ src/en/guide/use_cases/agent/receipt_assistant.md | 4 ++-- src/en/guide/use_cases/agent/tutorial_assistant.md | 6 ++++-- src/zh/guide/use_cases/agent/interpreter/crawl_webpage.md | 2 ++ .../guide/use_cases/agent/interpreter/data_visualization.md | 2 +- src/zh/guide/use_cases/agent/interpreter/email_summary.md | 2 ++ .../guide/use_cases/agent/interpreter/human_interaction.md | 2 ++ src/zh/guide/use_cases/agent/interpreter/image_removebg.md | 2 ++ src/zh/guide/use_cases/agent/interpreter/imitate_webpage.md | 2 ++ .../guide/use_cases/agent/interpreter/machine_learning.md | 2 ++ .../agent/interpreter/machine_learning_with_tools.md | 2 ++ src/zh/guide/use_cases/agent/interpreter/ocr_receipt.md | 2 ++ .../agent/interpreter/solve_mathematical_problems.md | 2 ++ src/zh/guide/use_cases/agent/interpreter/text2image.md | 2 ++ src/zh/guide/use_cases/agent/receipt_assistant.md | 4 ++-- src/zh/guide/use_cases/agent/tutorial_assistant.md | 6 ++++-- 26 files changed, 58 insertions(+), 10 deletions(-) diff --git a/src/en/guide/use_cases/agent/interpreter/crawl_webpage.md b/src/en/guide/use_cases/agent/interpreter/crawl_webpage.md index 8e3fce78..e8375422 100644 --- a/src/en/guide/use_cases/agent/interpreter/crawl_webpage.md +++ b/src/en/guide/use_cases/agent/interpreter/crawl_webpage.md @@ -12,6 +12,8 @@ Retrieve paper information containing the keywords: `multiagent` and `large lang ### Code +[examples/di/crawl_webpage.py](https://github.com/geekan/MetaGPT/blob/main/examples/di/crawl_webpage.py) + ```bash python examples/di/crawl_webpage.py ``` diff --git a/src/en/guide/use_cases/agent/interpreter/data_visualization.md b/src/en/guide/use_cases/agent/interpreter/data_visualization.md index baeec5d7..308ecbab 100644 --- a/src/en/guide/use_cases/agent/interpreter/data_visualization.md +++ b/src/en/guide/use_cases/agent/interpreter/data_visualization.md @@ -16,7 +16,7 @@ Use `DataInterpreter` to perform a simple data analysis and visualize the sklear python examples/di/data_visualization.py ``` -The code in `examples/di/data_visualization.py` is as follows: +The code in [examples/di/data_visualization.py](https://github.com/geekan/MetaGPT/blob/main/examples/di/data_visualization.py) is as follows: ```python import asyncio diff --git a/src/en/guide/use_cases/agent/interpreter/email_summary.md b/src/en/guide/use_cases/agent/interpreter/email_summary.md index 60fe2bff..04673071 100644 --- a/src/en/guide/use_cases/agent/interpreter/email_summary.md +++ b/src/en/guide/use_cases/agent/interpreter/email_summary.md @@ -12,6 +12,8 @@ After logging into the email, display the sender and the body of the latest 5 em ### Code +[examples/di/email_summary.py](https://github.com/geekan/MetaGPT/blob/main/examples/di/email_summary.py) + ```bash python examples/di/email_summary.py ``` diff --git a/src/en/guide/use_cases/agent/interpreter/human_interaction.md b/src/en/guide/use_cases/agent/interpreter/human_interaction.md index 8794bbef..f1c289d0 100644 --- a/src/en/guide/use_cases/agent/interpreter/human_interaction.md +++ b/src/en/guide/use_cases/agent/interpreter/human_interaction.md @@ -12,6 +12,8 @@ We use the same [machine learning scenario](./machine_learning.md) as an example ### Code +[examples/di/machine_learning.py](https://github.com/geekan/MetaGPT/blob/main/examples/di/machine_learning.py) + ``` python examples/di/machine_learning.py --auto_run False ``` diff --git a/src/en/guide/use_cases/agent/interpreter/image_removebg.md b/src/en/guide/use_cases/agent/interpreter/image_removebg.md index cb5cb1c0..e391dd92 100644 --- a/src/en/guide/use_cases/agent/interpreter/image_removebg.md +++ b/src/en/guide/use_cases/agent/interpreter/image_removebg.md @@ -12,6 +12,8 @@ Use `DataInterpreter` to remove background from a picture of a dog. ### Code +[examples/di/rm_image_background.py](https://github.com/geekan/MetaGPT/blob/main/examples/di/rm_image_background.py) + ```bash python examples/di/rm_image_background.py ``` diff --git a/src/en/guide/use_cases/agent/interpreter/imitate_webpage.md b/src/en/guide/use_cases/agent/interpreter/imitate_webpage.md index d1258666..88aa7a7c 100644 --- a/src/en/guide/use_cases/agent/interpreter/imitate_webpage.md +++ b/src/en/guide/use_cases/agent/interpreter/imitate_webpage.md @@ -12,6 +12,8 @@ Given a URL or an image of a webpage, use the MetaGPT tool integrated with GPT-4 ### Code +[examples/di/imitate_webpage.py](https://github.com/geekan/MetaGPT/blob/main/examples/di/imitate_webpage.py) + ```bash python examples/di/imitate_webpage.py ``` diff --git a/src/en/guide/use_cases/agent/interpreter/machine_learning.md b/src/en/guide/use_cases/agent/interpreter/machine_learning.md index f6265985..8ebdd906 100644 --- a/src/en/guide/use_cases/agent/interpreter/machine_learning.md +++ b/src/en/guide/use_cases/agent/interpreter/machine_learning.md @@ -12,6 +12,8 @@ We use the [sklearn wine recognition dataset](https://scikit-learn.org/stable/da ### Code +[examples/di/machine_learning.py](https://github.com/geekan/MetaGPT/blob/main/examples/di/machine_learning.py) + ```bash python examples/di/machine_learning.py ``` diff --git a/src/en/guide/use_cases/agent/interpreter/machine_learning_with_tools.md b/src/en/guide/use_cases/agent/interpreter/machine_learning_with_tools.md index b47bcf91..46df8f26 100644 --- a/src/en/guide/use_cases/agent/interpreter/machine_learning_with_tools.md +++ b/src/en/guide/use_cases/agent/interpreter/machine_learning_with_tools.md @@ -8,6 +8,8 @@ Use `DataInterpreter` to model and predict the [titanic](https://www.kaggle.com/ ### Code +[examples/di/machine_learning_with_tools.py](https://github.com/geekan/MetaGPT/blob/main/examples/di/machine_learning_with_tools.py) + ```bash python examples/di/machine_learning_with_tools.py ``` diff --git a/src/en/guide/use_cases/agent/interpreter/ocr_receipt.md b/src/en/guide/use_cases/agent/interpreter/ocr_receipt.md index 33b3fecb..8cb7184d 100644 --- a/src/en/guide/use_cases/agent/interpreter/ocr_receipt.md +++ b/src/en/guide/use_cases/agent/interpreter/ocr_receipt.md @@ -14,6 +14,8 @@ Use `DataInterpreter` to perform OCR recognition on the following receipt, extra ### Code +[examples/di/ocr_receipt.py](https://github.com/geekan/MetaGPT/blob/main/examples/di/ocr_receipt.py) + > Note: You need to install Paddle-related dependencies to run this example, execute > `pip install metagpt[ocr]` diff --git a/src/en/guide/use_cases/agent/interpreter/solve_mathematical_problems.md b/src/en/guide/use_cases/agent/interpreter/solve_mathematical_problems.md index 305dc54c..ea40e0e7 100644 --- a/src/en/guide/use_cases/agent/interpreter/solve_mathematical_problems.md +++ b/src/en/guide/use_cases/agent/interpreter/solve_mathematical_problems.md @@ -12,6 +12,12 @@ At a school, all 60 students play on at least one of three teams: Basketball, So ### Code +[examples/di/solve_math_problems.py](https://github.com/geekan/MetaGPT/blob/main/examples/di/solve_math_problems.py) + +```bash +python examples/di/solve_math_problems.py +``` + ```python import asyncio diff --git a/src/en/guide/use_cases/agent/interpreter/text2image.md b/src/en/guide/use_cases/agent/interpreter/text2image.md index 58b5032b..fc25cd38 100644 --- a/src/en/guide/use_cases/agent/interpreter/text2image.md +++ b/src/en/guide/use_cases/agent/interpreter/text2image.md @@ -12,6 +12,8 @@ Use the text-to-image tool of stable diffusion to generate an image from a given ### Code +[examples/di/sd_tool_usage.py](https://github.com/geekan/MetaGPT/blob/main/examples/di/sd_tool_usage.py) + ```bash python examples/di/sd_tool_usage.py ``` diff --git a/src/en/guide/use_cases/agent/receipt_assistant.md b/src/en/guide/use_cases/agent/receipt_assistant.md index b2f2b9bc..9497728c 100644 --- a/src/en/guide/use_cases/agent/receipt_assistant.md +++ b/src/en/guide/use_cases/agent/receipt_assistant.md @@ -13,7 +13,7 @@ Supports OCR recognition of invoice files in `pdf`, `png`, `jpg`, and `zip` form ### Source Code -[GitHub Source Code](https://github.com/geekan/MetaGPT/blob/main/metagpt/roles/invoice_ocr_assistant.py) +[metagpt/roles/invoice_ocr_assistant.py](https://github.com/geekan/MetaGPT/blob/main/metagpt/roles/invoice_ocr_assistant.py) ## Role Definition @@ -292,7 +292,7 @@ Supports OCR recognition of invoice files in `pdf`, `png`, `jpg`, and `zip` form ### Execution Command Example -In the project's root directory, execute the command `python3 /examples/invoice_ocr.py`. +In the project's root directory, execute the command `python3 examples/invoice_ocr.py`. ### Execution Results diff --git a/src/en/guide/use_cases/agent/tutorial_assistant.md b/src/en/guide/use_cases/agent/tutorial_assistant.md index f205fd1e..bc769e8e 100644 --- a/src/en/guide/use_cases/agent/tutorial_assistant.md +++ b/src/en/guide/use_cases/agent/tutorial_assistant.md @@ -12,7 +12,7 @@ The design approach involves using the `LLM` (Large Language Model) to initially ### Source Code -[GitHub Source Code](https://github.com/geekan/MetaGPT/blob/main/metagpt/roles/tutorial_assistant.py) +[metagpt/roles/tutorial_assistant.py](https://github.com/geekan/MetaGPT/blob/main/metagpt/roles/tutorial_assistant.py) ## Role Definition @@ -190,7 +190,9 @@ The design approach involves using the `LLM` (Large Language Model) to initially ### Execution Command Examples -Provide corresponding execution command examples. +```bash +python examples/write_tutorial.py +``` ### Execution Results diff --git a/src/zh/guide/use_cases/agent/interpreter/crawl_webpage.md b/src/zh/guide/use_cases/agent/interpreter/crawl_webpage.md index 20235962..701d9e11 100644 --- a/src/zh/guide/use_cases/agent/interpreter/crawl_webpage.md +++ b/src/zh/guide/use_cases/agent/interpreter/crawl_webpage.md @@ -12,6 +12,8 @@ ### 代码 +[examples/di/crawl_webpage.py](https://github.com/geekan/MetaGPT/blob/main/examples/di/crawl_webpage.py) + ```bash python examples/di/crawl_webpage.py ``` diff --git a/src/zh/guide/use_cases/agent/interpreter/data_visualization.md b/src/zh/guide/use_cases/agent/interpreter/data_visualization.md index eda85f14..7adeab88 100644 --- a/src/zh/guide/use_cases/agent/interpreter/data_visualization.md +++ b/src/zh/guide/use_cases/agent/interpreter/data_visualization.md @@ -16,7 +16,7 @@ python examples/di/data_visualization.py ``` -`examples/di/data_visualization.py`文件中的代码具体为: +[examples/di/data_visualization.py](https://github.com/geekan/MetaGPT/blob/main/examples/di/data_visualization.py) 文件中的代码具体为: ```python import asyncio diff --git a/src/zh/guide/use_cases/agent/interpreter/email_summary.md b/src/zh/guide/use_cases/agent/interpreter/email_summary.md index 396e441f..14719c14 100644 --- a/src/zh/guide/use_cases/agent/interpreter/email_summary.md +++ b/src/zh/guide/use_cases/agent/interpreter/email_summary.md @@ -12,6 +12,8 @@ ### 代码 +[examples/di/email_summary.py](https://github.com/geekan/MetaGPT/blob/main/examples/di/email_summary.py) + ```bash python examples/di/email_summary.py ``` diff --git a/src/zh/guide/use_cases/agent/interpreter/human_interaction.md b/src/zh/guide/use_cases/agent/interpreter/human_interaction.md index cdb22909..2fa38c38 100644 --- a/src/zh/guide/use_cases/agent/interpreter/human_interaction.md +++ b/src/zh/guide/use_cases/agent/interpreter/human_interaction.md @@ -12,6 +12,8 @@ ### 代码 +[examples/di/machine_learning.py](https://github.com/geekan/MetaGPT/blob/main/examples/di/machine_learning.py) + ``` python examples/di/machine_learning.py --auto_run False ``` diff --git a/src/zh/guide/use_cases/agent/interpreter/image_removebg.md b/src/zh/guide/use_cases/agent/interpreter/image_removebg.md index 7f252a5a..908d660d 100644 --- a/src/zh/guide/use_cases/agent/interpreter/image_removebg.md +++ b/src/zh/guide/use_cases/agent/interpreter/image_removebg.md @@ -12,6 +12,8 @@ ### 代码 +[examples/di/rm_image_background.py](https://github.com/geekan/MetaGPT/blob/main/examples/di/rm_image_background.py) + ```bash python examples/di/rm_image_background.py ``` diff --git a/src/zh/guide/use_cases/agent/interpreter/imitate_webpage.md b/src/zh/guide/use_cases/agent/interpreter/imitate_webpage.md index 7ef1e2c9..f5d60c79 100644 --- a/src/zh/guide/use_cases/agent/interpreter/imitate_webpage.md +++ b/src/zh/guide/use_cases/agent/interpreter/imitate_webpage.md @@ -12,6 +12,8 @@ ### 代码 +[examples/di/imitate_webpage.py](https://github.com/geekan/MetaGPT/blob/main/examples/di/imitate_webpage.py) + ```bash python examples/di/imitate_webpage.py ``` diff --git a/src/zh/guide/use_cases/agent/interpreter/machine_learning.md b/src/zh/guide/use_cases/agent/interpreter/machine_learning.md index 7a68ff2e..90b88c90 100644 --- a/src/zh/guide/use_cases/agent/interpreter/machine_learning.md +++ b/src/zh/guide/use_cases/agent/interpreter/machine_learning.md @@ -12,6 +12,8 @@ ### 代码 +[examples/di/machine_learning.py](https://github.com/geekan/MetaGPT/blob/main/examples/di/machine_learning.py) + ```bash python examples/di/machine_learning.py ``` diff --git a/src/zh/guide/use_cases/agent/interpreter/machine_learning_with_tools.md b/src/zh/guide/use_cases/agent/interpreter/machine_learning_with_tools.md index a74fd1d5..97b10850 100644 --- a/src/zh/guide/use_cases/agent/interpreter/machine_learning_with_tools.md +++ b/src/zh/guide/use_cases/agent/interpreter/machine_learning_with_tools.md @@ -8,6 +8,8 @@ ### 代码 +[examples/di/machine_learning_with_tools.py](https://github.com/geekan/MetaGPT/blob/main/examples/di/machine_learning_with_tools.py) + ```bash python examples/di/machine_learning_with_tools.py ``` diff --git a/src/zh/guide/use_cases/agent/interpreter/ocr_receipt.md b/src/zh/guide/use_cases/agent/interpreter/ocr_receipt.md index 1082d77f..7a01aa9d 100644 --- a/src/zh/guide/use_cases/agent/interpreter/ocr_receipt.md +++ b/src/zh/guide/use_cases/agent/interpreter/ocr_receipt.md @@ -14,6 +14,8 @@ OCR,是一种识别图片中文字,得到结构化文本信息的技术手 ### 代码 +[examples/di/ocr_receipt.py](https://github.com/geekan/MetaGPT/blob/main/examples/di/ocr_receipt.py) + > 注意:你需要事先安装Paddle相关依赖以运行此示例,可执行 > `pip install metagpt[ocr]` diff --git a/src/zh/guide/use_cases/agent/interpreter/solve_mathematical_problems.md b/src/zh/guide/use_cases/agent/interpreter/solve_mathematical_problems.md index 5d0fc69e..b62e71ce 100644 --- a/src/zh/guide/use_cases/agent/interpreter/solve_mathematical_problems.md +++ b/src/zh/guide/use_cases/agent/interpreter/solve_mathematical_problems.md @@ -12,6 +12,8 @@ At a school, all 60 students play on at least one of three teams: Basketball, So ### 代码 +[examples/di/solve_math_problems.py](https://github.com/geekan/MetaGPT/blob/main/examples/di/solve_math_problems.py) + ```bash python examples/di/solve_math_problems.py ``` diff --git a/src/zh/guide/use_cases/agent/interpreter/text2image.md b/src/zh/guide/use_cases/agent/interpreter/text2image.md index d86dc1e5..64321ebe 100644 --- a/src/zh/guide/use_cases/agent/interpreter/text2image.md +++ b/src/zh/guide/use_cases/agent/interpreter/text2image.md @@ -12,6 +12,8 @@ Text2Image,指通过文字描述获取图片 ### 代码 +[examples/di/sd_tool_usage.py](https://github.com/geekan/MetaGPT/blob/main/examples/di/sd_tool_usage.py) + ```bash python examples/di/sd_tool_usage.py ``` diff --git a/src/zh/guide/use_cases/agent/receipt_assistant.md b/src/zh/guide/use_cases/agent/receipt_assistant.md index 7c589e30..ea1b54d8 100644 --- a/src/zh/guide/use_cases/agent/receipt_assistant.md +++ b/src/zh/guide/use_cases/agent/receipt_assistant.md @@ -13,7 +13,7 @@ ### 源码 -[GitHub Source Code](https://github.com/geekan/MetaGPT/blob/main/metagpt/roles/invoice_ocr_assistant.py) +[metagpt/roles/invoice_ocr_assistant.py](https://github.com/geekan/MetaGPT/blob/main/metagpt/roles/invoice_ocr_assistant.py) ## 角色定义 @@ -301,7 +301,7 @@ ### 执行命令样例 -在项目根目录下,执行命令行 `python3 /examples/invoice_ocr.py`。 +在项目根目录下,执行命令行 `python3 examples/invoice_ocr.py`。 ### 执行结果 diff --git a/src/zh/guide/use_cases/agent/tutorial_assistant.md b/src/zh/guide/use_cases/agent/tutorial_assistant.md index edfc44aa..58a9331c 100644 --- a/src/zh/guide/use_cases/agent/tutorial_assistant.md +++ b/src/zh/guide/use_cases/agent/tutorial_assistant.md @@ -12,7 +12,7 @@ ### 源码 -[GitHub Source Code](https://github.com/geekan/MetaGPT/blob/main/metagpt/roles/tutorial_assistant.py) +[metagpt/roles/tutorial_assistant.py](https://github.com/geekan/MetaGPT/blob/main/metagpt/roles/tutorial_assistant.py) ## 角色定义 @@ -195,7 +195,9 @@ ### 执行命令样例 -贴对应的执行命令样例 +```bash +python examples/write_tutorial.py +``` ### 执行结果 From 900efa4a660d1e7484f239447c32405d2914bbd1 Mon Sep 17 00:00:00 2001 From: liuminhui Date: Wed, 30 Oct 2024 15:52:19 +0800 Subject: [PATCH 03/13] In-Depth Guides Optimize --- src/en/guide/in_depth_guides/breakpoint_recovery.md | 6 +++++- src/en/guide/in_depth_guides/rag_module.md | 2 +- src/zh/guide/in_depth_guides/breakpoint_recovery.md | 6 +++++- src/zh/guide/in_depth_guides/rag_module.md | 2 +- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/en/guide/in_depth_guides/breakpoint_recovery.md b/src/en/guide/in_depth_guides/breakpoint_recovery.md index f64b6fcf..b94cc7f4 100644 --- a/src/en/guide/in_depth_guides/breakpoint_recovery.md +++ b/src/en/guide/in_depth_guides/breakpoint_recovery.md @@ -29,7 +29,9 @@ When the program is interrupted or terminated, the file structure in the storage team.json # Contains information such as team, environment, roles, actions, etc. ``` -Example of data summary corresponding to `team.json`. +
+ +Example of data summary corresponding to team.json ```json { @@ -140,6 +142,8 @@ Example of data summary corresponding to `team.json`. } ``` +
+ ### Execution order during recovery Since MetaGPT is an asynchronous execution framework, there are several typical interception points and recovery sequences as follows. diff --git a/src/en/guide/in_depth_guides/rag_module.md b/src/en/guide/in_depth_guides/rag_module.md index aeddca0a..612f3ef8 100644 --- a/src/en/guide/in_depth_guides/rag_module.md +++ b/src/en/guide/in_depth_guides/rag_module.md @@ -10,7 +10,7 @@ This article focuses on the RAG functions provided by the current MetaGPT: 4. Data update, addition of text and Python objects. 5. Data storage and recovery, vectorization is not required each time. -For more examples, please see [rag_pipeline](https://github.com/geekan/MetaGPT/blob/main/examples/rag_pipeline.py) and [rag_search](https://github.com/geekan/MetaGPT/blob/main/examples/rag_search.py) +For more examples, please see [rag_pipeline](https://github.com/geekan/MetaGPT/blob/main/examples/rag/rag_pipeline.py) and [rag_search](https://github.com/geekan/MetaGPT/blob/main/examples/rag/rag_search.py) ## Prepare diff --git a/src/zh/guide/in_depth_guides/breakpoint_recovery.md b/src/zh/guide/in_depth_guides/breakpoint_recovery.md index 7a6245d6..7cd33e5c 100644 --- a/src/zh/guide/in_depth_guides/breakpoint_recovery.md +++ b/src/zh/guide/in_depth_guides/breakpoint_recovery.md @@ -29,7 +29,9 @@ team.json # 包含团队、环境、角色、动作等信息 ``` -`team.json`对应内容的数据概要示例。 +
+ +team.json 对应内容的数据概要示例。 ```json { @@ -140,6 +142,8 @@ } ``` +
+ ### 恢复时的执行顺序 由于MetaGPT是异步执行框架,对于下述几种典型的中断截点和恢复顺序。 diff --git a/src/zh/guide/in_depth_guides/rag_module.md b/src/zh/guide/in_depth_guides/rag_module.md index 4fb476b7..1f21b253 100644 --- a/src/zh/guide/in_depth_guides/rag_module.md +++ b/src/zh/guide/in_depth_guides/rag_module.md @@ -10,7 +10,7 @@ RAG(Retrieval-Augmented Generation)通过引用外部权威知识库来优 4. 数据更新,增加文本与python对象 5. 数据保存及恢复,不用每次都进行向量化 -更多的例子请查看 [rag_pipeline](https://github.com/geekan/MetaGPT/blob/main/examples/rag_pipeline.py) 和 [rag_search](https://github.com/geekan/MetaGPT/blob/main/examples/rag_search.py) +更多的例子请查看 [rag_pipeline](https://github.com/geekan/MetaGPT/blob/main/examples/rag/rag_pipeline.py) 和 [rag_search](https://github.com/geekan/MetaGPT/blob/main/examples/rag/rag_search.py) ## 前置准备 From 7624b74a7ba202afa11edb14a1329f1d2b8c4779 Mon Sep 17 00:00:00 2001 From: chenshuanglong Date: Wed, 30 Oct 2024 22:29:36 +0800 Subject: [PATCH 04/13] feat: demo2 --- src/components/demo2/demo2.vue | 667 +++++++++++++++++++++++++-------- 1 file changed, 521 insertions(+), 146 deletions(-) diff --git a/src/components/demo2/demo2.vue b/src/components/demo2/demo2.vue index 60d98f73..e22329e6 100644 --- a/src/components/demo2/demo2.vue +++ b/src/components/demo2/demo2.vue @@ -1,23 +1,89 @@ +watch( + currentFolder, + async () => { + stopAutoPlay(); + index.value = 0; + activeTreeNodeId.value = null; + trees.value = await fetchTreeData(currentFolder.value); + + startAutoPlay(); + nextTick(() => { + draw(); + }); + }, + { + immediate: true, + } +); + From e032ef2b0a1ac51e4e3be42dde0a03804d032fb8 Mon Sep 17 00:00:00 2001 From: chenshuanglong Date: Wed, 30 Oct 2024 22:34:37 +0800 Subject: [PATCH 05/13] feat: demo2 --- src/components/demo2/demo2.vue | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/src/components/demo2/demo2.vue b/src/components/demo2/demo2.vue index e22329e6..4202c0d4 100644 --- a/src/components/demo2/demo2.vue +++ b/src/components/demo2/demo2.vue @@ -1,7 +1,6 @@