diff --git a/raiwidgets/raiwidgets/responsibleai_dashboard_input.py b/raiwidgets/raiwidgets/responsibleai_dashboard_input.py index 9ae75cb4fe..6c6738da4b 100644 --- a/raiwidgets/raiwidgets/responsibleai_dashboard_input.py +++ b/raiwidgets/raiwidgets/responsibleai_dashboard_input.py @@ -178,6 +178,15 @@ def debug_ml(self, data): num_leaves = data[4] min_child_samples = data[5] metric = display_name_to_metric[data[6]] + if not hasattr(self._analysis, '_text_column'): + text_cols = None + else: + text_cols = self._analysis._text_column + if text_cols is None: + text_cols = [] + elif isinstance(text_cols, str): + text_cols = [text_cols] + features = [f for f in features if f not in text_cols] filtered_data_df = self._prepare_filtered_error_analysis_data( features, filters, composite_filters, metric) diff --git a/responsibleai/responsibleai/managers/error_analysis_manager.py b/responsibleai/responsibleai/managers/error_analysis_manager.py index 5c5cd929c9..86a4acaf11 100644 --- a/responsibleai/responsibleai/managers/error_analysis_manager.py +++ b/responsibleai/responsibleai/managers/error_analysis_manager.py @@ -253,8 +253,12 @@ def __init__(self, model: Any, dataset: pd.DataFrame, target_column: str, for evaluating the model. :type dropped_features: Optional[List[str]] """ - self._true_y = dataset[target_column] - self._dataset = dataset.drop(columns=[target_column]) + if target_column is None: + self._true_y = None + self._dataset = dataset.copy() + else: + self._true_y = dataset[target_column] + self._dataset = dataset.drop(columns=[target_column]) self._feature_names = list(self._dataset.columns) self._model_task = model_task self._classes = classes diff --git a/responsibleai_text/responsibleai_text/managers/error_analysis_manager.py b/responsibleai_text/responsibleai_text/managers/error_analysis_manager.py index c1f15c17f2..515f502eb3 100644 --- a/responsibleai_text/responsibleai_text/managers/error_analysis_manager.py +++ b/responsibleai_text/responsibleai_text/managers/error_analysis_manager.py @@ -207,8 +207,8 @@ def __init__(self, model: Any, dataset: pd.DataFrame, sup_task_type = ErrorAnalysisTask.REGRESSION ext_dataset = ext_dataset.copy() del ext_dataset['prompt'] - ext_dataset['target_score'] = 5 target_column = 'target_score' + ext_dataset[target_column] = 5 else: sup_task_type = ErrorAnalysisTask.CLASSIFICATION super(ErrorAnalysisManager, self).__init__( @@ -244,7 +244,8 @@ def _create_index_predictor(model, dataset, target_column, :return: A wrapped predictor that uses index to retrieve text data. :rtype: WrappedIndexPredictorModel """ - dataset = dataset.drop(columns=[target_column]) + if target_column is not None: + dataset = dataset.drop(columns=[target_column]) dataset = get_text_columns(dataset, text_column) index_predictor = WrappedIndexPredictorModel( model, dataset, is_multilabel, task_type, classes) diff --git a/responsibleai_text/responsibleai_text/rai_text_insights/rai_text_insights.py b/responsibleai_text/responsibleai_text/rai_text_insights/rai_text_insights.py index 00285c5cfb..66588b7f26 100644 --- a/responsibleai_text/responsibleai_text/rai_text_insights/rai_text_insights.py +++ b/responsibleai_text/responsibleai_text/rai_text_insights/rai_text_insights.py @@ -190,7 +190,8 @@ def __init__(self, model: Any, test: pd.DataFrame, self._ext_test = ext_test self._ext_features = ext_features self._ext_test_df = pd.DataFrame(ext_test, columns=ext_features) - self._ext_test_df[target_column] = test[target_column] + if target_column is not None: + self._ext_test_df[target_column] = test[target_column] self.predict_output = None super(RAITextInsights, self).__init__( @@ -273,16 +274,18 @@ def _validate_model(self, model: Any, test: pd.DataFrame, an exception will be raised. :type text_column: str or list[str] """ - if not isinstance(target_column, list): - target_column = [target_column] - # Pick one row from test data - small_test_data = test.iloc[0:1].drop( - target_column, axis=1) + small_test_data = test.iloc[0:1] + if target_column is not None: + if not isinstance(target_column, list): + target_column = [target_column] + # Pick one row from test data + small_test_data = small_test_data.drop( + target_column, axis=1) small_test_data = get_text_columns(small_test_data, text_column) small_test_data = small_test_data.iloc[0] - if task_type not in [ - ModelTask.QUESTION_ANSWERING, - ModelTask.GENERATIVE_TEXT]: + list_task_outputs = [ModelTask.QUESTION_ANSWERING, + ModelTask.GENERATIVE_TEXT] + if task_type not in list_task_outputs: small_test_data = small_test_data.tolist() # Call the model try: @@ -592,13 +595,16 @@ def _get_dataset(self): dashboard_dataset.features = self._ext_test - true_y = self.test[self.target_column] - if true_y is not None and len(true_y) == row_length: - true_y = convert_to_list(true_y) - if is_classification_task: - true_y = self._convert_labels( - true_y, dashboard_dataset.class_names) - dashboard_dataset.true_y = true_y + if self.target_column is None: + dashboard_dataset.true_y = None + else: + true_y = self.test[self.target_column] + if true_y is not None and len(true_y) == row_length: + true_y = convert_to_list(true_y) + if is_classification_task: + true_y = self._convert_labels( + true_y, dashboard_dataset.class_names) + dashboard_dataset.true_y = true_y dashboard_dataset.feature_names = self._ext_features dashboard_dataset.target_column = self.target_column diff --git a/responsibleai_text/responsibleai_text/utils/feature_extractors.py b/responsibleai_text/responsibleai_text/utils/feature_extractors.py index bcff01c442..35c9be4f25 100644 --- a/responsibleai_text/responsibleai_text/utils/feature_extractors.py +++ b/responsibleai_text/responsibleai_text/utils/feature_extractors.py @@ -71,9 +71,13 @@ def extract_features(text_dataset: pd.DataFrame, if has_dropped_features and column_names[j] in dropped_features: continue feature_names.append(column_names[j]) - if not isinstance(target_column, list): + + if not isinstance(target_column, (list, type(None))): target_column = [target_column] - text_features = text_dataset.drop(target_column, axis=1) + + text_features = text_dataset.copy() + if target_column is not None: + text_features = text_features.drop(target_column, axis=1) if task_type in single_text_col_tasks: sentences = text_features.iloc[:, 0].tolist() diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/constants.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/constants.py index 2a157e7d3b..712bb90d79 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/constants.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/constants.py @@ -14,3 +14,17 @@ Your response will be used in automated evaluation of question-answering \ systems, and must be an integer between 1 and 5, and nothing else. """.strip() + +_EXAMPLES = """ +This rating value should always be an integer between 1 and 5. So the rating \ +produced should be 1 or 2 or 3 or 4 or 5. +Some examples of valid responses are: +1 +2 +5 +Some examples of invalid responses are: +1/5 +1.5 +3.0 +5 stars +""".strip() diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/_compute.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/_compute.py index 5e20b2f21d..90b6048370 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/_compute.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/_compute.py @@ -5,7 +5,8 @@ import pandas as pd -from responsibleai_text.utils.genai_metrics.constants import _SYS_PROMPT +from responsibleai_text.utils.genai_metrics.constants import (_EXAMPLES, + _SYS_PROMPT) def format_str(s, **kwargs): @@ -21,6 +22,7 @@ def format_str(s, **kwargs): def _compute_metric(template, logger, wrapper_model, **kwargs): m = [] + template = template % _EXAMPLES templated_ques = format_str(template, **kwargs) inp = pd.DataFrame({ diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/coherence.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/coherence.py index 5623b70bec..31e650ee06 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/coherence.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/coherence.py @@ -44,17 +44,7 @@ Four stars: the answer is mostly coherent Five stars: the answer has perfect coherency -This rating value should always be an integer between 1 and 5. So the rating \ -produced should be 1 or 2 or 3 or 4 or 5. -Some examples of valid responses are: -1 -2 -5 -Some examples of invalid responses are: -1/5 -1.5 -3.0 -5 stars +%s QUESTION: {question} diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py index f26bd3467a..72a871bc4c 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py @@ -46,17 +46,7 @@ Four stars: the predicted answer is mostly similar to the correct answer Five stars: the predicted answer is completely similar to the correct answer -This rating value should always be an integer between 1 and 5. So the rating \ -produced should be 1 or 2 or 3 or 4 or 5. -Some examples of valid responses are: -1 -2 -5 -Some examples of invalid responses are: -1/5 -1.5 -3.0 -5 stars +%s QUESTION: {question} diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py index 0531ccd228..9d38e8f0d3 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py @@ -45,17 +45,7 @@ Four stars: the answer is mostly fluent Five stars: the answer has perfect fluency -This rating value should always be an integer between 1 and 5. So the rating \ -produced should be 1 or 2 or 3 or 4 or 5. -Some examples of valid responses are: -1 -2 -5 -Some examples of invalid responses are: -1/5 -1.5 -3.0 -5 stars +%s QUESTION: {question} diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/groundedness.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/groundedness.py index 28da15777d..4be9501b5b 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/groundedness.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/groundedness.py @@ -47,17 +47,7 @@ Note the ANSWER is generated by a computer system, it can contain certain \ symbols, which should not be a negative factor in the evaluation. -This rating value should always be an integer between 1 and 5. So the rating \ -produced should be 1 or 2 or 3 or 4 or 5. -Some examples of valid responses are: -1 -2 -5 -Some examples of invalid responses are: -1/5 -1.5 -3.0 -5 stars +%s CONTEXT: {context} diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/relevance.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/relevance.py index 01965bfcb2..89c7706d62 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/relevance.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/relevance.py @@ -45,17 +45,7 @@ Four stars: the answer is mostly relevant Five stars: the answer has perfect relevance -This rating value should always be an integer between 1 and 5. So the rating \ -produced should be 1 or 2 or 3 or 4 or 5. -Some examples of valid responses are: -1 -2 -5 -Some examples of invalid responses are: -1/5 -1.5 -3.0 -5 stars +%s QUESTION AND CONTEXT: {question}