From c8ff9a265eed5545a7fba935e43598ee6185bbcd Mon Sep 17 00:00:00 2001 From: Kartik Choudhary Date: Thu, 1 Feb 2024 17:26:58 -0500 Subject: [PATCH] Add methods and constants for genai metrics (#2524) * Added info about required packages * Update responsibleaidashboard-question-answering-model-debugging.ipynb * show example prediction * Update responsibleaidashboard-question-answering-model-debugging.ipynb * add methods and constants for genai task type Signed-off-by: Kartik Choudhary * add missing files for genai metrics Signed-off-by: Kartik Choudhary * update copyright information Signed-off-by: Kartik Choudhary --------- Signed-off-by: Kartik Choudhary --- .../raiwidgets/responsibleai_dashboard.py | 14 +++ .../responsibleai_dashboard_input.py | 33 ++++++- .../responsibleai_text/common/constants.py | 8 ++ .../managers/error_analysis_manager.py | 20 +++- .../rai_text_insights/rai_text_insights.py | 97 ++++++++++++++++++- .../utils/feature_extractors.py | 19 +++- .../utils/genai_metrics/__init__.py | 4 + .../utils/genai_metrics/scripts/__init__.py | 4 + .../utils/genai_metrics/scripts/_compute.py | 2 +- .../genai_metrics/scripts/equivalence.py | 9 ++ .../utils/genai_metrics/scripts/fluency.py | 9 ++ .../genai_metrics/scripts/groundedness.py | 12 +++ .../utils/genai_metrics/scripts/relevance.py | 9 ++ 13 files changed, 233 insertions(+), 7 deletions(-) create mode 100644 responsibleai_text/responsibleai_text/utils/genai_metrics/__init__.py create mode 100644 responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/__init__.py diff --git a/raiwidgets/raiwidgets/responsibleai_dashboard.py b/raiwidgets/raiwidgets/responsibleai_dashboard.py index 3ac0a11fff..cadff8476b 100644 --- a/raiwidgets/raiwidgets/responsibleai_dashboard.py +++ b/raiwidgets/raiwidgets/responsibleai_dashboard.py @@ -122,6 +122,15 @@ def get_question_answering_metrics(): methods=["POST"] ) + def get_generative_text_metrics(): + data = request.get_json(force=True) + return jsonify(self.input.get_generative_text_metrics(data)) + self.add_url_rule( + get_generative_text_metrics, + '/get_generative_text_metrics', + methods=["POST"] + ) + if hasattr(self._service, 'socketio'): @self._service.socketio.on('handle_object_detection_json') def handle_object_detection_json(od_json): @@ -132,3 +141,8 @@ def handle_object_detection_json(od_json): def handle_question_answering_json(qa_json): qa_data = json.loads(qa_json['data']) return self.input.get_question_answering_metrics(qa_data) + + @self._service.socketio.on('handle_generative_text_json') + def handle_generative_text_json(gt_json): + gt_data = json.loads(gt_json['data']) + return self.input.get_generative_text_metrics(gt_data) diff --git a/raiwidgets/raiwidgets/responsibleai_dashboard_input.py b/raiwidgets/raiwidgets/responsibleai_dashboard_input.py index 0df2fdf3f2..9ae75cb4fe 100644 --- a/raiwidgets/raiwidgets/responsibleai_dashboard_input.py +++ b/raiwidgets/raiwidgets/responsibleai_dashboard_input.py @@ -171,7 +171,7 @@ def _prepare_filtered_error_analysis_data(self, features, filters, def debug_ml(self, data): try: - features = data[0] + features = data[0] # TODO: Remove prompt feature filters = data[1] composite_filters = data[2] max_depth = data[3] @@ -484,3 +484,34 @@ def get_question_answering_metrics(self, post_data): "inner error: {}".format(e_str), WidgetRequestResponseConstants.data: [] } + + def get_generative_text_metrics(self, post_data): + """Flask endpoint function to get Model Overview metrics + for the Generative Text scenario. + + :param post_data: List of inputs in the order + [true_y, predicted_y, aggregate_method, class_name, iou_threshold]. + :type post_data: List + + :return: JSON/dict data response + :rtype: Dict[str, List] + """ + try: + selection_indexes = post_data[0] + generative_text_cache = post_data[1] + exp = self._analysis.compute_genai_metrics( + selection_indexes, + generative_text_cache + ) + return { + WidgetRequestResponseConstants.data: exp + } + except Exception as e: + print(e) + traceback.print_exc() + e_str = _format_exception(e) + return { + WidgetRequestResponseConstants.error: + EXP_VIZ_ERR_MSG.format(e_str), + WidgetRequestResponseConstants.data: [] + } diff --git a/responsibleai_text/responsibleai_text/common/constants.py b/responsibleai_text/responsibleai_text/common/constants.py index 9efcdaeed5..1486d1d515 100644 --- a/responsibleai_text/responsibleai_text/common/constants.py +++ b/responsibleai_text/responsibleai_text/common/constants.py @@ -18,6 +18,8 @@ class ModelTask(str, Enum): QUESTION_ANSWERING = 'question_answering' ENTAILMENT = 'entailment' SUMMARIZATIONS = 'summarizations' + GENERATIVE_TEXT = 'generative_text' + GENERATIVE_TEXT_CHAT = 'generative_text_chat' UNKNOWN = 'unknown' @@ -34,3 +36,9 @@ class QuestionAnsweringFields(object): QUESTION = "question" CONTEXT = "context" ANSWERS = "answers" + + +class GenerativeTextFields(object): + PROMPT = "prompt" + SYS_PROMPT = "sys_prompt" + RESPONSE = "response" diff --git a/responsibleai_text/responsibleai_text/managers/error_analysis_manager.py b/responsibleai_text/responsibleai_text/managers/error_analysis_manager.py index 45d4147ff6..c1f15c17f2 100644 --- a/responsibleai_text/responsibleai_text/managers/error_analysis_manager.py +++ b/responsibleai_text/responsibleai_text/managers/error_analysis_manager.py @@ -12,6 +12,7 @@ import pandas as pd from ml_wrappers import wrap_model +from erroranalysis._internal.constants import ModelTask as ErrorAnalysisTask from erroranalysis._internal.error_analyzer import ModelAnalyzer from erroranalysis._internal.error_report import as_error_report from responsibleai._tools.shared.state_directory_management import \ @@ -22,6 +23,7 @@ from responsibleai.managers.error_analysis_manager import as_error_config from responsibleai_text.common.constants import ModelTask from responsibleai_text.utils.feature_extractors import get_text_columns +from responsibleai_text.utils.genai_metrics.metrics import get_genai_metric LABELS = 'labels' @@ -83,6 +85,14 @@ def __init__(self, model, dataset, is_multilabel, task_type, classes=None): self.predictions = self.model.predict( self.dataset.loc[:, ['context', 'questions']]) self.predictions = np.array(self.predictions) + elif self.task_type == ModelTask.GENERATIVE_TEXT: + # TODO: Decide the final metric for error analysis + coherence = get_genai_metric( + 'coherence', + predictions=self.model.predict(self.dataset), + references=dataset['prompt'], + wrapper_model=self.model) + self.predictions = np.array(coherence['scores']) else: raise ValueError("Unknown task type: {}".format(self.task_type)) @@ -193,9 +203,17 @@ def __init__(self, model: Any, dataset: pd.DataFrame, task_type, index_classes) if categorical_features is None: categorical_features = [] + if task_type == ModelTask.GENERATIVE_TEXT: + sup_task_type = ErrorAnalysisTask.REGRESSION + ext_dataset = ext_dataset.copy() + del ext_dataset['prompt'] + ext_dataset['target_score'] = 5 + target_column = 'target_score' + else: + sup_task_type = ErrorAnalysisTask.CLASSIFICATION super(ErrorAnalysisManager, self).__init__( index_predictor, ext_dataset, target_column, - classes, categorical_features) + classes, categorical_features, model_task=sup_task_type) @staticmethod def _create_index_predictor(model, dataset, target_column, diff --git a/responsibleai_text/responsibleai_text/rai_text_insights/rai_text_insights.py b/responsibleai_text/responsibleai_text/rai_text_insights/rai_text_insights.py index 229b668ad3..00285c5cfb 100644 --- a/responsibleai_text/responsibleai_text/rai_text_insights/rai_text_insights.py +++ b/responsibleai_text/responsibleai_text/rai_text_insights/rai_text_insights.py @@ -30,6 +30,8 @@ from responsibleai_text.managers.explainer_manager import ExplainerManager from responsibleai_text.utils.feature_extractors import (extract_features, get_text_columns) +from responsibleai_text.utils.genai_metrics.metrics import \ + get_genai_metric_mean module_logger = logging.getLogger(__name__) module_logger.setLevel(logging.INFO) @@ -116,7 +118,8 @@ def __init__(self, model: Any, test: pd.DataFrame, serializer: Optional[Any] = None, maximum_rows_for_test: int = 5000, feature_metadata: Optional[FeatureMetadata] = None, - text_column: Optional[Union[str, List]] = None): + text_column: Optional[Union[str, List]] = None, + eval_model: Any = None): """Creates an RAITextInsights object. :param model: The model to compute RAI insights for. @@ -148,6 +151,10 @@ def __init__(self, model: Any, test: pd.DataFrame, If not provided, and there is additional feature metadata, then an exception will be raised. :type text_column: str or list[str] + :param eval_model: The model to use for evaluation with AI-assisted + metrics. If not provided, then the model passed in the model + parameter will be used. + :type eval_model: object """ # drop index as this can cause issues later like when copying # target column below from test dataset to _ext_test_df @@ -160,6 +167,10 @@ def __init__(self, model: Any, test: pd.DataFrame, self._text_column = text_column self._feature_metadata = feature_metadata self._wrapped_model = wrap_model(model, test, task_type) + if eval_model is None: + self._eval_model = self._wrapped_model + else: + self._eval_model = wrap_model(eval_model, test, task_type) self._validate_rai_insights_input_parameters( model=self._wrapped_model, test=test, target_column=target_column, task_type=task_type, @@ -269,7 +280,9 @@ def _validate_model(self, model: Any, test: pd.DataFrame, target_column, axis=1) small_test_data = get_text_columns(small_test_data, text_column) small_test_data = small_test_data.iloc[0] - if task_type != ModelTask.QUESTION_ANSWERING: + if task_type not in [ + ModelTask.QUESTION_ANSWERING, + ModelTask.GENERATIVE_TEXT]: small_test_data = small_test_data.tolist() # Call the model try: @@ -319,7 +332,8 @@ def _validate_rai_insights_input_parameters( ModelTask.SENTIMENT_ANALYSIS.value, ModelTask.QUESTION_ANSWERING.value, ModelTask.ENTAILMENT.value, - ModelTask.SUMMARIZATIONS.value + ModelTask.SUMMARIZATIONS.value, + ModelTask.GENERATIVE_TEXT.value, ] if task_type not in valid_tasks: @@ -362,6 +376,10 @@ def _validate_rai_insights_input_parameters( if not target_columns_set.issubset(set(test.columns)): raise UserConfigValidationException( 'The list of target_column(s) should be in test data') + elif (task_type == ModelTask.GENERATIVE_TEXT.value and + target_column is None): + # target column is optional for generative text + pass else: if target_column not in list(test.columns): raise UserConfigValidationException( @@ -514,6 +532,11 @@ def _get_test_text_data(self, is_classification_task): dataset = self.test.drop(target_column, axis=1) elif self.task_type == ModelTask.QUESTION_ANSWERING: dataset = self.test.drop([self.target_column], axis=1) + elif self.task_type == ModelTask.GENERATIVE_TEXT: + if self.target_column is None: + dataset = self.test.copy() + else: + dataset = self.test.drop([self.target_column], axis=1) else: raise ValueError("Unknown task type: {}".format(self.task_type)) dataset = get_text_columns(dataset, self._text_column) @@ -853,3 +876,71 @@ def compute_question_answering_metrics( except ValueError: all_cohort_metrics.append([0, 0, 0, 0, 0, 0]) return all_cohort_metrics + + def compute_genai_metrics( + self, + selection_indexes, + genai_cache + ): + dashboard_dataset = self.get_data().dataset + prompt_idx = dashboard_dataset.feature_names.index('prompt') + prompts = [feat[prompt_idx] for feat in dashboard_dataset.features] + true_y = dashboard_dataset.true_y + predicted_y = dashboard_dataset.predicted_y + + all_cohort_metrics = [] + for cohort_indices in selection_indexes: + cohort_metrics = dict() + + if true_y is None: + true_y_cohort = None + else: + true_y_cohort = [true_y[cohort_index] for cohort_index + in cohort_indices] + predicted_y_cohort = [predicted_y[cohort_index] for cohort_index + in cohort_indices] + prompts_cohort = [prompts[cohort_index] for cohort_index + in cohort_indices] + try: + if true_y_cohort is not None: + exact_match = evaluate.load('exact_match') + cohort_metrics['exact_match'] = exact_match.compute( + predictions=predicted_y_cohort, + references=true_y_cohort) + + cohort_metrics['coherence'] = get_genai_metric_mean( + 'coherence', + predictions=predicted_y_cohort, + references=prompts_cohort, + wrapper_model=self._eval_model) + + if true_y_cohort is not None: + cohort_metrics['equivalence'] = get_genai_metric_mean( + 'equivalence', + predictions=predicted_y_cohort, + references=prompts_cohort, + answers=true_y_cohort, + wrapper_model=self._eval_model) + + cohort_metrics['fluency'] = get_genai_metric_mean( + 'fluency', + predictions=predicted_y_cohort, + references=prompts_cohort, + wrapper_model=self._eval_model) + + cohort_metrics['groundedness'] = get_genai_metric_mean( + 'groundedness', + predictions=predicted_y_cohort, + references=prompts_cohort, + wrapper_model=self._eval_model) + + cohort_metrics['relevance'] = get_genai_metric_mean( + 'relevance', + predictions=predicted_y_cohort, + references=prompts_cohort, + wrapper_model=self._eval_model) + + all_cohort_metrics.append(cohort_metrics) + except ValueError: + all_cohort_metrics.append({}) + return all_cohort_metrics diff --git a/responsibleai_text/responsibleai_text/utils/feature_extractors.py b/responsibleai_text/responsibleai_text/utils/feature_extractors.py index 640415c625..bcff01c442 100644 --- a/responsibleai_text/responsibleai_text/utils/feature_extractors.py +++ b/responsibleai_text/responsibleai_text/utils/feature_extractors.py @@ -12,7 +12,8 @@ from tqdm import tqdm from nlp_feature_extractors import attribute_extractors as exts -from responsibleai_text.common.constants import (ModelTask, +from responsibleai_text.common.constants import (GenerativeTextFields, + ModelTask, QuestionAnsweringFields) nlp = None @@ -60,6 +61,9 @@ def extract_features(text_dataset: pd.DataFrame, feature_names.append(prefix + "maximum_parse_tree_depth") feature_names.append("question_type") feature_names.append("context_overlap") + elif task_type == ModelTask.GENERATIVE_TEXT: + start_meta_index = 0 + feature_names = base_feature_names else: raise ValueError("Unknown task type: {}".format(task_type)) # copy over the metadata column names @@ -96,6 +100,19 @@ def extract_features(text_dataset: pd.DataFrame, context_overlap = get_context_overlap(context=context, question=question) extracted_features.append(context_overlap) + # append all other metadata features + append_metadata_values(start_meta_index, text_dataset, i, + extracted_features, has_dropped_features, + dropped_features, column_names) + results.append(extracted_features) + elif task_type == ModelTask.GENERATIVE_TEXT: + for i, row in tqdm(text_features.iterrows(), + desc='feature extraction'): + extracted_features = [] + add_extracted_features_for_sentence( + row[GenerativeTextFields.PROMPT], extracted_features, + task_type) + # append all other metadata features append_metadata_values(start_meta_index, text_dataset, i, extracted_features, has_dropped_features, diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/__init__.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/__init__.py new file mode 100644 index 0000000000..692faf31bd --- /dev/null +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +"""Contains the GenAI metrics.""" diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/__init__.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/__init__.py new file mode 100644 index 0000000000..461928c132 --- /dev/null +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +"""Contains the implementation of various metrics for GenAI.""" diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/_compute.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/_compute.py index 43ab0fc23a..5e20b2f21d 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/_compute.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/_compute.py @@ -24,7 +24,7 @@ def _compute_metric(template, logger, wrapper_model, **kwargs): templated_ques = format_str(template, **kwargs) inp = pd.DataFrame({ - 'questions': templated_ques, + 'prompt': templated_ques, 'sys_prompt': _SYS_PROMPT}) responses = wrapper_model.predict(inp) diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py index 9e32985407..f26bd3467a 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py @@ -48,6 +48,15 @@ This rating value should always be an integer between 1 and 5. So the rating \ produced should be 1 or 2 or 3 or 4 or 5. +Some examples of valid responses are: +1 +2 +5 +Some examples of invalid responses are: +1/5 +1.5 +3.0 +5 stars QUESTION: {question} diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py index 5fadb1e256..0531ccd228 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py @@ -47,6 +47,15 @@ This rating value should always be an integer between 1 and 5. So the rating \ produced should be 1 or 2 or 3 or 4 or 5. +Some examples of valid responses are: +1 +2 +5 +Some examples of invalid responses are: +1/5 +1.5 +3.0 +5 stars QUESTION: {question} diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/groundedness.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/groundedness.py index 4135ee8102..28da15777d 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/groundedness.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/groundedness.py @@ -47,6 +47,18 @@ Note the ANSWER is generated by a computer system, it can contain certain \ symbols, which should not be a negative factor in the evaluation. +This rating value should always be an integer between 1 and 5. So the rating \ +produced should be 1 or 2 or 3 or 4 or 5. +Some examples of valid responses are: +1 +2 +5 +Some examples of invalid responses are: +1/5 +1.5 +3.0 +5 stars + CONTEXT: {context} diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/relevance.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/relevance.py index ca43ed0f55..01965bfcb2 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/relevance.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/relevance.py @@ -47,6 +47,15 @@ This rating value should always be an integer between 1 and 5. So the rating \ produced should be 1 or 2 or 3 or 4 or 5. +Some examples of valid responses are: +1 +2 +5 +Some examples of invalid responses are: +1/5 +1.5 +3.0 +5 stars QUESTION AND CONTEXT: {question}