From b77531acb8b9c4c38bbdbcfbf5f6ee66cab1ef66 Mon Sep 17 00:00:00 2001 From: HardiRathod <33089230+HardiRathod@users.noreply.github.com> Date: Fri, 22 Oct 2021 12:23:31 -0500 Subject: [PATCH 1/6] update for metrics by cell --- tl/evaluation/evaluation.py | 105 ++++++++++++++++++++---------------- 1 file changed, 58 insertions(+), 47 deletions(-) diff --git a/tl/evaluation/evaluation.py b/tl/evaluation/evaluation.py index 5efc575d..2a7bbbf3 100644 --- a/tl/evaluation/evaluation.py +++ b/tl/evaluation/evaluation.py @@ -16,15 +16,12 @@ def read_csv(file_path, dtype=object): def ground_truth_labeler(gt_file_path, file_path=None, df=None): """ compares each candidate for the input cells with the ground truth value for that cell and adds an evaluation label. - Args: gt_file_path: ground truth file path. column: column name with ranking scores file_path: input file path df: or input dataframe - Returns: a dataframe with added column `evaluation_label` - """ if file_path is None and df is None: raise RequiredInputParameterMissingException( @@ -58,20 +55,59 @@ def assign_evaluation_label(row): return 1 return -1 +def calculate_metrics_by_group(cgdf, k, method, col, column, tag): + results = [] + # true positive for precision at 1 + tp_ps = [] -def metrics(column, file_path=None, df=None, k: int = 1, tag=""): + # true positive for recall at k + tp_rs = defaultdict(list) + if method == 'column': + grouped = cgdf.groupby(by=['row']) + else: + grouped = cgdf.groupby(by=['column-id', 'row']) + n = len(grouped) + for key, gdf in grouped: + gdf = gdf.sort_values(by=[column, 'kg_id'], ascending=[False, True]).reset_index() + + for i, row in gdf.iterrows(): + if float(row['evaluation_label']) == 1 and row[column] == row['max_score']: + tp_ps.append(key) + + # this df is sorted by score, so highest ranked candidate is rank 1 and so on... + rank = i + 1 + if rank <= k and (row['evaluation_label'] == '1' or row['evaluation_label'] == 1.0): + tp_rs[k].append(key) + + precision = float(len(tp_ps)) / float(n) + recall = {k: float(len(each_tp_rs)) / float(n) for k, each_tp_rs in tp_rs.items()} + # sort as k value increasing + recall = {k: v for k, v in sorted(recall.items(), key=lambda x: x[0])} + + for _k, each_recall in recall.items(): + if precision == 0 and each_recall == 0: + f1_score = 0.0 + else: + f1_score = (2 * precision * each_recall) / (precision + each_recall) + results.append({"k": _k, + 'f1': f1_score, + 'precision': precision, + 'recall': each_recall, + 'column': col, + 'tag': tag}) + return results + + +def metrics(column, file_path=None, df=None, k: int = 1, tag="", method = 'column'): """ computes the precision, recall and f1 score for the tl pipeline. - Args: column: column with ranking score file_path: input file path df: or input dataframe k: calculate recall at top k candidates tag: a tag to use in the output file to identify the results of running the given pipeline - Returns: - """ if file_path is None and df is None: raise RequiredInputParameterMissingException( @@ -85,49 +121,24 @@ def metrics(column, file_path=None, df=None, k: int = 1, tag=""): # replace na to 0.0 df[column] = df[column].astype(float).fillna(0.0) - df['max_score'] = df.groupby(by=['column', 'row'])[column].transform(max) + if method == 'column': + separating_column = 'column' + else: + separating_column = 'column-id' + df['max_score'] = df.groupby(by=[separating_column, 'row'])[column].transform(max) # relevant df rdf = df[df['evaluation_label'].astype(float) != 0.0] - - col_grouped = rdf.groupby(by=['column']) - results = [] - for col, cgdf in col_grouped: - # true positive for precision at 1 - tp_ps = [] - - # true positive for recall at k - tp_rs = defaultdict(list) - grouped = cgdf.groupby(by=['row']) - n = len(grouped) - for key, gdf in grouped: - gdf = gdf.sort_values(by=[column, 'kg_id'], ascending=[False, True]).reset_index() - - for i, row in gdf.iterrows(): - if float(row['evaluation_label']) == 1 and row[column] == row['max_score']: - tp_ps.append(key) - - # this df is sorted by score, so highest ranked candidate is rank 1 and so on... - rank = i + 1 - if rank <= k and (row['evaluation_label'] == '1' or row['evaluation_label'] == 1.0): - tp_rs[k].append(key) - - precision = float(len(tp_ps)) / float(n) - recall = {k: float(len(each_tp_rs)) / float(n) for k, each_tp_rs in tp_rs.items()} - # sort as k value increasing - recall = {k: v for k, v in sorted(recall.items(), key=lambda x: x[0])} - - for _k, each_recall in recall.items(): - if precision == 0 and each_recall == 0: - f1_score = 0.0 - else: - f1_score = (2 * precision * each_recall) / (precision + each_recall) - results.append({"k": _k, - 'f1': f1_score, - 'precision': precision, - 'recall': each_recall, - 'column': col, - 'tag': tag}) + if method == 'column': + col_grouped = rdf.groupby(by=['column']) + results = [] + # output_df = calculate_metrics_by_group(col_grouped) + for col, cgdf in col_grouped: + col_wise_result = calculate_metrics_by_group(cgdf, k, method, col, column, tag) + results.extend(col_wise_result) + + else: + results = calculate_metrics_by_group(rdf, k, method = method, col = "", column = column, tag = tag) output_df = pd.DataFrame(results) return output_df From f31a5713b019d0eb825aeadaff92ec1fff95f066 Mon Sep 17 00:00:00 2001 From: HardiRathod <33089230+HardiRathod@users.noreply.github.com> Date: Fri, 22 Oct 2021 12:25:56 -0500 Subject: [PATCH 2/6] update to add option for metrics with cell --- tl/cli/metrics.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tl/cli/metrics.py b/tl/cli/metrics.py index 0c4a4406..f7b9b253 100644 --- a/tl/cli/metrics.py +++ b/tl/cli/metrics.py @@ -27,9 +27,13 @@ def add_arguments(parser): parser.add_argument('--tag', action='store', type=str, dest='tag', default='', help='a tag to use in the output file to identify the results of running the given pipeline') + parser.add_argument('--method', action='store', dest='method', + default="column", + choices=["column", 'cell'], + help="The method for calculating metrics.") + parser.add_argument('input_file', nargs='?', type=argparse.FileType('r'), default=sys.stdin) - def run(**kwargs): from tl.evaluation import evaluation import pandas as pd @@ -37,7 +41,7 @@ def run(**kwargs): try: df = pd.read_csv(kwargs['input_file'], dtype=object) start = time.time() - odf = evaluation.metrics(kwargs['column'], k=kwargs['k'], df=df, tag=kwargs['tag']) + odf = evaluation.metrics(kwargs['column'], k=kwargs['k'], df=df, tag=kwargs['tag'], method=kwargs['method']) end = time.time() logger = Logger(kwargs["logfile"]) logger.write_to_file(args={ From 30885194ebc6f9669bebd5481c41939f4f3f92e3 Mon Sep 17 00:00:00 2001 From: HardiRathod <33089230+HardiRathod@users.noreply.github.com> Date: Fri, 22 Oct 2021 12:29:15 -0500 Subject: [PATCH 3/6] date parsing, m text, incorrectness scores, etc also included the relevant threshold --- tl/features/cell_context_matches.py | 161 ++++++++++++++++++++-------- 1 file changed, 118 insertions(+), 43 deletions(-) diff --git a/tl/features/cell_context_matches.py b/tl/features/cell_context_matches.py index a76678ef..00cd8ea4 100644 --- a/tl/features/cell_context_matches.py +++ b/tl/features/cell_context_matches.py @@ -94,7 +94,8 @@ def get_triples_to_column(self, col2: str): raise Exception(f'Cannot find context for a column with itself. col1: {self.col}, col2: {col2}') return self.ccm.get(col2, []) - def get_properties(self, col2: str, q_node: str = None) -> List[Tuple[str, str, float, int]]: + def get_properties(self, col2: str, q_node: str = None, return_zero_scores=False) -> List[ + Tuple[str, str, float, int]]: """ list of tuples (property, type, best score, count_appears) -> [("P175", "i", 0.95, 4), ...] @@ -103,6 +104,9 @@ def get_properties(self, col2: str, q_node: str = None) -> List[Tuple[str, str, for row in range (0, max_rows): cc = tcm.get_cell_context(row, col) props = cc.get_properties(3) + return_zero_scores: for a particular qnode, this will return properties which did not match + (the ones with zero scores). These properties are needed to compute whether a relevant property did not match + for incorrectness scores. """ if self.col == col2: raise Exception(f'Cannot find context for a column with itself. col1: {self.col}, col2: {col2}') @@ -116,6 +120,8 @@ def get_properties(self, col2: str, q_node: str = None) -> List[Tuple[str, str, for record in col2_records: property = record['property'] score = record['score'] + if not return_zero_scores and score == 0: + continue if property not in prop_count: prop_count[property] = { 'count': 0, @@ -153,11 +159,13 @@ def __init__(self, context_matches_path=None, label_column: str = 'label_clean', ignore_column: str = None, + pseudo_column: str = None, relevant_properties_file: str = None, use_relevant_properties: bool = False, save_relevant_properties: bool = False, string_similarity_threshold: float = 0.7, quantity_similarity_threshold: float = 0.3, + property_relevance_threshold: float = 0.2, output_column_name: str = "context_score" ): """ @@ -169,8 +177,11 @@ def __init__(self, so the backing store must be NumPy array. """ self.ignore_column = ignore_column + self.pseudo_column = pseudo_column if self.ignore_column: self.prefix_column_name = "ignore_" + elif self.pseudo_column: + self.prefix_column_name = "pseudo_" else: self.prefix_column_name = "" self.row_col_label_dict = {} @@ -180,6 +191,7 @@ def __init__(self, self.ccm_dict = {} self.string_similarity_threshold = string_similarity_threshold self.quantity_similarity_threshold = quantity_similarity_threshold + self.property_relevance_threshold = property_relevance_threshold self.input_df = None if input_path is not None: input_df = pd.read_csv(input_path) @@ -191,24 +203,25 @@ def __init__(self, self.relevant_properties = self.read_relevant_properties() input_df['row'] = input_df['row'].astype('str') input_df['column'] = input_df['column'].astype('str') - self.main_entity_column = self.find_main_entity_column(input_df, label_column) self.initialize(input_df, context_dict, label_column) - if context_matches_path is not None: self.load_from_disk(context_matches_path) - def read_relevant_properties(self) -> dict: # or whatever datastructure makes sense + def read_relevant_properties(self, + relevant_properties_df: pd.DataFrame = None) -> dict: if self.relevant_properties_file is None: raise TLException('Please specify a valid path for relevant properties.') - - relevant_properties_df = pd.read_csv(self.relevant_properties_file) + if relevant_properties_df is None: + relevant_properties_df = pd.read_csv(self.relevant_properties_file) + relevant_properties_df = relevant_properties_df[ + relevant_properties_df['property_score'] >= self.property_relevance_threshold] relevant_properties_group = relevant_properties_df.groupby(['column', 'col2']) relevant_properties_dict = {} for cell, group in relevant_properties_group: column_column_pair = f"{cell[0]}_{cell[1]}" - all_properties = set(group['property_'].unique()) - relevant_properties_dict[column_column_pair] = all_properties - + # all_properties = set(group['property_'].unique()) + # relevant_properties_dict[column_column_pair] = all_properties + relevant_properties_dict[column_column_pair] = dict(zip(group['property_'], group['property_score'])) return relevant_properties_dict def write_relevant_properties(self, relevant_properties_df: pd.DataFrame): @@ -239,17 +252,18 @@ def initialize(self, raw_input_df, context_dict, label_column): raw_input_df['kg_labels'].fillna("", inplace=True) raw_input_df['kg_aliases'].fillna("", inplace=True) - raw_input_df['context'].fillna("", inplace=True) - - if self.ignore_column is not None: - _input_df = raw_input_df[(raw_input_df[self.ignore_column].astype(float) == 0) - & (raw_input_df['column'] == self.main_entity_column)] - not_ignored_rows = set(_input_df['row'].unique()) - _input_df_2 = raw_input_df[ - (raw_input_df['row'].isin(not_ignored_rows)) & (raw_input_df["column"] != self.main_entity_column)] + raw_input_df['context'].fillna("", inplace = True) - self.input_df = pd.concat([_input_df, _input_df_2]) + if self.ignore_column is not None or self.pseudo_column is not None: + if self.ignore_column is not None: + _input_df = raw_input_df[(raw_input_df[self.ignore_column].astype(float) == 0)] + not_ignored_rows = set(_input_df['row'].unique()) + _input_df_2 = raw_input_df[ + (raw_input_df['row'].isin(not_ignored_rows))] + self.input_df = pd.concat([_input_df, _input_df_2]).drop_duplicates() + else: + self.input_df = raw_input_df[(raw_input_df[self.pseudo_column].astype(float) == 1)] not_ignored_indices = self.input_df.index self.other_input_df = raw_input_df[~raw_input_df.index.isin(not_ignored_indices)] @@ -266,17 +280,24 @@ def initialize(self, raw_input_df, context_dict, label_column): row_column_pairs.add(key) # row_column_label_dict stores only the row_column pairs that need to be matched for row, col, context in zip(self.input_df['row'], self.input_df['column'], self.input_df['context']): - if col == '0': + if col == min(columns): context_vals = context.split('|') for i, context_val in enumerate(context_vals): context_column = i + 1 row_col_dict_key = f"{row}_{context_column}" if row_col_dict_key not in self.row_col_label_dict: - try: - date = dp.parse(context_val) - context_val = str(date.year) - except: - pass + # Next condition, resolves the error for matching the case where month's name get parsed. + if not context_val.isalpha(): + try: + if len(context_val) >= 4: + if len(context_val) == 4: + date = dp.parse(context_val) + context_val = str(date.year) + else: + date = dp.parse(context_val) + context_val = str(date.year) + "-" + str(date.month) + "-" + str(date.day) + except: + pass self.row_col_label_dict[row_col_dict_key] = context_val columns.add(str(context_column)) for row, col, kg_id, kg_id_label_str, kg_id_alias_str in zip(self.input_df['row'], @@ -298,15 +319,17 @@ def initialize(self, raw_input_df, context_dict, label_column): self.ccm_dict[ccm_key] = CellContextMatches(row, col) if kg_id_context is not None: for col2 in columns: - if (col != col2) and (col == self.main_entity_column or col2 == self.main_entity_column): + if col != col2: ccm_key_2 = f"{row}_{col2}" if ccm_key_2 not in self.ccm_dict: self.ccm_dict[ccm_key_2] = CellContextMatches(row, col2) + context_results = self.compute_context_similarity(kg_id_context, col, col2, self.row_col_label_dict.get(f"{row}_{col2}", - None)) + None), + return_zero_similarity=True) for context_result in context_results: self.add_match(row=row, col1=col, @@ -323,18 +346,20 @@ def initialize(self, raw_input_df, context_dict, label_column): self.input_df = self.process(row_column_pairs, columns) def process(self, row_column_pairs: set, n_context_columns: set): - context_scores, properties, similarities = self.compute_context_scores(n_context_columns, row_column_pairs) + context_scores, properties, similarities, relevant_property_list, incorrectness_scores = self.compute_context_scores( + n_context_columns, row_column_pairs) self.input_df[self.output_column_name] = context_scores self.input_df[self.prefix_column_name + 'context_properties'] = properties self.input_df[self.prefix_column_name + 'context_similarity'] = similarities + self.input_df[self.prefix_column_name + 'relevant_properties'] = relevant_property_list + self.input_df[self.prefix_column_name + 'incorrectness_scores'] = incorrectness_scores out = [self.input_df] if self.other_input_df is not None: out.append(self.other_input_df) return pd.concat(out).fillna(0.0) - def correctness_of_candidate(self): - # Number of matches are the number it matched correctly - pass + def incorrectness_of_candidate(self, sum_of_relevant_properties_not_matched: int) -> float: + return 1 - (1 / (pow(2, sum_of_relevant_properties_not_matched))) def compute_context_scores(self, n_context_columns: set, row_column_pairs: set) -> ( List[int], List[str], List[int]): @@ -342,65 +367,101 @@ def compute_context_scores(self, n_context_columns: set, row_column_pairs: set) context_score_list = [] context_property_list = [] context_similarity_list = [] + relevant_property_list = [] + incorrectness_score_list = [] for row, col, q_node in zip(self.input_df['row'], self.input_df['column'], self.input_df['kg_id']): # Handle equal similarity for different properties by looping over and getting # the one with highest similarity. property_matched = [] similarity_matched = [] + relevant_properties = [] sum_of_properties = 0 + relevant_properties_not_matched = [] + incorrectness_score = 0 r_c = f"{row}_{col}" for col2 in n_context_columns: - if col2 != col and (col == self.main_entity_column or col2 == self.main_entity_column): - returned_properties = self.ccm_dict[r_c].get_properties(col2, q_node=q_node) - if not returned_properties: - continue + if col2 != col: + returned_properties = self.ccm_dict[r_c].get_properties(col2, q_node=q_node, + return_zero_scores=True) + relevant_properties_for_pair = self.relevant_properties.get(f"{col}_{col2}", "") best_score = 0 property_ = None + correct_matched = 0 + incorrect_matched = 0 + if not returned_properties: + is_no_match = 1 if correct_matched == 0 and incorrect_matched > 0 else 0 + relevant_properties_not_matched.append(is_no_match) + continue + for properties in returned_properties: if properties[2] > best_score: property_ = properties[0] best_score = properties[2] - # if property_ not in current_relevant_properties: pass - property_matched.append(property_ + "(" + str(best_score) + ")") + if properties[0] in relevant_properties_for_pair: + if properties[2] == 0: + incorrect_matched += 1 + else: + correct_matched += 1 + # if correct_matched == 0 after this point, it implies that none of the relevant properties matched. + # if incorrect_matched == 0, after this point, it implies that either all the properties matched or + # all the properties are missing. + + is_no_match = 1 if (correct_matched == 0) and ( + incorrect_matched > 0) else 0 + relevant_properties_not_matched.append(is_no_match) + relevant_properties.append(":".join(relevant_properties_for_pair)) + if best_score == 0: + continue + property_matched.append(col2 + ":" + property_ + "(" + str(best_score) + ")") similarity_matched.append(best_score) sum_of_properties = sum_of_properties + best_score if sum_of_properties == 0: context_score = 0 else: context_score = (1 - 1 / pow(2, sum_of_properties)) + if len(relevant_properties_not_matched) > 0: + incorrectness_score = self.incorrectness_of_candidate(sum(relevant_properties_not_matched)) context_score_list.append(context_score) + incorrectness_score_list.append(incorrectness_score) context_similarity_list.append(similarity_matched) context_property_list.append(property_matched) - return context_score_list, context_property_list, context_similarity_list + relevant_property_list.append("|".join(relevant_properties)) + return context_score_list, context_property_list, context_similarity_list, relevant_property_list, incorrectness_score_list def compute_property_scores(self, row_column_pairs: set, n_context_columns: set): properties_df_list = [] + num_rows = self.input_df['row'].nunique() for r_c in row_column_pairs: row_col = r_c.split("_") row = row_col[0] col = row_col[1] for col2 in n_context_columns: - if (col2 != col) and (col2 == self.main_entity_column or col == self.main_entity_column): - m = self.ccm_dict[r_c].get_properties(col2) + if col2 != col: + m = self.ccm_dict[r_c].get_properties(col2, return_zero_scores=False) int_prop = pd.DataFrame(m, columns=["property_", "type", "best_score", "avg_score", "n_occurences"]) int_prop['row'] = row int_prop['column'] = col int_prop['col2'] = col2 properties_df_list.append(int_prop) - if len(properties_df_list) > 0: - properties_df = pd.concat(properties_df_list) + if len(properties_df_list) > 0: + properties_df = pd.concat(properties_df_list) + else: + properties_df = pd.DataFrame(columns=["property_", "type", "best_score", "avg_score", + "n_occurences", "row", "column", "col2"]) property_value_list = [] grouped_obj = properties_df.groupby(['column', 'col2', 'property_']) for cell, group in grouped_obj: - property_score = (group['avg_score'].sum(axis=0)) + property_score = (group['avg_score'].sum(axis=0)) / num_rows property_value_list.append([cell[2], cell[0], cell[1], property_score]) property_value_df = pd.DataFrame(property_value_list, columns=['property_', 'column', 'col2', 'property_score']) property_value_df = property_value_df.sort_values(by=['column', 'property_score'], ascending=[True, False]) # Saving the top 3 properties for each column column pair that we have. # is equivalent to most_important_property_df = property_value_df.groupby(['column', 'col2']).head(3) + if self.save_relevant_properties: self.write_relevant_properties(most_important_property_df) + self.relevant_properties = self.read_relevant_properties(most_important_property_df) def compute_context_similarity(self, kg_id_context: List[dict], @@ -497,10 +558,23 @@ def computes_similarity(self, col2_num = self.return_a_number(col2_string) if col2_num: current_sim = self.compute_quantity_similarity(float(col2_num), float(context_value)) - elif context_values_type == 'i': + elif context_values_type == 'i' or context_values_type == 'm': current_sim = similarity.hybrid.symmetric_monge_elkan_similarity(self.preprocess(context_value), self.preprocess(col2_string), lower_bound=self.string_similarity_threshold) + # Need to remove this after rltk update + if current_sim < self.string_similarity_threshold: + current_sim = 0 + elif context_values_type == 'd': + # Try match the date completely . -> This already happens in the first if condition for exact match + # Next try match the years. + year_col2_string = col2_string.split("-")[0] + year_context_value = context_value.split("-")[0] + if year_col2_string == year_context_value: + if len(col2_string) == 4: + current_sim = 1.0 + else: + current_sim = 0.5 if current_sim > max_sim: max_sim = current_sim best_matched = context_value @@ -606,3 +680,4 @@ def read_context_file(context_file: str) -> dict: context_dict.update(json.loads(line.strip())) return context_dict + From fbfe98baae21c757ef2e5796f145fb7e85a2f195 Mon Sep 17 00:00:00 2001 From: HardiRathod <33089230+HardiRathod@users.noreply.github.com> Date: Fri, 22 Oct 2021 12:30:37 -0500 Subject: [PATCH 4/6] update to add relevance property threshold --- tl/cli/context-match.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tl/cli/context-match.py b/tl/cli/context-match.py index a75219a7..1bde2744 100644 --- a/tl/cli/context-match.py +++ b/tl/cli/context-match.py @@ -40,6 +40,9 @@ def add_arguments(parser): parser.add_argument('--ignore-column-name', action='store', dest='ignore_column_name', default=None, help='This column is used to consider only few rows by setting to 1.') + parser.add_argument('--pseudo-gt-column-name', action='store', + dest='pseudo_gt_column_name', default=None, + help='This column is used to consider only few rows by setting to 0.') parser.add_argument('--context-properties-path', action='store', dest='context_properties_path', default=None, help="The path where relevant properties will be stored.") @@ -48,6 +51,9 @@ def add_arguments(parser): parser.add_argument('--save-relevant-properties', action='store_true', default=False, dest='save_relevant_properties', help="if set, relevant properties are written a file.") + parser.add_argument('--property-relevance-threshold', action='store', type=float, + default=0, dest='property_relevance_threshold', + help='The minimum property score to be used for determining the relevance of a property.') # output parser.add_argument('-o', '--output-column-name', action='store', dest='output_column', default="context_score", help='The output column is the named column of the score for the matches ' @@ -65,15 +71,20 @@ def run(**kwargs): similarity_string_threshold = kwargs.pop("similarity_string_threshold") similarity_quantity_threshold = kwargs.pop("similarity_quantity_threshold") ignore_column_name = kwargs.pop("ignore_column_name") + property_relevance_threshold = kwargs.pop("property_relevance_threshold") + pseudo_gt_column_name = kwargs.pop("pseudo_gt_column_name") - obj = TableContextMatches(context_path=context_file_path, context_dict=None, input_path=input_file_path, + obj = TableContextMatches(context_path=context_file_path, context_dict=None, + input_path=input_file_path, context_matches_path=None, label_column='label_clean', ignore_column=ignore_column_name, + pseudo_column=pseudo_gt_column_name, relevant_properties_file=kwargs['context_properties_path'], use_relevant_properties=kwargs['use_relevant_properties'], save_relevant_properties=kwargs['save_relevant_properties'], string_similarity_threshold=similarity_string_threshold, quantity_similarity_threshold=similarity_quantity_threshold, + property_relevance_threshold=property_relevance_threshold, output_column_name=output_column_name) start = time.time() result_df = obj.input_df From 46d52a0d3efa22ce88b4d90b1403d789eb555ec1 Mon Sep 17 00:00:00 2001 From: HardiRathod <33089230+HardiRathod@users.noreply.github.com> Date: Tue, 26 Oct 2021 18:18:10 -0500 Subject: [PATCH 5/6] fixed unittest errors --- tl/unittests/test_context_match.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tl/unittests/test_context_match.py b/tl/unittests/test_context_match.py index e6a3c8ec..e2d25063 100644 --- a/tl/unittests/test_context_match.py +++ b/tl/unittests/test_context_match.py @@ -115,6 +115,7 @@ def test_for_item_match(self): def test_for_date_match(self): obj_6 = MatchContext(self.input_file_path, self.similarity_string_threshold, self.similarity_quantity_threshold, self.string_separator, self.missing_property_replacement_factor, self.ignore_column_name, + self.pseudo_gt_column_name, self.output_column_name, context_path=self.context_file_path) odf = obj_6.process_data_by_column() odf.to_csv('{}/data/result_test_6.csv'.format(parent_path), index=False) From 6671bafd01a75d38c58529038e322ad3b7a66c3c Mon Sep 17 00:00:00 2001 From: HardiRathod <33089230+HardiRathod@users.noreply.github.com> Date: Tue, 26 Oct 2021 18:21:36 -0500 Subject: [PATCH 6/6] Bug fixed - matching with the first column --- tl/features/cell_context_matches.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tl/features/cell_context_matches.py b/tl/features/cell_context_matches.py index 00cd8ea4..d1b08562 100644 --- a/tl/features/cell_context_matches.py +++ b/tl/features/cell_context_matches.py @@ -252,7 +252,7 @@ def initialize(self, raw_input_df, context_dict, label_column): raw_input_df['kg_labels'].fillna("", inplace=True) raw_input_df['kg_aliases'].fillna("", inplace=True) - raw_input_df['context'].fillna("", inplace = True) + raw_input_df['context'].fillna("", inplace=True) if self.ignore_column is not None or self.pseudo_column is not None: if self.ignore_column is not None: @@ -272,15 +272,16 @@ def initialize(self, raw_input_df, context_dict, label_column): else: self.input_df = raw_input_df self.other_input_df = None - rows = set(self.input_df['row'].unique()) columns = set(self.input_df['column'].unique()) row_column_pairs = set() for row, col, label in zip(self.input_df['row'], self.input_df['column'], self.input_df[label_column]): key = f"{row}_{col}" + self.row_col_label_dict[key] = label row_column_pairs.add(key) # row_column_label_dict stores only the row_column pairs that need to be matched + min_columns = min(columns) for row, col, context in zip(self.input_df['row'], self.input_df['column'], self.input_df['context']): - if col == min(columns): + if col == min_columns: context_vals = context.split('|') for i, context_val in enumerate(context_vals): context_column = i + 1 @@ -300,6 +301,7 @@ def initialize(self, raw_input_df, context_dict, label_column): pass self.row_col_label_dict[row_col_dict_key] = context_val columns.add(str(context_column)) + for row, col, kg_id, kg_id_label_str, kg_id_alias_str in zip(self.input_df['row'], self.input_df['column'], self.input_df['kg_id'], @@ -330,6 +332,7 @@ def initialize(self, raw_input_df, context_dict, label_column): self.row_col_label_dict.get(f"{row}_{col2}", None), return_zero_similarity=True) + for context_result in context_results: self.add_match(row=row, col1=col, @@ -486,6 +489,7 @@ def compute_context_similarity(self, if not self.use_relevant_properties or (self.use_relevant_properties and self.is_relevant_property(col, col2, property)): + score, best_str_match = self.computes_similarity(prop_val_dict['v'], col2_string_set, prop_val_dict['t']) @@ -680,4 +684,3 @@ def read_context_file(context_file: str) -> dict: context_dict.update(json.loads(line.strip())) return context_dict -