Merge pull request #12 from cmap/upgrade_pandas_etc

Upgrade pandas etc
cmap · Oct 27, 2017 · 0bc6203 · 0bc6203
2 parents df84861 + ffa8407
commit 0bc6203
Show file tree

Hide file tree

Showing 172 changed files with 1,779 additions and 41,537 deletions.
diff --git a/cmapPy/pandasGEXpress/GCToo.py b/cmapPy/pandasGEXpress/GCToo.py
@@ -42,11 +42,13 @@
 import numpy as np
 import pandas as pd
 import logging
-from cmapPy.pandasGEXpress import setup_GCToo_logger as setup_logger
+import setup_GCToo_logger as setup_logger
+
 
 __authors__ = 'Oana Enache, Lev Litichevskiy, Dave Lahr'
 __email__ = '[email protected]'
 
+
 class GCToo(object):
     """Class representing parsed gct(x) objects as pandas dataframes.
     Contains 3 component dataframes (row_metadata_df, column_metadata_df,

diff --git a/cmapPy/pandasGEXpress/__init__.py b/cmapPy/pandasGEXpress/__init__.py
@@ -1,4 +1 @@
-from .parse import parse
-#from .GCToo import GCToo
-#from .write_gctx import write_gctx
-#from .write_gct import write_gct 
+from cmapPy.pandasGEXpress.parse import parse
diff --git a/cmapPy/pandasGEXpress/concat_gctoo.py b/cmapPy/pandasGEXpress/concat_gctoo.py
@@ -38,6 +38,7 @@
 import logging
 import setup_GCToo_logger as setup_logger
 import pandas as pd
+import numpy
 
 import GCToo
 import parse 
@@ -72,6 +73,8 @@ def build_parser():
         help="what to name the output file")
     parser.add_argument("--fields_to_remove", "-ftr", nargs="+", default=[],
         help="fields to remove from the common metadata")
+    parser.add_argument("--remove_all_metadata_fields", "-ramf", action="store_true", default=False,
+                        help="remove all metadata fields during operation")
     parser.add_argument("--reset_ids", "-rsi", action="store_true", default=False,
         help="whether to reset ids (use this flag if ids are not unique)")
 
@@ -84,13 +87,18 @@ def build_parser():
     parser.add_argument("-verbose", "-v", action="store_true", default=False,
         help="whether to print a bunch of output")
 
+    parser.add_argument("--error_report_output_file", "-erof", type=str, default=None,
+                        help="""destination file for writing out error report - currently information about inconsistent
+                        metadata fields in the common dimension of the concat operation""")
+
     return parser
 
 
 def main():
     # get args
     args = build_parser().parse_args(sys.argv[1:])
     setup_logger.setup(verbose=args.verbose)
+    logger.debug("args:  {}".format(args))
 
     # Get files directly
     if args.input_filepaths is not None:
@@ -120,10 +128,12 @@ def main():
 
         # Create concatenated gctoo object
         if args.concat_direction == "horiz":
-            out_gctoo = hstack(gctoos, args.fields_to_remove, args.reset_ids)
+            out_gctoo = hstack(gctoos, args.remove_all_metadata_fields, args.error_report_output_file,
+                               args.fields_to_remove, args.reset_ids)
 
         elif args.concat_direction == "vert":
-            out_gctoo = vstack(gctoos, args.fields_to_remove, args.reset_ids)
+            out_gctoo = vstack(gctoos, args.remove_all_metadata_fields, args.error_report_output_file,
+                               args.fields_to_remove, args.reset_ids)
 
     # Write out_gctoo to file
     logger.info("Writing to output file args.out_name:  {}".format(args.out_name))
@@ -153,7 +163,7 @@ def get_file_list(wildcard):
     return files
 
 
-def hstack(gctoos, fields_to_remove=[], reset_ids=False):
+def hstack(gctoos, remove_all_metadata_fields, error_report_file, fields_to_remove=[], reset_ids=False):
     """ Horizontally concatenate gctoos.
 
     Args:
@@ -169,18 +179,20 @@ def hstack(gctoos, fields_to_remove=[], reset_ids=False):
     row_meta_dfs = []
     col_meta_dfs = []
     data_dfs = []
+    srcs = []
     for g in gctoos:
         row_meta_dfs.append(g.row_metadata_df)
         col_meta_dfs.append(g.col_metadata_df)
         data_dfs.append(g.data_df)
+        srcs.append(g.src)
 
     logger.debug("shapes of row_meta_dfs:  {}".format([x.shape for x in row_meta_dfs]))
 
     # Concatenate row metadata
-    all_row_metadata_df = assemble_common_meta(row_meta_dfs, fields_to_remove)
+    all_row_metadata_df = assemble_common_meta(row_meta_dfs, fields_to_remove, srcs, remove_all_metadata_fields, error_report_file)
 
     # Concatenate col metadata
-    all_col_metadata_df = assemble_concatenated_meta(col_meta_dfs)
+    all_col_metadata_df = assemble_concatenated_meta(col_meta_dfs, remove_all_metadata_fields)
 
     # Concatenate the data_dfs
     all_data_df = assemble_data(data_dfs, "horiz")
@@ -202,7 +214,7 @@ def hstack(gctoos, fields_to_remove=[], reset_ids=False):
     return concated
 
 
-def vstack(gctoos, fields_to_remove=[], reset_ids=False):
+def vstack(gctoos, remove_all_metadata_fields, error_report_file, fields_to_remove=[], reset_ids=False):
     """ Vertically concatenate gctoos.
 
     Args:
@@ -218,16 +230,18 @@ def vstack(gctoos, fields_to_remove=[], reset_ids=False):
     row_meta_dfs = []
     col_meta_dfs = []
     data_dfs = []
+    srcs = []
     for g in gctoos:
         row_meta_dfs.append(g.row_metadata_df)
         col_meta_dfs.append(g.col_metadata_df)
         data_dfs.append(g.data_df)
+        srcs.append(g.src)
 
     # Concatenate col metadata
-    all_col_metadata_df = assemble_common_meta(col_meta_dfs, fields_to_remove)
+    all_col_metadata_df = assemble_common_meta(col_meta_dfs, fields_to_remove, srcs, remove_all_metadata_fields, error_report_file)
 
     # Concatenate col metadata
-    all_row_metadata_df = assemble_concatenated_meta(row_meta_dfs)
+    all_row_metadata_df = assemble_concatenated_meta(row_meta_dfs, remove_all_metadata_fields)
 
     # Concatenate the data_dfs
     all_data_df = assemble_data(data_dfs, "vert")
@@ -249,7 +263,7 @@ def vstack(gctoos, fields_to_remove=[], reset_ids=False):
     return concated
 
 
-def assemble_common_meta(common_meta_dfs, fields_to_remove):
+def assemble_common_meta(common_meta_dfs, fields_to_remove, sources, remove_all_metadata_fields, error_report_file):
     """ Assemble the common metadata dfs together. Both indices are sorted.
     Fields that are not in all the dfs are dropped.
 
@@ -262,50 +276,138 @@ def assemble_common_meta(common_meta_dfs, fields_to_remove):
         all_meta_df_sorted (pandas df)
 
     """
-    # Remove any column headers that are not present in all dfs (and sort)
-    shared_column_headers = sorted(set.intersection(*[set(df.columns) for df in common_meta_dfs]))
-    trimmed_common_meta_dfs = [df[shared_column_headers] for df in common_meta_dfs]
+    all_meta_df, all_meta_df_with_dups = build_common_all_meta_df(common_meta_dfs, fields_to_remove, remove_all_metadata_fields)
+
+    if not all_meta_df.index.is_unique:
+        all_report_df = build_mismatched_common_meta_report([x.shape for x in common_meta_dfs],
+            sources, all_meta_df, all_meta_df_with_dups)
+
+        unique_duplicate_ids = all_report_df.index.unique()
+
+        if error_report_file is not None:
+            all_report_df.to_csv(error_report_file, sep="\t")
+
+        msg = """There are inconsistencies in common_metadata_df between different files.  Try excluding metadata fields
+using the fields_to_remove argument.  unique_duplicate_ids: {}
+all_report_df:
+{}""".format(unique_duplicate_ids, all_report_df)
+        raise MismatchCommonMetadataConcatGctooException(msg)
+
+    # Finally, sort the index
+    all_meta_df_sorted = all_meta_df.sort_index(axis=0)
+
+    return all_meta_df_sorted
+
+
+def build_common_all_meta_df(common_meta_dfs, fields_to_remove, remove_all_metadata_fields):
+    """
+    concatenate the entries in common_meta_dfs, removing columns selectively (fields_to_remove) or entirely (
+        remove_all_metadata_fields=True; in this case, effectively just merges all the indexes in common_meta_dfs).
+
+        Returns 2 dataframes (in a tuple):  the first has duplicates removed, the second does not.
+
+    Args:
+        common_meta_dfs: collection of pandas DataFrames containing the metadata in the "common" direction of the
+            concatenation operation
+        fields_to_remove: columns to be removed (if present) from the common_meta_dfs
+        remove_all_metadata_fields: boolean indicating that all metadata fields should be removed from the
+            common_meta_dfs; overrides fields_to_remove if present
+
+    Returns:
+        tuple containing
+            all_meta_df:  pandas dataframe that is the concatenation of the dataframes in common_meta_dfs,
+            all_meta_df_with_dups:
+    """
+
+    if remove_all_metadata_fields:
+        trimmed_common_meta_dfs = [pd.DataFrame(index=df.index) for df in common_meta_dfs]
+    else:
+        shared_column_headers = sorted(set.intersection(*[set(df.columns) for df in common_meta_dfs]))
+        logger.debug("shared_column_headers:  {}".format(shared_column_headers))
 
-    # Remove any column headers that will prevent dfs from being identical
-    for df in trimmed_common_meta_dfs:
-        df.drop(fields_to_remove, axis=1, errors="ignore", inplace=True)
+        trimmed_common_meta_dfs = [df[shared_column_headers] for df in common_meta_dfs]
+
+        # Remove any column headers that will prevent dfs from being identical
+        for df in trimmed_common_meta_dfs:
+            df.drop(fields_to_remove, axis=1, errors="ignore", inplace=True)
 
     # Concatenate all dfs and then remove duplicate rows
     all_meta_df_with_dups = pd.concat(trimmed_common_meta_dfs, axis=0)
+    logger.debug("all_meta_df_with_dups.shape:  {}".format(all_meta_df_with_dups.shape))
+    logger.debug("all_meta_df_with_dups.columns:  {}".format(all_meta_df_with_dups.columns))
+    logger.debug("all_meta_df_with_dups.index:  {}".format(all_meta_df_with_dups.index))
 
     # If all metadata dfs were empty, df will be empty
     if all_meta_df_with_dups.empty:
-
         # Simply return unique ids
         all_meta_df = pd.DataFrame(index=all_meta_df_with_dups.index.unique())
 
     else:
         all_meta_df_with_dups["concat_gctoo_column_for_index"] = all_meta_df_with_dups.index
         all_meta_df = all_meta_df_with_dups.copy(deep=True).drop_duplicates()
         all_meta_df.drop("concat_gctoo_column_for_index", axis=1, inplace=True)
+        all_meta_df_with_dups.drop("concat_gctoo_column_for_index", axis=1, inplace=True)
 
     logger.debug("all_meta_df_with_dups.shape: {}".format(all_meta_df_with_dups.shape))
     logger.debug("all_meta_df.shape: {}".format(all_meta_df.shape))
 
-    # If there are still duplicate ids, then their metadata didn't align
-    # in different gcts
+    return (all_meta_df, all_meta_df_with_dups)
+
+
+def build_mismatched_common_meta_report(common_meta_df_shapes, sources, all_meta_df, all_meta_df_with_dups):
+    """
+    Generate a report (dataframe) that indicates for the common metadata that does not match across the common metadata
+        which source file had which of the different mismatch values
+
+    Args:
+        common_meta_df_shapes:  list of tuples that are the shapes of the common meta dataframes
+        sources: list of the source files that the dataframes were loaded from
+        all_meta_df: produced from build_common_all_meta_df
+        all_meta_df_with_dups: produced from build_common_all_meta_df
+
+    Returns:
+        all_report_df:  dataframe indicating the mismatched row metadata values and the corresponding source file
+    """
+    expanded_sources = []
+    for (i, shape) in enumerate(common_meta_df_shapes):
+        src = sources[i]
+        expanded_sources.extend([src for i in xrange(shape[0])])
+    expanded_sources = numpy.array(expanded_sources)
+    logger.debug("len(expanded_sources):  {}".format(len(expanded_sources)))
+
     duplicate_ids = all_meta_df.index[all_meta_df.index.duplicated(keep=False)]
 
-    assert all_meta_df.index.is_unique, (
-        ("There are inconsistencies in common_metadata_df between " +
-         "different files.\nTry excluding metadata fields " +
-         "using the fields_to_remove argument.\n"
-         "duplicate_ids[0]: {id}\n" +
-         "all_meta_df.loc[{id}, :]:\n{df}").format(id=duplicate_ids[0],
-            df=all_meta_df.loc[duplicate_ids[0], :]))
+    unique_duplicate_ids = duplicate_ids.unique()
+    logger.debug("unique_duplicate_ids:  {}".format(unique_duplicate_ids))
 
-    # Finally, sort the index
-    all_meta_df_sorted = all_meta_df.sort_index(axis=0)
+    duplicate_ids_meta_df = all_meta_df.loc[unique_duplicate_ids]
 
-    return all_meta_df_sorted
+    report_df_list = []
+    for unique_dup_id in unique_duplicate_ids:
+        rows = duplicate_ids_meta_df.loc[unique_dup_id]
+
+        matching_row_locs = numpy.array([False for i in xrange(all_meta_df_with_dups.shape[0])])
+        for i in xrange(rows.shape[0]):
+            r = rows.iloc[i]
+            row_comparison = r == all_meta_df_with_dups
+            matching_row_locs = matching_row_locs | row_comparison.all(axis=1).values
 
+        report_df = all_meta_df_with_dups.loc[matching_row_locs].copy()
+        report_df["source_file"] = expanded_sources[matching_row_locs]
+        logger.debug("report_df.shape:  {}".format(report_df.shape))
+        report_df_list.append(report_df)
 
-def assemble_concatenated_meta(concated_meta_dfs):
+    all_report_df = pd.concat(report_df_list, axis=0)
+    all_report_df["orig_rid"] = all_report_df.index
+    all_report_df.index = pd.Index(xrange(all_report_df.shape[0]), name="index")
+    logger.debug("all_report_df.shape:  {}".format(all_report_df.shape))
+    logger.debug("all_report_df.index:  {}".format(all_report_df.index))
+    logger.debug("all_report_df.columns:  {}".format(all_report_df.columns))
+
+    return all_report_df
+
+
+def assemble_concatenated_meta(concated_meta_dfs, remove_all_metadata_fields):
     """ Assemble the concatenated metadata dfs together. For example,
     if horizontally concatenating, the concatenated metadata dfs are the
     column metadata dfs. Both indices are sorted.
@@ -318,6 +420,10 @@ def assemble_concatenated_meta(concated_meta_dfs):
 
     """
     # Concatenate the concated_meta_dfs
+    if remove_all_metadata_fields:
+        for df in concated_meta_dfs:
+            df.drop(df.columns, axis=1, inplace=True)
+
     all_concated_meta_df = pd.concat(concated_meta_dfs, axis=0)
 
     # Sanity check: the number of rows in all_concated_meta_df should correspond
@@ -430,5 +536,8 @@ def reset_ids_in_meta_df(meta_df):
     meta_df.index.name = original_index_name
 
 
+class MismatchCommonMetadataConcatGctooException(Exception):
+    pass
+
 if __name__ == "__main__":
     main()
diff --git a/cmapPy/pandasGEXpress/gct2gctx.py b/cmapPy/pandasGEXpress/gct2gctx.py
@@ -9,10 +9,9 @@
 """
 
 import logging
-from cmapPy.pandasGEXpress import setup_GCToo_logger as setup_logger
+import setup_GCToo_logger as setup_logger
 import argparse
 import sys
-import GCToo
 import parse_gct
 import write_gctx
 
@@ -23,31 +22,32 @@
 
 
 def build_parser():
-	parser = argparse.ArgumentParser(description=__doc__, 
-		formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-	# required
-	parser.add_argument("-filename", 
-		help=".gct file that you would like converted to .gctx form")
-	# optional
-	parser.add_argument("-output_filepath", 
-		help="(optional) out path/name for output gctx file", default=None)
-	parser.add_argument("-verbose", "-v", 
-		help="Whether to print a bunch of output.", action="store_true", default=False)
-	return parser
+    parser = argparse.ArgumentParser(description=__doc__,
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    # required
+    parser.add_argument("-filename",
+                        help=".gct file that you would like converted to .gctx form")
+    # optional
+    parser.add_argument("-output_filepath",
+                        help="(optional) out path/name for output gctx file", default=None)
+    parser.add_argument("-verbose", "-v",
+                        help="Whether to print a bunch of output.", action="store_true", default=False)
+    return parser
+
 
 def main():
-	args = build_parser().parse_args(sys.argv[1:])
-	setup_logger.setup(verbose=args.verbose)
-	in_gctoo = parse_gct.parse(args.filename, convert_neg_666=False)
-	logger.debug("Original out name: {}".format(in_gctoo.src))
+    args = build_parser().parse_args(sys.argv[1:])
+    setup_logger.setup(verbose=args.verbose)
+    in_gctoo = parse_gct.parse(args.filename, convert_neg_666=False)
+    logger.debug("Original out name: {}".format(in_gctoo.src))
 
-	if args.output_filepath == None:
-		out_name = str.split(in_gctoo.src, "/")[-1].split(".")[0]
-	else:
-		out_name = args.output_filepath
+    if args.output_filepath == None:
+        out_name = str.split(in_gctoo.src, "/")[-1].split(".")[0]
+    else:
+        out_name = args.output_filepath
 
-	write_gctx.write(in_gctoo, out_name)
+    write_gctx.write(in_gctoo, out_name)
 
 
 if __name__ == "__main__":
-	main()
+    main()