Skip to content

Commit

Permalink
Merge pull request #12 from cmap/upgrade_pandas_etc
Browse files Browse the repository at this point in the history
Upgrade pandas etc
  • Loading branch information
oena authored Oct 27, 2017
2 parents df84861 + ffa8407 commit 0bc6203
Show file tree
Hide file tree
Showing 172 changed files with 1,779 additions and 41,537 deletions.
4 changes: 3 additions & 1 deletion cmapPy/pandasGEXpress/GCToo.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,13 @@
import numpy as np
import pandas as pd
import logging
from cmapPy.pandasGEXpress import setup_GCToo_logger as setup_logger
import setup_GCToo_logger as setup_logger


__authors__ = 'Oana Enache, Lev Litichevskiy, Dave Lahr'
__email__ = '[email protected]'


class GCToo(object):
"""Class representing parsed gct(x) objects as pandas dataframes.
Contains 3 component dataframes (row_metadata_df, column_metadata_df,
Expand Down
5 changes: 1 addition & 4 deletions cmapPy/pandasGEXpress/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1 @@
from .parse import parse
#from .GCToo import GCToo
#from .write_gctx import write_gctx
#from .write_gct import write_gct
from cmapPy.pandasGEXpress.parse import parse
167 changes: 138 additions & 29 deletions cmapPy/pandasGEXpress/concat_gctoo.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import logging
import setup_GCToo_logger as setup_logger
import pandas as pd
import numpy

import GCToo
import parse
Expand Down Expand Up @@ -72,6 +73,8 @@ def build_parser():
help="what to name the output file")
parser.add_argument("--fields_to_remove", "-ftr", nargs="+", default=[],
help="fields to remove from the common metadata")
parser.add_argument("--remove_all_metadata_fields", "-ramf", action="store_true", default=False,
help="remove all metadata fields during operation")
parser.add_argument("--reset_ids", "-rsi", action="store_true", default=False,
help="whether to reset ids (use this flag if ids are not unique)")

Expand All @@ -84,13 +87,18 @@ def build_parser():
parser.add_argument("-verbose", "-v", action="store_true", default=False,
help="whether to print a bunch of output")

parser.add_argument("--error_report_output_file", "-erof", type=str, default=None,
help="""destination file for writing out error report - currently information about inconsistent
metadata fields in the common dimension of the concat operation""")

return parser


def main():
# get args
args = build_parser().parse_args(sys.argv[1:])
setup_logger.setup(verbose=args.verbose)
logger.debug("args: {}".format(args))

# Get files directly
if args.input_filepaths is not None:
Expand Down Expand Up @@ -120,10 +128,12 @@ def main():

# Create concatenated gctoo object
if args.concat_direction == "horiz":
out_gctoo = hstack(gctoos, args.fields_to_remove, args.reset_ids)
out_gctoo = hstack(gctoos, args.remove_all_metadata_fields, args.error_report_output_file,
args.fields_to_remove, args.reset_ids)

elif args.concat_direction == "vert":
out_gctoo = vstack(gctoos, args.fields_to_remove, args.reset_ids)
out_gctoo = vstack(gctoos, args.remove_all_metadata_fields, args.error_report_output_file,
args.fields_to_remove, args.reset_ids)

# Write out_gctoo to file
logger.info("Writing to output file args.out_name: {}".format(args.out_name))
Expand Down Expand Up @@ -153,7 +163,7 @@ def get_file_list(wildcard):
return files


def hstack(gctoos, fields_to_remove=[], reset_ids=False):
def hstack(gctoos, remove_all_metadata_fields, error_report_file, fields_to_remove=[], reset_ids=False):
""" Horizontally concatenate gctoos.
Args:
Expand All @@ -169,18 +179,20 @@ def hstack(gctoos, fields_to_remove=[], reset_ids=False):
row_meta_dfs = []
col_meta_dfs = []
data_dfs = []
srcs = []
for g in gctoos:
row_meta_dfs.append(g.row_metadata_df)
col_meta_dfs.append(g.col_metadata_df)
data_dfs.append(g.data_df)
srcs.append(g.src)

logger.debug("shapes of row_meta_dfs: {}".format([x.shape for x in row_meta_dfs]))

# Concatenate row metadata
all_row_metadata_df = assemble_common_meta(row_meta_dfs, fields_to_remove)
all_row_metadata_df = assemble_common_meta(row_meta_dfs, fields_to_remove, srcs, remove_all_metadata_fields, error_report_file)

# Concatenate col metadata
all_col_metadata_df = assemble_concatenated_meta(col_meta_dfs)
all_col_metadata_df = assemble_concatenated_meta(col_meta_dfs, remove_all_metadata_fields)

# Concatenate the data_dfs
all_data_df = assemble_data(data_dfs, "horiz")
Expand All @@ -202,7 +214,7 @@ def hstack(gctoos, fields_to_remove=[], reset_ids=False):
return concated


def vstack(gctoos, fields_to_remove=[], reset_ids=False):
def vstack(gctoos, remove_all_metadata_fields, error_report_file, fields_to_remove=[], reset_ids=False):
""" Vertically concatenate gctoos.
Args:
Expand All @@ -218,16 +230,18 @@ def vstack(gctoos, fields_to_remove=[], reset_ids=False):
row_meta_dfs = []
col_meta_dfs = []
data_dfs = []
srcs = []
for g in gctoos:
row_meta_dfs.append(g.row_metadata_df)
col_meta_dfs.append(g.col_metadata_df)
data_dfs.append(g.data_df)
srcs.append(g.src)

# Concatenate col metadata
all_col_metadata_df = assemble_common_meta(col_meta_dfs, fields_to_remove)
all_col_metadata_df = assemble_common_meta(col_meta_dfs, fields_to_remove, srcs, remove_all_metadata_fields, error_report_file)

# Concatenate col metadata
all_row_metadata_df = assemble_concatenated_meta(row_meta_dfs)
all_row_metadata_df = assemble_concatenated_meta(row_meta_dfs, remove_all_metadata_fields)

# Concatenate the data_dfs
all_data_df = assemble_data(data_dfs, "vert")
Expand All @@ -249,7 +263,7 @@ def vstack(gctoos, fields_to_remove=[], reset_ids=False):
return concated


def assemble_common_meta(common_meta_dfs, fields_to_remove):
def assemble_common_meta(common_meta_dfs, fields_to_remove, sources, remove_all_metadata_fields, error_report_file):
""" Assemble the common metadata dfs together. Both indices are sorted.
Fields that are not in all the dfs are dropped.
Expand All @@ -262,50 +276,138 @@ def assemble_common_meta(common_meta_dfs, fields_to_remove):
all_meta_df_sorted (pandas df)
"""
# Remove any column headers that are not present in all dfs (and sort)
shared_column_headers = sorted(set.intersection(*[set(df.columns) for df in common_meta_dfs]))
trimmed_common_meta_dfs = [df[shared_column_headers] for df in common_meta_dfs]
all_meta_df, all_meta_df_with_dups = build_common_all_meta_df(common_meta_dfs, fields_to_remove, remove_all_metadata_fields)

if not all_meta_df.index.is_unique:
all_report_df = build_mismatched_common_meta_report([x.shape for x in common_meta_dfs],
sources, all_meta_df, all_meta_df_with_dups)

unique_duplicate_ids = all_report_df.index.unique()

if error_report_file is not None:
all_report_df.to_csv(error_report_file, sep="\t")

msg = """There are inconsistencies in common_metadata_df between different files. Try excluding metadata fields
using the fields_to_remove argument. unique_duplicate_ids: {}
all_report_df:
{}""".format(unique_duplicate_ids, all_report_df)
raise MismatchCommonMetadataConcatGctooException(msg)

# Finally, sort the index
all_meta_df_sorted = all_meta_df.sort_index(axis=0)

return all_meta_df_sorted


def build_common_all_meta_df(common_meta_dfs, fields_to_remove, remove_all_metadata_fields):
"""
concatenate the entries in common_meta_dfs, removing columns selectively (fields_to_remove) or entirely (
remove_all_metadata_fields=True; in this case, effectively just merges all the indexes in common_meta_dfs).
Returns 2 dataframes (in a tuple): the first has duplicates removed, the second does not.
Args:
common_meta_dfs: collection of pandas DataFrames containing the metadata in the "common" direction of the
concatenation operation
fields_to_remove: columns to be removed (if present) from the common_meta_dfs
remove_all_metadata_fields: boolean indicating that all metadata fields should be removed from the
common_meta_dfs; overrides fields_to_remove if present
Returns:
tuple containing
all_meta_df: pandas dataframe that is the concatenation of the dataframes in common_meta_dfs,
all_meta_df_with_dups:
"""

if remove_all_metadata_fields:
trimmed_common_meta_dfs = [pd.DataFrame(index=df.index) for df in common_meta_dfs]
else:
shared_column_headers = sorted(set.intersection(*[set(df.columns) for df in common_meta_dfs]))
logger.debug("shared_column_headers: {}".format(shared_column_headers))

# Remove any column headers that will prevent dfs from being identical
for df in trimmed_common_meta_dfs:
df.drop(fields_to_remove, axis=1, errors="ignore", inplace=True)
trimmed_common_meta_dfs = [df[shared_column_headers] for df in common_meta_dfs]

# Remove any column headers that will prevent dfs from being identical
for df in trimmed_common_meta_dfs:
df.drop(fields_to_remove, axis=1, errors="ignore", inplace=True)

# Concatenate all dfs and then remove duplicate rows
all_meta_df_with_dups = pd.concat(trimmed_common_meta_dfs, axis=0)
logger.debug("all_meta_df_with_dups.shape: {}".format(all_meta_df_with_dups.shape))
logger.debug("all_meta_df_with_dups.columns: {}".format(all_meta_df_with_dups.columns))
logger.debug("all_meta_df_with_dups.index: {}".format(all_meta_df_with_dups.index))

# If all metadata dfs were empty, df will be empty
if all_meta_df_with_dups.empty:

# Simply return unique ids
all_meta_df = pd.DataFrame(index=all_meta_df_with_dups.index.unique())

else:
all_meta_df_with_dups["concat_gctoo_column_for_index"] = all_meta_df_with_dups.index
all_meta_df = all_meta_df_with_dups.copy(deep=True).drop_duplicates()
all_meta_df.drop("concat_gctoo_column_for_index", axis=1, inplace=True)
all_meta_df_with_dups.drop("concat_gctoo_column_for_index", axis=1, inplace=True)

logger.debug("all_meta_df_with_dups.shape: {}".format(all_meta_df_with_dups.shape))
logger.debug("all_meta_df.shape: {}".format(all_meta_df.shape))

# If there are still duplicate ids, then their metadata didn't align
# in different gcts
return (all_meta_df, all_meta_df_with_dups)


def build_mismatched_common_meta_report(common_meta_df_shapes, sources, all_meta_df, all_meta_df_with_dups):
"""
Generate a report (dataframe) that indicates for the common metadata that does not match across the common metadata
which source file had which of the different mismatch values
Args:
common_meta_df_shapes: list of tuples that are the shapes of the common meta dataframes
sources: list of the source files that the dataframes were loaded from
all_meta_df: produced from build_common_all_meta_df
all_meta_df_with_dups: produced from build_common_all_meta_df
Returns:
all_report_df: dataframe indicating the mismatched row metadata values and the corresponding source file
"""
expanded_sources = []
for (i, shape) in enumerate(common_meta_df_shapes):
src = sources[i]
expanded_sources.extend([src for i in xrange(shape[0])])
expanded_sources = numpy.array(expanded_sources)
logger.debug("len(expanded_sources): {}".format(len(expanded_sources)))

duplicate_ids = all_meta_df.index[all_meta_df.index.duplicated(keep=False)]

assert all_meta_df.index.is_unique, (
("There are inconsistencies in common_metadata_df between " +
"different files.\nTry excluding metadata fields " +
"using the fields_to_remove argument.\n"
"duplicate_ids[0]: {id}\n" +
"all_meta_df.loc[{id}, :]:\n{df}").format(id=duplicate_ids[0],
df=all_meta_df.loc[duplicate_ids[0], :]))
unique_duplicate_ids = duplicate_ids.unique()
logger.debug("unique_duplicate_ids: {}".format(unique_duplicate_ids))

# Finally, sort the index
all_meta_df_sorted = all_meta_df.sort_index(axis=0)
duplicate_ids_meta_df = all_meta_df.loc[unique_duplicate_ids]

return all_meta_df_sorted
report_df_list = []
for unique_dup_id in unique_duplicate_ids:
rows = duplicate_ids_meta_df.loc[unique_dup_id]

matching_row_locs = numpy.array([False for i in xrange(all_meta_df_with_dups.shape[0])])
for i in xrange(rows.shape[0]):
r = rows.iloc[i]
row_comparison = r == all_meta_df_with_dups
matching_row_locs = matching_row_locs | row_comparison.all(axis=1).values

report_df = all_meta_df_with_dups.loc[matching_row_locs].copy()
report_df["source_file"] = expanded_sources[matching_row_locs]
logger.debug("report_df.shape: {}".format(report_df.shape))
report_df_list.append(report_df)

def assemble_concatenated_meta(concated_meta_dfs):
all_report_df = pd.concat(report_df_list, axis=0)
all_report_df["orig_rid"] = all_report_df.index
all_report_df.index = pd.Index(xrange(all_report_df.shape[0]), name="index")
logger.debug("all_report_df.shape: {}".format(all_report_df.shape))
logger.debug("all_report_df.index: {}".format(all_report_df.index))
logger.debug("all_report_df.columns: {}".format(all_report_df.columns))

return all_report_df


def assemble_concatenated_meta(concated_meta_dfs, remove_all_metadata_fields):
""" Assemble the concatenated metadata dfs together. For example,
if horizontally concatenating, the concatenated metadata dfs are the
column metadata dfs. Both indices are sorted.
Expand All @@ -318,6 +420,10 @@ def assemble_concatenated_meta(concated_meta_dfs):
"""
# Concatenate the concated_meta_dfs
if remove_all_metadata_fields:
for df in concated_meta_dfs:
df.drop(df.columns, axis=1, inplace=True)

all_concated_meta_df = pd.concat(concated_meta_dfs, axis=0)

# Sanity check: the number of rows in all_concated_meta_df should correspond
Expand Down Expand Up @@ -430,5 +536,8 @@ def reset_ids_in_meta_df(meta_df):
meta_df.index.name = original_index_name


class MismatchCommonMetadataConcatGctooException(Exception):
pass

if __name__ == "__main__":
main()
46 changes: 23 additions & 23 deletions cmapPy/pandasGEXpress/gct2gctx.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,9 @@
"""

import logging
from cmapPy.pandasGEXpress import setup_GCToo_logger as setup_logger
import setup_GCToo_logger as setup_logger
import argparse
import sys
import GCToo
import parse_gct
import write_gctx

Expand All @@ -23,31 +22,32 @@


def build_parser():
parser = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
# required
parser.add_argument("-filename",
help=".gct file that you would like converted to .gctx form")
# optional
parser.add_argument("-output_filepath",
help="(optional) out path/name for output gctx file", default=None)
parser.add_argument("-verbose", "-v",
help="Whether to print a bunch of output.", action="store_true", default=False)
return parser
parser = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
# required
parser.add_argument("-filename",
help=".gct file that you would like converted to .gctx form")
# optional
parser.add_argument("-output_filepath",
help="(optional) out path/name for output gctx file", default=None)
parser.add_argument("-verbose", "-v",
help="Whether to print a bunch of output.", action="store_true", default=False)
return parser


def main():
args = build_parser().parse_args(sys.argv[1:])
setup_logger.setup(verbose=args.verbose)
in_gctoo = parse_gct.parse(args.filename, convert_neg_666=False)
logger.debug("Original out name: {}".format(in_gctoo.src))
args = build_parser().parse_args(sys.argv[1:])
setup_logger.setup(verbose=args.verbose)
in_gctoo = parse_gct.parse(args.filename, convert_neg_666=False)
logger.debug("Original out name: {}".format(in_gctoo.src))

if args.output_filepath == None:
out_name = str.split(in_gctoo.src, "/")[-1].split(".")[0]
else:
out_name = args.output_filepath
if args.output_filepath == None:
out_name = str.split(in_gctoo.src, "/")[-1].split(".")[0]
else:
out_name = args.output_filepath

write_gctx.write(in_gctoo, out_name)
write_gctx.write(in_gctoo, out_name)


if __name__ == "__main__":
main()
main()
Loading

0 comments on commit 0bc6203

Please sign in to comment.