Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

find UW star files avoiding hard-coded names #91

Merged
merged 1 commit into from
Apr 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion skycatalogs/catalog_creator.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,8 @@ def __init__(self, parts, area_partition=None, skycatalog_root=None,
_star_db = '/global/cfs/cdirs/lsst/groups/SSim/DC2/dc2_stellar_healpixel.db'
_sn_db = '/global/cfs/cdirs/lsst/groups/SSim/DC2/cosmoDC2_v1.1.4/sne_cosmoDC2_v1.1.4_MS_DDF_healpix.db'

_star_parquet = '/global/cfs/cdirs/descssim/postDC2/UW_star_catalog'
# _star_parquet = '/global/cfs/cdirs/descssim/postDC2/UW_star_catalog'
_star_parquet = '/sdf/data/rubin/shared/ops-rehearsal-3/imSim_catalogs/UW_stars'

self._galaxy_stride = galaxy_stride
if pkg_root:
Expand Down
76 changes: 55 additions & 21 deletions skycatalogs/utils/star_parquet_input.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,66 @@
# import pyarrow as pa
import os
import glob
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import numpy.ma as ma
import healpy
from lsst.meas.algorithms.htmIndexer import HtmIndexer


def _find_star_files(dirpath, pixel):
'''
Parameters
----------
dirpath string Path to directory containing input parquet files
HTM partitioning
pixel int Healpix pixel
class UWStarFiles:
_files = {} # shared state

Returns
-------
List of parquet file paths for files which might have sources belonging
to specified pixel
'''
# For now, hard-code to 3 files which have everything needed
# for joint simulation region
def __init__(self, input_dir):
self._index_files(input_dir)
self.htm_indexer = HtmIndexer(depth=20)

def _index_files(self, input_dir):
if self._files:
# The files have already been indexed.
return
files = sorted(glob.glob(os.path.join(input_dir,
Copy link
Collaborator

@JoanneBogart JoanneBogart Apr 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A description of the format for input file basenames would be helpful, either in a comment or use a regular expression to pick out fields of interest, which can be named, e.g.
pat = "stars_chunk_(?P<min_htm_id>\d+)_(?P<max_htm_id>\d+).parquet"
Then lines 25-28 below get replaced with
m = re.match(pat, os.path.basename(item))
self._files[(m['min_htm_id'], m['max_htm_id'])] = item

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I dislike regular expressions, but if u really think this is better and generally easier for people to understand (I don't particularly), I can change it.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I find the regular expression clearer; I don't know whether most people do or not. Adding a comment describing the structure of the basename instead is fine with me.

'stars_chunk_*.parquet')))
for item in files:
tokens = os.path.basename(item).split('_')
imin = int(tokens[2])
imax = int(tokens[3].split('.')[0])
self._files[(imin, imax)] = item

def find_files(self, pixel, nside=32, res_factor=16):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A docstring for this routine would be welcome. Why was 16 chosen as default for res_factor?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, I'll add a comment. resfactor 16 is obviously the resfactor used by those files.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does resfactor have to do with HTM depth (apparently 20), if anything?
Healpixels with nside=32 are much bigger than HTM depth 20 regions (or depth 16 regions, for that matter). How did you verify that nside*16 gives a sufficiently fine grid?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another way to do this (which seems safer to me) would be to define a minimum circle around the healpixel, then call HtmIndexer.getShardIds

"""
Find the UW input files that cover a given healpixel by sampling the
interior of the healpixel and computing the htm indexes at those
locations.

fnames = ['stars_chunk_8800000000000_8900000000000.parquet',
'stars_chunk_9200000000000_9300000000000.parquet',
'stars_chunk_9700000000000_9800000000000.parquet']
Parameters
----------
pixel : int
Healpixel id.
nside : int
Resolution parameter of the healpixel map.
res_factor : int
Sampling factor of the healpix to ensure coverage of the healpixel
by the UW files.

return [os.path.join(dirpath, fname) for fname in fnames]
Returns
-------
list : List of filenames.
"""
corners = np.transpose(healpy.boundaries(nside, pixel))
subpixels = healpy.query_polygon(nside*res_factor, corners)
ra, dec = [], []
for subpix in subpixels:
coord = healpy.pix2ang(nside*res_factor, subpix, lonlat=True)
ra.append(coord[0])
dec.append(coord[1])
files = set()
indices = set(self.htm_indexer.indexPoints(ra, dec))
for index in indices:
for (imin, imax), item in self._files.items():
if imin <= index <= imax:
files.add(item)
return files


def _calculate_pixel_mask(ra, dec, pixel, nside=32):
Expand All @@ -49,7 +82,7 @@ def _calculate_pixel_mask(ra, dec, pixel, nside=32):
return mask


def _star_parquet_reader(dirpath, pixel, output_arrow_schema):
def _star_parquet_reader(dirpath, pixel, output_arrow_schema, nside=32):
# Get requisite info from parquet files for sources in pixel.
# Next do renames and calculation for magnorm
to_read = ['simobjid', 'ra', 'decl', 'mura', 'mudecl', 'vrad',
Expand All @@ -59,7 +92,8 @@ def _star_parquet_reader(dirpath, pixel, output_arrow_schema):
out_fields = ['id', 'ra', 'dec', 'mura', 'mudec', 'radial_velocity',
'parallax', 'sed_filepath', 'magnorm', 'ebv']

paths = _find_star_files(dirpath, pixel)
uw_files = UWStarFiles(dirpath)
paths = uw_files.find_files(pixel, nside)
out_dict = {}
for k in out_fields:
out_dict[k] = []
Expand Down