-
Notifications
You must be signed in to change notification settings - Fork 5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
find UW star files avoiding hard-coded names #91
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,33 +1,66 @@ | ||
# import pyarrow as pa | ||
import os | ||
import glob | ||
import pyarrow.parquet as pq | ||
import pandas as pd | ||
import numpy as np | ||
import numpy.ma as ma | ||
import healpy | ||
from lsst.meas.algorithms.htmIndexer import HtmIndexer | ||
|
||
|
||
def _find_star_files(dirpath, pixel): | ||
''' | ||
Parameters | ||
---------- | ||
dirpath string Path to directory containing input parquet files | ||
HTM partitioning | ||
pixel int Healpix pixel | ||
class UWStarFiles: | ||
_files = {} # shared state | ||
|
||
Returns | ||
------- | ||
List of parquet file paths for files which might have sources belonging | ||
to specified pixel | ||
''' | ||
# For now, hard-code to 3 files which have everything needed | ||
# for joint simulation region | ||
def __init__(self, input_dir): | ||
self._index_files(input_dir) | ||
self.htm_indexer = HtmIndexer(depth=20) | ||
|
||
def _index_files(self, input_dir): | ||
if self._files: | ||
# The files have already been indexed. | ||
return | ||
files = sorted(glob.glob(os.path.join(input_dir, | ||
'stars_chunk_*.parquet'))) | ||
for item in files: | ||
tokens = os.path.basename(item).split('_') | ||
imin = int(tokens[2]) | ||
imax = int(tokens[3].split('.')[0]) | ||
self._files[(imin, imax)] = item | ||
|
||
def find_files(self, pixel, nside=32, res_factor=16): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A docstring for this routine would be welcome. Why was 16 chosen as default for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah, I'll add a comment. resfactor 16 is obviously the resfactor used by those files. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What does resfactor have to do with HTM depth (apparently 20), if anything? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another way to do this (which seems safer to me) would be to define a minimum circle around the healpixel, then call HtmIndexer.getShardIds |
||
""" | ||
Find the UW input files that cover a given healpixel by sampling the | ||
interior of the healpixel and computing the htm indexes at those | ||
locations. | ||
|
||
fnames = ['stars_chunk_8800000000000_8900000000000.parquet', | ||
'stars_chunk_9200000000000_9300000000000.parquet', | ||
'stars_chunk_9700000000000_9800000000000.parquet'] | ||
Parameters | ||
---------- | ||
pixel : int | ||
Healpixel id. | ||
nside : int | ||
Resolution parameter of the healpixel map. | ||
res_factor : int | ||
Sampling factor of the healpix to ensure coverage of the healpixel | ||
by the UW files. | ||
|
||
return [os.path.join(dirpath, fname) for fname in fnames] | ||
Returns | ||
------- | ||
list : List of filenames. | ||
""" | ||
corners = np.transpose(healpy.boundaries(nside, pixel)) | ||
subpixels = healpy.query_polygon(nside*res_factor, corners) | ||
ra, dec = [], [] | ||
for subpix in subpixels: | ||
coord = healpy.pix2ang(nside*res_factor, subpix, lonlat=True) | ||
ra.append(coord[0]) | ||
dec.append(coord[1]) | ||
files = set() | ||
indices = set(self.htm_indexer.indexPoints(ra, dec)) | ||
for index in indices: | ||
for (imin, imax), item in self._files.items(): | ||
if imin <= index <= imax: | ||
files.add(item) | ||
return files | ||
|
||
|
||
def _calculate_pixel_mask(ra, dec, pixel, nside=32): | ||
|
@@ -49,7 +82,7 @@ def _calculate_pixel_mask(ra, dec, pixel, nside=32): | |
return mask | ||
|
||
|
||
def _star_parquet_reader(dirpath, pixel, output_arrow_schema): | ||
def _star_parquet_reader(dirpath, pixel, output_arrow_schema, nside=32): | ||
# Get requisite info from parquet files for sources in pixel. | ||
# Next do renames and calculation for magnorm | ||
to_read = ['simobjid', 'ra', 'decl', 'mura', 'mudecl', 'vrad', | ||
|
@@ -59,7 +92,8 @@ def _star_parquet_reader(dirpath, pixel, output_arrow_schema): | |
out_fields = ['id', 'ra', 'dec', 'mura', 'mudec', 'radial_velocity', | ||
'parallax', 'sed_filepath', 'magnorm', 'ebv'] | ||
|
||
paths = _find_star_files(dirpath, pixel) | ||
uw_files = UWStarFiles(dirpath) | ||
paths = uw_files.find_files(pixel, nside) | ||
out_dict = {} | ||
for k in out_fields: | ||
out_dict[k] = [] | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A description of the format for input file basenames would be helpful, either in a comment or use a regular expression to pick out fields of interest, which can be named, e.g.
pat = "stars_chunk_(?P<min_htm_id>\d+)_(?P<max_htm_id>\d+).parquet"
Then lines 25-28 below get replaced with
m = re.match(pat, os.path.basename(item))
self._files[(m['min_htm_id'], m['max_htm_id'])] = item
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I dislike regular expressions, but if u really think this is better and generally easier for people to understand (I don't particularly), I can change it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I find the regular expression clearer; I don't know whether most people do or not. Adding a comment describing the structure of the basename instead is fine with me.