Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support HEALpix-indexed AGASC HDF5 files and more #155

Merged
merged 34 commits into from
Oct 3, 2023
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
a518a2b
Initial commit to support HEALpix-indexed AGASC HDF5 files
taldcroft Sep 12, 2023
799e19f
Fix typo
taldcroft Sep 12, 2023
f9e2d23
Minimal modernization of create_mini_agasc_h5.py
taldcroft Sep 12, 2023
0be1563
Refactor and modernize
taldcroft Sep 13, 2023
e9fef74
Apply black formatting
taldcroft Sep 13, 2023
f64aebd
Apply black to create_near_neighbor_ids
taldcroft Sep 13, 2023
45b1533
Improvements to miniagasc creation helper scripts
taldcroft Sep 13, 2023
cbccf43
Support for caching and columns in get_agasc_cone
taldcroft Sep 18, 2023
0805567
Working get_agasc_cone for healpix
taldcroft Sep 18, 2023
c90685b
Remove columns as an option since slows performance
taldcroft Sep 18, 2023
cad21fe
Fix a couple of issues in create_near_neighbor_ids
taldcroft Sep 18, 2023
07b069b
Change epoch for computing near-neighbor to 2024.0
taldcroft Sep 18, 2023
fe9b554
Some fixes / improvements in file creation scripts
taldcroft Sep 19, 2023
065ed43
Implement new convention for agasc_file kwarg
taldcroft Sep 19, 2023
601fafa
Doc fixes
taldcroft Sep 19, 2023
3d11988
Fix tests
taldcroft Sep 19, 2023
9744866
Rework the file selection code
taldcroft Sep 22, 2023
238454c
Update filename resolution for latest spec
taldcroft Sep 23, 2023
df155b2
Update conf.py for numpydoc and autodoc typehints
taldcroft Sep 23, 2023
145567c
Flake8
taldcroft Sep 23, 2023
6176521
Add test of get_agasc_filename and fix bug in that function
taldcroft Sep 25, 2023
be7145e
Require either .h5 or * at end of `agasc_file` + other fixes
taldcroft Sep 26, 2023
350d91f
Rename and overhaul create_mini_agasc_h5
taldcroft Sep 26, 2023
a5d38c5
Add option to allow RC files for testing
taldcroft Sep 27, 2023
8ea5a53
Add version kwarg and fix tests
taldcroft Sep 27, 2023
3bfa5a2
Test get_agasc_cone, get_star, get_stars for HEALpix
taldcroft Sep 27, 2023
e314a63
Flake8
taldcroft Sep 28, 2023
743285d
Add new dev script for profiling
taldcroft Sep 29, 2023
ca4613b
Reduce memory from supplement
taldcroft Sep 30, 2023
775e156
Add dev scripts for memory profiling
taldcroft Oct 1, 2023
4a5f473
Fix name
taldcroft Oct 1, 2023
0194306
Add documentation on HEALpix ordering
taldcroft Oct 1, 2023
476e183
Add agasc_file to stars meta
taldcroft Oct 3, 2023
c41fe20
Add dev script to profile caching performance
taldcroft Oct 3, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 132 additions & 16 deletions agasc/agasc.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,21 @@
# Licensed under a 3-clause BSD style license - see LICENSE.rst
import os
import contextlib
import functools
import os
from pathlib import Path
from typing import Optional

import numpy as np
import numexpr
import numpy as np
import tables

from astropy.table import Column, Table
from Chandra.Time import DateTime
from astropy.table import Table, Column

from .healpix import get_stars_from_healpix_h5, is_healpix
from .paths import default_agasc_dir, default_agasc_file
from .supplement.utils import get_supplement_table

__all__ = ['sphere_dist', 'get_agasc_cone', 'get_star', 'get_stars',
__all__ = ['sphere_dist', 'get_agasc_cone', 'get_star', 'get_stars', 'read_h5_table',
'MAG_CATID_SUPPLEMENT', 'BAD_CLASS_SUPPLEMENT',
'set_supplement_enabled', 'SUPPLEMENT_ENABLED_ENV']

Expand Down Expand Up @@ -133,6 +136,81 @@ def get_ra_decs(agasc_file):
return RA_DECS_CACHE[agasc_file]


def read_h5_table(
h5_file: str | Path | tables.file.File,
row0: Optional[int] = None,
row1: Optional[int] = None,
path="data",
cache=False,
) -> np.ndarray:
"""
Read HDF5 table from group ``path`` in ``h5_file``.

If ``row0`` and ``row1`` are specified then only the rows in that range are read,
e.g. ``data[row0:row1]``.

If ``cache`` is ``True`` then the data for the last read is cached in memory. The
cache key is ``(h5_file, path)`` and only one cache entry is kept. If ``h5_file``
is an HDF5 file object then the filename is used as the cache key.

Parameters
----------
h5_file : str, Path, tables.file.File
Path to the HDF5 file to read or an open HDF5 file from ``tables.open_file``.
row0 : int, optional
First row to read. Default is None (read from first row).
row1 : int, optional
Last row to read. Default is None (read to last row).
path : str, optional
Path to the data table in the HDF5 file. Default is 'data'.
cache : bool, optional
Whether to cache the read data. Default is False.

Returns
-------
out : np.ndarray
The HDF5 data as a numpy structured array
"""
if cache:
if isinstance(h5_file, tables.file.File):
h5_file = h5_file.filename
data = _read_h5_table_cached(h5_file, path)
out = data[row0:row1]
else:
out = _read_h5_table(h5_file, path, row0, row1)

return out


@functools.lru_cache(maxsize=1)
def _read_h5_table_cached(
h5_file: str | Path,
path: str,
) -> np.ndarray:
return _read_h5_table(h5_file, path, row0=None, row1=None)


def _read_h5_table(
h5_file: str | Path | tables.file.File,
path: str,
row0: None | int,
row1: None | int,
) -> np.ndarray:
if isinstance(h5_file, tables.file.File):
out = _read_h5_table_from_open_h5_file(h5_file, path, row0, row1)
else:
with tables.open_file(h5_file) as h5:
out = _read_h5_table_from_open_h5_file(h5, path, row0, row1)

out = np.asarray(out) # Convert to structured ndarray (not recarray)
return out

def _read_h5_table_from_open_h5_file(h5, path, row0, row1):
data = getattr(h5.root, path)
out = data.read(start=row0, stop=row1)
return out


def sphere_dist(ra1, dec1, ra2, dec2):
"""
Haversine formula for angular distance on a sphere: more stable at poles.
Expand Down Expand Up @@ -227,7 +305,8 @@ def add_pmcorr_columns(stars, date):


def get_agasc_cone(ra, dec, radius=1.5, date=None, agasc_file=None,
pm_filter=True, fix_color1=True, use_supplement=None):
pm_filter=True, fix_color1=True, use_supplement=None,
cache=False):
"""
Get AGASC catalog entries within ``radius`` degrees of ``ra``, ``dec``.

Expand All @@ -253,25 +332,22 @@ def get_agasc_cone(ra, dec, radius=1.5, date=None, agasc_file=None,
:param fix_color1: set COLOR1=COLOR2 * 0.85 for stars with V-I color
:param use_supplement: Use estimated mag from AGASC supplement where available
(default=value of AGASC_SUPPLEMENT_ENABLED env var, or True if not defined)
:param cache: Cache the AGASC data in memory (default=False)
Copy link
Contributor

@jeanconn jeanconn Oct 2, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Overall, these agasc changes are very well tested and documented, but I'm not sure if there is a cache kwarg test or use case documented. Given the impact that seems fine -- I think the plan is this would only be used by the advanced user to increase speed at the expense of memory. Though I'm not sure about the magnitude of benefit or cost.

Copy link
Member Author

@taldcroft taldcroft Oct 3, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The performance gains are now documented in the description with a new profiling script in the dev directory. One use case is to replace get_agasc_cone_fast in kady (of course that requires the matlab_pm_bug so who knows). In retrospect it probably wasn't worth the effort but it is done now, let's be positive! 😄


:returns: astropy Table of AGASC entries
"""
if agasc_file is None:
agasc_file = default_agasc_file()

get_stars_func = (
get_stars_from_healpix_h5
if is_healpix(agasc_file)
else get_stars_from_dec_sorted_h5
)
# Possibly expand initial radius to allow for slop due proper motion
rad_pm = radius + (0.1 if pm_filter else 0.0)

ra_decs = get_ra_decs(agasc_file)

idx0, idx1 = np.searchsorted(ra_decs.dec, [dec - rad_pm, dec + rad_pm])

dists = sphere_dist(ra, dec, ra_decs.ra[idx0:idx1], ra_decs.dec[idx0:idx1])
ok = dists <= rad_pm

with tables.open_file(agasc_file) as h5:
stars = Table(h5.root.data[idx0:idx1][ok], copy=False)

stars = get_stars_func(ra, dec, rad_pm, agasc_file, cache)
add_pmcorr_columns(stars, date)
if fix_color1:
update_color1_column(stars)
Expand All @@ -287,6 +363,46 @@ def get_agasc_cone(ra, dec, radius=1.5, date=None, agasc_file=None,
return stars


def get_stars_from_dec_sorted_h5(
ra: float,
dec: float,
radius: float,
agasc_file: str | Path,
cache: bool = False,
) -> Table:
"""
Returns a table of stars within a given radius of a given RA and Dec.

Parameters
----------
ra : float
The right ascension of the center of the search radius, in degrees.
dec : float
The declination of the center of the search radius, in degrees.
radius : float
The radius of the search circle, in degrees.
agasc_file : str or Path
The path to the AGASC HDF5 file.
cache : bool, optional
Whether to cache the AGASC data in memory. Default is False.

Returns
-------
stars : astropy.table.Table
A structured ndarray of stars within the search radius, sorted by declination.
"""
ra_decs = get_ra_decs(agasc_file)
idx0, idx1 = np.searchsorted(ra_decs.dec, [dec - radius, dec + radius])

dists = sphere_dist(ra, dec, ra_decs.ra[idx0:idx1], ra_decs.dec[idx0:idx1])
ok = dists <= radius

stars = read_h5_table(agasc_file, row0=idx0, row1=idx1, cache=cache)
stars = Table(stars[ok])

return stars


def get_star(id, agasc_file=None, date=None, fix_color1=True, use_supplement=None):
"""
Get AGASC catalog entry for star with requested id.
Expand Down
141 changes: 141 additions & 0 deletions agasc/healpix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
# Licensed under a 3-clause BSD style license - see LICENSE.rst
"""
Provide functions for working with HEALPix-indexed AGASC HDF5 files.

Functions
---------
is_healpix(agasc_file)
Return True if `agasc_file` is a HEALPix file (otherwise dec-sorted).
get_stars_from_healpix_h5(ra, dec, radius, agasc_file)
Return a table of stars within a given radius around a given sky position.
"""

import functools
from pathlib import Path

import astropy.units as u
import astropy_healpix as hpx
import numpy as np
import tables
from astropy.table import Table


__all__ = ["is_healpix", "get_stars_from_healpix_h5"]


@functools.lru_cache()
def is_healpix(agasc_file):
"""Return True if ``agasc_file`` is a healpix file (otherwise dec-sorted)"""
with tables.open_file(agasc_file, mode="r") as h5:
return "healpix_index" in h5.root


@functools.lru_cache(maxsize=12)
def get_healpix(nside):
"""
Returns a HEALPix object with the specified nside and nested order.

Parameters
-----------
nside : int
The nside parameter for the HEALPix object.

Returns:
--------
hpx : HEALPix object
A HEALPix object with the specified nside and order.
"""
return hpx.HEALPix(nside=nside, order="nested")


@functools.lru_cache(maxsize=8)
def get_healpix_info(agasc_file: str | Path) -> tuple[dict[int, tuple[int, int]], int]:
"""
Get the healpix index table for an AGASC file.

The healpix index table is a table with columns ``healpix``, ``idx0`` and ``idx1``.
This corresponds to row ranges in the main ``data`` table in the HDF5 file.

Parameters
----------
agasc_file : str or Path
Path to the AGASC HDF5 file.

Returns
-------
healpix_index : dict
Dictionary of healpix index to row range.
nside : int
HEALPix nside parameter.
"""
with tables.open_file(agasc_file, mode="r") as h5:
tbl = h5.root.healpix_index[:]
nside = h5.root.healpix_index.attrs["nside"]

out = {row["healpix"]: (row["row0"], row["row1"]) for row in tbl}

return out, nside


def get_stars_from_healpix_h5(
ra: float,
dec: float,
radius: float,
agasc_file: str | Path,
cache: bool = False,
) -> Table:
"""
Returns a table of stars within a given radius around a given sky position (RA, Dec),
using the AGASC data stored in a HDF5 file with a HEALPix index.

Parameters
----------
ra : float
Right ascension of the center of the search cone, in degrees.
dec : float
Declination of the center of the search cone, in degrees.
radius : float
Radius of the search cone, in degrees.
agasc_file : str or Path
Path to the HDF5 file containing the AGASC data with a HEALPix index.
cache : bool, optional
Whether to cache the AGASC data in memory. Default is False.

Returns
-------
stars : astropy.table.Table
Table of stars within the search cone, with columns from the AGASC data table.
"""
from agasc import sphere_dist, read_h5_table

# Table of healpix, idx0, idx1 where idx is the index into main AGASC data table
healpix_index_map, nside = get_healpix_info(agasc_file)
hp = get_healpix(nside)

# Get healpix index for ever pixel that intersects the cone.
healpix_indices = hp.cone_search_lonlat(
ra * u.deg, dec * u.deg, radius=radius * u.deg
)

stars_list = []

def make_stars_list(h5_file):
for healpix_index in healpix_indices:
idx0, idx1 = healpix_index_map[healpix_index]
stars = read_h5_table(h5_file, row0=idx0, row1=idx1, cache=cache)
stars_list.append(stars)

if cache:
make_stars_list(agasc_file)
else:
with tables.open_file(agasc_file) as h5:
make_stars_list(h5)

stars = Table(np.concatenate(stars_list))
dists = sphere_dist(ra, dec, stars["RA"], stars["DEC"])
stars = stars[dists <= radius]

# Sort in DEC order for back-compatibility with the AGASC ordering before v1.8.
stars.sort("DEC")

return stars
Loading