Source code for coniferest.datasets.plasticc_gp

from __future__ import annotations  # "|" syntax is not available in Python 3.9

IMPORT_ERROR_MESSAGE = (
    "`datasets' library is required to load PLAsTiCC dataset, install it with `pip install datasets` "
    "or reinstall the package with `pip install coniferest[datasets]`"
)



[docs]
def plasticc_gp(**kwargs):
    """
    Load PLAsTiCC dataset of GP-approximated light curves.

    It is a subset of the PLAsTiCC dataset, light curves are approximated
    with vector Gaussian processes (Kornilov et al., 2023). Features
    are these GP approximations and GP parameters, all normalized.

    The dataset is hosted on Hugging Face, `datasets' package is required
    to load it.

    Adopted from Ishida et al. 2021:
    https://ui.adsabs.harvard.edu/abs/2021A%26A...650A.195I/abstract

    Parameters
    ----------
    **kwargs
        Arbitrary keyword arguments passed to `datasets.load_dataset`.

    Returns
    -------
    data : 2-D numpy.ndarray of float32
        2-D array of features.
    metadata : 1-D numpy.ndarray of bool
        1-D array of anomaly labels, True for anomalies.
    """

    try:
        from datasets import load_dataset
    except ImportError:
        raise ImportError(IMPORT_ERROR_MESSAGE)

    dataset = load_dataset("snad-space/plasticc-gp", **kwargs)
    df = dataset["train"].to_pandas()
    features_columns = [name for name in df.columns if name.startswith("feature")]
    data = df[features_columns].to_numpy()
    # The source file uses 1 for anomaly and 0 for nominals,
    # while we need Label.ANOMALY == -1 and Label.REGULAR == 1.
    metadata = 1 - 2 * df["answer"].to_numpy(dtype=int)
    return data, metadata
Source code for coniferest.datasets.plasticc_gp

coniferest

Navigation

Related Topics