Source code for coniferest.datasets

import numpy as np

from coniferest.label import Label

from .plasticc_gp import plasticc_gp
from .ztf_m31 import ztf_m31

__all__ = ["ztf_m31", "plasticc_gp", "single_outlier", "non_anomalous_outliers"]


class Dataset:
    def __init__(self, data, labels):
        """
        Dataset is an o-by-f array, where o is objects and f is features.
        """
        self.data = data
        self.labels = labels

    def to_data_metadata(self):
        return np.asarray(self.data), np.asarray(list(map(Label, self.labels)))


class SingleOutlierDataset(Dataset):
    def __init__(self, inliers=10_000, rng=0):
        rng = np.random.default_rng(rng)
        data_inliers = rng.normal(loc=0, scale=1, size=(inliers, 2))
        data_outlier = np.array([[1e6, -1e6]])
        data = np.vstack([data_inliers, data_outlier])
        labels = np.append(np.zeros(inliers), np.ones(1))
        super().__init__(data, labels)



[docs]
def single_outlier(inliers=10_000, rng=0):
    return SingleOutlierDataset(inliers, rng).to_data_metadata()



class MalanchevDataset(Dataset):
    def __init__(self, inliers=1 << 10, outliers=1 << 5, regions=None, rng=0):
        """
        A simple dataset for testing the anomaly detection algorithms. It
        constits of one portion of regular data of `inliers` capacity, and
        three portions of outlier data of `outliers` capacity each. Every
        outlier portion maybe selected either as regular or anomalous. Example:

        ```
                MalanchevDataset(inliers=100, outliers=10, regions=(R,R,A))
             ┌───────────────────────────────────────────────────────────────┐
         1.12┤  .           .                                        .       │
             │        .  .         .                        .  .    .  .     │
         0.88┤.   . .        .                                      .      . │
             │   .                                             .             │
             │                                                               │
         0.64┤                                                               │
             │                .     .                                        │
             │         ... ..  .... ... .....                                │
          0.4┤        ....  .. .. .. .    .                                  │
             │          . ...     ..   ... .                                 │
         0.17┤        .  .  ...  ..... .  ..                   *             │
             │         .    .... .  . .. . .                 *  **           │
             │         .   .      . . . ...                 *     *         *│
        -0.07┤                                                       *       │
             │                                                               │
        -0.31┤                                               *               │
             └┬──────────────┬───────────────┬───────────────┬──────────────┬┘
             -0.2           0.16            0.53            0.89          1.26

        ```

        Here we have a plot of 100 inliers, 20 regular outliers (all plotted as
        dots) and 10 anomalous outliers (plotted as stars).
        """
        self.inliers = inliers
        self.outliers = outliers
        if regions is None:
            regions = np.array([Label.R, Label.R, Label.A])
        self.regions = regions

        rng = np.random.default_rng(rng)
        self.rng = rng

        x = np.concatenate(
            [
                self._generate_inliers(inliers, rng),
                self._generate_outliers(outliers, rng, [1, 1]),
                self._generate_outliers(outliers, rng, [0, 1]),
                self._generate_outliers(outliers, rng, [1, 0]),
            ]
        )

        x_labels = np.concatenate(
            [
                np.ones(inliers),
                self.regions[0] * np.ones(outliers),
                self.regions[1] * np.ones(outliers),
                self.regions[2] * np.ones(outliers),
            ]
        )

        super(MalanchevDataset, self).__init__(data=x, labels=x_labels)

    @staticmethod
    def _generate_inliers(n, rng):
        return rng.uniform([0, 0], [0.5, 0.5], (n, 2))

    @staticmethod
    def _generate_outliers(n, rng, loc=None):
        loc = loc or [1, 1]
        return rng.normal(loc, 0.1, (n, 2))



[docs]
def non_anomalous_outliers(inliers=1 << 10, outliers=1 << 5, regions=None, seed=0):
    return MalanchevDataset(inliers, outliers, regions, seed).to_data_metadata()



class DevNetDataset(Dataset):
    """Deviation Network paper datasets.

    This class constructor would download datasets from the Deviation Network GitHub:
    https://github.com/GuansongPang/deviation-network

    It requires pandas to be installed.

    Arguments
    ---------
    name : str
        Name of the dataset to download. See `.avialble_datasets`.

    Attributes
    ----------
    avialble_datasets : list[str]
        List of available datasets to download.
    """

    _dataset_filenames = {
        "donors": "KDD2014_donors_10feat_nomissing_normalised.csv",
        "census": "census-income-full-mixed-binarized.tar.xz",
        "fraud": "creditcardfraud_normalised.tar.xz",
        "celeba": "celeba_baldvsnonbald_normalised.csv",
        "backdoor": "UNSW_NB15_traintest_backdoor.tar.xz",
        "campaign": "bank-additional-full_normalised.csv",
        "thyroid": "annthyroid_21feat_normalised.csv",
    }
    _dataset_urls = {
        name: f"https://github.com/GuansongPang/deviation-network/raw/master/dataset/{filename}"
        for name, filename in _dataset_filenames.items()
    }
    avialble_datasets = list(_dataset_filenames.keys())

    def __init__(self, name: str):
        try:
            import pandas as pd
        except ImportError:
            raise ImportError(
                "Pandas is required to load DevNet datasets, install it with `pip install pandas` or "
                "reinstall the package with `pip install coniferest[datasets]`"
            )

        if name not in self.avialble_datasets:
            raise ValueError(f"Dataset {name} is not available. Available datasets are: {self.avialble_datasets}")

        df = pd.read_csv(self._dataset_urls[name])

        # Last column is for class, the rest are features
        data = df.iloc[:, :-1].to_numpy(dtype=float)

        # In the original data, the labels are 1 for anomalies and 0 for regular data
        # We need 1 for regular data and -1 for anomalies
        labels = 1 - 2 * df.iloc[:, -1].to_numpy(dtype=int)

        super().__init__(data, labels)


def dev_net_dataset(name: str):
    f"""Download and return metadata and data for the Deviation Network dataset.
   
    This class constructor would download datasets from the Deviation Network GitHub:
    https://github.com/GuansongPang/deviation-network

    It requires pandas to be installed.
    
    Avialable datasets are: {", ".join(DevNetDataset.avialble_datasets)}

    Arguments
    ---------
    name : str
        Name of the dataset to download. See `.avialble_datasets`. 
    
    Returns
    -------
    data : array-like, shape (n_samples, n_features)
        2-D array of data points
    labels : array-like, shape (n_samples,)
        1-D array of `Label` objects for each data point
    """
    return DevNetDataset(name).to_data_metadata()
Source code for coniferest.datasets

coniferest

Navigation

Related Topics