Source code for coniferest.datasets

import numpy as np

from coniferest.label import Label

from .plasticc_gp import plasticc_gp
from .ztf_m31 import ztf_m31

__all__ = ["ztf_m31", "plasticc_gp", "single_outlier", "non_anomalous_outliers"]


class Dataset:
    def __init__(self, data, labels):
        """
        Dataset is an o-by-f array, where o is objects and f is features.
        """
        self.data = data
        self.labels = labels

    def to_data_metadata(self):
        return np.asarray(self.data), np.asarray(list(map(Label, self.labels)))


class SingleOutlierDataset(Dataset):
    def __init__(self, inliers=10_000, rng=0):
        rng = np.random.default_rng(rng)
        data_inliers = rng.normal(loc=0, scale=1, size=(inliers, 2))
        data_outlier = np.array([[1e6, -1e6]])
        data = np.vstack([data_inliers, data_outlier])
        labels = np.append(np.zeros(inliers), np.ones(1))
        super().__init__(data, labels)


[docs] def single_outlier(inliers=10_000, rng=0): return SingleOutlierDataset(inliers, rng).to_data_metadata()
class MalanchevDataset(Dataset): def __init__(self, inliers=1 << 10, outliers=1 << 5, regions=None, rng=0): """ A simple dataset for testing the anomaly detection algorithms. It constits of one portion of regular data of `inliers` capacity, and three portions of outlier data of `outliers` capacity each. Every outlier portion maybe selected either as regular or anomalous. Example: ``` MalanchevDataset(inliers=100, outliers=10, regions=(R,R,A)) ┌───────────────────────────────────────────────────────────────┐ 1.12┤ . . . │ │ . . . . . . . │ 0.88┤. . . . . . │ │ . . │ │ │ 0.64┤ │ │ . . │ │ ... .. .... ... ..... │ 0.4┤ .... .. .. .. . . │ │ . ... .. ... . │ 0.17┤ . . ... ..... . .. * │ │ . .... . . .. . . * ** │ │ . . . . . ... * * *│ -0.07┤ * │ │ │ -0.31┤ * │ └┬──────────────┬───────────────┬───────────────┬──────────────┬┘ -0.2 0.16 0.53 0.89 1.26 ``` Here we have a plot of 100 inliers, 20 regular outliers (all plotted as dots) and 10 anomalous outliers (plotted as stars). """ self.inliers = inliers self.outliers = outliers if regions is None: regions = np.array([Label.R, Label.R, Label.A]) self.regions = regions rng = np.random.default_rng(rng) self.rng = rng x = np.concatenate( [ self._generate_inliers(inliers, rng), self._generate_outliers(outliers, rng, [1, 1]), self._generate_outliers(outliers, rng, [0, 1]), self._generate_outliers(outliers, rng, [1, 0]), ] ) x_labels = np.concatenate( [ np.ones(inliers), self.regions[0] * np.ones(outliers), self.regions[1] * np.ones(outliers), self.regions[2] * np.ones(outliers), ] ) super(MalanchevDataset, self).__init__(data=x, labels=x_labels) @staticmethod def _generate_inliers(n, rng): return rng.uniform([0, 0], [0.5, 0.5], (n, 2)) @staticmethod def _generate_outliers(n, rng, loc=None): loc = loc or [1, 1] return rng.normal(loc, 0.1, (n, 2))
[docs] def non_anomalous_outliers(inliers=1 << 10, outliers=1 << 5, regions=None, seed=0): return MalanchevDataset(inliers, outliers, regions, seed).to_data_metadata()
class DevNetDataset(Dataset): """Deviation Network paper datasets. This class constructor would download datasets from the Deviation Network GitHub: https://github.com/GuansongPang/deviation-network It requires pandas to be installed. Arguments --------- name : str Name of the dataset to download. See `.avialble_datasets`. Attributes ---------- avialble_datasets : list[str] List of available datasets to download. """ _dataset_filenames = { "donors": "KDD2014_donors_10feat_nomissing_normalised.csv", "census": "census-income-full-mixed-binarized.tar.xz", "fraud": "creditcardfraud_normalised.tar.xz", "celeba": "celeba_baldvsnonbald_normalised.csv", "backdoor": "UNSW_NB15_traintest_backdoor.tar.xz", "campaign": "bank-additional-full_normalised.csv", "thyroid": "annthyroid_21feat_normalised.csv", } _dataset_urls = { name: f"https://github.com/GuansongPang/deviation-network/raw/master/dataset/{filename}" for name, filename in _dataset_filenames.items() } avialble_datasets = list(_dataset_filenames.keys()) def __init__(self, name: str): try: import pandas as pd except ImportError: raise ImportError( "Pandas is required to load DevNet datasets, install it with `pip install pandas` or " "reinstall the package with `pip install coniferest[datasets]`" ) if name not in self.avialble_datasets: raise ValueError(f"Dataset {name} is not available. Available datasets are: {self.avialble_datasets}") df = pd.read_csv(self._dataset_urls[name]) # Last column is for class, the rest are features data = df.iloc[:, :-1].to_numpy(dtype=float) # In the original data, the labels are 1 for anomalies and 0 for regular data # We need 1 for regular data and -1 for anomalies labels = 1 - 2 * df.iloc[:, -1].to_numpy(dtype=int) super().__init__(data, labels) def dev_net_dataset(name: str): f"""Download and return metadata and data for the Deviation Network dataset. This class constructor would download datasets from the Deviation Network GitHub: https://github.com/GuansongPang/deviation-network It requires pandas to be installed. Avialable datasets are: {", ".join(DevNetDataset.avialble_datasets)} Arguments --------- name : str Name of the dataset to download. See `.avialble_datasets`. Returns ------- data : array-like, shape (n_samples, n_features) 2-D array of data points labels : array-like, shape (n_samples,) 1-D array of `Label` objects for each data point """ return DevNetDataset(name).to_data_metadata()