Source code for mepylome.dtypes.beads

"""Contains classes and function for processing Illumina methylation arrays.

It includes methods for extracting methylation information, various
preprocessing techniques, normalization, and data handling.
"""

import collections
import logging
import math
import os
import pickle
import threading
from collections.abc import Iterator, Sequence
from dataclasses import dataclass
from functools import reduce
from pathlib import Path
from typing import (
    Any,
    Literal,
)
from uuid import uuid4

import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tqdm import tqdm

from mepylome.dtypes.arrays import ArrayType
from mepylome.dtypes.cache import cache_key, input_args_id, memoize
from mepylome.dtypes.chromosome import Chromosome
from mepylome.dtypes.idat import IdatParser
from mepylome.dtypes.manifests import Manifest
from mepylome.dtypes.probes import Channel, ProbeType
from mepylome.dtypes.purity import get_purity_features, predict_purity
from mepylome.utils.varia import MEPYLOME_CACHE_DIR, normexp_get_xs

logger = logging.getLogger(__name__)

GRN_SUFFIX = "_Grn.idat"
RED_SUFFIX = "_Red.idat"
GZ_SUFFIX = ".gz"
ENDING_SUFFIXES = ("_Grn.idat", "_Red.idat", "_Grn.idat.gz", "_Red.idat.gz")

EPSILON = 1e-6
NEUTRAL_BETA = 0.5
NEUTRAL_M_VALUE = 0.0

PrepType = Literal["raw", "illumina", "swan", "noob"]



[docs]
def is_valid_idat_basepath(
    basepath: str | Path | Sequence[str | Path],
) -> bool:
    """Checks if the given basepath(s) point to valid IDAT files."""
    if isinstance(basepath, str | Path):
        basepaths = [str(basepath)]
    else:
        basepaths = [str(x) for x in basepath]

    return all(
        (
            os.path.exists(x + GRN_SUFFIX)
            or os.path.exists(x + GRN_SUFFIX + GZ_SUFFIX)
        )
        and (
            os.path.exists(x + RED_SUFFIX)
            or os.path.exists(x + RED_SUFFIX + GZ_SUFFIX)
        )
        for x in basepaths
    )




[docs]
def idat_basepaths(
    files: str | Path | Sequence[str | Path],
    only_valid: bool = False,
) -> list[Path]:
    """Returns unique basepaths from IDAT files or directory.

    This function processes a list of IDAT files or a directory containing IDAT
    files and returns their basepaths by removing the file endings. The
    function ensures that there are no duplicate basepaths in the returned list
    and maintains the order of the files as they appear in the input.

    Args:
        files: A file or directory path or a list of file paths.

        only_valid: If True, only returns basepaths that point to valid IDAT
            file pairs.

    Returns:
        A list of unique basepaths corresponding to the provided IDAT files. If
        a directory is provided, all IDAT files are recursively considered.

    Example:
        >>> idat_basepaths("/path/to/dir")
        [PosixPath('/path/to/dir/file1'), PosixPath('/path/to/dir/file2')]

        >>> idat_basepaths(["/path1/file1_Grn.idat", "/path2/file2_Red.idat"])
        [PosixPath('/path1/file1'), PosixPath('/path2/file2')]

        >>> idat_basepaths("/path/to/idat/file_Grn.idat.gz")
        [PosixPath('/path/to/idat/file')]
    """

    def get_idat_files(file_or_dir: str | Path) -> Iterator[str]:
        path = os.path.expanduser(file_or_dir)
        # If path is dir take all files in it
        if os.path.isdir(path):
            for dirpath, dirnames, filenames in os.walk(
                path, followlinks=True
            ):
                dirnames.sort()
                filenames.sort()
                for filename in filenames:
                    if filename.endswith(ENDING_SUFFIXES):
                        yield os.path.join(dirpath, filename)
        else:
            yield path

    def strip_suffix(file_path: str) -> str:
        for suffix in ENDING_SUFFIXES:
            if file_path.endswith(suffix):
                return file_path[: -len(suffix)]
        return file_path

    if isinstance(files, str | Path):
        files = [files]

    _files = [
        strip_suffix(idat_file)
        for file_or_dir in files
        for idat_file in get_idat_files(file_or_dir)
    ]
    # Remove duplicates, keep ordering
    unique_basepaths_dict = dict.fromkeys(_files)
    if only_valid:
        return [
            Path(base)
            for base in unique_basepaths_dict
            if is_valid_idat_basepath(base)
        ]
    return [Path(base) for base in unique_basepaths_dict]




[docs]
def idat_paths_from_basenames(
    basenames: Sequence[str | Path],
) -> tuple[np.ndarray, np.ndarray]:
    """Returns paths to green and red IDAT files.

    Args:
        basenames: List of basepaths for IDAT files.

    Returns:
        Paths to green and red IDAT files.

    Raises:
        FileNotFoundError: If any IDAT file is not found.
    """
    grn_idat_files = np.array(
        [Path(str(name) + GRN_SUFFIX) for name in basenames]
    )
    red_idat_files = np.array(
        [Path(str(name) + RED_SUFFIX) for name in basenames]
    )

    def check_and_fix(files: np.ndarray) -> Path | None:
        not_existing = [i for i, path in enumerate(files) if not path.exists()]
        files[not_existing] = [
            x.parent / (x.name + GZ_SUFFIX) for x in files[not_existing]
        ]
        return next((x for x in files[not_existing] if not x.exists()), None)

    not_found = check_and_fix(grn_idat_files)
    not_found = (
        check_and_fix(red_idat_files) if not_found is None else not_found
    )
    if not_found is not None:
        idat_file = str(not_found).replace(GZ_SUFFIX, "")
        msg = f"IDAT file not found: {idat_file}."
        raise FileNotFoundError(msg)
    return grn_idat_files, red_idat_files




[docs]
class RawData:
    """Represents raw intensity data extracted from IDAT files.

    This class initializes with a list of basepaths to IDAT files and parses
    them to extract raw intensity data from the green and red channels.

    Args:
        basenames: List of basepaths to IDAT files.

        manifest: The manifest associated with the array. If not provided, it
            will be determined from the probe count.

    Attributes:
        array_type: Type of Illumina array.

        sample_ids: List of sample IDs corresponding to the IDAT files.

        bead_addresses: Bead addresses.

        green: Array of raw intensity values from the green channel.

        red: Array of raw intensity values from the red channel.

    Example:
        >>> idat_basepath0 = directory_path / "200925700125_R07C01"
        >>> idat_basepath1 = directory_path / "200925700133_R02C01_Grn.idat"
        >>> raw_data = RawData(idat_basepath0)
        >>> raw_data = RawData([idat_basepath0, idat_basepath1])
    """

    array_type: ArrayType
    bead_addresses: np.ndarray
    manifest: Manifest
    sample_ids: list[str]

    green: np.ndarray
    red: np.ndarray

    def __init__(
        self,
        basenames: str | Path | Sequence[str | Path],
        *,
        manifest: Manifest | None = None,
    ) -> None:
        _basenames = idat_basepaths(basenames)

        self.sample_ids = [
            path.name.replace(GZ_SUFFIX, "") for path in _basenames
        ]

        grn_idat_files, red_idat_files = idat_paths_from_basenames(_basenames)

        grn_idat = [
            IdatParser(str(filepath), mode="intensity")
            for filepath in grn_idat_files
        ]
        red_idat = [
            IdatParser(str(filepath), mode="intensity")
            for filepath in red_idat_files
        ]

        array_types = [
            ArrayType.from_probe_count(len(idat.illumina_ids))
            for idat in grn_idat + red_idat
        ]

        if len(set(array_types)) != 1:
            msg = "Array types must all be the same."
            raise ValueError(msg)

        self.array_type = array_types[0]

        self.manifest = (
            Manifest(self.array_type) if manifest is None else manifest
        )

        all_illumina_ids = [idat.illumina_ids for idat in grn_idat + red_idat]

        if all(
            np.array_equal(all_illumina_ids[0], arr)
            for arr in all_illumina_ids
        ):
            self.bead_addresses = all_illumina_ids[0]
            self.green = np.array([idat.probe_means for idat in grn_idat])
            self.red = np.array([idat.probe_means for idat in red_idat])
        else:
            self.bead_addresses = reduce(
                np.intersect1d, [idat.illumina_ids for idat in grn_idat]
            )
            self.green = np.array(
                [
                    idat.probe_means[
                        np.isin(
                            idat.illumina_ids,
                            self.bead_addresses,
                            assume_unique=True,
                        )
                    ]
                    for idat in grn_idat
                ]
            )
            self.red = np.array(
                [
                    idat.probe_means[
                        np.isin(
                            idat.illumina_ids,
                            self.bead_addresses,
                            assume_unique=True,
                        )
                    ]
                    for idat in red_idat
                ]
            )

    @property
    def green_df(self) -> pd.DataFrame:
        """DataFrame: Green channel raw intensity indexed by probe IDs."""
        return pd.DataFrame(
            self.green.T,
            index=self.bead_addresses,
            columns=self.sample_ids,
            dtype="int32",
        )

    @property
    def red_df(self) -> pd.DataFrame:
        """DataFrame: Red channel raw intensity indexed by probe IDs."""
        return pd.DataFrame(
            self.red.T,
            index=self.bead_addresses,
            columns=self.sample_ids,
            dtype="int32",
        )

    def __repr__(self) -> str:
        title = "RawData():"
        lines = [
            title + "\n" + "*" * len(title),
            f"array_type: {self.array_type}",
            f"manifest: {self.manifest.array_type}",
            f"sample_ids:\n{self.sample_ids}",
            f"bead_addresses:\n{self.bead_addresses}",
            f"green:\n{self.green}",
            f"red:\n{self.red}",
            f"green_df:\n{self.green_df}",
            f"red_df:\n{self.red_df}",
        ]
        return "\n\n".join(lines)



@memoize
def _overlap_indices(
    left_arr: Sequence | pd.Index,
    right_arr: Sequence | pd.Index,
) -> tuple[np.ndarray, np.ndarray]:
    """Compute the indices of overlapping elements between two arrays.

    This function finds the common elements (indices) between two input arrays
    and returns their positions in both arrays. It uses pandas Index objects
    and memoization for performance improvement.

    Example:
        >>> left_arr = ['a', 'b', 'c', 'd']
        >>> right_arr = ['b', 'c', 'e']
        >>> left_idx, right_idx = _overlap_indices(left_arr, right_arr)
        >>> print(left_idx)
        [1 2]
        >>> print(right_idx)
        [0 1]
    """
    if not isinstance(left_arr, pd.Index):
        left_arr = pd.Index(left_arr)
    if not isinstance(right_arr, pd.Index):
        right_arr = pd.Index(right_arr)
    common_indices = left_arr.intersection(right_arr)
    left_index = left_arr.get_indexer(common_indices)
    right_index = right_arr.get_indexer(common_indices)
    return left_index, right_index


@memoize
def _get_sex_indices(
    sample_probes: np.ndarray, array_type: ArrayType
) -> tuple[np.ndarray, np.ndarray]:
    """Return probe indices for X and Y chromosomes in a sample array.

    Maps manifest-defined X and Y chromosome probes to their positions
    within a given sample probe array.

    Returns:
        Tuple containing:
        - indices of X-chromosome probes in sample_probes
        - indices of Y-chromosome probes in sample_probes
    """
    manifest_df = Manifest(array_type).data_frame

    x_probes = manifest_df[manifest_df.Chromosome == Chromosome.CHRX][
        "IlmnID"
    ].values
    y_probes = manifest_df[manifest_df.Chromosome == Chromosome.CHRY][
        "IlmnID"
    ].values

    return (
        _overlap_indices(sample_probes, x_probes)[0],
        _overlap_indices(sample_probes, y_probes)[0],
    )



[docs]
@dataclass(slots=True)
class MethylIndices:
    """Class for caching multiple indices."""

    # Probes IDs and probe index relative to position in manifest dataframe
    probe_idx: np.ndarray
    probe_ids: np.ndarray

    # Adresses in bead intensity arrays
    bead_t1r_a: np.ndarray
    bead_t1r_b: np.ndarray
    bead_t1g_a: np.ndarray
    bead_t1g_b: np.ndarray
    bead_t2_a: np.ndarray

    # Positions in methylated/unmethylated output arrays
    meth_t1r: np.ndarray
    meth_t1g: np.ndarray
    meth_t2: np.ndarray

    # Adresses in bead intensity arrays for controls
    bead_at: np.ndarray | None = None
    bead_cg: np.ndarray | None = None
    bead_neg: np.ndarray | None = None

    # Positions in control arrays (NOOB)
    ctrl_idx: np.ndarray | None = None
    ctrl_cg: np.ndarray | None = None
    ctrl_at: np.ndarray | None = None




[docs]
class MethylData:
    """Represents methylated and unmethylated intensity data from RawData.

    This class provides methods for preprocessing Illumina methylation data and
    computing beta values from methylated and unmethylated intensities.

    Args:
        data: RawData object containing raw intensity data.

        file: Path to file or dir or list of paths containing raw intensity
            data.

        prep: Preprocessing method. Options: "illumina", "swan", "noob".

        seed: Seed value used for random number generation in the SWAN
            preprocessing method.

    Note:
        If 'data' is not provided, it will attempt to create a RawData object
        using the specified 'file'.

    Raises:
        ValueError: If neither 'data' nor 'file' is provided.
        ValueError: If 'data' is provided but is not of type 'RawData'.

    Examples:
        >>> methyl_data = MethylData(raw_data)
        >>> methyl_data = MethylData(file=file_path, prep="swan")
    """

    array_type: ArrayType
    bead_addresses: np.ndarray
    manifest: Manifest
    methylated: np.ndarray
    probe_ids: np.ndarray
    probe_idx: np.ndarray
    sample_ids: list[str]
    seed: int | None
    unmethylated: np.ndarray

    green: np.ndarray
    red: np.ndarray
    _intensity: np.ndarray | None
    _log_intensity_design_matrix_cache: np.ndarray | None

    def __init__(
        self,
        data: RawData | None = None,
        file: str | Path | Sequence[str | Path] | None = None,
        prep: PrepType = "illumina",
        seed: int | None = None,
    ) -> None:
        if (data is None) == (file is None):
            msg = "Exactly one of 'data' or 'file' must be provided."
            raise ValueError(msg)
        if data is None:
            assert file is not None
            data = RawData(file)
        elif not isinstance(data, RawData):
            msg = "'data' is not of type 'RawData'."
            raise ValueError(msg)
        self.seed = seed
        self.green = data.green
        self.red = data.red
        self.array_type = data.array_type
        self.bead_addresses = data.bead_addresses
        self.manifest = data.manifest
        self.sample_ids = data.sample_ids
        self._intensity = None
        self._log_intensity_design_matrix_cache = None
        if prep == "illumina":
            self.preprocess_illumina()
        elif prep == "swan":
            self.preprocess_swan()
        elif prep == "noob":
            self.preprocess_noob()
        elif prep == "raw":
            self.preprocess_raw()
        else:
            msg = f"invalid 'prep' value {prep}"
            raise ValueError(msg)

    @property
    def green_df(self) -> pd.DataFrame:
        """DataFrame: Normalized green intensity by probe ID."""
        return pd.DataFrame(
            self.green.T,
            index=self.bead_addresses,
            columns=self.sample_ids,
            dtype="float32",
        )

    @property
    def red_df(self) -> pd.DataFrame:
        """DataFrame: Normalized red intensity by probe ID."""
        return pd.DataFrame(
            self.red.T,
            index=self.bead_addresses,
            columns=self.sample_ids,
            dtype="float32",
        )

    @property
    def methylated_df(self) -> pd.DataFrame:
        """DataFrame: Methylated intensity values indexed by IlmnID."""
        return pd.DataFrame(
            self.methylated.T,
            index=self.probe_ids,
            columns=self.sample_ids,
            dtype="float32",
        ).rename_axis("IlmnID")

    @property
    def unmethylated_df(self) -> pd.DataFrame:
        """DataFrame: Unmethylated intensity values indexed by IlmnID."""
        return pd.DataFrame(
            self.unmethylated.T,
            index=self.probe_ids,
            columns=self.sample_ids,
            dtype="float32",
        ).rename_axis("IlmnID")

    @property
    def intensity(self) -> np.ndarray:
        """Calculates numpy intensity values from methylation data."""
        if self._intensity is None:
            logger.debug("Setting intensity for: %s", self.sample_ids)
            intensity = self.methylated + self.unmethylated

            # Replace NaN values with 1
            nan_indices = np.isnan(intensity)
            if np.any(nan_indices):
                intensity[nan_indices] = 1
                logger.debug(
                    "%s: Intensities that are NA set to 1", self.sample_ids
                )

            # Replace values less than 1 with 1
            lt_one_indices = intensity < 1
            if np.any(lt_one_indices):
                intensity[lt_one_indices] = 1
                logger.debug("%s: Intensities < 1 set to 1", self.sample_ids)

            # Check abnormal low and high intensities
            mean_intensity = np.mean(intensity, axis=1)
            if np.min(mean_intensity) < 5000:
                logger.info(
                    "%s: Intensities are abnormally low (< 5000)",
                    self.sample_ids,
                )
            if np.max(mean_intensity) > 50000:
                logger.info(
                    "%s: Intensities are abnormally high (> 50000)",
                    self.sample_ids,
                )
            self._intensity = intensity

        return self._intensity

    @property
    def _log_intensity_design_matrix(self) -> np.ndarray:
        """log2 intensity with appended intercept column for linear regression.

        Shape: (n_probes, n_samples + 1) - intercept in last column.
        """
        if self._log_intensity_design_matrix_cache is None:
            log_intensity = np.log2(self.intensity)
            self._log_intensity_design_matrix_cache = np.hstack(
                [
                    log_intensity.T,
                    np.ones(
                        (log_intensity.shape[1], 1),
                        dtype=log_intensity.dtype,
                    ),
                ]
            )
        return self._log_intensity_design_matrix_cache

    def _load_log_intensity(self) -> None:
        """Calculates log_intensity so this can be saved to disk."""
        _ = self._log_intensity_design_matrix
        self._intensity = None

    @property
    def intensity_df(self) -> pd.DataFrame:
        """Calculates DataFrame intensity values from methylation data."""
        return pd.DataFrame(
            self.intensity.T,
            columns=self.sample_ids,
            index=self.probe_ids,
        )


[docs]
    def preprocess_illumina(self) -> None:
        """Performs preprocessing usings Illuminas method.

        This function implements preprocessing for Illumina methylation
        microarrays as used in Genome Studio, the standard software provided by
        Illumina.

        Details:
            This implementation is adapted from 'minfi'.
        """
        if self.array_type == ArrayType.ILLUMINA_27K:
            raise ValueError(f"{self.array_type} requires raw mode.")

        ci = MethylData._cached_indices(
            self.manifest, self.bead_addresses, "illumina"
        )
        self._illumina_control_normalization(ci=ci)
        self._illumina_bg_correction(ci)
        self._preprocess_raw(ci)


    def _illumina_control_normalization(
        self,
        ci: MethylIndices,
        reference: int = 0,
    ) -> None:
        """Performs normalization using control probes."""
        grn_average = np.mean(
            self.green[:, ci.bead_cg],
            axis=1,
        )
        red_average = np.mean(
            self.red[:, ci.bead_at],
            axis=1,
        )

        ref = (grn_average + red_average)[reference] / 2
        grn_factor = ref / grn_average
        red_factor = ref / red_average

        self.green = grn_factor[:, np.newaxis] * self.green
        self.red = red_factor[:, np.newaxis] * self.red

    def _illumina_bg_correction(self, ci: MethylIndices) -> None:
        """Performs background normalization using negative control probes."""
        if len(ci.bead_neg) <= 30:  # type: ignore[arg-type]
            return

        grn_bg = np.partition(self.green[:, ci.bead_neg], 30)[:, 30]
        red_bg = np.partition(self.red[:, ci.bead_neg], 30)[:, 30]

        # Subtract and threshold at zero, using in-place operations
        np.subtract(self.green, grn_bg[:, np.newaxis], out=self.green)
        np.maximum(self.green, 0, out=self.green)

        # Subtract and threshold at zero, using in-place operations
        np.subtract(self.red, red_bg[:, np.newaxis], out=self.red)
        np.maximum(self.red, 0, out=self.red)

    @memoize
    def _cached_indices(
        manifest: Manifest,
        bead_addresses: np.ndarray,
        prep: PrepType = "illumina",
    ) -> MethylIndices:
        """Cache the indices required for data processing.

        Args:
            manifest: Manifest object.

            bead_addresses: Array of Illumina bead addresses.

            prep: Preprocessing method. Options: "illumina", "noob", "swan",
                "raw".

        Returns:
            Cached indices including probe indices, Illumina IDs indices, and
            probe type indices.
        """
        type_1 = manifest.probe_info(ProbeType.ONE)
        type_2 = manifest.probe_info(ProbeType.TWO)
        type_1_red = type_1[type_1.Color_Channel.values == Channel.RED.value]
        type_1_grn = type_1[type_1.Color_Channel.values == Channel.GRN.value]
        probe_idx = pd.Index(
            np.sort(
                np.concatenate(
                    [
                        type_1.IlmnID.index,
                        type_2.IlmnID.index,
                    ]
                )
            )
        )
        probe_ids = manifest.data_frame.IlmnID.values[probe_idx.values]
        ids = pd.Index(bead_addresses)
        ci = MethylIndices(
            probe_idx=probe_idx.values,
            probe_ids=probe_ids,
            bead_t1r_a=ids.get_indexer(type_1_red["AddressA_ID"]),
            bead_t1r_b=ids.get_indexer(type_1_red["AddressB_ID"]),
            bead_t1g_a=ids.get_indexer(type_1_grn["AddressA_ID"]),
            bead_t1g_b=ids.get_indexer(type_1_grn["AddressB_ID"]),
            bead_t2_a=ids.get_indexer(type_2["AddressA_ID"]),
            meth_t1r=probe_idx.get_indexer(type_1_red.index),
            meth_t1g=probe_idx.get_indexer(type_1_grn.index),
            meth_t2=probe_idx.get_indexer(type_2.index),
        )

        if prep == "illumina":
            at_controls = manifest.control_address(["NORM_A", "NORM_T"])
            ng_controls = manifest.control_address("NEGATIVE")
            cg_controls = manifest.control_address(["NORM_C", "NORM_G"])

            def valid_ids(indices: np.ndarray) -> np.ndarray:
                return indices[indices != -1]

            ci.bead_at = valid_ids(ids.get_indexer(at_controls))
            ci.bead_cg = valid_ids(ids.get_indexer(cg_controls))
            ci.bead_neg = valid_ids(ids.get_indexer(ng_controls))

        if prep == "swan":
            ng_controls = manifest.control_address("NEGATIVE")

            def valid_ids(indices: np.ndarray) -> np.ndarray:
                return indices[indices != -1]

            ci.bead_neg = valid_ids(ids.get_indexer(ng_controls))

        if prep == "noob":
            control_probes = manifest.control_data_frame
            control_probes = control_probes[
                control_probes.Address_ID.isin(ids)
            ].reset_index(drop=True)
            ci.ctrl_cg = control_probes[
                control_probes.Control_Type.isin(["NORM_C", "NORM_G"])
            ].index.values
            ci.ctrl_at = control_probes[
                control_probes.Control_Type.isin(["NORM_A", "NORM_T"])
            ].index.values
            ci.ctrl_idx = ids.get_indexer(control_probes["Address_ID"])

        return ci


[docs]
    def preprocess_raw(self) -> None:
        """Calculates methylated/unmethylated arrays without preprocessing.

        Converts the Red/Green channel for an Illumina methylation array
        into methylation signal, without using any normalization.
        """
        ci = MethylData._cached_indices(
            self.manifest, self.bead_addresses, "raw"
        )
        self._preprocess_raw(ci)


    def _preprocess_raw(self, ci: MethylIndices) -> None:
        """Internal preprocess logic."""
        methyl_shape = (len(self.sample_ids), len(ci.probe_idx))
        self.methylated = np.full(methyl_shape, np.nan)
        self.unmethylated = np.full(methyl_shape, np.nan)
        self.methylated[:, ci.meth_t1r] = np.take(
            self.red, ci.bead_t1r_b, axis=1
        )
        self.methylated[:, ci.meth_t1g] = np.take(
            self.green, ci.bead_t1g_b, axis=1
        )
        self.methylated[:, ci.meth_t2] = np.take(
            self.green, ci.bead_t2_a, axis=1
        )
        self.unmethylated[:, ci.meth_t1r] = np.take(
            self.red, ci.bead_t1r_a, axis=1
        )
        self.unmethylated[:, ci.meth_t1g] = np.take(
            self.green, ci.bead_t1g_a, axis=1
        )
        self.unmethylated[:, ci.meth_t2] = np.take(
            self.red, ci.bead_t2_a, axis=1
        )
        self.probe_idx = ci.probe_idx
        self.probe_ids = ci.probe_ids

    def _swan_bg_intensity(self, ci: MethylIndices) -> np.ndarray:
        """Intensity background normalization used for SWAN preprocessing."""
        grn_med = np.median(
            self.green[:, ci.bead_neg],
            axis=1,
        )
        red_med = np.median(
            self.red[:, ci.bead_neg],
            axis=1,
        )
        return np.mean([grn_med, red_med], axis=0)

    @staticmethod
    def _swan_indices(
        manifest: Manifest,
        probe_idx: np.ndarray,
        seed: int | None = None,
    ) -> tuple[dict, dict]:
        rng = np.random.default_rng(seed)
        all_ncpgs = (
            manifest.data_frame[["Probe_Type", "N_CpG"]]
            .loc[probe_idx]
            .reset_index(drop=True)
        )
        subset_sizes = all_ncpgs.groupby(
            ["Probe_Type", "N_CpG"], dropna=False
        ).size()
        subset_size = min(
            subset_sizes.get((probe_type, n_cpg), 0)
            for probe_type in [ProbeType.ONE, ProbeType.TWO]
            for n_cpg in [1, 2, 3]
        )
        all_indices = {}
        random_subset_indices = {}
        for probe_type in [ProbeType.ONE, ProbeType.TWO]:
            all_ncpgs_type = all_ncpgs[all_ncpgs.Probe_Type == probe_type]
            all_indices[probe_type] = all_ncpgs_type.index.values
            all_ncpgs_type = all_ncpgs_type.reset_index(drop=True)
            indices = []
            for ncpgs in range(1, 4):
                ids = all_ncpgs_type.index[all_ncpgs_type.N_CpG == ncpgs]
                ids_subset = rng.permutation(ids)[:subset_size]
                indices.append(ids_subset)
            random_subset_indices[probe_type] = np.sort(
                np.concatenate(indices)
            )
        return all_indices, random_subset_indices


[docs]
    def preprocess_swan(self) -> None:
        """Subset-quantile Within Array Normalization (SWAN).

        Details:
            The SWAN method has two parts. First, an average quantile
            distribution is created using a subset of probes defined to be
            biologically similar based on the number of CpGs underlying the
            probe body. This is achieved by randomly selecting N Infinium I and
            II probes that have 1, 2 and 3 underlying CpGs, where N is the
            minimum number of probes in the 6 sets of Infinium I and II probes
            with 1, 2 or 3 probe body CpGs. This results in a pool of 3N
            Infinium I and 3N Infinium II probes. The subset for each probe
            type is then sorted by increasing intensity.  The value of each of
            the 3N pairs of observations is subsequently assigned to be the
            mean intensity of the two probe types for that row or “quantile”.
            This is the standard quantile procedure. The intensities of the
            remaining probes are then separately adjusted for each probe type
            using linear interpolation between the subset probes.


            Implementation adapted from 'minfi'

        Note:
            SWAN uses a random subset of probes for between array
            normalization. To achieve reproducible results, set the seed.

        References:
            J Maksimovic, L Gordon and A Oshlack (2012). SWAN: Subset quantile
            Within-Array Normalization for Illumina Infinium
            HumanMethylation450 BeadChips. Genome Biology 13, R44.
        """
        if self.array_type == ArrayType.ILLUMINA_27K:
            raise ValueError(f"{self.array_type} requires raw mode.")

        ci = MethylData._cached_indices(
            self.manifest, self.bead_addresses, "swan"
        )
        self._preprocess_raw(ci)
        bg_intensity = self._swan_bg_intensity(ci)
        all_indices, random_subset_indices = MethylData._swan_indices(
            self.manifest, self.probe_idx, self.seed
        )
        self.methylated = MethylData._preprocess_swan_main(
            self.methylated, bg_intensity, all_indices, random_subset_indices
        )
        self.unmethylated = MethylData._preprocess_swan_main(
            self.unmethylated, bg_intensity, all_indices, random_subset_indices
        )


    @staticmethod
    def _preprocess_swan_main(
        intensity: np.ndarray,
        bg_intensity: np.ndarray,
        all_indices: dict,
        random_subset_indices: dict,
    ) -> np.ndarray:
        """Main function for preprocess_swan."""
        from scipy.stats import rankdata

        random_subset_one = all_indices[ProbeType.ONE][
            random_subset_indices[ProbeType.ONE]
        ]
        random_subset_two = all_indices[ProbeType.TWO][
            random_subset_indices[ProbeType.TWO]
        ]
        sorted_subset_intensity = (
            np.sort(intensity[:, random_subset_one], axis=1)
            + np.sort(intensity[:, random_subset_two], axis=1)
        ) / 2
        swan = np.full(intensity.shape, np.nan)
        for i in range(len(intensity)):
            for probe_type in [ProbeType.ONE, ProbeType.TWO]:
                curr_intensity = intensity[i, all_indices[probe_type]]
                x = rankdata(curr_intensity) / len(curr_intensity)
                xp = np.sort(x[random_subset_indices[probe_type]])
                fp = sorted_subset_intensity[i, :]
                interp = np.interp(x=x, xp=xp, fp=fp)
                intensity_min = np.min(
                    curr_intensity[random_subset_indices[probe_type]]
                )
                intensity_max = np.max(
                    curr_intensity[random_subset_indices[probe_type]]
                )
                interp[x > np.max(xp)] += (
                    curr_intensity[x > np.max(xp)] - intensity_max
                )
                interp[x < np.min(xp)] += (
                    curr_intensity[x < np.min(xp)] - intensity_min
                )
                interp[interp <= 0] = bg_intensity[i]
                swan[i, all_indices[probe_type]] = interp
        return swan


[docs]
    def preprocess_noob(
        self,
        offset: float = 15,
        dye_method: str = "single",
    ) -> None:
        """The Noob preprocessing method.

        Description:
            Noob (normal-exponential out-of-band) is a background correction
            method with dye-bias normalization.

        Args:
            offset: An offset for the normexp background correction.

            dye_method: How should dye bias correction be done: "single" for
                single sample approach, or "reference" for a reference array.

        References:
            TJ Triche, DJ Weisenberger, D Van Den Berg, PW Laird and KD
            Siegmund _Low-level processing of Illumina Infinium DNA
            Methylation BeadArrays.  Nucleic Acids Res (2013) 41, e90.
            doi:10.1093/nar/gkt090.
        """
        if self.array_type == ArrayType.ILLUMINA_27K:
            raise ValueError(f"{self.array_type} requires raw mode.")

        ci = MethylData._cached_indices(
            self.manifest, self.bead_addresses, "noob"
        )

        self._preprocess_raw(ci)

        grn_oob = np.concatenate(
            [
                self.green[:, ci.bead_t1r_a],
                self.green[:, ci.bead_t1r_b],
            ],
            axis=1,
        )
        red_oob = np.concatenate(
            [self.red[:, ci.bead_t1g_a], self.red[:, ci.bead_t1g_b]],
            axis=1,
        )

        methylated = self.methylated
        unmethylated = self.unmethylated
        methylated[methylated <= 0] = 1
        unmethylated[unmethylated <= 0] = 1

        grn_m = methylated[:, ci.meth_t1g]
        grn_u = unmethylated[:, ci.meth_t1g]
        grn_2 = methylated[:, ci.meth_t2]

        xf_grn = np.concatenate([grn_m, grn_u, grn_2], axis=1)
        xs_grn = normexp_get_xs(xf_grn, controls=grn_oob, offset=offset)

        cumsum = np.cumsum([0, grn_m.shape[1], grn_u.shape[1], grn_2.shape[1]])
        slice_grn_m = slice(cumsum[0], cumsum[1])
        slice_grn_u = slice(cumsum[1], cumsum[2])
        slice_grn_2 = slice(cumsum[2], cumsum[3])

        red_m = methylated[:, ci.meth_t1r]
        red_u = unmethylated[:, ci.meth_t1r]
        red_2 = unmethylated[:, ci.meth_t2]

        xf_red = np.concatenate([red_m, red_u, red_2], axis=1)
        xs_red = normexp_get_xs(xf_red, controls=red_oob, offset=offset)

        cumsum = np.cumsum([0, red_m.shape[1], red_u.shape[1], red_2.shape[1]])
        slice_red_m = slice(cumsum[0], cumsum[1])
        slice_red_u = slice(cumsum[1], cumsum[2])
        slice_red_2 = slice(cumsum[2], cumsum[3])

        methylated[:, ci.meth_t1g] = xs_grn["xs"][:, slice_grn_m]
        unmethylated[:, ci.meth_t1g] = xs_grn["xs"][:, slice_grn_u]

        methylated[:, ci.meth_t1r] = xs_red["xs"][:, slice_red_m]
        unmethylated[:, ci.meth_t1r] = xs_red["xs"][:, slice_red_u]

        methylated[:, ci.meth_t2] = xs_grn["xs"][:, slice_grn_2]
        unmethylated[:, ci.meth_t2] = xs_red["xs"][:, slice_red_2]

        # Dye correction

        grn_control = self.green[:, ci.ctrl_idx]
        red_control = self.red[:, ci.ctrl_idx]

        xcs_grn = normexp_get_xs(
            grn_control, param=xs_grn["param"], offset=offset
        )
        xcs_red = normexp_get_xs(
            red_control, param=xs_red["param"], offset=offset
        )

        grn_avg = np.mean(xcs_grn["xs"][:, ci.ctrl_cg], axis=1)
        red_avg = np.mean(xcs_red["xs"][:, ci.ctrl_at], axis=1)

        red_grn_ratio = red_avg / grn_avg

        if dye_method == "single":
            red_factor = 1 / red_grn_ratio
            grn_factor = np.array([1, 1])
        elif dye_method == "reference":
            ref_idx = np.argmin(np.abs(red_grn_ratio - 1))
            ref = (grn_avg + red_avg)[ref_idx] / 2
            if np.isnan(ref):
                msg = "'ref_idx' refers to an array that is not present"
                raise ValueError(msg)
            grn_factor = ref / grn_avg
            red_factor = ref / red_avg
        else:
            msg = "dye_method must be 'single' or 'reference'"
            raise ValueError(msg)

        red_factor = red_factor.reshape(-1, 1)
        methylated[:, ci.meth_t1r] *= red_factor
        unmethylated[:, ci.meth_t1r] *= red_factor
        unmethylated[:, ci.meth_t2] *= red_factor

        if dye_method == "reference":
            grn_factor = grn_factor.reshape(-1, 1)
            methylated[:, ci.meth_t1g] *= grn_factor
            unmethylated[:, ci.meth_t1g] *= grn_factor
            methylated[:, ci.meth_t2] *= grn_factor

        self.methylated = methylated
        self.unmethylated = unmethylated



[docs]
    def poobah(
        self,
    ) -> pd.DataFrame:
        """Compute pOOBAH detection p-values for all probes.

        Compute pOOBAH detection p-values using out-of-band (OOB) hybridization
        signals.

        pOOBAH estimates whether probe intensities are distinguishable from
        empirical background distributions derived from OOB measurements.

        The method uses empirical cumulative distribution functions (ECDFs)
        computed from OOB probe intensities in each channel.

        Low p-values indicate reliable detection above background. A probe is
        considered to have failed detection (unreliable) when its pOOBAH
        p-value is greater than a threshold (usually 0.05).

        Returns:
            Detection p-values (0–1), shape (n_probes × n_samples), indexed by
            IlmnID. NaN indicates missing probes.

        Note:
            In SeSAMe, some probes are filtered using `backgroundMask`. This
            step is not implemented here, which may lead to small differences
            in the resulting p-values compared to SeSAMe.

        Reference:
            SeSAMe: reducing artifactual detection of DNA methylation by
            Infinium BeadChips in genomic deletions.
            Wanding Zhou, Timothy J. Triche Jr., Peter W. Laird, Hui Shen.
            Nucleic Acids Research, 46(e123), 2018.
            https://doi.org/10.1093/nar/gky691

        Examples:
            >>> methyl = MethylData(file=idat_basepath)
            >>> pvals = methyl.poobah()
            >>> mask = pvals >= 0.05
        """
        ci = MethylData._cached_indices(
            self.manifest, self.bead_addresses, "noob"
        )

        # NOTE: sesame additionaly filteres probes with backgroundMask
        grn_oob = np.concatenate(
            [
                self.green[:, ci.bead_t1r_a],
                self.green[:, ci.bead_t1r_b],
            ],
            axis=1,
        )
        red_oob = np.concatenate(
            [
                self.red[:, ci.bead_t1g_a],
                self.red[:, ci.bead_t1g_b],
            ],
            axis=1,
        )

        n_probes = len(ci.probe_idx)
        n_samples = len(self.sample_ids)
        pvals = np.full((n_samples, n_probes), np.nan)

        def ecdf(x: np.ndarray, ref: np.ndarray) -> np.ndarray:
            """Evaluate ECDF of OOB green/red at values x."""
            return np.searchsorted(ref, x, side="right") / len(ref)

        for s in range(n_samples):
            oob_g = np.sort(grn_oob[s])
            oob_r = np.sort(red_oob[s])

            # Type I Red probes
            m_1red = self.red[s, ci.bead_t1r_b]
            u_1red = self.red[s, ci.bead_t1r_a]
            pvals[s, ci.meth_t1r] = 1.0 - np.maximum(
                ecdf(m_1red, oob_r), ecdf(u_1red, oob_r)
            )

            # Type I Green probes
            m_1grn = self.green[s, ci.bead_t1g_b]
            u_1grn = self.green[s, ci.bead_t1g_a]
            pvals[s, ci.meth_t1g] = 1.0 - np.maximum(
                ecdf(m_1grn, oob_g), ecdf(u_1grn, oob_g)
            )

            # Type II probes
            m_2 = self.green[s, ci.bead_t2_a]
            u_2 = self.red[s, ci.bead_t2_a]
            pvals[s, ci.meth_t2] = 1.0 - np.maximum(
                ecdf(m_2, oob_g), ecdf(u_2, oob_r)
            )

        return pd.DataFrame(
            pvals.T,
            columns=self.sample_ids,
            index=ci.probe_ids,
        )



[docs]
    def quality_metrics(self) -> pd.DataFrame:
        """Compute per-sample median methylated and unmethylated intensities.

        This function reproduces the QC summary used in minfi::getQC, returning
        log2-transformed median intensities per sample.

        These values are commonly used for sample quality assessment. Samples
        with unusually low median intensities may indicate poor DNA quality or
        assay failure.
        """
        log2_med_m = np.log2(np.nanmedian(self.methylated, axis=1))
        log2_med_u = np.log2(np.nanmedian(self.unmethylated, axis=1))

        return pd.DataFrame(
            {
                "log2_median_methylated": log2_med_m,
                "log2_median_unmethylated": log2_med_u,
            },
            index=self.sample_ids,
        )



[docs]
    def detection_p(self) -> pd.DataFrame:
        """Detection p-values for probe signal vs background noise.

        Computes whether each probe signal (M+U) is distinguishable from a
        Gaussian background estimated from negative control probes. The p-value
        is the right-tail probability under this model.

        Low values indicate reliable detection above background. Samples with
        many high p-values (failed probes) may be low quality.

        Returns:
            Detection p-values (n_probes × n_samples), indexed by IlmnID.

        Notes:
            - Background is derived from negative control probes.
            - Uses robust statistics (median and MAD-like estimator).
            - Variance is stabilized to avoid degeneracy.

        Reference:
            Implements the Illumina detectionP method used in minfi.
        """
        from scipy.stats import norm

        ci = MethylData._cached_indices(
            self.manifest,
            self.bead_addresses,
            prep="illumina",
        )

        # Background statistics
        r_bg = self.red[:, ci.bead_neg]
        g_bg = self.green[:, ci.bead_neg]

        r_mu = np.median(r_bg, axis=1)
        # Median absolute deviation scaled for normal distribution
        r_sd = 1.4826 * np.median(np.abs(r_bg - r_mu[:, None]), axis=1)
        r_sd = np.clip(r_sd, EPSILON, None)

        g_mu = np.median(g_bg, axis=1)
        g_sd = 1.4826 * np.median(np.abs(g_bg - g_mu[:, None]), axis=1)
        g_sd = np.clip(g_sd, EPSILON, None)

        n_samples = len(self.sample_ids)
        n_probes = len(ci.probe_ids)

        pvals = np.full((n_samples, n_probes), np.nan)

        # Type I Red
        m = self.red[:, ci.bead_t1r_a] + self.red[:, ci.bead_t1r_b]
        pvals[:, ci.meth_t1r] = norm.sf(
            m,
            loc=2 * r_mu[:, None],
            scale=2 * r_sd[:, None],
        )

        # Type I Green
        m = self.green[:, ci.bead_t1g_a] + self.green[:, ci.bead_t1g_b]
        pvals[:, ci.meth_t1g] = norm.sf(
            m,
            loc=2 * g_mu[:, None],
            scale=2 * g_sd[:, None],
        )

        # Type II
        m = self.red[:, ci.bead_t2_a] + self.green[:, ci.bead_t2_a]
        pvals[:, ci.meth_t2] = norm.sf(
            m,
            loc=(r_mu + g_mu)[:, None],
            scale=(r_sd + g_sd)[:, None],
        )

        return pd.DataFrame(
            pvals.T,
            index=ci.probe_ids,
            columns=self.sample_ids,
        )


    @property
    def betas(self) -> pd.DataFrame:
        """Returns beta values."""
        betas = self._get_beta(self.methylated, self.unmethylated)
        return pd.DataFrame(
            betas.T, columns=self.sample_ids, index=self.probe_ids
        )


[docs]
    def betas_at(
        self,
        cpgs: Sequence | np.ndarray | None = None,
        fill: float = NEUTRAL_BETA,
    ) -> pd.DataFrame:
        """Calculates beta values for specified CpG sites.

        Args:
            cpgs: Array of CpG IDs.

            fill: Value to fill for CpGs not found in the used manifest or
                equal to NaN.

        Returns:
            DataFrame containing beta values for specified CpGs.

        Note:
            If 'cpgs' is None, all CpGs from the used manifest are considered.
        """
        if cpgs is None:
            cpgs = self.manifest.methylation_probes
        betas = self._get_beta(self.methylated, self.unmethylated)
        converted = np.full((len(self.sample_ids), len(cpgs)), fill)
        left_idx, right_idx = _overlap_indices(cpgs, self.probe_ids)
        converted[:, left_idx] = betas[:, right_idx]
        converted[np.isnan(converted)] = fill
        return pd.DataFrame(converted.T, columns=self.sample_ids, index=cpgs)


    @staticmethod
    def _get_beta(
        methylated: np.ndarray,
        unmethylated: np.ndarray,
        offset: float = 100,
        beta_threshold: float = 0,
        *,
        min_zero: bool = True,
    ) -> np.ndarray:
        if offset < 0:
            msg = "'offset' must be non-negative"
            raise ValueError(msg)

        if not (0 <= beta_threshold <= 0.5):
            msg = "'beta_threshold' must be between 0 and 0.5"
            raise ValueError(msg)

        if min_zero:
            methylated = np.maximum(methylated, 0)
            unmethylated = np.maximum(unmethylated, 0)

        # Ignore division by zero
        with np.errstate(divide="ignore", invalid="ignore"):
            betas = methylated / (methylated + unmethylated + offset)

        if beta_threshold > 0:
            betas = np.minimum(
                np.maximum(betas, beta_threshold), 1 - beta_threshold
            )

        return betas

    @property
    def mvalues(self) -> pd.DataFrame:
        """Returns M-values."""
        mvals = self._get_m_value(self.methylated, self.unmethylated)
        return pd.DataFrame(
            mvals.T, columns=self.sample_ids, index=self.probe_ids
        )


[docs]
    def mvalues_at(
        self,
        cpgs: Sequence | np.ndarray | None = None,
        fill: float = NEUTRAL_M_VALUE,
    ) -> pd.DataFrame:
        """Calculates m-values for specified CpG sites.

        Args:
            cpgs: Array of CpG IDs.

            fill: Value to fill for CpGs not found in the used manifest or
                equal to NaN.

        Returns:
            DataFrame containing m-values for specified CpGs.

        Note:
            If 'cpgs' is None, all CpGs from the used manifest are considered.
        """
        if cpgs is None:
            cpgs = self.manifest.methylation_probes

        mvals = self._get_m_value(self.methylated, self.unmethylated)

        converted = np.full((len(self.sample_ids), len(cpgs)), fill)
        left_idx, right_idx = _overlap_indices(cpgs, self.probe_ids)
        converted[:, left_idx] = mvals[:, right_idx]

        converted[np.isnan(converted)] = fill

        return pd.DataFrame(converted.T, columns=self.sample_ids, index=cpgs)


    @staticmethod
    def _get_m_value(
        methylated: np.ndarray,
        unmethylated: np.ndarray,
        offset: float = 1.0,
        *,
        min_zero: bool = True,
    ) -> np.ndarray:
        """Compute M-values: log2((M + offset) / (U + offset)).

        Args:
            methylated: methylated intensities

            unmethylated: unmethylated intensities

            offset: small constant to prevent log(0)

            min_zero: clamp negative intensities to zero

        Returns:
            M-values
        """
        if offset < 0:
            raise ValueError("'offset' must be non-negative")

        if min_zero:
            methylated = np.maximum(methylated, 0)
            unmethylated = np.maximum(unmethylated, 0)

        # Ignore division by zero
        with np.errstate(divide="ignore", invalid="ignore"):
            mvals = np.log2((methylated + offset) / (unmethylated + offset))

        return mvals


[docs]
    def predict_sex(self) -> np.ndarray:
        """Predict sex from X/Y chromosome methylation intensities.

        Uses median log2 intensity difference between Y and X probes.
        Threshold-based classifier trained/validated on ~3k tumor samples,
        achieving ~95% accuracy. Algorithm needs to be refined in future
        versions.


        Returns:
            Predicted sex (``"male"`` or ``"female"``), indexed by sample name.

        Notes:
            This algorithm is experimental and may be refined in future
            releases.
        """
        # TODO: Refine algorithm
        x_idx, y_idx = _get_sex_indices(
            sample_probes=self.probe_ids,
            array_type=self.manifest.array_type,
        )

        x_log = np.log2(self.intensity[:, x_idx] + 1)
        y_log = np.log2(self.intensity[:, y_idx] + 1)

        x_median = np.median(x_log, axis=1)
        y_median = np.median(y_log, axis=1)

        diff_yx = y_median - x_median
        threshold = -2.5602695724074866

        return pd.Series(
            np.where(diff_yx > threshold, "male", "female"),
            index=self.sample_ids,
            name="sex",
        )



[docs]
    def predict_purity(
        self,
        method: Literal["absolute", "estimate"] = "absolute",
        fill: float = 0.5,
    ) -> pd.Series:
        """Estimate tumor purity from DNA methylation data.

        Uses RFpurify random forest models to estimate tumor purity from the
        methylation beta values stored in this object. In RFpurify the

        Available models:

            ``"absolute"``:
                Model trained against purity estimates from the ABSOLUTE study.

            ``"estimate"``:
                Model trained against purity estimates from the ESTIMATE study.

        Reference:
            Sill et al. (2019)
            https://github.com/mwsill/RFpurify
            https://doi.org/10.1186/s12859-019-3014-z

        Args:
            method: RFpurify model to use. Supported values are ``"absolute"``
                and ``"estimate"``.

            fill: Beta value used for CpG probes required by the model but
                missing from the input data. RFpurify was trained on Illumina
                450k and EPIC arrays; missing probes may occur when applying it
                to newer arrays such as EPIC v2.

        Returns:
            Predicted tumor purity values in the range [0, 1], indexed by
            sample name.

        Raises:
            ValueError: If ``method`` is not ``"absolute"`` or ``"estimate"``.
        """
        cpgs = get_purity_features(method)
        return predict_purity(
            self.betas_at(cpgs).T,
            method=method,
            fill=fill,
        )



[docs]
    def plot_betas_density(self, bins: int = 256) -> None:
        """Plot beta-value density distributions."""
        from scipy.ndimage import gaussian_filter1d

        fig = go.Figure()

        edges = np.linspace(0, 1, bins + 1)
        centers = (edges[:-1] + edges[1:]) / 2

        for sample_id in self.sample_ids:
            values = self.betas[sample_id].to_numpy()

            hist, _ = np.histogram(
                values,
                bins=edges,
                density=True,
            )
            hist = gaussian_filter1d(hist, sigma=2)

            fig.add_trace(
                go.Scatter(
                    x=centers,
                    y=hist,
                    mode="lines",
                    name=sample_id,
                )
            )

        fig.update_layout(
            template="simple_white",
            xaxis_title="Beta value",
            yaxis_title="Density",
            hovermode="x unified",
            showlegend=True,
        )

        fig.update_xaxes(range=[0, 1])

        fig.show()



[docs]
    def plot_intens_vs_betas(
        self,
        sample_id: str | Sequence[str] | None = None,
        n_cols: int = 3,
    ) -> None:
        """Plot total signal intensity vs beta value.

        Args:
            sample_id: Sample(s) to plot. Defaults to all samples (one subplot
                per sample).

            n_cols: Number of subplot columns when plotting multiple samples.
        """
        if sample_id is None:
            sample_ids = list(self.sample_ids)
        elif isinstance(sample_id, str):
            sample_ids = [sample_id]
        else:
            sample_ids = list(sample_id)

        intens_df = self.intensity_df
        betas_df = self.betas

        n_samples = len(sample_ids)
        n_cols = max(1, min(n_cols, n_samples))
        n_rows = math.ceil(n_samples / n_cols)

        fig = make_subplots(
            rows=n_rows,
            cols=n_cols,
            subplot_titles=sample_ids,
            horizontal_spacing=0.08,
            vertical_spacing=0.15,
        )

        for i, sample in enumerate(sample_ids):
            row = i // n_cols + 1
            col = i % n_cols + 1

            log_intens = np.log2(intens_df[sample].to_numpy())
            beta = betas_df[sample].to_numpy()
            valid = np.isfinite(log_intens) & np.isfinite(beta)

            fig.add_trace(
                go.Histogram2dContour(
                    x=log_intens[valid],
                    y=beta[valid],
                    colorscale="Turbo",
                    showscale=False,
                    ncontours=20,
                    line=dict(width=0),
                    hoverinfo="skip",
                ),
                row=row,
                col=col,
            )

            fig.add_hline(
                y=0.5,
                line_dash="dash",
                line_color="gray",
                layer="above",
                row=row,
                col=col,
            )
            fig.update_xaxes(
                title_text="Total Intensity (Log2(M+U))", row=row, col=col
            )
            fig.update_yaxes(range=[0, 1], title_text="Beta", row=row, col=col)

        fig.update_layout(
            template="simple_white",
            title="Total Signal Intensity vs Beta Values",
            height=max(700, 380 * n_rows),
            width=max(900, 480 * n_cols),
            showlegend=False,
        )

        fig.show()



[docs]
    def plot_red_green_qq(self, n_points: int = 512) -> None:
        """Red vs Green channel intensity quantile-quantile plot.

        Compares sorted intensity distributions of Red and Green channels using
        quantile downsampling.

        Args:
            n_points: Number of quantile-subsampled points per sample.
        """
        fig = go.Figure()

        global_max = 0.0

        for sample_id in self.sample_ids:
            red = self.red_df[sample_id].to_numpy()
            grn = self.green_df[sample_id].to_numpy()

            red = np.sort(red)
            grn = np.sort(grn)

            # Downsample
            idx = np.linspace(0, len(red) - 1, n_points).astype(int)
            red = red[idx]
            grn = grn[idx]

            global_max = max(global_max, red[-1], grn[-1])

            fig.add_trace(
                go.Scatter(
                    x=red,
                    y=grn,
                    mode="markers",
                    name=sample_id,
                    marker=dict(size=5, opacity=0.6),
                )
            )

        # identity line using global max
        fig.add_trace(
            go.Scatter(
                x=[0, global_max],
                y=[0, global_max],
                mode="lines",
                name="Identity (Red = Green)",
                line=dict(dash="dash"),
            )
        )

        fig.update_layout(
            template="simple_white",
            title="Type-I Red vs Green signal correlation",
            xaxis_title="Type-I Red signal",
            yaxis_title="Type-I Green signal",
            showlegend=True,
        )

        fig.show()


    def __repr__(self) -> str:
        title = "MethylData():"
        lines = [
            title + "\n" + "*" * len(title),
            f"array_type: {self.array_type}",
            f"manifest: {self.manifest.array_type}",
            f"sample_ids:\n{self.sample_ids}",
            f"green:\n{self.green}",
            f"red:\n{self.red}",
            f"green_df:\n{self.green_df}",
            f"red_df:\n{self.red_df}",
            f"intensity_df:\n{self.intensity_df}",
            f"methylated_df:\n{self.methylated_df}",
            f"unmethylated_df:\n{self.unmethylated_df}",
        ]
        return "\n\n".join(lines)




[docs]
class ReferenceMethylData:
    """Stores and manages reference cases for different array types.

    This class categorizes and processes reference IDAT files to create
    MethylData objects for different array types. It is intended for CNV
    neutral reference cases used in CNV calculation.

    Args:
        file: List of file paths to IDAT files or directory containing IDAT
            files.

        prep: Preprocessing method. Options: "illumina", "swan", "noob".

    Attributes:
        _methyl_data: Internal dictionary to cache MethylData objects
            for each array type.

    Raises:
        ValueError: If no reference files are found for the specified array
            type.

    Examples:
        >>> # 'directory' contains 450k, EPIC and EPICv2 idat files
        >>> reference = ReferenceMethylData(file=directory, prep="illumina")
        >>> sample_450k = MethylData(file=idat_file_450k)
        >>> sample_epic = MethylData(file=idat_file_epic)
        >>> sample_epicv2 = MethylData(file=idat_file_epicv2)
        >>> # reference can be used for all types
        >>> cnv_450k = CNV(sample_450k, reference)
        >>> cnv_epic = CNV(sample_epic, reference)
        >>> cnv_epicv2 = CNV(sample_epicv2, reference)
    """

    _cache: dict[Any, "ReferenceMethylData"] = {}
    # Prevents a non-finished instance from being loaded.
    _lock_new = threading.Lock()
    _lock_init = threading.Lock()

    file: str | Path | Sequence[str | Path]
    prep: PrepType
    save_to_disk: bool

    _methyl_data: dict[ArrayType, MethylData]
    _cached: bool

    def __new__(
        cls,
        file: str | Path | Sequence[str | Path],
        prep: PrepType = "illumina",
        save_to_disk: bool = False,
    ) -> "ReferenceMethylData":
        key = cache_key(file, prep)

        with cls._lock_new:
            if key in cls._cache:
                return cls._cache[key]

            instance = super().__new__(cls)

            # Cache the instance
            cls._cache[key] = instance
            return instance

    def __getnewargs__(
        self,
    ) -> tuple[str | Path | Sequence[str | Path], PrepType, bool]:
        # Necessary for pickle
        return self.file, self.prep, self.save_to_disk

    def __init__(
        self,
        file: str | Path | Sequence[str | Path],
        prep: PrepType = "illumina",
        save_to_disk: bool = False,
    ) -> None:
        with ReferenceMethylData._lock_init:
            # Don't need to initialize if instance is cached.
            if getattr(self, "_cached", False):
                return

            self.file = file
            self.prep = prep
            self.save_to_disk = save_to_disk
            idat_files = idat_basepaths(self.file)

            # Load data from disk
            filepath = ReferenceMethylData.pickle_filename(
                self.prep, idat_files
            )
            if self.save_to_disk and filepath.exists():
                with filepath.open("rb") as f:
                    saved_instance = pickle.load(f)
                    self.__dict__.update(saved_instance.__dict__)
                    self._cached = True
                    return

            reference_files = collections.defaultdict(list)
            self._methyl_data = {}
            for idat_file in tqdm(
                idat_files, desc="Categorizing reference IDAT files"
            ):
                array_type = ArrayType.from_idat(idat_file)
                reference_files[array_type].append(idat_file)
            for array_type, file_list in tqdm(
                reference_files.items(), desc="Processing reference IDAT files"
            ):
                raw_data = RawData(file_list)
                self._methyl_data[array_type] = MethylData(
                    raw_data, prep=self.prep
                )
                self._methyl_data[array_type]._load_log_intensity()

            if self.save_to_disk:
                # Save saving to disk
                tmp_path = filepath.with_suffix(f".{uuid4()}.tmp")

                with tmp_path.open("wb") as f:
                    pickle.dump(self, f)

                tmp_path.replace(filepath)

            self._cached = True

    @staticmethod
    def pickle_filename(
        prep: PrepType,
        idat_files: list[Path],
    ) -> Path:
        return MEPYLOME_CACHE_DIR / input_args_id(
            "Ref", prep, sorted(str(x) for x in idat_files)
        )

    def __getitem__(self, array_type: ArrayType) -> MethylData:
        if array_type not in self._methyl_data:
            msg = (
                f"No copy number neutral reference files found for "
                f"array type '{array_type.value}'."
            )
            raise ValueError(msg)
        return self._methyl_data[array_type]