Source code for doubletdetection.plot

import os
import warnings

import matplotlib
import numpy as np
import umap
from sklearn.decomposition import PCA
from sklearn.utils import check_array

try:
    os.environ["DISPLAY"]
except KeyError:
    matplotlib.use("Agg")
import matplotlib.pyplot as plt


[docs]def normalize_counts(raw_counts, pseudocount=0.1):
    """Normalize count array. Default normalizer used by BoostClassifier.

    Args:
        raw_counts (ndarray): count data
        pseudocount (float, optional): Count to add prior to log transform.

    Returns:
        ndarray: Normalized data.
    """
    # Sum across cells

    cell_sums = np.sum(raw_counts, axis=1)

    # Mutiply by median and divide each cell by cell sum
    median = np.median(cell_sums)
    normed = raw_counts * median / cell_sums[:, np.newaxis]

    normed = np.log10(normed + pseudocount)

    return normed


[docs]def convergence(clf, show=False, save=None, p_thresh=1e-7, voter_thresh=0.9):
    """Produce a plot showing number of cells called doublet per iter

    Args:
        clf (BoostClassifier object): Fitted classifier
        show (bool, optional): If True, runs plt.show()
        save (str, optional): filename for saved figure,
            figure not saved by default
        p_thresh (float, optional): hypergeometric test p-value threshold
            that determines per iteration doublet calls
        voter_thresh (float, optional): fraction of iterations a cell must
            be called a doublet

    Returns:
        matplotlib figure
    """
    log_p_thresh = np.log(p_thresh)
    doubs_per_run = []
    # Ignore numpy complaining about np.nan comparisons
    with np.errstate(invalid="ignore"):
        for i in range(clf.n_iters):
            cum_log_p_values = clf.all_log_p_values_[: i + 1]
            cum_vote_average = np.mean(
                np.ma.masked_invalid(cum_log_p_values) <= log_p_thresh, axis=0
            )
            cum_doublets = np.ma.filled(
                (cum_vote_average >= voter_thresh).astype(float), np.nan
            )
            doubs_per_run.append(np.nansum(cum_doublets))

    # Ignore warning for convergence plot
    with warnings.catch_warnings():
        warnings.filterwarnings(
            action="ignore", module="matplotlib", message="^tight_layout"
        )

        f, ax = plt.subplots(1, 1, figsize=(4, 4), dpi=150)
        ax.plot(np.arange(len(doubs_per_run)), doubs_per_run)
        ax.set_xlabel("Number of Iterations")
        ax.set_ylabel("Number of Predicted Doublets")
        ax.set_title("Predicted Doublets per Iteration")

        if show is True:
            plt.show()
        if isinstance(save, str):
            f.savefig(save, format="pdf", bbox_inches="tight")

    return f


[docs]def umap_plot(
    raw_counts,
    labels,
    n_components=30,
    show=False,
    save=None,
    normalizer=normalize_counts,
    random_state=None,
):
    """Produce a umap plot of the data with doublets in black.

        Count matrix is normalized and dimension reduced before plotting.

    Args:
        raw_counts (array-like): Count matrix, oriented cells by genes.
        labels (ndarray): predicted doublets from predict method
        n_components (int, optional): number of PCs to use prior to UMAP
        show (bool, optional): If True, runs plt.show()
        save (str, optional): filename for saved figure,
            figure not saved by default
        normalizer ((ndarray) -> ndarray, optional): Method to normalize
            raw_counts. Defaults to normalize_counts, included in this package.
            Note: To use normalize_counts with its pseudocount parameter changed
            from the default 0.1 value to some positive float `new_var`, use:
            normalizer=lambda counts: doubletdetection.normalize_counts(counts,
            pseudocount=new_var)
        random_state (int, optional): If provided, passed to PCA and UMAP

    Returns:
        matplotlib figure
        ndarray: umap reduction
    """

    try:
        raw_counts = check_array(
            raw_counts, accept_sparse=False, force_all_finite=True, ensure_2d=True
        )
    except TypeError:  # Only catches sparse error. Non-finite & n_dims still raised.
        warnings.warn("Sparse raw_counts is automatically densified.")
        raw_counts = raw_counts.toarray()
    norm_counts = normalizer(raw_counts)
    reduced_counts = PCA(
        n_components=n_components, svd_solver="randomized", random_state=random_state
    ).fit_transform(norm_counts)
    umap_dr = umap.UMAP(random_state=random_state, min_dist=0.5).fit_transform(
        reduced_counts
    )
    # Ensure only looking at positively identified doublets
    doublets = labels == 1

    fig, axes = plt.subplots(1, 1, figsize=(4, 4), dpi=150)
    axes.scatter(
        umap_dr[:, 0],
        umap_dr[:, 1],
        c="grey",
        cmap=plt.cm.tab20,
        s=1,
        label="predicted singlets",
    )
    axes.scatter(
        umap_dr[:, 0][doublets],
        umap_dr[:, 1][doublets],
        s=3,
        c="black",
        label="predicted doublets",
    )
    axes.axis("off")
    axes.legend(frameon=False)
    axes.set_title(
        "{} doublets out of {} cells\n {}% cross-type doublet rate".format(
            np.sum(doublets),
            raw_counts.shape[0],
            np.round(100 * np.sum(doublets) / raw_counts.shape[0], 2),
        )
    )

    if show is True:
        plt.show()
    if isinstance(save, str):
        fig.savefig(save, format="pdf", bbox_inches="tight")

    return fig, umap_dr


[docs]def threshold(
    clf,
    show=False,
    save=None,
    log10=True,
    log_p_grid=None,
    voter_grid=None,
    v_step=2,
    p_step=5,
):
    """Produce a plot showing number of cells called doublet across
       various thresholds

    Args:
        clf (BoostClassifier object): Fitted classifier
        show (bool, optional): If True, runs plt.show()
        save (str, optional): If provided, the figure is saved to this
            filepath.
        log10 (bool, optional): Use log 10 if true, natural log if false.
        log_p_grid (ndarray, optional): log p-value thresholds to use.
            Defaults to np.arange(-100, -1). log base decided by log10
        voter_grid (ndarray, optional): Voting thresholds to use. Defaults to
            np.arange(0.3, 1.0, 0.05).
        p_step (int, optional): number of xlabels to skip in plot
        v_step (int, optional): number of ylabels to skip in plot


    Returns:
        matplotlib figure
    """
    # Ignore numpy complaining about np.nan comparisons
    with np.errstate(invalid="ignore"):
        all_log_p_values_ = np.copy(clf.all_log_p_values_)
        if log10:
            all_log_p_values_ /= np.log(10)
        if log_p_grid is None:
            log_p_grid = np.arange(-100, -1)
        if voter_grid is None:
            voter_grid = np.arange(0.3, 1.0, 0.05)
        doubs_per_t = np.zeros((len(voter_grid), len(log_p_grid)))
        for i in range(len(voter_grid)):
            for j in range(len(log_p_grid)):
                voting_average = np.mean(
                    np.ma.masked_invalid(all_log_p_values_) <= log_p_grid[j], axis=0
                )
                labels = np.ma.filled(
                    (voting_average >= voter_grid[i]).astype(float), np.nan
                )
                doubs_per_t[i, j] = np.nansum(labels)

    # Ignore warning for convergence plot
    with warnings.catch_warnings():
        warnings.filterwarnings(
            action="ignore", module="matplotlib", message="^tight_layout"
        )

        f, ax = plt.subplots(1, 1, figsize=(4, 4), dpi=150)
        cax = ax.imshow(doubs_per_t, cmap="hot", aspect="auto")
        ax.set_xticks(np.arange(len(log_p_grid))[::p_step])
        ax.set_xticklabels(np.around(log_p_grid, 1)[::p_step], rotation="vertical")
        ax.set_yticks(np.arange(len(voter_grid))[::v_step])
        ax.set_yticklabels(np.around(voter_grid, 2)[::v_step])
        cbar = f.colorbar(cax)
        cbar.set_label("Predicted Doublets")
        if log10 is True:
            ax.set_xlabel("Log10 p-value")
        else:
            ax.set_xlabel("Log p-value")
        ax.set_ylabel("Voting Threshold")
        ax.set_title("Threshold Diagnostics")

    if show is True:
        plt.show()
    if save:
        f.savefig(save, format="pdf", bbox_inches="tight")

    return f