Source code for scvi.dataset.cite_seq

import logging
import os
from collections import namedtuple

import numpy as np
import pandas as pd

from scvi.dataset.dataset import CellMeasurement, DownloadableDataset

logger = logging.getLogger(__name__)

available_datasets = {
    "cbmc": "CBMC_8K_13AB_10X",
    "pbmc": "PBMC_vs_flow_10X",
    "cd8": "CD8_merged",
}
CiteSeqFilenames = namedtuple(
    "CiteSeqFilenames", field_names=["rna", "adt", "adt_centered"]
)


[docs]class CiteSeqDataset(DownloadableDataset):
    """Allows to form 3 different CiteSeq datasets.

    Note that their centered log ratio transformation for ADT counts is different from
    the standard clr transformation: they explain they add pseudocounts (for 0 values),
    but do not explicit the actual transformation.
    It doesn't seem to be simply adding count 1 to all entries, or only 0 entries.

    Parameters
    ----------
    name
        Name of the CiteSeq dataset to load. Either "cbmc", "pbmc" or "cd8".
    save_path
        Location to use when saving/loading the data.
    delayed_populating
        Switch for delayed populating mechanism.

    """

    def __init__(
        self,
        name: str = "cbmc",
        save_path: str = "data/citeSeq/",
        delayed_populating: bool = False,
    ):
        s = available_datasets[name]
        filenames = CiteSeqFilenames(
            rna="%s_rna.csv.gz" % name,
            adt="%s_adt.csv.gz" % name,
            adt_centered="%s_adt_centered.csv.gz" % name,
        )
        super().__init__(
            urls=[
                "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE100nnn/GSE100866/suppl/GSE100866_%s-RNA_umi.csv.gz"
                % s,
                "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE100nnn/GSE100866/suppl/GSE100866_%s-ADT_umi.csv.gz"
                % s,
                "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE100nnn/GSE100866/suppl/"
                "GSE100866_%s-ADT_clr-transformed.csv.gz" % s,
            ],
            filenames=filenames,
            save_path=os.path.join(save_path, name),
            delayed_populating=delayed_populating,
        )

[docs]    def populate(self):
        logger.info("Preprocessing data")
        self.expression = pd.read_csv(
            os.path.join(self.save_path, self.filenames.rna),
            index_col=0,
            compression="gzip",
        ).T

        # process protein measurements
        adt = pd.read_csv(
            os.path.join(self.save_path, self.filenames.adt),
            index_col=0,
            compression="gzip",
        )
        protein_names = np.asarray(adt.index).astype(np.str)
        protein_measurement = CellMeasurement(
            name="protein_expression",
            data=adt.T.values,
            columns_attr_name="protein_names",
            columns=protein_names,
        )
        adt_centered = pd.read_csv(
            os.path.join(self.save_path, self.filenames.adt_centered),
            index_col=0,
            compression="gzip",
        )
        if not np.array_equal(
            np.asarray(adt_centered.index).astype(np.str), protein_names
        ):
            raise ValueError(
                "Protein names are not the same for raw and centered counts."
            )
        protein_measurement_centered = CellMeasurement(
            name="protein_expression_clr",
            data=adt_centered.T.values,
            columns_attr_name="protein_names_clr",
            columns=protein_names,
        )

        # keep only human genes (there are also mouse genes)
        gene_names = np.asarray(self.expression.columns, dtype=str)
        human_filter = np.asarray(
            [name.startswith("HUMAN") for name in gene_names], dtype=np.bool
        )
        logger.info(
            "Selecting only HUMAN genes ({} / {})".format(
                human_filter.sum(), len(human_filter)
            )
        )
        X = self.expression.values[:, human_filter]
        gene_names = gene_names[human_filter]
        gene_names = np.asarray(
            [name.split("_")[-1] if "_" in name else name for name in gene_names],
            dtype=np.str,
        )

        logger.info("Finish preprocessing data")

        self.populate_from_data(
            X=X,
            gene_names=gene_names,
            Ys=[protein_measurement, protein_measurement_centered],
        )

        self.filter_cells_by_count()


[docs]class CbmcDataset(CiteSeqDataset):
    """Loads cbmc dataset.

    This dataset that includes 8,617 cord blood mononuclear cells profiled using 10x along with for each cell 13
    well-characterized mononuclear antibodies. We kept the top 600 genes by variance.

    Parameters
    ----------
    save_path
        Save path of raw data file.

    Examples
    --------
    >>> gene_dataset = CbmcDataset()
    """

    def __init__(
        self, save_path: str = "data/citeSeq/", delayed_populating: bool = False
    ):
        super().__init__(
            name="cbmc", save_path=save_path, delayed_populating=delayed_populating
        )