Source code for scvi.dataset.brain_large

import logging
import os

import h5py
import numpy as np
import scipy.sparse as sp_sparse

from scvi.dataset.dataset import DownloadableDataset

logger = logging.getLogger(__name__)


[docs]class BrainLargeDataset(DownloadableDataset):
    """Loads brain-large dataset.

    This dataset contains 1.3 million brain cells from
    `10x Genomics <https://support.10xgenomics.com/single-cell-gene-expression/datasets>`_.
    We randomly shuffle the data to get a 1M subset of cells and order genes by variance to retain first 10,000 and then 720 sampled variable genes.
    This dataset is then sampled multiple times in cells for the runtime and goodness-of-fit analysis.
    We report imputation scores on the 10k cells and 720 genes samples only.

    Parameters
    ----------
    filename
        File name to use when saving/loading the data.
    save_path
        Location to use when saving/loading the data.
    sample_size_gene_var
        Number of cells to use to estimate gene variances.
    max_cells_to_keep
        Maximum number of cells to keep.
    nb_genes_to_keep
        Number of genes to keep, ordered by decreasing variance.
    loading_batch_size
        Number of cells to use for each chunk loaded.
    delayed_populating
        Switch for delayed populating mechanism.

    Examples
    --------
    >>> gene_dataset = BrainLargeDataset()
    """

    def __init__(
        self,
        filename: str = None,
        save_path: str = "data/",
        sample_size_gene_var: int = 10000,
        max_cells_to_keep: int = None,
        nb_genes_to_keep: int = 720,
        loading_batch_size: int = 100000,
        delayed_populating: bool = False,
    ):
        # used in populate, should not be moved after the call to super().__init__()
        self.sample_size_gene_var = sample_size_gene_var
        self.max_cells_to_keep = max_cells_to_keep
        self.nb_genes_to_keep = nb_genes_to_keep
        self.loading_batch_size = loading_batch_size
        super().__init__(
            urls=(
                "http://cf.10xgenomics.com/samples/cell-exp/1.3.0/1M_neurons/"
                "1M_neurons_filtered_gene_bc_matrices_h5.h5"
            ),
            filenames=filename if filename is not None else "brain_large.h5",
            save_path=save_path,
            delayed_populating=delayed_populating,
        )

[docs]    def populate(self):
        logger.info("Preprocessing Brain Large data")
        with h5py.File(os.path.join(self.save_path, self.filenames[0]), "r") as f:
            data = f["mm10"]
            nb_genes, nb_cells = f["mm10"]["shape"]
            self.n_cells_to_keep = (
                self.max_cells_to_keep
                if self.max_cells_to_keep is not None
                else nb_cells
            )
            index_partitioner = data["indptr"][...]
            # estimate gene variance using a subset of cells.
            index_partitioner_gene_var = index_partitioner[
                : (self.sample_size_gene_var + 1)
            ]
            last_index_gene_var_sample = index_partitioner_gene_var[-1]
            gene_var_sample_matrix = sp_sparse.csc_matrix(
                (
                    data["data"][:last_index_gene_var_sample].astype(np.float32),
                    data["indices"][:last_index_gene_var_sample],
                    index_partitioner_gene_var,
                ),
                shape=(nb_genes, len(index_partitioner_gene_var) - 1),
            )
            mean = gene_var_sample_matrix.mean(axis=1)
            var = gene_var_sample_matrix.multiply(gene_var_sample_matrix).mean(
                axis=1
            ) - np.multiply(mean, mean)
            self.subset_genes = np.squeeze(np.asarray(var)).argsort()[
                -self.nb_genes_to_keep :
            ][::-1]
            del gene_var_sample_matrix, mean, var

            n_iters = int(self.n_cells_to_keep / self.loading_batch_size) + (
                self.n_cells_to_keep % self.loading_batch_size > 0
            )
            for i in range(n_iters):
                index_partitioner_batch = index_partitioner[
                    (i * self.loading_batch_size) : (
                        (1 + i) * self.loading_batch_size + 1
                    )
                ]
                first_index_batch = index_partitioner_batch[0]
                last_index_batch = index_partitioner_batch[-1]
                index_partitioner_batch = (
                    index_partitioner_batch - first_index_batch
                ).astype(np.int32)
                n_cells_batch = len(index_partitioner_batch) - 1
                data_batch = data["data"][first_index_batch:last_index_batch].astype(
                    np.float32
                )
                indices_batch = data["indices"][
                    first_index_batch:last_index_batch
                ].astype(np.int32)
                matrix_batch = sp_sparse.csr_matrix(
                    (data_batch, indices_batch, index_partitioner_batch),
                    shape=(n_cells_batch, nb_genes),
                )[:, self.subset_genes]
                # stack on the fly to limit RAM usage
                if i == 0:
                    matrix = matrix_batch
                else:
                    matrix = sp_sparse.vstack([matrix, matrix_batch])
                logger.info(
                    "loaded {} / {} cells".format(
                        i * self.loading_batch_size + n_cells_batch,
                        self.n_cells_to_keep,
                    )
                )
        logger.info("%d cells subsampled" % matrix.shape[0])
        logger.info("%d genes subsampled" % matrix.shape[1])
        self.populate_from_data(matrix)
        self.filter_cells_by_count()