Source code for scvi.dataset.csv

import logging
import os
from typing import Iterable, Union

import numpy as np
import pandas as pd

from scvi.dataset.dataset import DownloadableDataset

logger = logging.getLogger(__name__)


[docs]class CsvDataset(DownloadableDataset):
    """Loads a `.csv` file.

    Parameters
    ----------
    filename
        File name to use when saving/loading the data.
    save_path
        Location to use when saving/loading the data.
    url
        URL pointing to the data which will be downloaded
        if it's not already in ``save_path``.
    new_n_genes
        Number of subsampled genes.
    subset_genes
        List of genes for subsampling.
    compression
        For on-the-fly decompression of on-disk data. If ‘infer’ and filepath_or_buffer
        is path-like, then detect compression from the following extensions: ‘.gz’, ‘.bz2’, ‘.zip’, or ‘.xz’
        (otherwise no decompression). If using ‘zip’, the ZIP file must contain only one data file to be read in.
    batch_ids_file
        Name of the `.csv` file with batch indices.
        File contains two columns. The first holds cell names and second
        holds batch indices - type int. The first row of the file is header.

    Examples
    --------
    >>> # Loading a remote dataset
    >>> remote_url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE100866&format=file&file="
    ... "GSE100866%5FCBMC%5F8K%5F13AB%5F10X%2DRNA%5Fumi%2Ecsv%2Egz")
    >>> remote_csv_dataset = CsvDataset("GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv.gz", save_path='data/',
    ... compression="gzip", url=remote_url)
    >>> # Loading a local dataset
    >>> local_csv_dataset = CsvDataset("GSE100866_CBMC_8K_13AB_10X-RNA_umi.csv.gz",
    ... save_path="data/", compression='gzip')
    """

    def __init__(
        self,
        filename: str,
        save_path: str = "data/",
        url: str = None,
        new_n_genes: int = None,
        subset_genes: Iterable[Union[int, str]] = None,
        compression: str = None,
        sep: str = ",",
        gene_by_cell: bool = True,
        labels_file: str = None,
        batch_ids_file: str = None,
        delayed_populating: bool = False,
    ):
        self.compression = compression
        self.sep = sep
        self.gene_by_cell = (
            gene_by_cell  # Whether the original dataset is genes by cells
        )
        self.labels_file = labels_file
        self.batch_ids_file = batch_ids_file
        super().__init__(
            urls=url,
            filenames=filename,
            save_path=save_path,
            delayed_populating=delayed_populating,
        )
        if (new_n_genes is not None) or (subset_genes is not None):
            self.subsample_genes(new_n_genes=new_n_genes, subset_genes=subset_genes)

[docs]    def populate(self):
        logger.info("Preprocessing dataset")

        if self.gene_by_cell:
            data = pd.read_csv(
                os.path.join(self.save_path, self.filenames[0]),
                sep=self.sep,
                index_col=0,
                compression=self.compression,
            ).T
        else:
            data = pd.read_csv(
                os.path.join(self.save_path, self.filenames[0]),
                sep=self.sep,
                index_col=0,
                compression=self.compression,
            )

        gene_names = np.asarray(data.columns, dtype=str)
        labels, cell_types, batch_indices = None, None, None
        if self.labels_file is not None:
            labels = pd.read_csv(
                os.path.join(self.save_path, self.labels_file), header=0, index_col=0
            )
            labels = labels.values
            cell_types = np.unique(labels)

        if self.batch_ids_file is not None:
            batch_indices = pd.read_csv(
                os.path.join(self.save_path, self.batch_ids_file), header=0, index_col=0
            )
            batch_indices = batch_indices.values

        data = data.values
        logger.info("Finished preprocessing dataset")

        self.populate_from_data(
            X=data,
            batch_indices=batch_indices,
            labels=labels,
            gene_names=gene_names,
            cell_types=cell_types,
        )
        self.filter_cells_by_count()


[docs]class BreastCancerDataset(CsvDataset):
    def __init__(self, save_path: str = "data/", delayed_populating: bool = False):
        super().__init__(
            "Layer2_BC_count_matrix-1.tsv",
            save_path=save_path,
            url="http://www.spatialtranscriptomicsresearch.org/wp-content/"
            "uploads/2016/07/Layer2_BC_count_matrix-1.tsv",
            sep="\t",
            gene_by_cell=False,
            delayed_populating=delayed_populating,
        )


[docs]class MouseOBDataset(CsvDataset):
    def __init__(self, save_path: str = "data/", delayed_populating: bool = False):
        super().__init__(
            "Rep11_MOB_count_matrix-1.tsv",
            save_path=save_path,
            url="http://www.spatialtranscriptomicsresearch.org/wp-content/uploads/"
            "2016/07/Rep11_MOB_count_matrix-1.tsv",
            sep="\t",
            gene_by_cell=False,
            delayed_populating=delayed_populating,
        )