Source code for scvi.dataset.dataset10X

import logging
import os
import pickle
import tarfile
from typing import Tuple

import numpy as np
import pandas as pd
import scipy.io as sp_io
import shutil
from scipy.sparse import csr_matrix

from scvi.dataset.dataset import CellMeasurement, DownloadableDataset, _download

logger = logging.getLogger(__name__)

available_datasets = {
    "1.1.0": [
        "frozen_pbmc_donor_a",
        "frozen_pbmc_donor_b",
        "frozen_pbmc_donor_c",
        "fresh_68k_pbmc_donor_a",
        "cd14_monocytes",
        "b_cells",
        "cd34",
        "cd56_nk",
        "cd4_t_helper",
        "regulatory_t",
        "naive_t",
        "memory_t",
        "cytotoxic_t",
        "naive_cytotoxic",
    ],
    "2.1.0": ["pbmc8k", "pbmc4k", "t_3k", "t_4k", "neuron_9k"],
    "3.0.0": [
        "pbmc_1k_protein_v3",
        "pbmc_10k_protein_v3",
        "malt_10k_protein_v3",
        "pbmc_1k_v2",
        "pbmc_1k_v3",
        "pbmc_10k_v3",
        "hgmm_1k_v2",
        "hgmm_1k_v3",
        "hgmm_5k_v3",
        "hgmm_10k_v3",
        "neuron_1k_v2",
        "neuron_1k_v3",
        "neuron_10k_v3",
        "heart_1k_v2",
        "heart_1k_v3",
        "heart_10k_v3",
    ],
    "3.1.0": ["5k_pbmc_protein_v3", "5k_pbmc_protein_v3_nextgem"],
}

dataset_to_group = dict(
    [
        (dataset_name, group)
        for group, list_datasets in available_datasets.items()
        for dataset_name in list_datasets
    ]
)

group_to_url_skeleton = {
    "1.1.0": "http://cf.10xgenomics.com/samples/cell-exp/{}/{}/{}_{}_gene_bc_matrices.tar.gz",
    "2.1.0": "http://cf.10xgenomics.com/samples/cell-exp/{}/{}/{}_{}_gene_bc_matrices.tar.gz",
    "3.0.0": "http://cf.10xgenomics.com/samples/cell-exp/{}/{}/{}_{}_feature_bc_matrix.tar.gz",
    "3.1.0": "http://cf.10xgenomics.com/samples/cell-exp/{}/{}/{}_{}_feature_bc_matrix.tar.gz",
}

group_to_filename_skeleton = {
    "1.1.0": "{}_gene_bc_matrices.tar.gz",
    "2.1.0": "{}_gene_bc_matrices.tar.gz",
    "3.0.0": "{}_feature_bc_matrix.tar.gz",
    "3.1.0": "{}_feature_bc_matrix.tar.gz",
}

available_specification = ["filtered", "raw"]


[docs]class Dataset10X(DownloadableDataset): """Loads a file from `10x <http://cf.10xgenomics.com/>`_ website. Parameters ---------- dataset_name : Name of the dataset file. Has to be one of: "frozen_pbmc_donor_a", "frozen_pbmc_donor_b", "frozen_pbmc_donor_c", "fresh_68k_pbmc_donor_a", "cd14_monocytes", "b_cells", "cd34", "cd56_nk", "cd4_t_helper", "regulatory_t", "naive_t", "memory_t", "cytotoxic_t", "naive_cytotoxic", "pbmc8k", "pbmc4k", "t_3k", "t_4k", "neuron_9k", "pbmc_1k_protein_v3", "pbmc_10k_protein_v3", "malt_10k_protein_v3", "pbmc_1k_v2", "pbmc_1k_v3", "pbmc_10k_v3", "hgmm_1k_v2", "hgmm_1k_v3", "hgmm_5k_v3", "hgmm_10k_v3", "neuron_1k_v2", "neuron_1k_v3", "neuron_10k_v3", "heart_1k_v2", "heart_1k_v3", "heart_10k_v3". filename : manual override of the filename to write to. save_path : Location to use when saving/loading the data. url : manual override of the download remote location. Note that we already provide urls for most 10X datasets, which are automatically formed only using the ``dataset_name``. type : Either `filtered` data or `raw` data. dense : Whether to load as dense or sparse. If False, data is cast to sparse using ``scipy.sparse.csr_matrix``. measurement_names_column : column in which to find measurement names in the corresponding `.tsv` file. remove_extracted_data : Whether to remove extracted archives after populating the dataset. Examples -------- >>> tenX_dataset = Dataset10X("neuron_9k") """ def __init__( self, dataset_name: str = None, filename: str = None, save_path: str = "data/10X", url: str = None, type: str = "filtered", dense: bool = False, measurement_names_column: int = 1, remove_extracted_data: bool = False, delayed_populating: bool = False, ): self.barcodes = None self.dense = dense self.measurement_names_column = measurement_names_column self.remove_extracted_data = remove_extracted_data # form data url and filename unless manual override if dataset_name is not None: if url is not None: logger.warning("dataset_name provided, manual url is disregarded.") if filename is not None: logger.warning("dataset_name provided, manual filename is disregarded.") group = dataset_to_group[dataset_name] url_skeleton = group_to_url_skeleton[group] url = url_skeleton.format(group, dataset_name, dataset_name, type) filename_skeleton = group_to_filename_skeleton[group] filename = filename_skeleton.format(type) save_path = os.path.join(save_path, dataset_name) elif filename is not None and url is not None: logger.debug("Loading 10X dataset with custom url and filename") elif filename is not None and url is None: logger.debug("Loading local 10X dataset with custom filename") else: logger.debug("Loading extracted local 10X dataset with custom filename") super().__init__( urls=url, filenames=filename, save_path=save_path, delayed_populating=delayed_populating, )
[docs] def populate(self): logger.info("Preprocessing dataset") was_extracted = False if len(self.filenames) > 0: file_path = os.path.join(self.save_path, self.filenames[0]) if not os.path.exists(file_path[:-7]): # nothing extracted yet if tarfile.is_tarfile(file_path): logger.info("Extracting tar file") tar = tarfile.open(file_path, "r:gz") tar.extractall(path=self.save_path) was_extracted = True tar.close() # get exact path of the extract, for robustness to changes is the 10X storage logic path_to_data, suffix = self.find_path_to_data() # get filenames, according to 10X storage logic measurements_filename = "genes.tsv" if suffix == "" else "features.tsv.gz" barcode_filename = "barcodes.tsv" + suffix matrix_filename = "matrix.mtx" + suffix expression_data = sp_io.mmread(os.path.join(path_to_data, matrix_filename)).T if self.dense: expression_data = expression_data.A else: expression_data = csr_matrix(expression_data) # group measurements by type (e.g gene, protein) # in case there are multiple measurements, e.g protein # they are indicated in the third column gene_expression_data = expression_data measurements_info = pd.read_csv( os.path.join(path_to_data, measurements_filename), sep="\t", header=None ) Ys = None if measurements_info.shape[1] < 3: gene_names = measurements_info[self.measurement_names_column].astype(np.str) else: gene_names = None for measurement_type in np.unique(measurements_info[2]): # .values required to work with sparse matrices measurement_mask = (measurements_info[2] == measurement_type).values measurement_data = expression_data[:, measurement_mask] measurement_names = measurements_info[self.measurement_names_column][ measurement_mask ].astype(np.str) if measurement_type == "Gene Expression": gene_expression_data = measurement_data gene_names = measurement_names else: Ys = [] if Ys is None else Ys if measurement_type == "Antibody Capture": measurement_type = "protein_expression" columns_attr_name = "protein_names" # protein counts do not have many zeros so always make dense if self.dense is not True: measurement_data = measurement_data.A else: measurement_type = measurement_type.lower().replace(" ", "_") columns_attr_name = measurement_type + "_names" measurement = CellMeasurement( name=measurement_type, data=measurement_data, columns_attr_name=columns_attr_name, columns=measurement_names, ) Ys.append(measurement) if gene_names is None: raise ValueError( "When loading measurements, no 'Gene Expression' category was found." ) batch_indices, cell_attributes_dict = None, None if os.path.exists(os.path.join(path_to_data, barcode_filename)): barcodes = pd.read_csv( os.path.join(path_to_data, barcode_filename), sep="\t", header=None ) cell_attributes_dict = { "barcodes": np.squeeze(np.asarray(barcodes, dtype=str)) } # As of 07/01, 10X barcodes have format "%s-%d" where the digit is a batch index starting at 1 batch_indices = np.asarray( [barcode.split("-")[-1] for barcode in cell_attributes_dict["barcodes"]] ) batch_indices = batch_indices.astype(np.int64) - 1 logger.info("Finished preprocessing dataset") self.populate_from_data( X=gene_expression_data, batch_indices=batch_indices, gene_names=gene_names, cell_attributes_dict=cell_attributes_dict, Ys=Ys, ) self.filter_cells_by_count() # cleanup if required if was_extracted and self.remove_extracted_data: logger.info("Removing extracted data at {}".format(file_path[:-7])) shutil.rmtree(file_path[:-7])
[docs] def find_path_to_data(self) -> Tuple[str, str]: """Returns exact path for the data in the archive. This is required because 10X doesn't have a consistent way of storing their data. Additionally, the function returns whether the data is stored in compressed format. Returns ------- path in which files are contains and their suffix if compressed. """ for root, subdirs, files in os.walk(self.save_path): # do not consider hidden files files = [f for f in files if not f[0] == "."] contains_mat = [ filename == "matrix.mtx" or filename == "matrix.mtx.gz" for filename in files ] contains_mat = np.asarray(contains_mat).any() if contains_mat: is_tar = files[0][-3:] == ".gz" suffix = ".gz" if is_tar else "" return root, suffix raise FileNotFoundError( "No matrix.mtx(.gz) found in path (%s)." % self.save_path )
[docs]class BrainSmallDataset(Dataset10X): """This dataset consists in 9,128 mouse brain cells profiled using `10x Genomics`. It is used as a complement of PBMC for our study of zero abundance and quality control metrics correlation with our generative posterior parameters. We derived quality control metrics using the cellrangerRkit R package (v.1.1.0). Quality metrics were extracted from CellRanger throughout the molecule specific information file. We kept the top 3000 genes by variance. We used the clusters provided by cellRanger for the correlation analysis of zero probabilities. Examples -------- >>> gene_dataset = BrainSmallDataset() """ def __init__( self, save_path: str = "data/", save_path_10X: str = "data/", delayed_populating: bool = False, remove_extracted_data: bool = False, ): super().__init__( dataset_name="neuron_9k", save_path=save_path_10X, remove_extracted_data=remove_extracted_data, delayed_populating=delayed_populating, ) _download( filename="brain_small_metadata.pickle", url="https://github.com/YosefLab/scVI-data/raw/master/brain_small_metadata.pickle", save_path=save_path, ) metadata = pickle.load( open(os.path.join(save_path, "brain_small_metadata.pickle"), "rb") ) self.labels = metadata["clusters"].loc[self.barcodes] - 1 raw_qc = metadata["raw_qc"].loc[self.barcodes] self.initialize_cell_attribute("raw_qc", raw_qc) self.initialize_cell_attribute("qc", raw_qc.values) self.qc_names = raw_qc.columns