Source code for scvi.dataset.loom

import logging
import os
from typing import List, Optional

import numpy as np

from scvi.dataset.dataset import DownloadableDataset

logger = logging.getLogger(__name__)


[docs]class LoomDataset(DownloadableDataset): """Loads a potentially remote `.loom` file. Parameters ---------- filename File name to use when saving/loading the data. save_path Location to use when saving/loading the data. url URL pointing to the data which will be downloaded if it's not already in ``save_path``. batch_indices_attribute_name Name of the attribute containing batch indices. labels_attribute_name Name of the attribute containing labels. gene_names_attribute_name Name of the attribute containing gene names. cell_types_attribute_name Name of the attribute containing cell types. delayed_populating Switch for delayed populating mechanism. Examples -------- >>> # Loading a remote dataset >>> remote_loom_dataset = LoomDataset("osmFISH_SScortex_mouse_all_cell.loom", save_path='data/', ... url='http://linnarssonlab.org/osmFISH/osmFISH_SScortex_mouse_all_cells.loom') >>> # Loading a local dataset >>> local_loom_dataset = LoomDataset("osmFISH_SScortex_mouse_all_cell.loom", save_path='data/') """ def __init__( self, filename: str, save_path: str = "data/", url: str = None, batch_indices_attribute_name: str = "BatchID", labels_attribute_name: str = "ClusterID", encode_labels_name_into_int: bool = False, gene_names_attribute_name: str = "Gene", cell_types_attribute_name: str = "CellTypes", delayed_populating: bool = False, ): self.batch_indices_attribute_name = batch_indices_attribute_name self.labels_attribute_name = labels_attribute_name self.encode_labels_name_into_int = encode_labels_name_into_int self.gene_names_attribute_name = gene_names_attribute_name self.cell_types_attribute_name = cell_types_attribute_name self.global_attributes_dict = None super().__init__( urls=url, filenames=filename, save_path=save_path, delayed_populating=delayed_populating, )
[docs] def populate(self): logger.info("Preprocessing dataset") ( gene_names, labels, batch_indices, cell_types, cell_attributes_dict, gene_attributes_dict, global_attributes_dict, ) = (None, None, None, None, None, None, None) try: import loompy except ImportError: raise ImportError( "Please install loompy package via `pip install --user loompy" ) ds = loompy.connect(os.path.join(self.save_path, self.filenames[0])) select = ds[:, :].sum(axis=0) > 0 # Take out cells that don't express any gene if not all(select): logger.warning("Removing non-expressing cells") for row_attribute_name in ds.ra: if row_attribute_name == self.gene_names_attribute_name: gene_names = ds.ra[self.gene_names_attribute_name] else: gene_attributes_dict = ( gene_attributes_dict if gene_attributes_dict is not None else {} ) gene_attributes_dict[row_attribute_name] = ds.ra[row_attribute_name] for column_attribute_name in ds.ca: if column_attribute_name == self.batch_indices_attribute_name: batch_indices = ds.ca[self.batch_indices_attribute_name][select] elif column_attribute_name == self.labels_attribute_name: labels = ds.ca[self.labels_attribute_name][select] else: cell_attributes_dict = ( cell_attributes_dict if cell_attributes_dict is not None else {} ) cell_attributes_dict[column_attribute_name] = ds.ca[ column_attribute_name ][select] for global_attribute_name in ds.attrs: if global_attribute_name == self.cell_types_attribute_name: cell_types = ds.attrs[self.cell_types_attribute_name] else: global_attributes_dict = ( global_attributes_dict if global_attributes_dict is not None else {} ) global_attributes_dict[global_attribute_name] = ds.attrs[ global_attribute_name ] if global_attributes_dict is not None: self.global_attributes_dict = global_attributes_dict if ( self.encode_labels_name_into_int and labels is not None and cell_types is not None ): mapping = dict((v, k) for k, v in enumerate(cell_types)) mapper = np.vectorize(lambda x: mapping[x]) labels = mapper(labels) data = ds[:, select].T # change matrix to cells by genes ds.close() logger.info("Finished preprocessing dataset") self.populate_from_data( X=data, batch_indices=batch_indices, labels=labels, gene_names=gene_names, cell_types=cell_types, cell_attributes_dict=cell_attributes_dict, gene_attributes_dict=gene_attributes_dict, )
[docs]class RetinaDataset(LoomDataset): """Loads retina dataset. The dataset of bipolar cells contains after their original pipeline for filtering 27,499 cells and 13,166 genes coming from two batches. We use the cluster annotation from 15 cell-types from the author. We also extract their normalized data with Combat and use it for benchmarking. Examples -------- >>> gene_dataset = RetinaDataset() """ def __init__(self, save_path: str = "data/", delayed_populating: bool = False): super().__init__( filename="retina.loom", save_path=save_path, url="https://github.com/YosefLab/scVI-data/raw/master/retina.loom", delayed_populating=delayed_populating, ) self.cell_types = [ "RBC", "MG", "BC5A", "BC7", "BC6", "BC5C", "BC1A", "BC3B", "BC1B", "BC2", "BC5D", "BC3A", "BC5B", "BC4", "BC8_9", ]
[docs]class PreFrontalCortexStarmapDataset(LoomDataset): """Loads a starMAP dataset of 3,704 cells and 166 genes from the mouse pre-frontal cortex (Wang et al., 2018)""" def __init__(self, save_path: str = "data/", delayed_populating: bool = False): super().__init__( filename="mpfc-starmap.loom", save_path=save_path, url="https://github.com/YosefLab/scVI-data/raw/master/mpfc-starmap.loom", labels_attribute_name="Clusters", encode_labels_name_into_int=True, delayed_populating=delayed_populating, ) self.initialize_cell_attribute("x_coord", self.Spatial_coordinates[:, 0]) self.initialize_cell_attribute("y_coord", self.Spatial_coordinates[:, 1])
[docs]class FrontalCortexDropseqDataset(LoomDataset): """"Load the cells from the mouse frontal cortex sequenced by the Dropseq technology (Saunders et al., 2018) Load the 71639 annotated cells located in the frontal cortex of adult mouses among the 690,000 cells studied by (Saunders et al., 2018) using the Drop-seq method. We have a 71639*7611 gene expression matrix Among the 7611 genes, we offer the user to provide a list of genes to subsample from. If not provided, all genes are kept. """ def __init__( self, save_path: str = "data/", genes_to_keep: Optional[List[str]] = None, delayed_populating: bool = False, ): super().__init__( filename="fc-dropseq.loom", save_path=save_path, url="https://github.com/YosefLab/scVI-data/raw/master/fc-dropseq.loom", labels_attribute_name="Clusters", delayed_populating=delayed_populating, ) if genes_to_keep is not None: self.reorder_genes(genes_to_keep, drop_omitted_genes=True) # reorder labels such that layers of the cortex are in order order_labels = [5, 6, 3, 2, 4, 0, 1, 8, 7, 9, 10, 11, 12, 13] self.reorder_cell_types(self.cell_types[order_labels])