import logging
import os
from typing import List, Optional
import numpy as np
from scvi.dataset.dataset import DownloadableDataset
logger = logging.getLogger(__name__)
[docs]class LoomDataset(DownloadableDataset):
"""Loads a potentially remote `.loom` file.
Parameters
----------
filename
File name to use when saving/loading the data.
save_path
Location to use when saving/loading the data.
url
URL pointing to the data which will be downloaded
if it's not already in ``save_path``.
batch_indices_attribute_name
Name of the attribute containing batch indices.
labels_attribute_name
Name of the attribute containing labels.
gene_names_attribute_name
Name of the attribute containing gene names.
cell_types_attribute_name
Name of the attribute containing cell types.
delayed_populating
Switch for delayed populating mechanism.
Examples
--------
>>> # Loading a remote dataset
>>> remote_loom_dataset = LoomDataset("osmFISH_SScortex_mouse_all_cell.loom", save_path='data/',
... url='http://linnarssonlab.org/osmFISH/osmFISH_SScortex_mouse_all_cells.loom')
>>> # Loading a local dataset
>>> local_loom_dataset = LoomDataset("osmFISH_SScortex_mouse_all_cell.loom", save_path='data/')
"""
def __init__(
self,
filename: str,
save_path: str = "data/",
url: str = None,
batch_indices_attribute_name: str = "BatchID",
labels_attribute_name: str = "ClusterID",
encode_labels_name_into_int: bool = False,
gene_names_attribute_name: str = "Gene",
cell_types_attribute_name: str = "CellTypes",
delayed_populating: bool = False,
):
self.batch_indices_attribute_name = batch_indices_attribute_name
self.labels_attribute_name = labels_attribute_name
self.encode_labels_name_into_int = encode_labels_name_into_int
self.gene_names_attribute_name = gene_names_attribute_name
self.cell_types_attribute_name = cell_types_attribute_name
self.global_attributes_dict = None
super().__init__(
urls=url,
filenames=filename,
save_path=save_path,
delayed_populating=delayed_populating,
)
[docs] def populate(self):
logger.info("Preprocessing dataset")
(
gene_names,
labels,
batch_indices,
cell_types,
cell_attributes_dict,
gene_attributes_dict,
global_attributes_dict,
) = (None, None, None, None, None, None, None)
try:
import loompy
except ImportError:
raise ImportError(
"Please install loompy package via `pip install --user loompy"
)
ds = loompy.connect(os.path.join(self.save_path, self.filenames[0]))
select = ds[:, :].sum(axis=0) > 0 # Take out cells that don't express any gene
if not all(select):
logger.warning("Removing non-expressing cells")
for row_attribute_name in ds.ra:
if row_attribute_name == self.gene_names_attribute_name:
gene_names = ds.ra[self.gene_names_attribute_name]
else:
gene_attributes_dict = (
gene_attributes_dict if gene_attributes_dict is not None else {}
)
gene_attributes_dict[row_attribute_name] = ds.ra[row_attribute_name]
for column_attribute_name in ds.ca:
if column_attribute_name == self.batch_indices_attribute_name:
batch_indices = ds.ca[self.batch_indices_attribute_name][select]
elif column_attribute_name == self.labels_attribute_name:
labels = ds.ca[self.labels_attribute_name][select]
else:
cell_attributes_dict = (
cell_attributes_dict if cell_attributes_dict is not None else {}
)
cell_attributes_dict[column_attribute_name] = ds.ca[
column_attribute_name
][select]
for global_attribute_name in ds.attrs:
if global_attribute_name == self.cell_types_attribute_name:
cell_types = ds.attrs[self.cell_types_attribute_name]
else:
global_attributes_dict = (
global_attributes_dict if global_attributes_dict is not None else {}
)
global_attributes_dict[global_attribute_name] = ds.attrs[
global_attribute_name
]
if global_attributes_dict is not None:
self.global_attributes_dict = global_attributes_dict
if (
self.encode_labels_name_into_int
and labels is not None
and cell_types is not None
):
mapping = dict((v, k) for k, v in enumerate(cell_types))
mapper = np.vectorize(lambda x: mapping[x])
labels = mapper(labels)
data = ds[:, select].T # change matrix to cells by genes
ds.close()
logger.info("Finished preprocessing dataset")
self.populate_from_data(
X=data,
batch_indices=batch_indices,
labels=labels,
gene_names=gene_names,
cell_types=cell_types,
cell_attributes_dict=cell_attributes_dict,
gene_attributes_dict=gene_attributes_dict,
)
[docs]class RetinaDataset(LoomDataset):
"""Loads retina dataset.
The dataset of bipolar cells contains after their original pipeline for filtering 27,499 cells and
13,166 genes coming from two batches. We use the cluster annotation from 15 cell-types from the author.
We also extract their normalized data with Combat and use it for benchmarking.
Examples
--------
>>> gene_dataset = RetinaDataset()
"""
def __init__(self, save_path: str = "data/", delayed_populating: bool = False):
super().__init__(
filename="retina.loom",
save_path=save_path,
url="https://github.com/YosefLab/scVI-data/raw/master/retina.loom",
delayed_populating=delayed_populating,
)
self.cell_types = [
"RBC",
"MG",
"BC5A",
"BC7",
"BC6",
"BC5C",
"BC1A",
"BC3B",
"BC1B",
"BC2",
"BC5D",
"BC3A",
"BC5B",
"BC4",
"BC8_9",
]
[docs]class PreFrontalCortexStarmapDataset(LoomDataset):
"""Loads a starMAP dataset of 3,704 cells and 166 genes from the mouse pre-frontal cortex (Wang et al., 2018)"""
def __init__(self, save_path: str = "data/", delayed_populating: bool = False):
super().__init__(
filename="mpfc-starmap.loom",
save_path=save_path,
url="https://github.com/YosefLab/scVI-data/raw/master/mpfc-starmap.loom",
labels_attribute_name="Clusters",
encode_labels_name_into_int=True,
delayed_populating=delayed_populating,
)
self.initialize_cell_attribute("x_coord", self.Spatial_coordinates[:, 0])
self.initialize_cell_attribute("y_coord", self.Spatial_coordinates[:, 1])
[docs]class FrontalCortexDropseqDataset(LoomDataset):
""""Load the cells from the mouse frontal cortex sequenced by the Dropseq technology (Saunders et al., 2018)
Load the 71639 annotated cells located in the frontal cortex of adult mouses among the 690,000 cells
studied by (Saunders et al., 2018) using the Drop-seq method. We have a 71639*7611 gene expression matrix
Among the 7611 genes, we offer the user to provide a list of genes to subsample from. If not provided,
all genes are kept.
"""
def __init__(
self,
save_path: str = "data/",
genes_to_keep: Optional[List[str]] = None,
delayed_populating: bool = False,
):
super().__init__(
filename="fc-dropseq.loom",
save_path=save_path,
url="https://github.com/YosefLab/scVI-data/raw/master/fc-dropseq.loom",
labels_attribute_name="Clusters",
delayed_populating=delayed_populating,
)
if genes_to_keep is not None:
self.reorder_genes(genes_to_keep, drop_omitted_genes=True)
# reorder labels such that layers of the cortex are in order
order_labels = [5, 6, 3, 2, 4, 0, 1, 8, 7, 9, 10, 11, 12, 13]
self.reorder_cell_types(self.cell_types[order_labels])