Source code for scvi.dataset.cortex

import csv
import logging
import os
from typing import List, Optional

import numpy as np

from scvi.dataset.dataset import DownloadableDataset

logger = logging.getLogger(__name__)


[docs]class CortexDataset(DownloadableDataset): """Loads cortex dataset. The `Mouse Cortex Cells dataset <https://storage.googleapis.com/linnarsson-lab-www-blobs/blobs/cortex/expression_mRNA_17-Aug-2014.txt>`_ contains 3005 mouse cortex cells and gold-standard labels for seven distinct cell types. Each cell type corresponds to a cluster to recover. We retain top 558 genes ordered by variance. Parameters ---------- save_path Path indicating where to save/load data. genes_to_keep Gene names to keep. total_genes Total number of genes to keep. If None and genes_to_keep is empty/None, all genes are loaded. delayed_populating Boolean switch for delayed population mechanism. Examples -------- >>> gene_dataset = CortexDataset() """ def __init__( self, save_path: str = "data/", genes_to_keep: Optional[List[str]] = None, total_genes: Optional[int] = 558, delayed_populating: bool = False, ): self.genes_to_keep = genes_to_keep self.total_genes = total_genes self.precise_labels = None super().__init__( urls="https://storage.googleapis.com/linnarsson-lab-www-blobs/blobs" "/cortex/expression_mRNA_17-Aug-2014.txt", filenames="expression.bin", save_path=save_path, delayed_populating=delayed_populating, )
[docs] def populate(self): logger.info("Loading Cortex data") rows = [] gene_names = [] with open(os.path.join(self.save_path, self.filenames[0]), "r") as csvfile: data_reader = csv.reader(csvfile, delimiter="\t") for i, row in enumerate(data_reader): if i == 1: precise_clusters = np.asarray(row, dtype=str)[2:] if i == 8: clusters = np.asarray(row, dtype=str)[2:] if i >= 11: rows.append(row[1:]) gene_names.append(row[0]) cell_types, labels = np.unique(clusters, return_inverse=True) _, self.precise_labels = np.unique(precise_clusters, return_inverse=True) X = np.asarray(rows, dtype=np.int).T[1:] gene_names = np.asarray(gene_names, dtype=np.str) gene_indices = [] if self.genes_to_keep is not None: look_up = dict([(g, i) for i, g in enumerate(gene_names)]) gene_indices = np.array( [look_up[g] for g in self.genes_to_keep], dtype=np.int ) nb_gene_indices = len(gene_indices) extra_gene_indices = [] if self.total_genes is not None and nb_gene_indices < self.total_genes: all_genes_by_var = np.std(X, axis=0).argsort()[::-1] extra_genes_by_var = [i for i in all_genes_by_var if i not in gene_indices] extra_gene_indices = extra_genes_by_var[ : self.total_genes - len(gene_indices) ] gene_indices = np.concatenate([gene_indices, extra_gene_indices]).astype( np.int32 ) if gene_indices.size == 0: gene_indices = slice(None) X = X[:, gene_indices] gene_names = gene_names[gene_indices] logger.info("Finished preprocessing Cortex data") self.populate_from_data( X=X, labels=labels, gene_names=gene_names, cell_types=cell_types, cell_attributes_dict={"precise_labels": precise_clusters}, )