Multi-resolution deconvolution of spatial transcriptomics in R#

In this brief tutorial, we go over how to use scvi-tools functionality in R for analyzing spatial datasets. We will load spatial data following this Seurat tutorial, subsequently analyzing the data using DestVI.

This tutorial requires Reticulate. Please check out our installation guide for instructions on installing Reticulate and scvi-tools.

Loading and processing data with Seurat#

# install.packages("Seurat")
# install.packages("reticulate")
# install.packages("anndata")
# install.packages("devtools")
# devtools::install_github("satijalab/seurat-data")

# if (!requireNamespace("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")

# BiocManager::install(c("LoomExperiment", "SingleCellExperiment"))
# devtools::install_github("cellgeni/sceasy")
library(Seurat)
library(SeuratData)
library(ggplot2)

First, we load the reference SMART-seq2 dataset of mouse brain. This dataset contains about 14,000 cells.

cortex_sc_data <- readRDS(url("https://www.dropbox.com/s/cuowvm4vrf65pvq/allen_cortex.rds?dl=1"))
InstallData("stxBrain")
brain_st_data <- LoadData("stxBrain", type = "anterior1")

Now, we subset the data in the same way that was done in the Seurat vignette, to match the cortex single-cell reference we are using.

brain_st_data <- SCTransform(brain_st_data, assay = "Spatial", verbose = FALSE)
brain_st_data <- RunPCA(brain_st_data, assay = "SCT", verbose = FALSE)
brain_st_data <- FindNeighbors(brain_st_data, reduction = "pca", dims = 1:30)
brain_st_data <- FindClusters(brain_st_data, verbose = FALSE)
brain_st_data <- RunUMAP(brain_st_data, reduction = "pca", dims = 1:30)

cortex_st_data <- subset(brain_st_data, idents = c(1, 2, 3, 4, 6, 7))
cortex_st_data <- subset(cortex_st_data, anterior1_imagerow > 400 | anterior1_imagecol < 150, invert = TRUE)
cortex_st_data <- subset(cortex_st_data, anterior1_imagerow > 275 & anterior1_imagecol > 370, invert = TRUE)
cortex_st_data <- subset(cortex_st_data, anterior1_imagerow > 250 & anterior1_imagecol > 440, invert = TRUE)
cortex_sc_data
An object of class Seurat 
34617 features across 14249 samples within 1 assay 
Active assay: RNA (34617 features, 0 variable features)
cortex_st_data
An object of class Seurat 
48721 features across 1073 samples within 2 assays 
Active assay: SCT (17668 features, 3000 variable features)
 1 other assay present: Spatial
 2 dimensional reductions calculated: pca, umap
cortex_sc_data <- NormalizeData(cortex_sc_data, normalization.method = "LogNormalize", scale.factor = 10000)
cortex_sc_data <- FindVariableFeatures(cortex_sc_data, selection.method = "vst", nfeatures = 2000)
top2000 <- head(VariableFeatures(cortex_sc_data), 2000)
top2000intersect <- intersect(rownames(cortex_st_data), top2000)
cortex_sc_data <- cortex_sc_data[top2000intersect]
cortex_st_data <- cortex_st_data[top2000intersect]
G <- length(top2000intersect)
G
1658
SpatialFeaturePlot(cortex_st_data, features = "nCount_Spatial") + theme(legend.position = "right")

Data conversion (Seurat -> AnnData)#

We use sceasy for conversion, and load the necessary Python packages for later.

library(reticulate)
library(sceasy)
library(anndata)

sc <- import("scanpy", convert = FALSE)
scvi <- import("scvi", convert = FALSE)

We make two AnnData objects, one for the single-cell reference and one for the spatial transcriptomics data, and then move the measurement coordinates to the appropriate attribute of the spatial AnnData.

cortex_sc_adata <- convertFormat(cortex_sc_data, from="seurat", to="anndata", main_layer="counts", drop_single_values=FALSE)
cortex_st_adata <- convertFormat(cortex_st_data, from="seurat", to="anndata", assay="Spatial", main_layer="counts", drop_single_values=FALSE)

Fit the scLVM#

scvi$model$CondSCVI$setup_anndata(cortex_sc_adata, labels_key="subclass")
None

Here we would like to reweight each measurement by a scalar factor (e.g., the inverse proportion) in the loss of the model so that lowly abundant cell types get better fit by the model.

sclvm <- scvi$model$CondSCVI(cortex_sc_adata, weight_obs=TRUE)
sclvm$train(max_epochs=as.integer(250))
None
# Make plot smaller.
saved <- options(repr.plot.width=6, repr.plot.height=5)

sclvm_elbo <- py_to_r(sclvm$history["elbo_train"]$astype("float64"))
ggplot(data = sclvm_elbo, mapping = aes(x=as.numeric(rownames(sclvm_elbo)), y=elbo_train)) + geom_line() + xlab("Epoch") + ylab("ELBO") + xlim(10, NA)

# Revert plot settings.
options(saved)

Deconvolution with stLVM#

scvi$model$DestVI$setup_anndata(cortex_st_adata)
None
stlvm <- scvi$model$DestVI$from_rna_model(cortex_st_adata, sclvm)
stlvm$train(max_epochs=as.integer(2500))
None
# Make plot smaller.
saved <- options(repr.plot.width=6, repr.plot.height=5)

stlvm_elbo <- py_to_r(stlvm$history["elbo_train"]$astype("float64"))
ggplot(data = stlvm_elbo, mapping = aes(x=as.numeric(rownames(stlvm_elbo)), y=elbo_train)) + geom_line() + xlab("Epoch") + ylab("ELBO") + xlim(10, NA)

# Revert plot settings.
options(saved)

Cell type proportions#

cortex_st_adata$obsm["proportions"] <- stlvm$get_proportions()
head(py_to_r(cortex_st_adata$obsm$get("proportions")))
A data.frame: 6 × 23
AstroCREndoL2/3 ITL4L5 ITL5 PTL6 CTL6 ITL6bNPOligoPeriPvalbSMCSerpinf1SncgSstVLMCVip
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl>
AAACAGAGCGACTCCT-10.1327413620.0087472820.0117183610.2217627910.00704961360.15463854370.0771677490.00056953620.0699254200.1527414320.00078274590.0107291970.0231004580.00391918370.0109463450.01473228910.01475227160.0010433790.0039576380.0138991615
AAACCGGGTAGGTACC-10.1841768320.0037916950.0052545620.2349969600.08008868250.00679606010.0484723490.03266797210.0911209580.0015877140.06355461480.0016043130.0023884390.00158272750.0107131220.00086870670.00909362360.0632553550.0551509220.0461300276
AAACCGTTCGTCCAGG-10.0127691920.0185167960.0845555220.0021213220.06927963350.00436168210.0015948950.07118630410.0032159920.0094841440.00295381640.0235899950.0185943600.04653250050.1996404530.04004147650.00099769530.0510154140.2670243680.0210210774
AAACTCGTGATATAAG-10.0019667030.0013080910.0033878780.0010476740.00067834450.00227718590.0014684800.16490821540.0052632780.6013067360.00176961840.1761766970.0017153370.00020668670.0015422910.00042485950.00039846530.0001951550.0016230750.0003795118
AAAGGGATGTAGCAAG-10.0927003990.0006975230.0200469120.1278780550.18978802860.10999408360.0774127990.00149521350.0060323790.0012977000.00230120360.0170508960.0016935110.14917376640.0189524420.00492217300.00140647650.1128751260.0146817180.0166219342
AAATAACCATACGGGA-10.1475603430.0081294210.0339586400.3914599720.00335716740.00074010440.0167444090.09094707670.1465705480.0040101240.00071795130.0044906110.0047751110.00083066610.0017265510.01039758140.00061365700.0015488950.0059781030.0521079451
cortex_st_data[["predictions"]] <- CreateAssayObject(data = t(py_to_r(cortex_st_adata$obsm$get("proportions"))))
DefaultAssay(cortex_st_data) <- "predictions"
SpatialFeaturePlot(cortex_st_data, features = c("L2/3 IT", "L4"), pt.size.factor = 1.6, ncol = 2, crop = TRUE)

Intra cell type information#

At the heart of DestVI is a multitude of latent variables (5 per cell type per spots). We refer to them as “gamma”, and we may manually examine them for downstream analysis.

Because those values may be hard to examine for end-users, we presented several methods for prioritizing the study of different cell types (based on PCA and Hotspot). If you’d like to use those methods, please refer to our DestVI reproducibility repository. If you have suggestions to improve those, and would like to see them in the main codebase, reach out to us.

In this tutorial, we assume that the user have identified key gene modules that vary within one cell type in the single-cell RNA sequencing data (e.g., using Hotspot). We provide here a code snippet for imputing the spatial pattern of the cell type specific gene expression, using the example of the PLP1 gene in Endothelial cells.

for (cell_type_gamma in iterate(stlvm$get_gamma()$items())) {
    cell_type <- cell_type_gamma[0]
    gamma_df <- cell_type_gamma[1]
    cortex_st_data[[paste(cell_type, "_gamma", sep = "")]] <- CreateAssayObject(data = t(py_to_r(gamma_df)))
}
head(GetAssayData(cortex_st_data))
A matrix: 6 × 1073 of type dbl
AAACAGAGCGACTCCT-1AAACCGGGTAGGTACC-1AAACCGTTCGTCCAGG-1AAACTCGTGATATAAG-1AAAGGGATGTAGCAAG-1AAATAACCATACGGGA-1AAATCGTGTACCACAA-1AAATGATTCGATCAGC-1AAATGGTCAATGTGCC-1AAATTAACGGGTAGCT-1TTGGTCACACTCGTAA-1TTGTAAGGACCTAAGT-1TTGTAAGGCCAGTTGG-1TTGTAATCCGTACTCG-1TTGTCGTTCAGTTACC-1TTGTGGCCCTGACAGT-1TTGTGTTTCCCGAAAG-1TTGTTCAGTGTGCTAC-1TTGTTGTGTGTCAAGA-1TTGTTTCCATACAACT-1
Astro0.1327413620.1841768320.0127691920.00196670350.0927003990.14756034310.00040450090.0310763010.1360706690.01979301870.13650533560.088242190.0875880870.0811930000.19845946130.2810910940.0162153260.0811164900.0615435690.1241387874
CR0.0087472820.0037916950.0185167960.00130809090.0006975230.00812942070.00631879550.0033090710.0031542380.00088585690.00374944510.030619770.0034436740.0019391390.00977118130.0030285260.0266556460.0017106370.0053807750.1895391792
Endo0.0117183610.0052545620.0845555220.00338787820.0200469120.03395863990.00196742900.0396581140.0516554420.01856217720.00502916840.028165910.0013384680.0520131960.04868353530.0263728650.0124661450.0434180830.0460809510.0098529318
L2/3 IT0.2217627910.2349969600.0021213220.00104767350.1278780550.39145997170.00945222280.0010909600.0528951290.05209997670.00529488040.167366360.0037209990.0046981000.13832953570.1697787790.0043061780.2651722130.0022874870.0017337656
L40.0070496140.0800886820.0692796330.00067834450.1897880290.00335716740.00099972720.0971405280.0603982430.01057981700.02243719440.016007940.1464024930.0114648690.12943992020.1234099270.0160239670.0039084780.0017588690.0001608078
L5 IT0.1546385440.0067960600.0043616820.00227718590.1099940840.00074010440.14986507590.0391354560.3854942920.12602023780.00086203490.001325720.2063545730.1030196250.00076945410.0026176760.1228936020.0025579660.2071072910.0006238798
ct_name <- "L6 IT"
gene_name <- "Plp1"

# filter for spots with low abundance
indices <- which(GetAssayData(cortex_st_data$predictions)[ct_name,] > 0.03)

# impute gene
specific_expression <- stlvm$get_scale_for_ct(ct_name, indices = r_to_py(as.integer(indices - 1)))[[gene_name]]
specific_expression <- 1 + 1e4 * py_to_r(specific_expression$to_frame())
filtered_st_data <- cortex_st_data[, indices]
filtered_st_data[["imputation"]] <- CreateAssayObject(data = t(specific_expression))
DefaultAssay(filtered_st_data) <- "imputation"
SpatialFeaturePlot(filtered_st_data, features = gene_name)

Session Info#

sI <- sessionInfo()
sI$loadedOnly <- NULL
print(sI, locale=FALSE)
R version 4.0.3 (2020-10-10)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 16.04.6 LTS

Matrix products: default
BLAS/LAPACK: /data/yosef2/users/jhong/miniconda3/envs/r_tutorial/lib/libopenblasp-r0.3.12.so

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] anndata_0.7.5.3           sceasy_0.0.6             
 [3] reticulate_1.22           ggplot2_3.3.5            
 [5] stxBrain.SeuratData_0.1.1 pbmc3k.SeuratData_3.1.4  
 [7] ifnb.SeuratData_3.0.0     SeuratData_0.2.1         
 [9] SeuratObject_4.0.2        Seurat_4.0.4