Introduction to scvi-tools in R#

In this introductory tutorial, we go through the different steps of an scvi-tools workflow. It is the R version of this python tutorial.

While we focus on scVI in this tutorial, the API is consistent across all models.

library(reticulate)
library(anndataR)
library(ggplot2)
library(IRdisplay)

Before we use reticulate, we will need to point it to the correct conda env we use for the analysis

use_condaenv("base", required = TRUE)

Import Python libraries with reticulate#

sc <- import('scanpy', convert = FALSE)
scvi <- import("scvi", convert = FALSE)
<contextlib.ExitStack object at 0x7f8661192910>
scvi$`__version__`
'1.4.0.post1'
scvi$settings$seed=42L

Load a subsampled version of the heart cell atlas dataset directly from scvi, like the python tutorial:

adata = scvi$data$heart_cell_atlas_subsampled()
adata
AnnData object with n_obs × n_vars = 18641 × 26662
    obs: 'NRP', 'age_group', 'cell_source', 'cell_type', 'donor', 'gender', 'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'region', 'sample', 'scrublet_score', 'source', 'type', 'version', 'cell_states', 'Used'
    var: 'gene_ids-Harvard-Nuclei', 'feature_types-Harvard-Nuclei', 'gene_ids-Sanger-Nuclei', 'feature_types-Sanger-Nuclei', 'gene_ids-Sanger-Cells', 'feature_types-Sanger-Cells', 'gene_ids-Sanger-CD45', 'feature_types-Sanger-CD45', 'n_counts'
    uns: 'cell_type_colors'

Apply scanpy preprocessing functions directly:

sc$pp$filter_genes(adata, min_counts=3L)
sc$pp$filter_cells(adata, min_genes = 200L)
adata$layers["counts"] = adata$X$copy()  # preserve counts
sc$pp$normalize_total(adata, target_sum = 1e4)
sc$pp$log1p(adata)
adata$raw = adata  # freeze the state in `.raw`
None
None
None
None

Select highly variable genes

sc$pp$highly_variable_genes(
    adata,
    n_top_genes=r_to_py(1200),
    subset=TRUE,
    layer="counts",
    flavor="seurat_v3",
    batch_key="cell_source",
)
None
adata
AnnData object with n_obs × n_vars = 18641 × 1200
    obs: 'NRP', 'age_group', 'cell_source', 'cell_type', 'donor', 'gender', 'n_counts', 'n_genes', 'percent_mito', 'percent_ribo', 'region', 'sample', 'scrublet_score', 'source', 'type', 'version', 'cell_states', 'Used'
    var: 'gene_ids-Harvard-Nuclei', 'feature_types-Harvard-Nuclei', 'gene_ids-Sanger-Nuclei', 'feature_types-Sanger-Nuclei', 'gene_ids-Sanger-Cells', 'feature_types-Sanger-Cells', 'gene_ids-Sanger-CD45', 'feature_types-Sanger-CD45', 'n_counts', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'cell_type_colors', 'log1p', 'hvg'
    layers: 'counts'

Creating and training a model#

# run setup_anndata
scvi$model$SCVI$setup_anndata(adata,
                            layer="counts",
                            categorical_covariate_keys=c("cell_source", "donor"),
                            continuous_covariate_keys=c("percent_mito", "percent_ribo")
                             )
None
# create the model
model = scvi$model$SCVI(adata)
model
# train the model
model$train(max_epochs = 400L)
None
str(py_to_r(model$registry))
List of 6
 $ scvi_version     : chr "1.4.0.post1"
 $ model_name       : chr "SCVI"
 $ setup_args       :List of 6
  ..$ layer                     : chr "counts"
  ..$ batch_key                 : NULL
  ..$ labels_key                : NULL
  ..$ size_factor_key           : NULL
  ..$ categorical_covariate_keys: chr [1:2] "cell_source" "donor"
  ..$ continuous_covariate_keys : chr [1:2] "percent_mito" "percent_ribo"
 $ field_registries :defaultdict(<class 'dict'>, {'X': {'data_registry': {'attr_name': 'layers', 'attr_key': 'counts'}, 'state_registry': {'n_obs': 18641, 'n_vars': 1200, 'column_names': array(['ISG15', 'TNFRSF18', 'VWA1', ..., 'ITGB2', 'S100B', 'MT-ND4'],
      dtype=object)}, 'summary_stats': {'n_vars': 1200, 'n_cells': 18641}}, 'batch': {'data_registry': {'attr_name': 'obs', 'attr_key': '_scvi_batch'}, 'state_registry': {'categorical_mapping': array([0]), 'original_key': '_scvi_batch'}, 'summary_stats': {'n_batch': 1}}, 'labels': {'data_registry': {'attr_name': 'obs', 'attr_key': '_scvi_labels'}, 'state_registry': {'categorical_mapping': array([0]), 'original_key': '_scvi_labels'}, 'summary_stats': {'n_labels': 1}}, 'size_factor': {'data_registry': {}, 'state_registry': {}, 'summary_stats': {}}, 'extra_categorical_covs': {'data_registry': {'attr_name': 'obsm', 'attr_key': '_scvi_extra_categorical_covs'}, 'state_registry': {'mappings': {'cell_source': array(['Harvard-Nuclei', 'Sanger-CD45', 'Sanger-Cells', 'Sanger-Nuclei'],
      dtype=object), 'donor': array(['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D11', 'H2', 'H3', 'H4',
       'H5', 'H6', 'H7'], dtype=object)}, 'field_keys': ['cell_source', 'donor'], 'n_cats_per_key': [4, 14]}, 'summary_stats': {'n_extra_categorical_covs': 2}}, 'extra_continuous_covs': {'data_registry': {'attr_name': 'obsm', 'attr_key': '_scvi_extra_continuous_covs'}, 'state_registry': {'columns': array(['percent_mito', 'percent_ribo'], dtype=object)}, 'summary_stats': {'n_extra_continuous_covs': 2}}})
 $ setup_method_name: chr "setup_anndata"
 $ _scvi_uuid       : chr "2bb1a520-9757-49c0-a818-ba50ae4761a9"

Show trainning curves#

elbo_train <- py_to_r(model$history[['elbo_train']])
elbo_train$epoch = as.numeric(row.names(elbo_train))
elbo_train$elbo_train <- as.numeric(unlist(elbo_train$elbo_train))
head(elbo_train)
A data.frame: 6 × 2
elbo_trainepoch
<dbl><dbl>
0399.20920
1343.59731
2336.79482
3333.01873
4329.71224
5327.38225
ggplot(elbo_train, aes(x = epoch, y = elbo_train)) +
  geom_line(color = "steelblue") +
  labs(title = "Negative ELBO over training epochs")

Saving and loading#

model_dir = file.path(getwd(), "scvi_model")
model$save(model_dir, overwrite=TRUE)
None
model = scvi$model$SCVI$load(model_dir, adata=adata)
model

Obtaining model outputs#

SCVI_LATENT_KEY = "X_scVI"

latent = model$get_latent_representation()
adata$obsm[SCVI_LATENT_KEY] = latent
latent$shape
(18641, 10)
adata_subset = adata[adata$obs$cell_type == "Fibroblast"]
latent_subset = model$get_latent_representation(adata_subset)
latent_subset$shape
(2446, 10)
denoised = py_to_r(model$get_normalized_expression(adata_subset, library_size=1e4))
denoised[c(1:6),c(1:6)]
A data.frame: 6 × 6
ISG15TNFRSF18VWA1HES5SPSB1ANGPTL7
<dbl><dbl><dbl><dbl><dbl><dbl>
GTCAAGTCATGCCACG-1-HCAHeart77028790.78885080.01194312891.11204040.018676627 4.8609090.07145255
GAGTCATTCTCCGTGT-1-HCAHeart82871284.92577080.00072388262.97201010.01087504323.5054680.05364634
CCTCTGATCGTGACAT-1-HCAHeart77028811.23840060.13487580421.23587640.005328822 2.9954640.21579888
CGCCATTCATCATCTT-1-H0035_apex0.14942950.03666714583.66614820.002467821 4.6625940.23152569
TCGTAGAGTAGGACTG-1-H0015_septum0.48515730.09797424080.34675740.09433391710.3569300.28906620
AAGTCTGCACCGATAT-1-HCAHeart78338541.84485070.11648533493.34158710.002791890 7.8420770.82429951
SCVI_NORMALIZED_KEY = "scvi_normalized"

adata$layers[SCVI_NORMALIZED_KEY] = model$get_normalized_expression(library_size=10e4)

Visualization without batch correction#

# run PCA then generate UMAP plots
sc$tl$pca(adata)
None
sc$pp$neighbors(adata, n_pcs=30L, n_neighbors=20L) #note for the usage of "L" for integer
None
sc$tl$umap(adata, min_dist=0.3)
None
fig1 = sc$pl$umap(
    adata,
    color="cell_type",
    frameon=FALSE,
    return_fig=TRUE,
    show = FALSE
)
#We will use the saved file in order to plot in R notebook (might not directly render from the scanpy umap function)
fig1$savefig("pca_cell_type.png", bbox_inches="tight")
display_png(file = "pca_cell_type.png", width = 800, height = 600)
None
../../../_images/f09f39710916651f9e1361de9798cc62ef0e3f6f491e64ff14be10a4119a93bb.png
fig2 = sc$pl$umap(
    adata,
    color=c("donor", "cell_source"),
    ncols=2L,
    frameon=FALSE,
    return_fig=TRUE,
    show = FALSE
)
fig2$savefig("pca_donor_source.png", bbox_inches="tight")
display_png(file = "pca_donor_source.png", width = 1400, height = 1200)
None
../../../_images/c4afe44e1d375c1d97038530b295e486376e2cc47ccb24551e693a17ddfe5d24.png

Visualization with batch correction (scVI)#

# use scVI latent space for UMAP generation
sc$pp$neighbors(adata, use_rep=SCVI_LATENT_KEY)
sc$tl$umap(adata, min_dist=0.3)
None
None
fig3 = sc$pl$umap(
    adata,
    color="cell_type",
    frameon=FALSE,
    return_fig=TRUE,
    show = FALSE
)
fig3$savefig("scvi_cell_type.png", bbox_inches="tight")
display_png(file = "scvi_cell_type.png", width = 800, height = 600)
None
../../../_images/bd605aedc017b0c235a86dd96596426414ab8b84e576db970b52ebe16d756a74.png
fig4 = sc$pl$umap(
    adata,
    color=c("donor", "cell_source"),
    ncols=2L,
    frameon=FALSE,
    return_fig=TRUE,
    show = FALSE
)
fig4$savefig("scvi_donor_source.png", bbox_inches="tight")
display_png(file = "scvi_donor_source.png", width = 1400, height = 1200)
None
../../../_images/02dca47c87aa06e7042d29a9575a59079ea33842ecb59e3f666ed069e544fc41.png

Clustering on the scVI latent space#

# neighbors were already computed using scVI
SCVI_CLUSTERS_KEY = "leiden_scVI"
sc$tl$leiden(adata, key_added=SCVI_CLUSTERS_KEY, resolution=0.5)
None
fig5 = sc$pl$umap(
    adata,
    color=SCVI_CLUSTERS_KEY,
    frameon=FALSE,
    return_fig=TRUE,
    show = FALSE
)
fig5$savefig("scvi_leiden_cluster.png", bbox_inches="tight")
display_png(file = "scvi_leiden_cluster.png", width = 800, height = 600)
None
../../../_images/11e4597d0a1bd3343ce4d66afb7523a73b140b2562e7c02cab62d7484f23f9c7.png

Differential expression#

de_df = py_to_r(model$differential_expression(
    groupby="cell_type", group1="Endothelial", group2="Fibroblast"
))
head(de_df)
A data.frame: 6 × 14
proba_m1proba_m2bayes_factorscale1scale2raw_mean1raw_mean2non_zeros_proportion1non_zeros_proportion2raw_normalized_mean1raw_normalized_mean2comparisongroup1group2
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><chr><chr><chr>
EGFL70.99800.00206.2126010.0085622703.754362e-042.37677910.0367947670.74154300.025756337 89.509911.1710879Endothelial vs FibroblastEndothelialFibroblast
VWF0.99760.00246.0298800.0164834976.488101e-045.07256320.0543745050.80822580.032297629169.698702.2100585Endothelial vs FibroblastEndothelialFibroblast
PECAM10.99740.00265.9496370.0055709705.923296e-042.06598400.0756336900.65393040.054374489 60.615573.4050260Endothelial vs FibroblastEndothelialFibroblast
SOX170.99560.00445.4217390.0014865442.656643e-050.78437110.0065412920.30761740.004497138 17.128260.1858679Endothelial vs FibroblastEndothelialFibroblast
STC10.99420.00585.1440790.0017633633.538079e-050.78534620.0040883080.19883180.003270646 17.157760.1948121Endothelial vs FibroblastEndothelialFibroblast
CYYR10.99360.00645.0450350.0032647352.135453e-040.92503520.0175797210.46239960.011856092 36.286950.6462995Endothelial vs FibroblastEndothelialFibroblast
de_df = py_to_r(model$differential_expression(groupby="cell_type", mode="change"))
head(de_df)
A data.frame: 6 × 22
proba_deproba_not_debayes_factorscale1scale2pseudocountsdeltalfc_meanlfc_medianlfc_stdraw_mean1raw_mean2non_zeros_proportion1non_zeros_proportion2raw_normalized_mean1raw_normalized_mean2is_de_fdr_0.05comparisongroup1group2
<dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><dbl><lgl><chr><chr><chr>
10.99900.00106.9067450.0282632561.929233e-049.363214e-060.257.6353617.6509732.31079417.3724160.0357913900.89655170.031520329280.350311.5660578TRUEAdipocytes vs RestAdipocytesRest
20.99740.00265.9496370.0114043801.613644e-049.363214e-060.256.5609946.6412382.107703 7.0620680.0250864490.84137930.022166955129.082001.0845219TRUEAdipocytes vs RestAdipocytesRest
30.99640.00365.6232120.0023101562.986398e-059.363214e-060.257.6867157.2571223.371668 1.3862070.0026492220.46206900.002595156 22.769500.1100937TRUEAdipocytes vs RestAdipocytesRest
40.99560.00445.4217390.0040223334.849749e-059.363214e-060.257.3171387.2872692.508050 2.7999990.0043793260.80689660.004325260 52.928540.1964442TRUEAdipocytes vs RestAdipocytesRest
50.99520.00485.3343260.0037810683.304044e-059.363214e-060.258.0071977.6468673.393191 2.0206890.0028114200.53793100.002757353 28.458900.1279225TRUEAdipocytes vs RestAdipocytesRest
60.99480.00525.2538810.0041465283.967208e-059.363214e-060.257.1303447.2134942.497959 2.6827570.0050281140.59310340.004865917 43.614100.1941209TRUEAdipocytes vs RestAdipocytesRest
sc$tl$dendrogram(adata, groupby="cell_type", use_rep="X_scVI")
None

Session Info Summary#

#reticulate::py_last_error()
sI <- sessionInfo()
sI$loadedOnly <- NULL
print(sI, locale=FALSE)
R version 4.3.3 (2024-02-29)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 24.04.3 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0 
LAPACK: /opt/anaconda3/lib/libmkl_rt.so.2;  LAPACK version 3.10.1

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] IRdisplay_1.1     ggplot2_4.0.0     anndataR_1.1.0    reticulate_1.44.0