scGOclust
is a package that leverages Gene Ontology to
analyse the functional profile of cells with scRNA-seq data.
# load required libraries
library(Seurat)
#> Loading required package: SeuratObject
#> Loading required package: sp
#> 'SeuratObject' was built under R 4.4.0 but the current version is
#> 4.4.2; it is recomended that you reinstall 'SeuratObject' as the ABI
#> for R may have changed
#> 'SeuratObject' was built with package 'Matrix' 1.7.1 but the current
#> version is 1.7.2; it is recomended that you reinstall 'SeuratObject' as
#> the ABI for 'Matrix' may have changed
#>
#> Attaching package: 'SeuratObject'
#> The following objects are masked from 'package:base':
#>
#> intersect, t
library(pheatmap)
library(httr)
## if (!require("devtools")) install.packages("devtools")
## install latest from source
## for reproducibility we do not update dependencies
# devtools::install_github("Papatheodorou-Group/scGOclust", upgrade = FALSE)
library(scGOclust)
# get a gene to GO BP terms mapping table
# remove electronically inferred records
# sometimes ensembl complains about ssh certificate has expired, this is a known issue, run this code
httr::set_config(httr::config(ssl_verifypeer = FALSE))
#mmu_tbl = ensemblToGo(species = 'mmusculus', GO_linkage_type = c('experimental', 'phylogenetic', 'computational', 'author', 'curator' ))
#dme_tbl = ensemblToGo(species = 'dmelanogaster', GO_linkage_type = c('experimental', 'phylogenetic', 'computational', 'author', 'curator' ))
# here we load the example data for convenience
data(mmu_tbl)
data(dme_tbl)
## construct a Seurat object with GO BP as features
mmu_go_obj <- makeGOSeurat(ensembl_to_GO = mmu_tbl, feature_type = 'external_gene_name', seurat_obj = mmu_subset)
#> collect data
#> compute GO to cell matrix, might take a few secs
#> Warning: Data is of class matrix. Coercing to dgCMatrix.
#> time used: 0.58 secs
#> removing 0 all zero terms
#> returning GO Seurat object
dme_go_obj <- makeGOSeurat(ensembl_to_GO = dme_tbl, feature_type = 'external_gene_name', seurat_obj = dme_subset)
#> collect data
#> compute GO to cell matrix, might take a few secs
#> Warning: Data is of class matrix. Coercing to dgCMatrix.
#> time used: 0.23 secs
#> removing 0 all zero terms
#> returning GO Seurat object
# specify the column with cell type annotation in [email protected]
mmu_ct_go <- getCellTypeGO(go_seurat_obj = mmu_go_obj, cell_type_col = 'cell_type_annotation')
#> perform normalization and log1p for mmu_go_obj
#> Normalizing layer: counts
#> Centering and scaling data matrix
#> As of Seurat v5, we recommend using AggregateExpression to perform pseudo-bulk analysis.
#> Names of identity class contain underscores ('_'), replacing with dashes ('-')
#> This message is displayed once per session.
dme_ct_go <- getCellTypeGO(go_seurat_obj = dme_go_obj, cell_type_col = 'annotation')
#> perform normalization and log1p for dme_go_obj
#> Normalizing layer: counts
#> Centering and scaling data matrix
# heatmap of Pearson's correlation coefficient of cell type average BP profiles within species
mmu_corr = cellTypeGOCorr(cell_type_go = mmu_ct_go, corr_method = 'pearson')
pheatmap(mmu_corr)
# calculate Pearson's correlation coefficient of cell type average BP profiles across species
corr = crossSpeciesCellTypeGOCorr(species_1 = 'mmusculus', species_2 = 'dmelanogaster', cell_type_go_sp1 = mmu_ct_go, cell_type_go_sp2 = dme_ct_go, corr_method = 'pearson')
# tries to put cells with higher values on the diagonal
# helpful when cross-species cell type similarity signal is less clear
plotCellTypeCorrHeatmap(corr, width = 9, height = 10)
# scale per column or row to see the relative similarity
plotCellTypeCorrHeatmap(corr, scale = 'column', width = 9, height = 10)
# analyze the cell-by-GO BP profile as a count matrix
# Note that the example data has very few cells (for reducing data size), so I am using a small UMAP min_dist and small spread here. Default min_dist is 0.3 and spread is 1.
mmu_go_analyzed = analyzeGOSeurat(go_seurat_obj = mmu_go_obj, cell_type_col = 'cell_type_annotation', min.dist = 0.1, spread = 0.5)
#> perform normalization and log1p for mmu_go_obj
#> Normalizing layer: counts
#> Warning: The following arguments are not used: spread
#> Finding variable features for layer counts
#> Warning: The following arguments are not used: spread
#> Warning: The following arguments are not used: spread
#> Warning: The following arguments are not used: spread
#> Computing nearest neighbor graph
#> Computing SNN
#> Warning: The following arguments are not used: spread
#> Warning: The following arguments are not used: spread
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 219
#> Number of edges: 9594
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.4931
#> Number of communities: 3
#> Elapsed time: 0 seconds
#> Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
#> To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
#> This message will be shown once per session
#> 04:14:20 UMAP embedding parameters a = 5.069 b = 1.003
#> 04:14:20 Read 219 rows and found 50 numeric columns
#> 04:14:20 Using Annoy for neighbor search, n_neighbors = 30
#> 04:14:20 Building Annoy index with metric = cosine, n_trees = 50
#> 0% 10 20 30 40 50 60 70 80 90 100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 04:14:20 Writing NN index file to temp file /tmp/RtmpBmg0FR/file2402ad8148c
#> 04:14:20 Searching Annoy index using 1 thread, search_k = 3000
#> 04:14:20 Annoy recall = 100%
#> 04:14:20 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
#> 04:14:21 Initializing from normalized Laplacian + noise (using RSpectra)
#> 04:14:21 Commencing optimization for 500 epochs, with 7792 positive edges
#> 04:14:22 Optimization finished
# UMAP plot of the analyzed cell-by-GO BP profile
# labeled by previously specified cell annotation column in meta.data
DimPlot(mmu_go_analyzed, label = TRUE) + NoLegend()
dme_go_analyzed = analyzeGOSeurat(go_seurat_obj = dme_go_obj, cell_type_col = 'annotation', min_dist=0.1, spread=0.5)
#> perform normalization and log1p for dme_go_obj
#> Normalizing layer: counts
#> Warning: The following arguments are not used: min_dist, spread
#> Finding variable features for layer counts
#> Warning: The following arguments are not used: min_dist, spread
#> Warning: The following arguments are not used: min_dist, spread
#> Warning: The following arguments are not used: min_dist, spread
#> Computing nearest neighbor graph
#> Computing SNN
#> Warning: The following arguments are not used: min_dist, spread
#> Warning: The following arguments are not used: min_dist, spread
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 180
#> Number of edges: 6265
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.5638
#> Number of communities: 4
#> Elapsed time: 0 seconds
#> Warning: The following arguments are not used: min_dist
#> 04:14:23 UMAP embedding parameters a = 3.244 b = 1.447
#> 04:14:23 Read 180 rows and found 50 numeric columns
#> 04:14:23 Using Annoy for neighbor search, n_neighbors = 30
#> 04:14:23 Building Annoy index with metric = cosine, n_trees = 50
#> 0% 10 20 30 40 50 60 70 80 90 100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 04:14:23 Writing NN index file to temp file /tmp/RtmpBmg0FR/file24027d99a3d2
#> 04:14:23 Searching Annoy index using 1 thread, search_k = 3000
#> 04:14:23 Annoy recall = 100%
#> 04:14:24 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
#> 04:14:24 Initializing from normalized Laplacian + noise (using RSpectra)
#> 04:14:24 Commencing optimization for 500 epochs, with 6618 positive edges
#> 04:14:25 Optimization finished
## calculation takes a few minutes due to the Wilcox signed rank test
ct_shared_go = scGOclust::getCellTypeSharedGO(species_1 = 'mmusculus', species_2 = 'dmelanogaster', analyzed_go_seurat_sp1 = mmu_go_analyzed, analyzed_go_seurat_sp2 = dme_go_analyzed, cell_type_col_sp1 = 'cell_type_annotation', cell_type_col_sp2 = 'annotation', layer_use = 'data')
# query shared GO terms for specific cell type pairs
getCellTypeSharedTerms(shared_go = ct_shared_go,
cell_type_sp1 = 'intestine_Enteroendocrine cell',
cell_type_sp2 = 'enteroendocrine cell',
return_full = FALSE)
# viola
sessionInfo()
#> R version 4.4.2 (2024-10-31)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.2 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=C
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] scGOclust_0.2.1 httr_1.4.7 pheatmap_1.0.12 Seurat_5.2.1
#> [5] SeuratObject_5.0.2 sp_2.2-0 rmarkdown_2.29
#>
#> loaded via a namespace (and not attached):
#> [1] RcppAnnoy_0.0.22 splines_4.4.2 later_1.4.1
#> [4] filelock_1.0.3 tibble_3.2.1 polyclip_1.10-7
#> [7] fastDummies_1.7.5 lifecycle_1.0.4 httr2_1.1.0
#> [10] globals_0.16.3 lattice_0.22-6 MASS_7.3-64
#> [13] magrittr_2.0.3 limma_3.63.3 plotly_4.10.4
#> [16] sass_0.4.9 jquerylib_0.1.4 yaml_2.3.10
#> [19] httpuv_1.6.15 sctransform_0.4.1 spam_2.11-1
#> [22] spatstat.sparse_3.1-0 reticulate_1.40.0 cowplot_1.1.3
#> [25] pbapply_1.7-2 DBI_1.2.3 buildtools_1.0.0
#> [28] RColorBrewer_1.1-3 abind_1.4-8 Rtsne_0.17
#> [31] purrr_1.0.4 BiocGenerics_0.53.6 pracma_2.4.4
#> [34] rappdirs_0.3.3 GenomeInfoDbData_1.2.13 IRanges_2.41.3
#> [37] S4Vectors_0.45.4 ggrepel_0.9.6 irlba_2.3.5.1
#> [40] listenv_0.9.1 spatstat.utils_3.1-2 maketools_1.3.2
#> [43] goftest_1.2-3 RSpectra_0.16-2 spatstat.random_3.3-2
#> [46] fitdistrplus_1.2-2 parallelly_1.42.0 codetools_0.2-20
#> [49] xml2_1.3.6 tidyselect_1.2.1 UCSC.utils_1.3.1
#> [52] farver_2.1.2 matrixStats_1.5.0 stats4_4.4.2
#> [55] BiocFileCache_2.15.1 spatstat.explore_3.3-4 jsonlite_1.8.9
#> [58] progressr_0.15.1 ggridges_0.5.6 survival_3.8-3
#> [61] tools_4.4.2 progress_1.2.3 ica_1.0-3
#> [64] Rcpp_1.0.14 glue_1.8.0 gridExtra_2.3
#> [67] xfun_0.50 GenomeInfoDb_1.43.4 dplyr_1.1.4
#> [70] withr_3.0.2 fastmap_1.2.0 digest_0.6.37
#> [73] R6_2.6.0 mime_0.12 colorspace_2.1-1
#> [76] networkD3_0.4 scattermore_1.2 tensor_1.5
#> [79] spatstat.data_3.1-4 biomaRt_2.63.1 RSQLite_2.3.9
#> [82] tidyr_1.3.1 generics_0.1.3 data.table_1.16.4
#> [85] prettyunits_1.2.0 htmlwidgets_1.6.4 slanter_0.2-0
#> [88] uwot_0.2.2 pkgconfig_2.0.3 gtable_0.3.6
#> [91] blob_1.2.4 lmtest_0.9-40 XVector_0.47.2
#> [94] sys_3.4.3 htmltools_0.5.8.1 dotCall64_1.2
#> [97] scales_1.3.0 Biobase_2.67.0 png_0.1-8
#> [100] spatstat.univar_3.1-1 knitr_1.49 reshape2_1.4.4
#> [103] nlme_3.1-167 curl_6.2.0 cachem_1.1.0
#> [106] zoo_1.8-12 stringr_1.5.1 KernSmooth_2.23-26
#> [109] parallel_4.4.2 miniUI_0.1.1.1 AnnotationDbi_1.69.0
#> [112] pillar_1.10.1 grid_4.4.2 vctrs_0.6.5
#> [115] RANN_2.6.2 promises_1.3.2 dbplyr_2.5.0
#> [118] xtable_1.8-4 cluster_2.1.8 evaluate_1.0.3
#> [121] cli_3.6.4 compiler_4.4.2 rlang_1.1.5
#> [124] crayon_1.5.3 future.apply_1.11.3 labeling_0.4.3
#> [127] plyr_1.8.9 stringi_1.8.4 viridisLite_0.4.2
#> [130] deldir_2.0-4 munsell_0.5.1 Biostrings_2.75.3
#> [133] lazyeval_0.2.2 spatstat.geom_3.3-5 Matrix_1.7-2
#> [136] RcppHNSW_0.6.0 hms_1.1.3 patchwork_1.3.0
#> [139] bit64_4.6.0-1 future_1.34.0 ggplot2_3.5.1
#> [142] KEGGREST_1.47.0 statmod_1.5.0 shiny_1.10.0
#> [145] ROCR_1.0-11 igraph_2.1.4 memoise_2.0.1
#> [148] bslib_0.9.0 bit_4.5.0.1