scGOclust is a package that leverages Gene Ontology to
analyse the functional profile of cells with scRNA-seq data.
# load required libraries
library(Seurat)
#> Loading required package: SeuratObject
#> Loading required package: sp
#> 'SeuratObject' was built under R 4.6.0 but the current version is
#> 4.6.1; it is recomended that you reinstall 'SeuratObject' as the ABI
#> for R may have changed
#>
#> Attaching package: 'SeuratObject'
#> The following objects are masked from 'package:base':
#>
#> intersect, t
library(pheatmap)
library(httr)
## if (!require("devtools")) install.packages("devtools")
## install latest from source
## for reproducibility we do not update dependencies
# devtools::install_github("Papatheodorou-Group/scGOclust", upgrade = FALSE)
library(scGOclust)# get a gene to GO BP terms mapping table
# remove electronically inferred records
# sometimes ensembl complains about ssh certificate has expired, this is a known issue, run this code
httr::set_config(httr::config(ssl_verifypeer = FALSE))
#mmu_tbl = ensemblToGo(species = 'mmusculus', GO_linkage_type = c('experimental', 'phylogenetic', 'computational', 'author', 'curator' ))
#dme_tbl = ensemblToGo(species = 'dmelanogaster', GO_linkage_type = c('experimental', 'phylogenetic', 'computational', 'author', 'curator' ))
# here we load the example data for convenience
data(mmu_tbl)
data(dme_tbl)## construct a Seurat object with GO BP as features
mmu_go_obj <- makeGOSeurat(ensembl_to_GO = mmu_tbl, feature_type = 'external_gene_name', seurat_obj = mmu_subset)
#> collect data
#> compute GO to cell matrix, might take a few secs
#> Warning: Data is of class matrix. Coercing to dgCMatrix.
#> time used: 0.62 secs
#> removing 0 GO terms with all 0 in dataset
#> returning GO Seurat object
dme_go_obj <- makeGOSeurat(ensembl_to_GO = dme_tbl, feature_type = 'external_gene_name', seurat_obj = dme_subset)
#> collect data
#> compute GO to cell matrix, might take a few secs
#> Warning: Data is of class matrix. Coercing to dgCMatrix.
#> time used: 0.25 secs
#> removing 0 GO terms with all 0 in dataset
#> returning GO Seurat object# specify the column with cell type annotation in [email protected]
mmu_ct_go <- getCellTypeGO(go_seurat_obj = mmu_go_obj, cell_type_col = 'cell_type_annotation')
#> perform normalization and log1p for mmu_go_obj
#> Normalizing layer: counts
#> Centering and scaling data matrix
#> As of Seurat v5, we recommend using AggregateExpression to perform pseudo-bulk analysis.
#> Names of identity class contain underscores ('_'), replacing with dashes ('-')
#> This message is displayed once per session.
dme_ct_go <- getCellTypeGO(go_seurat_obj = dme_go_obj, cell_type_col = 'annotation')
#> perform normalization and log1p for dme_go_obj
#> Normalizing layer: counts
#> Centering and scaling data matrix# heatmap of Pearson's correlation coefficient of cell type average BP profiles within species
mmu_corr = cellTypeGOCorr(cell_type_go = mmu_ct_go, corr_method = 'pearson')
pheatmap(mmu_corr)
# calculate Pearson's correlation coefficient of cell type average BP profiles across species
corr = crossSpeciesCellTypeGOCorr(species_1 = 'mmusculus', species_2 = 'dmelanogaster', cell_type_go_sp1 = mmu_ct_go, cell_type_go_sp2 = dme_ct_go, corr_method = 'pearson')
# tries to put cells with higher values on the diagonal
# helpful when cross-species cell type similarity signal is less clear
plotCellTypeCorrHeatmap(corr, width = 9, height = 10)
# scale per column or row to see the relative similarity
plotCellTypeCorrHeatmap(corr, scale = 'column', width = 9, height = 10)
# analyze the cell-by-GO BP profile as a count matrix
# Note that the example data has very few cells (for reducing data size), so I am using a small UMAP min_dist and small spread here. Default min_dist is 0.3 and spread is 1.
mmu_go_analyzed = analyzeGOSeurat(go_seurat_obj = mmu_go_obj, cell_type_col = 'cell_type_annotation', min.dist = 0.1, spread = 0.5)
#> Warning in analyzeGOSeurat(go_seurat_obj = mmu_go_obj, cell_type_col = "cell_type_annotation", : 'seurat_clusters' is presented in the metadata column names, this will be overwritten.
#> If you would like to keep annotations in this column, rename the column.
#> perform normalization and log1p for mmu_go_obj
#> Normalizing layer: counts
#> Warning: The following arguments are not used: spread
#> Finding variable features for layer counts
#> Warning: The following arguments are not used: spread
#> Warning: The following arguments are not used: spread
#> Warning: The following arguments are not used: spread
#> Computing nearest neighbor graph
#> Computing SNN
#> Warning: The following arguments are not used: spread
#> Warning: The following arguments are not used: spread
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 219
#> Number of edges: 9890
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.4728
#> Number of communities: 4
#> Elapsed time: 0 seconds
#> Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
#> To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
#> This message will be shown once per session
#> 06:43:20 UMAP embedding parameters a = 5.069 b = 1.003
#> 06:43:20 Read 219 rows and found 50 numeric columns
#> 06:43:20 Using Annoy for neighbor search, n_neighbors = 30
#> 06:43:20 Building Annoy index with metric = cosine, n_trees = 50
#> 0% 10 20 30 40 50 60 70 80 90 100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 06:43:20 Writing NN index file to temp file /tmp/RtmpfbOYmj/file1ff43e893623
#> 06:43:20 Searching Annoy index using 1 thread, search_k = 3000
#> 06:43:20 Annoy recall = 100%
#> 06:43:20 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
#> 06:43:21 Initializing from normalized Laplacian + noise (using RSpectra)
#> 06:43:21 Commencing optimization for 500 epochs, with 7820 positive edges
#> 06:43:21 Using rng type: pcg
#> 06:43:22 Optimization finished# UMAP plot of the analyzed cell-by-GO BP profile
# labeled by previously specified cell annotation column in meta.data
DimPlot(mmu_go_analyzed, label = TRUE) + NoLegend()dme_go_analyzed = analyzeGOSeurat(go_seurat_obj = dme_go_obj, cell_type_col = 'annotation', min_dist=0.1, spread=0.5)
#> Warning in analyzeGOSeurat(go_seurat_obj = dme_go_obj, cell_type_col = "annotation", : 'seurat_clusters' is presented in the metadata column names, this will be overwritten.
#> If you would like to keep annotations in this column, rename the column.
#> perform normalization and log1p for dme_go_obj
#> Normalizing layer: counts
#> Warning: The following arguments are not used: min_dist, spread
#> Finding variable features for layer counts
#> Warning: The following arguments are not used: min_dist, spread
#> Warning: The following arguments are not used: min_dist, spread
#> Warning: The following arguments are not used: min_dist, spread
#> Computing nearest neighbor graph
#> Computing SNN
#> Warning: The following arguments are not used: min_dist, spread
#> Warning: The following arguments are not used: min_dist, spread
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#>
#> Number of nodes: 180
#> Number of edges: 6418
#>
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.5467
#> Number of communities: 4
#> Elapsed time: 0 seconds
#> Warning: The following arguments are not used: min_dist
#> 06:43:23 UMAP embedding parameters a = 3.244 b = 1.447
#> 06:43:23 Read 180 rows and found 50 numeric columns
#> 06:43:23 Using Annoy for neighbor search, n_neighbors = 30
#> 06:43:23 Building Annoy index with metric = cosine, n_trees = 50
#> 0% 10 20 30 40 50 60 70 80 90 100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 06:43:23 Writing NN index file to temp file /tmp/RtmpfbOYmj/file1ff4604f71d
#> 06:43:23 Searching Annoy index using 1 thread, search_k = 3000
#> 06:43:23 Annoy recall = 100%
#> 06:43:24 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
#> 06:43:25 Initializing from normalized Laplacian + noise (using RSpectra)
#> 06:43:25 Commencing optimization for 500 epochs, with 6610 positive edges
#> 06:43:25 Using rng type: pcg
#> 06:43:25 Optimization finished
## calculation takes a few minutes due to the Wilcox signed rank test
ct_shared_go = scGOclust::getCellTypeSharedGO(species_1 = 'mmusculus', species_2 = 'dmelanogaster', analyzed_go_seurat_sp1 = mmu_go_analyzed, analyzed_go_seurat_sp2 = dme_go_analyzed, cell_type_col_sp1 = 'cell_type_annotation', cell_type_col_sp2 = 'annotation', layer_use = 'data')
# query shared GO terms for specific cell type pairs
getCellTypeSharedTerms(shared_go = ct_shared_go,
cell_type_sp1 = 'intestine_Enteroendocrine cell',
cell_type_sp2 = 'enteroendocrine cell',
return_full = FALSE)
# viola
sessionInfo()
#> R version 4.6.1 (2026-06-24)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 26.04 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.32.so; LAPACK version 3.12.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] future_1.70.0 scGOclust_0.2.1 httr_1.4.8 pheatmap_1.0.13
#> [5] Seurat_5.5.1 SeuratObject_5.4.0 sp_2.2-1 rmarkdown_2.31
#>
#> loaded via a namespace (and not attached):
#> [1] RColorBrewer_1.1-3 sys_3.4.3 jsonlite_2.0.0
#> [4] magrittr_2.0.5 spatstat.utils_3.2-3 farver_2.1.2
#> [7] vctrs_0.7.3 ROCR_1.0-12 memoise_2.0.1
#> [10] spatstat.explore_3.8-1 progress_1.2.3 htmltools_0.5.9
#> [13] curl_7.1.0 pracma_2.4.6 sass_0.4.10
#> [16] sctransform_0.4.3 parallelly_1.48.0 KernSmooth_2.23-26
#> [19] bslib_0.11.0 htmlwidgets_1.6.4 ica_1.0-3
#> [22] httr2_1.2.3 plyr_1.8.9 plotly_4.12.0
#> [25] zoo_1.8-15 cachem_1.1.0 networkD3_0.4.1
#> [28] buildtools_1.0.0 igraph_2.3.3 mime_0.13
#> [31] lifecycle_1.0.5 pkgconfig_2.0.3 Matrix_1.7-5
#> [34] R6_2.6.1 fastmap_1.2.0 fitdistrplus_1.2-6
#> [37] shiny_1.14.0 digest_0.6.39 S4Vectors_0.51.5
#> [40] AnnotationDbi_1.75.0 patchwork_1.3.2 tensor_1.5.1
#> [43] RSpectra_0.16-2 irlba_2.3.7 RSQLite_3.53.3
#> [46] labeling_0.4.3 filelock_1.0.3 progressr_0.19.0
#> [49] spatstat.sparse_3.2-0 slanter_0.2-0 polyclip_1.10-7
#> [52] abind_1.4-8 compiler_4.6.1 withr_3.0.3
#> [55] bit64_4.8.2 S7_0.2.2 DBI_1.3.0
#> [58] fastDummies_1.7.6 biomaRt_2.69.0 MASS_7.3-65
#> [61] rappdirs_0.3.4 tools_4.6.1 lmtest_0.9-40
#> [64] otel_0.2.0 httpuv_1.6.17 future.apply_1.20.2
#> [67] goftest_1.2-3 glue_1.8.1 nlme_3.1-169
#> [70] promises_1.5.0 grid_4.6.1 Rtsne_0.17
#> [73] cluster_2.1.8.2 reshape2_1.4.5 generics_0.1.4
#> [76] gtable_0.3.6 spatstat.data_3.1-9 tidyr_1.3.2
#> [79] hms_1.1.4 data.table_1.18.4 XVector_0.53.0
#> [82] BiocGenerics_0.59.8 spatstat.geom_3.8-1 RcppAnnoy_0.0.23
#> [85] ggrepel_0.9.8 RANN_2.6.2 pillar_1.11.1
#> [88] stringr_1.6.0 spam_2.11-4 RcppHNSW_0.7.0
#> [91] limma_3.69.2 later_1.4.8 splines_4.6.1
#> [94] dplyr_1.2.1 BiocFileCache_3.3.0 lattice_0.22-9
#> [97] survival_3.8-6 bit_4.6.0 deldir_2.0-4
#> [100] tidyselect_1.2.1 Biostrings_2.81.3 maketools_1.3.2
#> [103] miniUI_0.1.2 pbapply_1.7-4 knitr_1.51
#> [106] gridExtra_2.3.1 Seqinfo_1.3.0 IRanges_2.47.2
#> [109] scattermore_1.2 stats4_4.6.1 xfun_0.59
#> [112] Biobase_2.73.1 statmod_1.5.2 matrixStats_1.5.0
#> [115] stringi_1.8.7 lazyeval_0.2.3 yaml_2.3.12
#> [118] evaluate_1.0.5 codetools_0.2-20 data.tree_1.2.0
#> [121] tibble_3.3.1 cli_3.6.6 uwot_0.2.4
#> [124] xtable_1.8-8 reticulate_1.46.0 jquerylib_0.1.4
#> [127] Rcpp_1.1.1-1.1 globals_0.19.1 spatstat.random_3.5-0
#> [130] dbplyr_2.6.0 png_0.1-9 spatstat.univar_3.2-0
#> [133] parallel_4.6.1 ggplot2_4.0.3 blob_1.3.0
#> [136] prettyunits_1.2.0 dotCall64_1.2 listenv_1.0.0
#> [139] viridisLite_0.4.3 scales_1.4.0 ggridges_0.5.7
#> [142] crayon_1.5.3 purrr_1.2.2 rlang_1.2.0
#> [145] KEGGREST_1.53.4 cowplot_1.2.0