scGOclust_vignette

Load packages

scGOclust is a package that leverages Gene Ontology to analyse the functional profile of cells with scRNA-seq data.

# load required libraries

library(Seurat)
#> Loading required package: SeuratObject
#> Loading required package: sp
#> 'SeuratObject' was built under R 4.6.0 but the current version is
#> 4.6.1; it is recomended that you reinstall 'SeuratObject' as the ABI
#> for R may have changed
#> 
#> Attaching package: 'SeuratObject'
#> The following objects are masked from 'package:base':
#> 
#>     intersect, t
library(pheatmap)
library(httr)

## if (!require("devtools")) install.packages("devtools")

## install latest from source
## for reproducibility we do not update dependencies
# devtools::install_github("Papatheodorou-Group/scGOclust", upgrade = FALSE)

library(scGOclust)

2. Load input data

# get a gene to GO BP terms mapping table
# remove electronically inferred records

# sometimes ensembl complains about ssh certificate has expired, this is a known issue, run this code
httr::set_config(httr::config(ssl_verifypeer = FALSE)) 

#mmu_tbl = ensemblToGo(species = 'mmusculus', GO_linkage_type = c('experimental', 'phylogenetic', 'computational', 'author', 'curator' ))
#dme_tbl = ensemblToGo(species = 'dmelanogaster', GO_linkage_type = c('experimental', 'phylogenetic', 'computational', 'author', 'curator' ))

# here we load the example data for convenience
data(mmu_tbl)
data(dme_tbl)
# load the gene expression raw count objects
data(mmu_subset)
data(dme_subset)
ls()
#> [1] "dme_subset" "dme_tbl"    "mmu_subset" "mmu_tbl"

3. Build GO BP profile

## construct a Seurat object with GO BP as features

mmu_go_obj <- makeGOSeurat(ensembl_to_GO = mmu_tbl, feature_type = 'external_gene_name', seurat_obj = mmu_subset)
#> collect data
#> compute GO to cell matrix, might take a few secs
#> Warning: Data is of class matrix. Coercing to dgCMatrix.
#> time used: 0.62 secs
#> removing 0 GO terms with all 0 in dataset
#> returning GO Seurat object

dme_go_obj <- makeGOSeurat(ensembl_to_GO = dme_tbl, feature_type = 'external_gene_name', seurat_obj = dme_subset)
#> collect data
#> compute GO to cell matrix, might take a few secs
#> Warning: Data is of class matrix. Coercing to dgCMatrix.
#> time used: 0.25 secs
#> removing 0 GO terms with all 0 in dataset
#> returning GO Seurat object

4. Calculate cell type average GO BP profile

# specify the column with cell type annotation in [email protected]

mmu_ct_go <- getCellTypeGO(go_seurat_obj = mmu_go_obj, cell_type_col = 'cell_type_annotation')
#> perform normalization and log1p for mmu_go_obj
#> Normalizing layer: counts
#> Centering and scaling data matrix
#> As of Seurat v5, we recommend using AggregateExpression to perform pseudo-bulk analysis.
#> Names of identity class contain underscores ('_'), replacing with dashes ('-')
#> This message is displayed once per session.
dme_ct_go <- getCellTypeGO(go_seurat_obj = dme_go_obj, cell_type_col = 'annotation')
#> perform normalization and log1p for dme_go_obj
#> Normalizing layer: counts
#> Centering and scaling data matrix

5. Calculate within-species cell type functional similariy

# heatmap of Pearson's correlation coefficient of cell type average BP profiles within species

mmu_corr = cellTypeGOCorr(cell_type_go = mmu_ct_go, corr_method = 'pearson')
pheatmap(mmu_corr)

dme_corr = cellTypeGOCorr(cell_type_go = dme_ct_go, corr_method = 'pearson')
pheatmap(dme_corr)

5. Calculate cross-species cell type functional similariy


# calculate Pearson's correlation coefficient of cell type average BP profiles across species

corr = crossSpeciesCellTypeGOCorr(species_1 = 'mmusculus', species_2 = 'dmelanogaster', cell_type_go_sp1 = mmu_ct_go, cell_type_go_sp2 = dme_ct_go, corr_method = 'pearson')

# tries to put cells with higher values on the diagonal
# helpful when cross-species cell type similarity signal is less clear

plotCellTypeCorrHeatmap(corr, width = 9, height = 10)


# scale per column or row to see the relative similarity
plotCellTypeCorrHeatmap(corr, scale = 'column', width = 9, height = 10)

6. Dimensional reduction and UMAP visualization of cells with GO profile


# analyze the cell-by-GO BP profile as a count matrix
# Note that the example data has very few cells (for reducing data size), so I am using a small UMAP min_dist and small spread here. Default min_dist is 0.3 and spread is 1.

mmu_go_analyzed = analyzeGOSeurat(go_seurat_obj = mmu_go_obj, cell_type_col = 'cell_type_annotation', min.dist = 0.1, spread = 0.5)
#> Warning in analyzeGOSeurat(go_seurat_obj = mmu_go_obj, cell_type_col = "cell_type_annotation", : 'seurat_clusters' is presented in the metadata column names, this will be overwritten.
#>             If you would like to keep annotations in this column, rename the column.
#> perform normalization and log1p for mmu_go_obj
#> Normalizing layer: counts
#> Warning: The following arguments are not used: spread
#> Finding variable features for layer counts
#> Warning: The following arguments are not used: spread
#> Warning: The following arguments are not used: spread
#> Warning: The following arguments are not used: spread
#> Computing nearest neighbor graph
#> Computing SNN
#> Warning: The following arguments are not used: spread
#> Warning: The following arguments are not used: spread
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 219
#> Number of edges: 9890
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.4728
#> Number of communities: 4
#> Elapsed time: 0 seconds
#> Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
#> To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
#> This message will be shown once per session
#> 06:43:20 UMAP embedding parameters a = 5.069 b = 1.003
#> 06:43:20 Read 219 rows and found 50 numeric columns
#> 06:43:20 Using Annoy for neighbor search, n_neighbors = 30
#> 06:43:20 Building Annoy index with metric = cosine, n_trees = 50
#> 0%   10   20   30   40   50   60   70   80   90   100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 06:43:20 Writing NN index file to temp file /tmp/RtmpfbOYmj/file1ff43e893623
#> 06:43:20 Searching Annoy index using 1 thread, search_k = 3000
#> 06:43:20 Annoy recall = 100%
#> 06:43:20 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
#> 06:43:21 Initializing from normalized Laplacian + noise (using RSpectra)
#> 06:43:21 Commencing optimization for 500 epochs, with 7820 positive edges
#> 06:43:21 Using rng type: pcg
#> 06:43:22 Optimization finished
# UMAP plot of the analyzed cell-by-GO BP profile
# labeled by previously specified cell annotation column in meta.data

DimPlot(mmu_go_analyzed, label = TRUE) + NoLegend()

dme_go_analyzed = analyzeGOSeurat(go_seurat_obj = dme_go_obj, cell_type_col = 'annotation', min_dist=0.1, spread=0.5)
#> Warning in analyzeGOSeurat(go_seurat_obj = dme_go_obj, cell_type_col = "annotation", : 'seurat_clusters' is presented in the metadata column names, this will be overwritten.
#>             If you would like to keep annotations in this column, rename the column.
#> perform normalization and log1p for dme_go_obj
#> Normalizing layer: counts
#> Warning: The following arguments are not used: min_dist, spread
#> Finding variable features for layer counts
#> Warning: The following arguments are not used: min_dist, spread
#> Warning: The following arguments are not used: min_dist, spread
#> Warning: The following arguments are not used: min_dist, spread
#> Computing nearest neighbor graph
#> Computing SNN
#> Warning: The following arguments are not used: min_dist, spread
#> Warning: The following arguments are not used: min_dist, spread
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 180
#> Number of edges: 6418
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.5467
#> Number of communities: 4
#> Elapsed time: 0 seconds
#> Warning: The following arguments are not used: min_dist
#> 06:43:23 UMAP embedding parameters a = 3.244 b = 1.447
#> 06:43:23 Read 180 rows and found 50 numeric columns
#> 06:43:23 Using Annoy for neighbor search, n_neighbors = 30
#> 06:43:23 Building Annoy index with metric = cosine, n_trees = 50
#> 0%   10   20   30   40   50   60   70   80   90   100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 06:43:23 Writing NN index file to temp file /tmp/RtmpfbOYmj/file1ff4604f71d
#> 06:43:23 Searching Annoy index using 1 thread, search_k = 3000
#> 06:43:23 Annoy recall = 100%
#> 06:43:24 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
#> 06:43:25 Initializing from normalized Laplacian + noise (using RSpectra)
#> 06:43:25 Commencing optimization for 500 epochs, with 6610 positive edges
#> 06:43:25 Using rng type: pcg
#> 06:43:25 Optimization finished
DimPlot(dme_go_analyzed, label = TRUE) + NoLegend()

7. Get co-up and co-down regulated terms between pairs of cell types


## calculation takes a few minutes due to the Wilcox signed rank test

ct_shared_go = scGOclust::getCellTypeSharedGO(species_1 = 'mmusculus', species_2 = 'dmelanogaster', analyzed_go_seurat_sp1 = mmu_go_analyzed, analyzed_go_seurat_sp2 = dme_go_analyzed, cell_type_col_sp1 = 'cell_type_annotation', cell_type_col_sp2 = 'annotation', layer_use = 'data')
head(ct_shared_go$shared_sig_markers)

# query shared GO terms for specific cell type pairs

getCellTypeSharedTerms(shared_go = ct_shared_go,
                       cell_type_sp1 = 'intestine_Enteroendocrine cell', 
                       cell_type_sp2 = 'enteroendocrine cell',
                       return_full = FALSE)

# viola
sessionInfo()
#> R version 4.6.1 (2026-06-24)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 26.04 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.32.so;  LAPACK version 3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] future_1.70.0      scGOclust_0.2.1    httr_1.4.8         pheatmap_1.0.13   
#> [5] Seurat_5.5.1       SeuratObject_5.4.0 sp_2.2-1           rmarkdown_2.31    
#> 
#> loaded via a namespace (and not attached):
#>   [1] RColorBrewer_1.1-3     sys_3.4.3              jsonlite_2.0.0        
#>   [4] magrittr_2.0.5         spatstat.utils_3.2-3   farver_2.1.2          
#>   [7] vctrs_0.7.3            ROCR_1.0-12            memoise_2.0.1         
#>  [10] spatstat.explore_3.8-1 progress_1.2.3         htmltools_0.5.9       
#>  [13] curl_7.1.0             pracma_2.4.6           sass_0.4.10           
#>  [16] sctransform_0.4.3      parallelly_1.48.0      KernSmooth_2.23-26    
#>  [19] bslib_0.11.0           htmlwidgets_1.6.4      ica_1.0-3             
#>  [22] httr2_1.2.3            plyr_1.8.9             plotly_4.12.0         
#>  [25] zoo_1.8-15             cachem_1.1.0           networkD3_0.4.1       
#>  [28] buildtools_1.0.0       igraph_2.3.3           mime_0.13             
#>  [31] lifecycle_1.0.5        pkgconfig_2.0.3        Matrix_1.7-5          
#>  [34] R6_2.6.1               fastmap_1.2.0          fitdistrplus_1.2-6    
#>  [37] shiny_1.14.0           digest_0.6.39          S4Vectors_0.51.5      
#>  [40] AnnotationDbi_1.75.0   patchwork_1.3.2        tensor_1.5.1          
#>  [43] RSpectra_0.16-2        irlba_2.3.7            RSQLite_3.53.3        
#>  [46] labeling_0.4.3         filelock_1.0.3         progressr_0.19.0      
#>  [49] spatstat.sparse_3.2-0  slanter_0.2-0          polyclip_1.10-7       
#>  [52] abind_1.4-8            compiler_4.6.1         withr_3.0.3           
#>  [55] bit64_4.8.2            S7_0.2.2               DBI_1.3.0             
#>  [58] fastDummies_1.7.6      biomaRt_2.69.0         MASS_7.3-65           
#>  [61] rappdirs_0.3.4         tools_4.6.1            lmtest_0.9-40         
#>  [64] otel_0.2.0             httpuv_1.6.17          future.apply_1.20.2   
#>  [67] goftest_1.2-3          glue_1.8.1             nlme_3.1-169          
#>  [70] promises_1.5.0         grid_4.6.1             Rtsne_0.17            
#>  [73] cluster_2.1.8.2        reshape2_1.4.5         generics_0.1.4        
#>  [76] gtable_0.3.6           spatstat.data_3.1-9    tidyr_1.3.2           
#>  [79] hms_1.1.4              data.table_1.18.4      XVector_0.53.0        
#>  [82] BiocGenerics_0.59.8    spatstat.geom_3.8-1    RcppAnnoy_0.0.23      
#>  [85] ggrepel_0.9.8          RANN_2.6.2             pillar_1.11.1         
#>  [88] stringr_1.6.0          spam_2.11-4            RcppHNSW_0.7.0        
#>  [91] limma_3.69.2           later_1.4.8            splines_4.6.1         
#>  [94] dplyr_1.2.1            BiocFileCache_3.3.0    lattice_0.22-9        
#>  [97] survival_3.8-6         bit_4.6.0              deldir_2.0-4          
#> [100] tidyselect_1.2.1       Biostrings_2.81.3      maketools_1.3.2       
#> [103] miniUI_0.1.2           pbapply_1.7-4          knitr_1.51            
#> [106] gridExtra_2.3.1        Seqinfo_1.3.0          IRanges_2.47.2        
#> [109] scattermore_1.2        stats4_4.6.1           xfun_0.59             
#> [112] Biobase_2.73.1         statmod_1.5.2          matrixStats_1.5.0     
#> [115] stringi_1.8.7          lazyeval_0.2.3         yaml_2.3.12           
#> [118] evaluate_1.0.5         codetools_0.2-20       data.tree_1.2.0       
#> [121] tibble_3.3.1           cli_3.6.6              uwot_0.2.4            
#> [124] xtable_1.8-8           reticulate_1.46.0      jquerylib_0.1.4       
#> [127] Rcpp_1.1.1-1.1         globals_0.19.1         spatstat.random_3.5-0 
#> [130] dbplyr_2.6.0           png_0.1-9              spatstat.univar_3.2-0 
#> [133] parallel_4.6.1         ggplot2_4.0.3          blob_1.3.0            
#> [136] prettyunits_1.2.0      dotCall64_1.2          listenv_1.0.0         
#> [139] viridisLite_0.4.3      scales_1.4.0           ggridges_0.5.7        
#> [142] crayon_1.5.3           purrr_1.2.2            rlang_1.2.0           
#> [145] KEGGREST_1.53.4        cowplot_1.2.0