scGOclust_vignette

Load packages

scGOclust is a package that leverages Gene Ontology to analyse the functional profile of cells with scRNA-seq data.

# load required libraries

library(Seurat)
#> Loading required package: SeuratObject
#> Loading required package: sp
#> 'SeuratObject' was built under R 4.4.0 but the current version is
#> 4.4.2; it is recomended that you reinstall 'SeuratObject' as the ABI
#> for R may have changed
#> 'SeuratObject' was built with package 'Matrix' 1.7.1 but the current
#> version is 1.7.2; it is recomended that you reinstall 'SeuratObject' as
#> the ABI for 'Matrix' may have changed
#> 
#> Attaching package: 'SeuratObject'
#> The following objects are masked from 'package:base':
#> 
#>     intersect, t
library(pheatmap)
library(httr)

## if (!require("devtools")) install.packages("devtools")

## install latest from source
## for reproducibility we do not update dependencies
# devtools::install_github("Papatheodorou-Group/scGOclust", upgrade = FALSE)

library(scGOclust)

2. Load input data

# get a gene to GO BP terms mapping table
# remove electronically inferred records

# sometimes ensembl complains about ssh certificate has expired, this is a known issue, run this code
httr::set_config(httr::config(ssl_verifypeer = FALSE)) 

#mmu_tbl = ensemblToGo(species = 'mmusculus', GO_linkage_type = c('experimental', 'phylogenetic', 'computational', 'author', 'curator' ))
#dme_tbl = ensemblToGo(species = 'dmelanogaster', GO_linkage_type = c('experimental', 'phylogenetic', 'computational', 'author', 'curator' ))

# here we load the example data for convenience
data(mmu_tbl)
data(dme_tbl)

# load the gene expression raw count objects
data(mmu_subset)
data(dme_subset)
ls()
#> [1] "dme_subset" "dme_tbl"    "mmu_subset" "mmu_tbl"

3. Build GO BP profile

## construct a Seurat object with GO BP as features

mmu_go_obj <- makeGOSeurat(ensembl_to_GO = mmu_tbl, feature_type = 'external_gene_name', seurat_obj = mmu_subset)
#> collect data
#> compute GO to cell matrix, might take a few secs
#> Warning: Data is of class matrix. Coercing to dgCMatrix.
#> time used: 0.58 secs
#> removing 0 all zero terms
#> returning GO Seurat object

dme_go_obj <- makeGOSeurat(ensembl_to_GO = dme_tbl, feature_type = 'external_gene_name', seurat_obj = dme_subset)
#> collect data
#> compute GO to cell matrix, might take a few secs
#> Warning: Data is of class matrix. Coercing to dgCMatrix.
#> time used: 0.23 secs
#> removing 0 all zero terms
#> returning GO Seurat object

4. Calculate cell type average GO BP profile

# specify the column with cell type annotation in [email protected]

mmu_ct_go <- getCellTypeGO(go_seurat_obj = mmu_go_obj, cell_type_col = 'cell_type_annotation')
#> perform normalization and log1p for mmu_go_obj
#> Normalizing layer: counts
#> Centering and scaling data matrix
#> As of Seurat v5, we recommend using AggregateExpression to perform pseudo-bulk analysis.
#> Names of identity class contain underscores ('_'), replacing with dashes ('-')
#> This message is displayed once per session.
dme_ct_go <- getCellTypeGO(go_seurat_obj = dme_go_obj, cell_type_col = 'annotation')
#> perform normalization and log1p for dme_go_obj
#> Normalizing layer: counts
#> Centering and scaling data matrix

5. Calculate within-species cell type functional similariy

# heatmap of Pearson's correlation coefficient of cell type average BP profiles within species

mmu_corr = cellTypeGOCorr(cell_type_go = mmu_ct_go, corr_method = 'pearson')
pheatmap(mmu_corr)

dme_corr = cellTypeGOCorr(cell_type_go = dme_ct_go, corr_method = 'pearson')
pheatmap(dme_corr)

5. Calculate cross-species cell type functional similariy


# calculate Pearson's correlation coefficient of cell type average BP profiles across species

corr = crossSpeciesCellTypeGOCorr(species_1 = 'mmusculus', species_2 = 'dmelanogaster', cell_type_go_sp1 = mmu_ct_go, cell_type_go_sp2 = dme_ct_go, corr_method = 'pearson')


# tries to put cells with higher values on the diagonal
# helpful when cross-species cell type similarity signal is less clear

plotCellTypeCorrHeatmap(corr, width = 9, height = 10)


# scale per column or row to see the relative similarity
plotCellTypeCorrHeatmap(corr, scale = 'column', width = 9, height = 10)

6. Dimensional reduction and UMAP visualization of cells with GO profile


# analyze the cell-by-GO BP profile as a count matrix
# Note that the example data has very few cells (for reducing data size), so I am using a small UMAP min_dist and small spread here. Default min_dist is 0.3 and spread is 1.

mmu_go_analyzed = analyzeGOSeurat(go_seurat_obj = mmu_go_obj, cell_type_col = 'cell_type_annotation', min.dist = 0.1, spread = 0.5)
#> perform normalization and log1p for mmu_go_obj
#> Normalizing layer: counts
#> Warning: The following arguments are not used: spread
#> Finding variable features for layer counts
#> Warning: The following arguments are not used: spread
#> Warning: The following arguments are not used: spread
#> Warning: The following arguments are not used: spread
#> Computing nearest neighbor graph
#> Computing SNN
#> Warning: The following arguments are not used: spread
#> Warning: The following arguments are not used: spread
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 219
#> Number of edges: 9594
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.4931
#> Number of communities: 3
#> Elapsed time: 0 seconds
#> Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
#> To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
#> This message will be shown once per session
#> 04:14:20 UMAP embedding parameters a = 5.069 b = 1.003
#> 04:14:20 Read 219 rows and found 50 numeric columns
#> 04:14:20 Using Annoy for neighbor search, n_neighbors = 30
#> 04:14:20 Building Annoy index with metric = cosine, n_trees = 50
#> 0%   10   20   30   40   50   60   70   80   90   100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 04:14:20 Writing NN index file to temp file /tmp/RtmpBmg0FR/file2402ad8148c
#> 04:14:20 Searching Annoy index using 1 thread, search_k = 3000
#> 04:14:20 Annoy recall = 100%
#> 04:14:20 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
#> 04:14:21 Initializing from normalized Laplacian + noise (using RSpectra)
#> 04:14:21 Commencing optimization for 500 epochs, with 7792 positive edges
#> 04:14:22 Optimization finished

# UMAP plot of the analyzed cell-by-GO BP profile
# labeled by previously specified cell annotation column in meta.data

DimPlot(mmu_go_analyzed, label = TRUE) + NoLegend()

dme_go_analyzed = analyzeGOSeurat(go_seurat_obj = dme_go_obj, cell_type_col = 'annotation', min_dist=0.1, spread=0.5)
#> perform normalization and log1p for dme_go_obj
#> Normalizing layer: counts
#> Warning: The following arguments are not used: min_dist, spread
#> Finding variable features for layer counts
#> Warning: The following arguments are not used: min_dist, spread
#> Warning: The following arguments are not used: min_dist, spread
#> Warning: The following arguments are not used: min_dist, spread
#> Computing nearest neighbor graph
#> Computing SNN
#> Warning: The following arguments are not used: min_dist, spread
#> Warning: The following arguments are not used: min_dist, spread
#> Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
#> 
#> Number of nodes: 180
#> Number of edges: 6265
#> 
#> Running Louvain algorithm...
#> Maximum modularity in 10 random starts: 0.5638
#> Number of communities: 4
#> Elapsed time: 0 seconds
#> Warning: The following arguments are not used: min_dist
#> 04:14:23 UMAP embedding parameters a = 3.244 b = 1.447
#> 04:14:23 Read 180 rows and found 50 numeric columns
#> 04:14:23 Using Annoy for neighbor search, n_neighbors = 30
#> 04:14:23 Building Annoy index with metric = cosine, n_trees = 50
#> 0%   10   20   30   40   50   60   70   80   90   100%
#> [----|----|----|----|----|----|----|----|----|----|
#> **************************************************|
#> 04:14:23 Writing NN index file to temp file /tmp/RtmpBmg0FR/file24027d99a3d2
#> 04:14:23 Searching Annoy index using 1 thread, search_k = 3000
#> 04:14:23 Annoy recall = 100%
#> 04:14:24 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
#> 04:14:24 Initializing from normalized Laplacian + noise (using RSpectra)
#> 04:14:24 Commencing optimization for 500 epochs, with 6618 positive edges
#> 04:14:25 Optimization finished

DimPlot(dme_go_analyzed, label = TRUE) + NoLegend()

7. Get co-up and co-down regulated terms between pairs of cell types


## calculation takes a few minutes due to the Wilcox signed rank test

ct_shared_go = scGOclust::getCellTypeSharedGO(species_1 = 'mmusculus', species_2 = 'dmelanogaster', analyzed_go_seurat_sp1 = mmu_go_analyzed, analyzed_go_seurat_sp2 = dme_go_analyzed, cell_type_col_sp1 = 'cell_type_annotation', cell_type_col_sp2 = 'annotation', layer_use = 'data')

head(ct_shared_go$shared_sig_markers)


# query shared GO terms for specific cell type pairs

getCellTypeSharedTerms(shared_go = ct_shared_go,
                       cell_type_sp1 = 'intestine_Enteroendocrine cell', 
                       cell_type_sp2 = 'enteroendocrine cell',
                       return_full = FALSE)


# viola
sessionInfo()
#> R version 4.4.2 (2024-10-31)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.2 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C              
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] scGOclust_0.2.1    httr_1.4.7         pheatmap_1.0.12    Seurat_5.2.1      
#> [5] SeuratObject_5.0.2 sp_2.2-0           rmarkdown_2.29    
#> 
#> loaded via a namespace (and not attached):
#>   [1] RcppAnnoy_0.0.22        splines_4.4.2           later_1.4.1            
#>   [4] filelock_1.0.3          tibble_3.2.1            polyclip_1.10-7        
#>   [7] fastDummies_1.7.5       lifecycle_1.0.4         httr2_1.1.0            
#>  [10] globals_0.16.3          lattice_0.22-6          MASS_7.3-64            
#>  [13] magrittr_2.0.3          limma_3.63.3            plotly_4.10.4          
#>  [16] sass_0.4.9              jquerylib_0.1.4         yaml_2.3.10            
#>  [19] httpuv_1.6.15           sctransform_0.4.1       spam_2.11-1            
#>  [22] spatstat.sparse_3.1-0   reticulate_1.40.0       cowplot_1.1.3          
#>  [25] pbapply_1.7-2           DBI_1.2.3               buildtools_1.0.0       
#>  [28] RColorBrewer_1.1-3      abind_1.4-8             Rtsne_0.17             
#>  [31] purrr_1.0.4             BiocGenerics_0.53.6     pracma_2.4.4           
#>  [34] rappdirs_0.3.3          GenomeInfoDbData_1.2.13 IRanges_2.41.3         
#>  [37] S4Vectors_0.45.4        ggrepel_0.9.6           irlba_2.3.5.1          
#>  [40] listenv_0.9.1           spatstat.utils_3.1-2    maketools_1.3.2        
#>  [43] goftest_1.2-3           RSpectra_0.16-2         spatstat.random_3.3-2  
#>  [46] fitdistrplus_1.2-2      parallelly_1.42.0       codetools_0.2-20       
#>  [49] xml2_1.3.6              tidyselect_1.2.1        UCSC.utils_1.3.1       
#>  [52] farver_2.1.2            matrixStats_1.5.0       stats4_4.4.2           
#>  [55] BiocFileCache_2.15.1    spatstat.explore_3.3-4  jsonlite_1.8.9         
#>  [58] progressr_0.15.1        ggridges_0.5.6          survival_3.8-3         
#>  [61] tools_4.4.2             progress_1.2.3          ica_1.0-3              
#>  [64] Rcpp_1.0.14             glue_1.8.0              gridExtra_2.3          
#>  [67] xfun_0.50               GenomeInfoDb_1.43.4     dplyr_1.1.4            
#>  [70] withr_3.0.2             fastmap_1.2.0           digest_0.6.37          
#>  [73] R6_2.6.0                mime_0.12               colorspace_2.1-1       
#>  [76] networkD3_0.4           scattermore_1.2         tensor_1.5             
#>  [79] spatstat.data_3.1-4     biomaRt_2.63.1          RSQLite_2.3.9          
#>  [82] tidyr_1.3.1             generics_0.1.3          data.table_1.16.4      
#>  [85] prettyunits_1.2.0       htmlwidgets_1.6.4       slanter_0.2-0          
#>  [88] uwot_0.2.2              pkgconfig_2.0.3         gtable_0.3.6           
#>  [91] blob_1.2.4              lmtest_0.9-40           XVector_0.47.2         
#>  [94] sys_3.4.3               htmltools_0.5.8.1       dotCall64_1.2          
#>  [97] scales_1.3.0            Biobase_2.67.0          png_0.1-8              
#> [100] spatstat.univar_3.1-1   knitr_1.49              reshape2_1.4.4         
#> [103] nlme_3.1-167            curl_6.2.0              cachem_1.1.0           
#> [106] zoo_1.8-12              stringr_1.5.1           KernSmooth_2.23-26     
#> [109] parallel_4.4.2          miniUI_0.1.1.1          AnnotationDbi_1.69.0   
#> [112] pillar_1.10.1           grid_4.4.2              vctrs_0.6.5            
#> [115] RANN_2.6.2              promises_1.3.2          dbplyr_2.5.0           
#> [118] xtable_1.8-4            cluster_2.1.8           evaluate_1.0.3         
#> [121] cli_3.6.4               compiler_4.4.2          rlang_1.1.5            
#> [124] crayon_1.5.3            future.apply_1.11.3     labeling_0.4.3         
#> [127] plyr_1.8.9              stringi_1.8.4           viridisLite_0.4.2      
#> [130] deldir_2.0-4            munsell_0.5.1           Biostrings_2.75.3      
#> [133] lazyeval_0.2.2          spatstat.geom_3.3-5     Matrix_1.7-2           
#> [136] RcppHNSW_0.6.0          hms_1.1.3               patchwork_1.3.0        
#> [139] bit64_4.6.0-1           future_1.34.0           ggplot2_3.5.1          
#> [142] KEGGREST_1.47.0         statmod_1.5.0           shiny_1.10.0           
#> [145] ROCR_1.0-11             igraph_2.1.4            memoise_2.0.1          
#> [148] bslib_0.9.0             bit_4.5.0.1