library(Seurat)
library(Matrix)
library(dplyr)
library(RColorBrewer)
library(ggplot2)
library(ggExtra)
library(cowplot)
library(reticulate)
library(wesanderson)
use_python("/usr/bin/python3")
#Set ggplot theme as classic
theme_set(theme_classic())
<- Read10X("outs/filtered_feature_bc_matrix/")
Countdata
<- CreateSeuratObject(counts = Countdata,
Raw.data project = "Pidd1_KO",
min.cells = 3,
min.features = 800)
$Barcodes <- rownames(Raw.data@meta.data)
Raw.data
rm(Countdata)
dim(Raw.data)
## [1] 21491 14705
$percent.mito <- PercentageFeatureSet(Raw.data, pattern = "^mt-")
Raw.data$percent.ribo <- PercentageFeatureSet(Raw.data, pattern = "(^Rpl|^Rps|^Mrp)") Raw.data
VlnPlot(object = Raw.data, features = c("nFeature_RNA","nCount_RNA", "percent.mito", "percent.ribo"), ncol= 2) & NoAxes()
# Inspect cell based on relation between nUMI and nGene detected
# Relation between nUMI and nGene detected
<- Raw.data@meta.data
Cell.QC.Stat
<- ggplot(Cell.QC.Stat, aes(x=nCount_RNA, y=nFeature_RNA)) + geom_point() + geom_smooth(method="lm")
p1 <- ggMarginal(p1, type = "histogram", fill="lightgrey")
p1
<- ggplot(Cell.QC.Stat, aes(x=log10(nCount_RNA), y=log10(nFeature_RNA))) + geom_point() + geom_smooth(method="lm")
p2 <- ggMarginal(p2, type = "histogram", fill="lightgrey")
p2
plot_grid(plotlist = list(p1,p2), ncol=2, align='h', rel_widths = c(1, 1)) ; rm(p1,p2)
<- AddModuleScore(Raw.data,
Raw.data features = list(c("Hbb-bt", "Hbq1a", "Isg20", "Fech", "Snca", "Rec114")),
ctrl = 10,
name = "Erythrocyte.signature")
$Erythrocyte.signature <- Raw.data$Erythrocyte.signature1 Cell.QC.Stat
<- colorRampPalette(brewer.pal(n =11, name = "Spectral"))(100)
gradient
<- ggplot(Cell.QC.Stat, aes(log10(nCount_RNA), y=log10(nFeature_RNA))) +
p1 geom_point(aes(color= Erythrocyte.signature)) +
scale_color_gradientn(colours=rev(gradient), name='Erythrocyte score') + theme(legend.position="none")
<- ggplot(Cell.QC.Stat, aes(log10(nCount_RNA), y=log10(nFeature_RNA))) +
p2 geom_point(aes(color= percent.mito)) +
scale_color_gradientn(colours=rev(gradient), name='Percent mito') + theme(legend.position="none")
<- ggplot(Cell.QC.Stat, aes(log10(nCount_RNA), y=log10(nFeature_RNA))) +
p3 geom_point(aes(color= percent.ribo)) +
scale_color_gradientn(colours=rev(gradient), name='Percent ribo') + theme(legend.position="none")
+ p2 + p3 p1
$Erythrocyte <- ifelse(Cell.QC.Stat$Erythrocyte.signature > 0.1, "Erythrocyte", "Not_Erythrocyte") Cell.QC.Stat
<- ggplot(Cell.QC.Stat, aes(x=log10(nCount_RNA), y=log10(nFeature_RNA))) +
p2 geom_point(aes(colour = Erythrocyte)) +
theme(legend.position="none")
ggMarginal(p2, type = "histogram", fill="lightgrey")
# Filter cells based on these thresholds
<- Cell.QC.Stat %>% filter(Cell.QC.Stat$Erythrocyte.signature < 0.1) Cell.QC.Stat
We applied a high and low median absolute deviation (mad) thresholds to exclude outlier cells
<- median(Cell.QC.Stat$percent.mito) + 4*mad(Cell.QC.Stat$percent.mito)
max.mito.thr <- median(Cell.QC.Stat$percent.mito) - 4*mad(Cell.QC.Stat$percent.mito) min.mito.thr
<- ggplot(Cell.QC.Stat, aes(x=nFeature_RNA, y=percent.mito)) +
p1 geom_point() +
geom_hline(aes(yintercept = max.mito.thr), colour = "red", linetype = 2) +
geom_hline(aes(yintercept = min.mito.thr), colour = "red", linetype = 2) +
annotate(geom = "text", label = paste0(as.numeric(table(Cell.QC.Stat$percent.mito > max.mito.thr | Cell.QC.Stat$percent.mito < min.mito.thr)[2])," cells removed\n",
as.numeric(table(Cell.QC.Stat$percent.mito > max.mito.thr | Cell.QC.Stat$percent.mito < min.mito.thr)[1])," cells remain"),
x = 6000, y = 20)
ggMarginal(p1, type = "histogram", fill="lightgrey", bins=100)
# Filter cells based on these thresholds
<- Cell.QC.Stat %>% filter(percent.mito < max.mito.thr) %>% filter(percent.mito > min.mito.thr) Cell.QC.Stat
We filter cells which are likely to be doublet based on their higher content of transcript detected as well as cell with to few genes/UMI sequenced
# Set low and hight thresholds on the number of detected genes based on the one obtain with the WT dataset
<- log10(2000)
min.Genes.thr <- log10(median(Cell.QC.Stat$nFeature_RNA) + 3*mad(Cell.QC.Stat$nFeature_RNA))
max.Genes.thr
# Set hight threshold on the number of transcripts
<- log10(median(Cell.QC.Stat$nCount_RNA) + 3*mad(Cell.QC.Stat$nCount_RNA)) max.nUMI.thr
# Gene/UMI scatter plot before filtering
<- ggplot(Cell.QC.Stat, aes(x=log10(nCount_RNA), y=log10(nFeature_RNA))) +
p1 geom_point() +
geom_smooth(method="lm") +
geom_hline(aes(yintercept = min.Genes.thr), colour = "green", linetype = 2) +
geom_hline(aes(yintercept = max.Genes.thr), colour = "green", linetype = 2) +
geom_vline(aes(xintercept = max.nUMI.thr), colour = "red", linetype = 2)
ggMarginal(p1, type = "histogram", fill="lightgrey")
# Filter cells base on both metrics
<- Cell.QC.Stat %>% filter(log10(nFeature_RNA) > min.Genes.thr) %>% filter(log10(nCount_RNA) < max.nUMI.thr) Cell.QC.Stat
<- lm(data = Cell.QC.Stat, formula = log10(nFeature_RNA) ~ log10(nCount_RNA))
lm.model
<- ggplot(Cell.QC.Stat, aes(x=log10(nCount_RNA), y=log10(nFeature_RNA))) +
p2 geom_point() +
geom_smooth(method="lm") +
geom_hline(aes(yintercept = min.Genes.thr), colour = "green", linetype = 2) +
geom_hline(aes(yintercept = max.Genes.thr), colour = "green", linetype = 2) +
geom_vline(aes(xintercept = max.nUMI.thr), colour = "red", linetype = 2) +
annotate(geom = "text", label = paste0(dim(Cell.QC.Stat)[1], " QC passed cells"), x = 4, y = 3.8)
ggMarginal(p2, type = "histogram", fill="lightgrey")
<- subset(x = Raw.data, subset = Barcodes %in% Cell.QC.Stat$Barcodes) Raw.data
# Plot final QC metrics
VlnPlot(object = Raw.data, features = c("nFeature_RNA","nCount_RNA", "percent.mito", "percent.ribo"), ncol= 2) & NoAxes()
<- ggplot(Raw.data@meta.data, aes(x=log10(nCount_RNA), y=log10(nFeature_RNA))) + geom_point() + geom_smooth(method="lm")
p1 ggMarginal(p1, type = "histogram", fill="lightgrey")
rm(list = ls()[!ls() %in% "Raw.data"])
Export raw count matrix as input to Scrublet
#Export filtered matrix
dir.create("Scrublet_inputs")
<- Matrix(as.matrix(Raw.data@assays[["RNA"]]@counts), sparse = TRUE)
exprData writeMM(exprData, "Scrublet_inputs/matrix1.mtx")
## NULL
import scrublet as scr
import scipy.io
import numpy as np
import os
#Load raw counts matrix and gene list
= 'Scrublet_inputs'
input_dir = scipy.io.mmread(input_dir + '/matrix1.mtx').T.tocsc()
counts_matrix
#Initialize Scrublet
= scr.Scrublet(counts_matrix, expected_doublet_rate=0.1, sim_doublet_ratio=2, n_neighbors = 8)
scrub
#Run the default pipeline
= scrub.scrub_doublets(min_counts=1, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=25) doublet_scores, predicted_doublets
## Preprocessing...
## Simulating doublets...
## Embedding transcriptomes using PCA...
## Calculating doublet scores...
## Automatically set threshold at doublet score = 0.34
## Detected doublet rate = 4.4%
## Estimated detectable doublet fraction = 35.8%
## Overall doublet rate:
## Expected = 10.0%
## Estimated = 12.3%
## Elapsed time: 22.8 seconds
# Import scrublet's doublet score
$Doubletscore <- py$doublet_scores
Raw.data
# Plot doublet score
ggplot(Raw.data@meta.data, aes(x = Doubletscore, stat(ndensity))) +
geom_histogram(bins = 200, colour ="lightgrey")+
geom_vline(xintercept = 0.2, colour = "red", linetype = 2)
# Manually set threshold at doublet score to 0.2
$Predicted_doublets <- ifelse(py$doublet_scores > 0.2, "Doublet","Singlet")
Raw.datatable(Raw.data$Predicted_doublets)
##
## Doublet Singlet
## 1103 10177
<- subset(x = Raw.data, subset = Predicted_doublets == "Singlet") Raw.data
dir.create("./SpringCoordinates")
# Export raw expression matrix and gene list to regenerate a spring plot
<- Matrix(as.matrix(Raw.data@assays[["RNA"]]@counts), sparse = TRUE)
exprData writeMM(exprData, "./SpringCoordinates/ExprData.mtx")
## NULL
<- row.names(Raw.data@assays[["RNA"]]@counts)
Genelist write.table(Genelist, "./SpringCoordinates/Genelist.csv", sep="\t", col.names = F, row.names = F, quote = F)
#Export metadata
<- c("Scrublet", Raw.data$Predicted_doublets)
Scrublet <- paste(Scrublet, sep=",", collapse=",")
Scrublet
<- Scrublet
Cellgrouping write.table(Cellgrouping, "./SpringCoordinates/Cellgrouping.csv", quote =F, row.names = F, col.names = F)
<- read.table("SpringCoordinates/coordinates.txt", sep = ",", header = F, row.names = 1)
spring.coor colnames(spring.coor) <- c("Spring_1", "Spring_2")
<- function(x){
Spring.Sym = abs(max(spring.coor$Spring_2)-x)
x
}
$Spring_2 <- sapply(spring.coor$Spring_2, function(x) Spring.Sym(x)) spring.coor
$Spring_1 <- spring.coor$Spring_1
Raw.data$Spring_2 <- spring.coor$Spring_2 Raw.data
<- as.matrix(Raw.data@meta.data %>% select("Spring_1", "Spring_2"))
spring
"spring"]] <- CreateDimReducObject(embeddings = spring, key = "Spring_", assay = DefaultAssay(Raw.data)) Raw.data[[
DimPlot(Raw.data,
reduction = "spring",
pt.size = 0.5) & NoAxes()
# Broad clustering
<- c("Mcm5", "Pcna", "Tym5", "Fen1", "Mcm2", "Mcm4", "Rrm1", "Ung", "Gins2", "Mcm6", "Cdca7", "Dtl", "Prim1", "Uhrf1", "Mlf1ip", "Hells", "Rfc2", "Rap2", "Nasp", "Rad51ap1", "Gmnn", "Wdr76", "Slbp", "Ccne2", "Ubr7", "Pold3", "Msh2", "Atad2", "Rad51", "Rrm2", "Cdc45", "Cdc6", "Exo1", "Tipin", "Dscc1", "Blm", " Casp8ap2", "Usp1", "Clspn", "Pola1", "Chaf1b", "Brip1", "E2f8")
s.genes <- c("Hmgb2", "Ddk1","Nusap1", "Ube2c", "Birc5", "Tpx2", "Top2a", "Ndc80", "Cks2", "Nuf2", "Cks1b", "Mki67", "Tmpo", " Cenpk", "Tacc3", "Fam64a", "Smc4", "Ccnb2", "Ckap2l", "Ckap2", "Aurkb", "Bub1", "Kif11", "Anp32e", "Tubb4b", "Gtse1", "kif20b", "Hjurp", "Cdca3", "Hn1", "Cdc20", "Ttk", "Cdc25c", "kif2c", "Rangap1", "Ncapd2", "Dlgap5", "Cdca2", "Cdca8", "Ect2", "Kif23", "Hmmr", "Aurka", "Psrc1", "Anln", "Lbr", "Ckap5", "Cenpe", "Ctcf", "Nek2", "G2e3", "Gas2l3", "Cbx5", "Cenpa")
g2m.genes
<- CellCycleScoring(Raw.data, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)
Raw.data $CC.Difference <- Raw.data$S.Score - Raw.data$G2M.Score Raw.data
<- SCTransform(Raw.data,
Raw.data method = "glmGamPoi",
vars.to.regress = c("percent.mito", "CC.Difference", "nCount_RNA"),
verbose = T)
##
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
##
|
| | 0%
|
|== | 2%
|
|==== | 5%
|
|===== | 8%
|
|======= | 10%
|
|========= | 12%
|
|========== | 15%
|
|============ | 18%
|
|============== | 20%
|
|================ | 22%
|
|================== | 25%
|
|=================== | 28%
|
|===================== | 30%
|
|======================= | 32%
|
|======================== | 35%
|
|========================== | 38%
|
|============================ | 40%
|
|============================== | 42%
|
|================================ | 45%
|
|================================= | 48%
|
|=================================== | 50%
|
|===================================== | 52%
|
|====================================== | 55%
|
|======================================== | 58%
|
|========================================== | 60%
|
|============================================ | 62%
|
|============================================== | 65%
|
|=============================================== | 68%
|
|================================================= | 70%
|
|=================================================== | 72%
|
|==================================================== | 75%
|
|====================================================== | 78%
|
|======================================================== | 80%
|
|========================================================== | 82%
|
|============================================================ | 85%
|
|============================================================= | 88%
|
|=============================================================== | 90%
|
|================================================================= | 92%
|
|================================================================== | 95%
|
|==================================================================== | 98%
|
|======================================================================| 100%
##
|
| | 0%
|
|== | 2%
|
|==== | 5%
|
|===== | 8%
|
|======= | 10%
|
|========= | 12%
|
|========== | 15%
|
|============ | 18%
|
|============== | 20%
|
|================ | 22%
|
|================== | 25%
|
|=================== | 28%
|
|===================== | 30%
|
|======================= | 32%
|
|======================== | 35%
|
|========================== | 38%
|
|============================ | 40%
|
|============================== | 42%
|
|================================ | 45%
|
|================================= | 48%
|
|=================================== | 50%
|
|===================================== | 52%
|
|====================================== | 55%
|
|======================================== | 58%
|
|========================================== | 60%
|
|============================================ | 62%
|
|============================================== | 65%
|
|=============================================== | 68%
|
|================================================= | 70%
|
|=================================================== | 72%
|
|==================================================== | 75%
|
|====================================================== | 78%
|
|======================================================== | 80%
|
|========================================================== | 82%
|
|============================================================ | 85%
|
|============================================================= | 88%
|
|=============================================================== | 90%
|
|================================================================= | 92%
|
|================================================================== | 95%
|
|==================================================================== | 98%
|
|======================================================================| 100%
<- RunPCA(Raw.data, verbose = FALSE)
Raw.data
<- FindNeighbors(Raw.data,
Raw.data dims = 1:20,
k.param = 8)
<- FindClusters(Raw.data, resolution = 0.3) Raw.data
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 10177
## Number of edges: 206464
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9042
## Number of communities: 9
## Elapsed time: 0 seconds
DimPlot(Raw.data,
reduction = "spring",
cols = c("#ebcb2e", "#9ec22f", "#a9961b", "#cc3a1b", "#d14c8d", "#4cabdc", "#5ab793", "#e7823a", "#046c9a", "#4990c9"),
pt.size = 0.5) & NoAxes()
$Broadclust.ident <- Raw.data$seurat_clusters
Raw.data
<- Raw.data %>% subset(idents = c(0,1,2,3,4,5,6)) Raw.data
DimPlot(Raw.data,
reduction = "spring",
cols = c("#ebcb2e", "#9ec22f", "#a9961b", "#cc3a1b", "#d14c8d", "#4cabdc", "#5ab793", "#e7823a", "#046c9a", "#4990c9"),
pt.size = 0.5) & NoAxes()
ggplot(as.data.frame(t(Raw.data@assays$SCT@scale.data)), aes(x = eYFP, stat(ndensity))) +
geom_histogram(bins = 200, colour ="lightgrey")+
geom_vline(xintercept = -0.8, colour = "red", linetype = 2)
table(Raw.data@assays$SCT@scale.data["eYFP",] > -0.8)
##
## FALSE TRUE
## 6087 3956
$Genotype <- ifelse(Raw.data@assays$SCT@scale.data["eYFP",] > -0.8, "Ctrl.eYFP", "Pidd1.KO") Raw.data
DimPlot(object = Raw.data,
split.by = "Genotype",
group.by = "Genotype",
reduction = "spring",
cols = c("#7293c8", "#cc3a1b"),
pt.size = 0.5) & NoAxes()
saveRDS(Raw.data, "./Pidd1KO.cells.RDS")
#date
format(Sys.time(), "%d %B, %Y, %H,%M")
## [1] "16 janvier, 2023, 11,48"
#Packages used
sessionInfo()
## R version 4.2.2 Patched (2022-11-10 r83330)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.5 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /home/matthieu/anaconda3/lib/libmkl_rt.so.1
##
## locale:
## [1] LC_CTYPE=fr_FR.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=fr_FR.UTF-8 LC_COLLATE=fr_FR.UTF-8
## [5] LC_MONETARY=fr_FR.UTF-8 LC_MESSAGES=fr_FR.UTF-8
## [7] LC_PAPER=fr_FR.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=fr_FR.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] wesanderson_0.3.6 reticulate_1.22 cowplot_1.1.1 ggExtra_0.9
## [5] ggplot2_3.4.0 RColorBrewer_1.1-2 dplyr_1.0.7 Matrix_1.5-1
## [9] SeuratObject_4.0.4 Seurat_4.0.5
##
## loaded via a namespace (and not attached):
## [1] Rtsne_0.15 colorspace_2.0-2 deldir_1.0-6
## [4] ellipsis_0.3.2 ggridges_0.5.3 rprojroot_2.0.2
## [7] rstudioapi_0.13 spatstat.data_2.1-0 farver_2.1.0
## [10] leiden_0.3.9 listenv_0.8.0 ggrepel_0.9.1
## [13] fansi_0.5.0 codetools_0.2-18 splines_4.2.2
## [16] knitr_1.36 polyclip_1.10-0 jsonlite_1.7.2
## [19] ica_1.0-2 cluster_2.1.4 png_0.1-7
## [22] uwot_0.1.10 shiny_1.7.1 sctransform_0.3.2
## [25] spatstat.sparse_2.0-0 compiler_4.2.2 httr_1.4.2
## [28] assertthat_0.2.1 fastmap_1.1.0 lazyeval_0.2.2
## [31] cli_3.4.1 later_1.3.0 htmltools_0.5.2
## [34] tools_4.2.2 igraph_1.2.11 gtable_0.3.0
## [37] glue_1.5.1 RANN_2.6.1 reshape2_1.4.4
## [40] Rcpp_1.0.8 scattermore_0.7 jquerylib_0.1.4
## [43] vctrs_0.5.1 nlme_3.1-153 lmtest_0.9-39
## [46] xfun_0.28 stringr_1.4.0 globals_0.14.0
## [49] mime_0.12 miniUI_0.1.1.1 lifecycle_1.0.3
## [52] irlba_2.3.3 goftest_1.2-3 future_1.23.0
## [55] MASS_7.3-58 zoo_1.8-9 scales_1.2.1
## [58] spatstat.core_2.3-1 promises_1.2.0.1 spatstat.utils_2.2-0
## [61] parallel_4.2.2 yaml_2.2.1 pbapply_1.5-0
## [64] gridExtra_2.3 sass_0.4.0 rpart_4.1.19
## [67] stringi_1.7.6 highr_0.9 rlang_1.0.6
## [70] pkgconfig_2.0.3 matrixStats_0.61.0 evaluate_0.14
## [73] lattice_0.20-45 ROCR_1.0-11 purrr_0.3.4
## [76] tensor_1.5 labeling_0.4.2 patchwork_1.1.1
## [79] htmlwidgets_1.5.4 tidyselect_1.1.1 here_1.0.1
## [82] parallelly_1.29.0 RcppAnnoy_0.0.19 plyr_1.8.6
## [85] magrittr_2.0.2 R6_2.5.1 generics_0.1.1
## [88] DBI_1.1.1 withr_2.5.0 mgcv_1.8-41
## [91] pillar_1.6.4 fitdistrplus_1.1-6 survival_3.4-0
## [94] abind_1.4-5 tibble_3.1.6 future.apply_1.8.1
## [97] crayon_1.4.2 KernSmooth_2.23-20 utf8_1.2.2
## [100] spatstat.geom_2.4-0 plotly_4.10.0 rmarkdown_2.11
## [103] grid_4.2.2 data.table_1.14.2 digest_0.6.29
## [106] xtable_1.8-4 tidyr_1.1.4 httpuv_1.6.3
## [109] munsell_0.5.0 viridisLite_0.4.0 bslib_0.3.1
Institute of Psychiatry and Neuroscience of Paris, INSERM U1266, 75014, Paris, France, matthieu.moreau@inserm.fr↩︎