library(CRMetrics) # github.com/khodosevichlab/CRMetrics
library(qs)
library(magrittr)
library(dplyr)
library(ggplot2)
library(scHelper) # github.com/rrydbirk/scHelper
library(conos)
First, let look at metrics for each seq run
crm <- CRMetrics$new(data.path = c("counts_vis3/"), n.cores = 32, pal = ggsci::pal_npg()(9))
crm$plotSummaryMetrics(metrics = crm$selectMetrics(c(1:4,6,17,19)))
## Using 'sample' for 'comp.group'
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
We can plot an embedding of all cells. It’s a mess, we have way too many cells with low seq depth making clustering difficult.
crm$addDetailedMetrics()
crm$doPreprocessing()
crm$createEmbedding()
crm$plotEmbedding()
Let’s label the cells per donor.
spc.donor <- qread("spc_donor.qs")
crm$plotEmbedding(groups = spc.donor, plot.na = F, title = "Donors", mark.groups = F)
In order to remove ambient RNA, we need to prepare some things for CellBender
crm$prepareCellbender()
## 2024-11-18 22:46:38.199599 Started run using 6 cores
## 2024-11-18 22:46:38.202871 Loading HDF5 Cell Ranger outputs
## 2024-11-18 22:46:38.352955 Loading 6 count matrices using 6 cores
## 2024-11-18 22:47:16.472112 Done!
## 2024-11-18 22:47:16.473027 Calculating UMI counts per sample
## 2024-11-18 22:47:31.490998 Plotting
## 2024-11-18 22:47:31.764769 Done!
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
Suggested number of droplets to include
droplets.included <- crm$getTotalDroplets()
droplets.included
## C744 C745 C746 C748 C750 C751
## 58413 46893 94530 55269 69984 95691
Expected number of cells from 10x
crm$getExpectedCells()
## C744 C745 C746 C748 C750 C751
## 19471 15631 31510 18423 23328 31897
Adjusted vector
droplets.included["C744"] <- 5e4
droplets.included["C746"] <- 6e4
droplets.included["C750"] <- 5e4
droplets.included["C751"] <- 6e4
Plot with adjusted vector
crm$prepareCellbender(total.droplets = droplets.included)
## 2024-11-18 22:47:34.067586 Started run using 6 cores
## 2024-11-18 22:47:34.069952 Using stored HDF5 Cell Ranger outputs. To overwrite, set $cms.raw <- NULL
## 2024-11-18 22:47:34.070597 Using stored UMI counts calculations. To overwrite, set $cellbender$umi.counts <- NULL
## 2024-11-18 22:47:34.07115 Plotting
## 2024-11-18 22:47:34.398991 Done!
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
Save bash script
crm$saveCellbenderScript(total.droplets = droplets.included)
crm$plotCbTraining()
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
crm$plotCbCellProbs()
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
Let’s plot top ambient genes identified per sample as proportions of all samples. We find some immune and mitochondrial genes (normal)
crm$plotCbAmbGenes()
## Warning in crm$plotCbAmbGenes(): Palette has 9 colors but there are 13 genes,
## omitting palette.
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
We can plot top ambient genes per sample
crm$plotCbAmbExp()
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
Let’s load the corrected count matrices from CellBender and plot metrics for these.
crm$cms <- NULL
crm$detailed.metrics <- NULL
crm$addCms(cellbender = T)
## 2024-11-18 22:48:26.262189 Loading 6 count matrices using 6 cores
## 2024-11-18 22:48:33.831682 Done!
## Warning in crm$addCms(cellbender = T): Overwriting metadata
## Warning in crm$addCms(cellbender = T): Consider updating embedding by setting $cms.preprocessed <- NULL and $con <- NULL, and running $doPreprocessing() and $createEmbedding().
crm$addDetailedMetrics()
## 2024-11-18 22:48:36.210275 Counting using 6 cores
## 2024-11-18 22:49:02.52048 Creating table
## 2024-11-18 22:49:02.628244 Done!
crm$addSummaryFromCms()
## Warning in crm$addSummaryFromCms(): Overwriting existing summary metrics
## 2024-11-18 22:49:02.632638 Calculating 6 summaries using 6 cores
## 2024-11-18 22:49:26.867723 Done!
crm$plotSummaryMetrics()
## Using 'sample' for 'comp.group'
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
crm$plotDetailedMetrics()
## Using 'sample' for 'comp.group'
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
These are the results for demultiplexed cell barcodes.
donor_cb <- c("demux_poolA/donor_ids.tsv", "demux_poolB/donor_ids.tsv") %>%
lapply(read.delim, sep = "\t") %>%
`names<-`(c("PoolA","PoolB")) %>%
{lapply(seq_len(length(.)), \(x) mutate(.[[x]], pool = names(.)[x]))} %>%
bind_rows() %>%
filter(donor_id == "unassigned") %>%
mutate(donor_pool = paste0(donor_id,"_",pool))
We can plot best guess for all unassigned cell barcodes (CBs)
donor_cb %>%
mutate(singlet_pool = paste0(best_singlet,"_",pool)) %$%
table(singlet_pool) %>%
data.frame() %>%
ggplot(aes(singlet_pool, Freq, fill = singlet_pool)) +
geom_bar(stat = "identity") +
theme_bw() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(y = "Counts", x = "Donor", title = "Unassigned, best guess")
Next, we plot possible doublets for unassigned CBs - needs to be adjusted for pools
list(
donor_cb %$%
table(best_doublet) %>%
data.frame() %>%
mutate(donor = best_doublet %>%
as("character") %>%
strsplit(",") %>%
sapply('[[', 1) %>%
strsplit("_") %>%
sapply('[[', 2) %>%
factor()),
donor_cb %$%
table(best_doublet) %>%
data.frame() %>%
mutate(donor = best_doublet %>%
as("character") %>%
strsplit(",") %>%
sapply('[[', 2) %>%
strsplit("_") %>%
sapply('[[', 2) %>%
factor())
) %>%
bind_rows() %>%
group_by(donor) %>%
summarize(total = sum(Freq)) %>%
data.frame() %>%
ggplot(aes(donor, total, fill = donor)) +
geom_bar(stat = "identity") +
theme_bw() +
theme(legend.position = "none") +
labs(x = "Donor", y = "Counts", title = "Unassigned, possible doublet")
donor_cb <- c("demux_poolA/donor_ids.tsv", "demux_poolB/donor_ids.tsv") %>%
lapply(read.delim, sep = "\t") %>%
lapply(filter, !donor_id %in% c("doublet","unassigned")) %>%
`names<-`(c("poolA","poolB")) %>%
{lapply(seq_len(length(.)), \(x) mutate(.[[x]], donor_id = sapply(donor_id, \(y) if (y %in% c("Donor_29","Donor_25")) paste0(y,"_",names(.)[x]) else y)))} %>%
bind_rows() %>%
select(cell, donor_id) %>%
mutate(cell = gsub("_", "!!", cell) %>% sapply(\(x) paste0(x,"-1")))
cm.merge <- crm$cms %>%
lapply(\(cm) cm[,colnames(cm) %in% donor_cb$cell]) %>%
conos:::mergeCountMatrices()
cb.per.donor <- donor_cb %$%
split(cell, donor_id)
cms <- cb.per.donor %>%
lapply(\(donor) cm.merge[,colnames(cm.merge) %in% donor])
Let’s create an embedding of all cells per donor instead of seq run
crm <- CRMetrics$new(cms = cms, n.cores = 32, pal = ggsci::pal_npg()(9))
crm$addSummaryFromCms()
crm$addDetailedMetrics()
crm$doPreprocessing()
crm$createEmbedding()
## Warning in qread("/work/02_data/01_QC/vis3_crm_per_donor.qs", nthreads = 10):
## PROMSXP detected, replacing with NULL (see
## https://github.com/qsbase/qs/issues/93)
## Warning in qread("/work/02_data/01_QC/vis3_crm_per_donor.qs", nthreads = 10):
## PROMSXP detected, replacing with NULL (see
## https://github.com/qsbase/qs/issues/93)
## Warning in qread("/work/02_data/01_QC/vis3_crm_per_donor.qs", nthreads = 10):
## PROMSXP detected, replacing with NULL (see
## https://github.com/qsbase/qs/issues/93)
## Warning in qread("/work/02_data/01_QC/vis3_crm_per_donor.qs", nthreads = 10):
## PROMSXP detected, replacing with NULL (see
## https://github.com/qsbase/qs/issues/93)
## Warning in qread("/work/02_data/01_QC/vis3_crm_per_donor.qs", nthreads = 10):
## PROMSXP detected, replacing with NULL (see
## https://github.com/qsbase/qs/issues/93)
## Warning in qread("/work/02_data/01_QC/vis3_crm_per_donor.qs", nthreads = 10):
## PROMSXP detected, replacing with NULL (see
## https://github.com/qsbase/qs/issues/93)
## Warning in qread("/work/02_data/01_QC/vis3_crm_per_donor.qs", nthreads = 10):
## PROMSXP detected, replacing with NULL (see
## https://github.com/qsbase/qs/issues/93)
crm$plotEmbedding(title = "Clusters")
We can plot cells per donor. They don’t integrate well, not important at this step.
spc <- crm$con$getDatasetPerCell()
crm$plotEmbedding(groups = spc, mark.groups = F, title = "Donors")
spc.donor <- crm$con$getDatasetPerCell()
spc.donor %<>%
names() %>%
strsplit("!!") %>%
sapply(\(x) paste0(x[2],"!!",x[3])) %>%
{`names<-`(spc.donor, .)}
We’ll highlight all cells with UMI count < 1,000
crm$plotEmbedding(depth = T, depth.cutoff = 1e3)
Similarly, we plot cells per donor with UMI count < 1,000
crm$plotDepth(cutoff = 1e3)
## Warning: Removed 337 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 336 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 337 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 235 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 234 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 235 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 141 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 141 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 141 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 297 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 295 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 297 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 368 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 367 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 368 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 156 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 156 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 156 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 314 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 313 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 314 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 278 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 277 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 278 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 256 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 255 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 256 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 324 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 322 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 324 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 162 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 160 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 162 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 232 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 231 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 232 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 139 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 138 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 139 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 431 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 430 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 431 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 312 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 312 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 312 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 334 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 332 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 334 rows containing missing values or values outside the scale range
## (`geom_line()`).
Let’s plot cells with high (>5%) mitochondrial fraction. There’s not a lot, valiDrops caught them during demultiplexing
crm$plotEmbedding(mito.frac = T, mito.cutoff = 0.05)
Let’s look at doublets. We already removed some doublets during demultiplexing, but there may be more.
crm$detectDoublets(method = "doubletdetection",
export = T,
data.path = "doubletdetection/visit3/")
## Found 16 results for doubletdetection
## Detected 5501 possible doublets out of 81564 cells.
crm$addDoublets(method = "doubletdetection",
data.path = "doubletdetection/visit3/")
We highlight possible doublets. We see a lot of doublets between major clusters, and also some within a specific cluster to the right (adipocytes)
crm$plotEmbedding(doublet.method = "doubletdetection")
We plot all cells to be filtered
crm$plotFilteredCells(type = "embedding", depth = T, depth.cutoff = 1e3, mito.frac = T, mito.cutoff = 0.05, doublet.method = "doubletdetection")
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.
We show proportion of cells to be filtered per donor
crm$plotFilteredCells(type = "bar", depth = T, depth.cutoff = 1e3, mito.frac = T, mito.cutoff = 0.05, doublet.method = "doubletdetection")
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
Finally, an overview of specific filters per donor. A few samples are alarming, but nothing catastrophic.
crm$plotFilteredCells(type = "tile", depth = T, depth.cutoff = 1e3, mito.frac = T, mito.cutoff = 0.05, doublet.method = "doubletdetection")
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
crm$filterCms(depth.cutoff = 1e3, mito.cutoff = 0.05, doublets = "doubletdetection")
## Filtering based on: depth.cutoff = 1000; mito.cutoff = 0.05 and species = human; mito.cutoff = 0.05 and species = mouse; doublet method = doubletdetection
## 2024-11-18 22:50:46.891794 Preparing filter
## 2024-11-18 22:50:49.618516 Removing 10589 of 81564 cells ( 13%)
Now, we remove mito. and ribo. genes since we’ve seen they skew several downstream analyses.
genes.to.omit <- crm$cms.filtered %>%
lapply(rownames) %>%
Reduce(union, .) %>%
.[grepl("MT-|RPS|RPL", .)] %>%
.[!. %in% c("TRPS1","PURPL")]
crm$cms.filtered %<>% lapply(\(x) x[!rownames(x) %in% genes.to.omit, ])