library(CRMetrics) # github.com/khodosevichlab/CRMetrics
library(qs)
library(magrittr)
library(dplyr)
library(ggplot2)
library(scHelper) # github.com/rrydbirk/scHelper
library(conos)

QC per seq run

First, let look at metrics for each seq run

crm <- CRMetrics$new(data.path = c("counts_vis3/"), n.cores = 32, pal = ggsci::pal_npg()(9))
crm$plotSummaryMetrics(metrics = crm$selectMetrics(c(1:4,6,17,19)))
## Using 'sample' for 'comp.group'
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.

Plot initial embedding

We can plot an embedding of all cells. It’s a mess, we have way too many cells with low seq depth making clustering difficult.

crm$addDetailedMetrics()
crm$doPreprocessing()
crm$createEmbedding()
crm$plotEmbedding()

Per donor

Let’s label the cells per donor.

spc.donor <- qread("spc_donor.qs")
crm$plotEmbedding(groups = spc.donor, plot.na = F, title = "Donors", mark.groups = F)

Prepare Cellbender

In order to remove ambient RNA, we need to prepare some things for CellBender

crm$prepareCellbender()
## 2024-11-18 22:46:38.199599 Started run using 6 cores
## 2024-11-18 22:46:38.202871 Loading HDF5 Cell Ranger outputs
## 2024-11-18 22:46:38.352955 Loading 6 count matrices using 6 cores
## 2024-11-18 22:47:16.472112 Done!
## 2024-11-18 22:47:16.473027 Calculating UMI counts per sample
## 2024-11-18 22:47:31.490998 Plotting
## 2024-11-18 22:47:31.764769 Done!
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.

Suggested number of droplets to include

droplets.included <- crm$getTotalDroplets()
droplets.included
##  C744  C745  C746  C748  C750  C751 
## 58413 46893 94530 55269 69984 95691

Expected number of cells from 10x

crm$getExpectedCells()
##  C744  C745  C746  C748  C750  C751 
## 19471 15631 31510 18423 23328 31897

Adjusted vector

droplets.included["C744"] <- 5e4
droplets.included["C746"] <- 6e4
droplets.included["C750"] <- 5e4
droplets.included["C751"] <- 6e4

Plot with adjusted vector

crm$prepareCellbender(total.droplets = droplets.included)
## 2024-11-18 22:47:34.067586 Started run using 6 cores
## 2024-11-18 22:47:34.069952 Using stored HDF5 Cell Ranger outputs. To overwrite, set $cms.raw <- NULL
## 2024-11-18 22:47:34.070597 Using stored UMI counts calculations. To overwrite, set $cellbender$umi.counts <- NULL
## 2024-11-18 22:47:34.07115 Plotting
## 2024-11-18 22:47:34.398991 Done!
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.

Save bash script

crm$saveCellbenderScript(total.droplets = droplets.included)

Check results

crm$plotCbTraining()
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.

crm$plotCbCellProbs()
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.

Let’s plot top ambient genes identified per sample as proportions of all samples. We find some immune and mitochondrial genes (normal)

crm$plotCbAmbGenes()
## Warning in crm$plotCbAmbGenes(): Palette has 9 colors but there are 13 genes,
## omitting palette.
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.

We can plot top ambient genes per sample

crm$plotCbAmbExp()
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.

Load and plot cleaned CMs

Let’s load the corrected count matrices from CellBender and plot metrics for these.

crm$cms <- NULL
crm$detailed.metrics <- NULL
crm$addCms(cellbender = T)
## 2024-11-18 22:48:26.262189 Loading 6 count matrices using 6 cores
## 2024-11-18 22:48:33.831682 Done!
## Warning in crm$addCms(cellbender = T): Overwriting metadata
## Warning in crm$addCms(cellbender = T): Consider updating embedding by setting $cms.preprocessed <- NULL and $con <- NULL, and running $doPreprocessing() and $createEmbedding().
crm$addDetailedMetrics()
## 2024-11-18 22:48:36.210275 Counting using 6 cores
## 2024-11-18 22:49:02.52048 Creating table
## 2024-11-18 22:49:02.628244 Done!
crm$addSummaryFromCms()
## Warning in crm$addSummaryFromCms(): Overwriting existing summary metrics
## 2024-11-18 22:49:02.632638 Calculating 6 summaries using 6 cores
## 2024-11-18 22:49:26.867723 Done!
crm$plotSummaryMetrics()
## Using 'sample' for 'comp.group'
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.

crm$plotDetailedMetrics()
## Using 'sample' for 'comp.group'
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.

QC per donor

These are the results for demultiplexed cell barcodes.

donor_cb <- c("demux_poolA/donor_ids.tsv", "demux_poolB/donor_ids.tsv") %>% 
  lapply(read.delim, sep = "\t") %>% 
  `names<-`(c("PoolA","PoolB")) %>% 
  {lapply(seq_len(length(.)), \(x) mutate(.[[x]], pool = names(.)[x]))} %>% 
  bind_rows() %>% 
  filter(donor_id == "unassigned") %>% 
  mutate(donor_pool = paste0(donor_id,"_",pool))

We can plot best guess for all unassigned cell barcodes (CBs)

donor_cb %>% 
  mutate(singlet_pool = paste0(best_singlet,"_",pool)) %$% 
  table(singlet_pool) %>% 
  data.frame() %>%
  ggplot(aes(singlet_pool, Freq, fill = singlet_pool)) + 
  geom_bar(stat = "identity") + 
  theme_bw() + 
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(y = "Counts", x = "Donor", title = "Unassigned, best guess")

Next, we plot possible doublets for unassigned CBs - needs to be adjusted for pools

list(
  donor_cb %$%
    table(best_doublet) %>% 
    data.frame() %>% 
    mutate(donor = best_doublet %>% 
             as("character") %>% 
             strsplit(",") %>% 
             sapply('[[', 1) %>% 
             strsplit("_") %>% 
             sapply('[[', 2) %>% 
             factor()),
  donor_cb %$%
    table(best_doublet) %>% 
    data.frame() %>% 
    mutate(donor = best_doublet %>% 
             as("character") %>% 
             strsplit(",") %>% 
             sapply('[[', 2) %>% 
             strsplit("_") %>% 
             sapply('[[', 2) %>% 
             factor())
) %>% 
  bind_rows() %>% 
  group_by(donor) %>%
  summarize(total = sum(Freq)) %>% 
  data.frame() %>% 
  ggplot(aes(donor, total, fill = donor)) + 
  geom_bar(stat = "identity") + 
  theme_bw() + 
  theme(legend.position = "none") +
  labs(x = "Donor", y = "Counts", title = "Unassigned, possible doublet")

Get demultiplexed CBs

donor_cb <- c("demux_poolA/donor_ids.tsv", "demux_poolB/donor_ids.tsv") %>% 
  lapply(read.delim, sep = "\t") %>% 
  lapply(filter, !donor_id %in% c("doublet","unassigned")) %>% 
  `names<-`(c("poolA","poolB")) %>% 
  {lapply(seq_len(length(.)), \(x) mutate(.[[x]], donor_id = sapply(donor_id, \(y) if (y %in% c("Donor_29","Donor_25")) paste0(y,"_",names(.)[x]) else y)))} %>% 
  bind_rows() %>%
  select(cell, donor_id) %>% 
  mutate(cell = gsub("_", "!!", cell) %>% sapply(\(x) paste0(x,"-1")))
cm.merge <- crm$cms %>% 
  lapply(\(cm) cm[,colnames(cm) %in% donor_cb$cell]) %>% 
  conos:::mergeCountMatrices()

cb.per.donor <- donor_cb %$%
  split(cell, donor_id)

cms <- cb.per.donor %>% 
  lapply(\(donor) cm.merge[,colnames(cm.merge) %in% donor])

Create embedding

Let’s create an embedding of all cells per donor instead of seq run

crm <- CRMetrics$new(cms = cms, n.cores = 32, pal = ggsci::pal_npg()(9))
crm$addSummaryFromCms()
crm$addDetailedMetrics()
crm$doPreprocessing()
crm$createEmbedding()
## Warning in qread("/work/02_data/01_QC/vis3_crm_per_donor.qs", nthreads = 10):
## PROMSXP detected, replacing with NULL (see
## https://github.com/qsbase/qs/issues/93)
## Warning in qread("/work/02_data/01_QC/vis3_crm_per_donor.qs", nthreads = 10):
## PROMSXP detected, replacing with NULL (see
## https://github.com/qsbase/qs/issues/93)
## Warning in qread("/work/02_data/01_QC/vis3_crm_per_donor.qs", nthreads = 10):
## PROMSXP detected, replacing with NULL (see
## https://github.com/qsbase/qs/issues/93)
## Warning in qread("/work/02_data/01_QC/vis3_crm_per_donor.qs", nthreads = 10):
## PROMSXP detected, replacing with NULL (see
## https://github.com/qsbase/qs/issues/93)
## Warning in qread("/work/02_data/01_QC/vis3_crm_per_donor.qs", nthreads = 10):
## PROMSXP detected, replacing with NULL (see
## https://github.com/qsbase/qs/issues/93)
## Warning in qread("/work/02_data/01_QC/vis3_crm_per_donor.qs", nthreads = 10):
## PROMSXP detected, replacing with NULL (see
## https://github.com/qsbase/qs/issues/93)
## Warning in qread("/work/02_data/01_QC/vis3_crm_per_donor.qs", nthreads = 10):
## PROMSXP detected, replacing with NULL (see
## https://github.com/qsbase/qs/issues/93)
crm$plotEmbedding(title = "Clusters")

We can plot cells per donor. They don’t integrate well, not important at this step.

spc <- crm$con$getDatasetPerCell()
crm$plotEmbedding(groups = spc, mark.groups = F, title = "Donors")

spc.donor <- crm$con$getDatasetPerCell() 
spc.donor %<>% 
  names() %>% 
  strsplit("!!") %>% 
  sapply(\(x) paste0(x[2],"!!",x[3])) %>% 
  {`names<-`(spc.donor, .)}

Depth

We’ll highlight all cells with UMI count < 1,000

crm$plotEmbedding(depth = T, depth.cutoff = 1e3)

Similarly, we plot cells per donor with UMI count < 1,000

crm$plotDepth(cutoff = 1e3)
## Warning: Removed 337 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 336 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 337 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 235 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 234 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 235 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 141 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 141 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 141 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 297 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 295 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 297 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 368 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 367 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 368 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 156 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 156 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 156 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 314 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 313 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 314 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 278 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 277 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 278 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 256 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 255 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 256 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 324 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 322 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 324 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 162 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 160 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 162 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 232 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 231 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 232 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 139 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 138 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 139 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 431 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 430 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 431 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 312 rows containing non-finite outside the scale range
## (`stat_align()`).
## Removed 312 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 312 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 334 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning: Removed 332 rows containing non-finite outside the scale range
## (`stat_align()`).
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.
## Warning: Removed 334 rows containing missing values or values outside the scale range
## (`geom_line()`).

Mitochondrial fraction

Let’s plot cells with high (>5%) mitochondrial fraction. There’s not a lot, valiDrops caught them during demultiplexing

crm$plotEmbedding(mito.frac = T, mito.cutoff = 0.05)

Doublets

Let’s look at doublets. We already removed some doublets during demultiplexing, but there may be more.

crm$detectDoublets(method = "doubletdetection", 
                   export = T,
                   data.path = "doubletdetection/visit3/")
## Found 16 results for doubletdetection
## Detected 5501 possible doublets out of 81564 cells.
crm$addDoublets(method = "doubletdetection", 
                data.path = "doubletdetection/visit3/")

We highlight possible doublets. We see a lot of doublets between major clusters, and also some within a specific cluster to the right (adipocytes)

crm$plotEmbedding(doublet.method = "doubletdetection")

We plot all cells to be filtered

crm$plotFilteredCells(type = "embedding", depth = T, depth.cutoff = 1e3, mito.frac = T, mito.cutoff = 0.05, doublet.method = "doubletdetection")
## Scale for colour is already present.
## Adding another scale for colour, which will replace the existing scale.

We show proportion of cells to be filtered per donor

crm$plotFilteredCells(type = "bar", depth = T, depth.cutoff = 1e3, mito.frac = T, mito.cutoff = 0.05, doublet.method = "doubletdetection")
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.

Finally, an overview of specific filters per donor. A few samples are alarming, but nothing catastrophic.

crm$plotFilteredCells(type = "tile", depth = T, depth.cutoff = 1e3, mito.frac = T, mito.cutoff = 0.05, doublet.method = "doubletdetection")
## Warning in plot_theme(plot): The `legend.text.align` theme element is not
## defined in the element hierarchy.
## Warning in plot_theme(plot): The `legend.title.align` theme element is not
## defined in the element hierarchy.

Filter cells

crm$filterCms(depth.cutoff = 1e3, mito.cutoff = 0.05, doublets = "doubletdetection")
## Filtering based on: depth.cutoff = 1000; mito.cutoff = 0.05 and species = human; mito.cutoff = 0.05 and species = mouse; doublet method = doubletdetection
## 2024-11-18 22:50:46.891794 Preparing filter
## 2024-11-18 22:50:49.618516 Removing 10589 of 81564 cells (  13%)

Remove mito. and ribo. genes

Now, we remove mito. and ribo. genes since we’ve seen they skew several downstream analyses.

genes.to.omit <- crm$cms.filtered %>% 
  lapply(rownames) %>% 
  Reduce(union, .) %>% 
  .[grepl("MT-|RPS|RPL", .)] %>% 
  .[!. %in% c("TRPS1","PURPL")]

crm$cms.filtered %<>% lapply(\(x) x[!rownames(x) %in% genes.to.omit, ])