View on GitHub

Seurat_Integration_Introduction

Based on https://satijalab.org/seurat/articles/integration_introduction.html

suppressWarnings(library(Seurat))
## Registered S3 method overwritten by 'spatstat.geom':
##   method     from
##   print.boxx cli

## Attaching SeuratObject
suppressWarnings(library(SeuratData))
## -- Installed datasets ------------------------------------- SeuratData v0.2.1 --

## v ifnb   3.1.0                          v pbmc3k 3.1.4

## -------------------------------------- Key -------------------------------------

## v Dataset loaded successfully
## > Dataset built with a newer version of Seurat than installed
## (?) Unknown version of Seurat installed
suppressWarnings(library(patchwork))

Chapter 1 - Build an merged Seurat Object using own data

You can also load your own data using the read10x function Make sure you have all three file in the correct directory

  1. matrix.mtx
  2. genes.tsv
  3. barcodes.tsv
URL <- "D:/Documents/GitHub/Introduction-to-Seurat-Package/pbmc3k_filtered_gene_bc_matrices/filtered_gene_bc_matrices/hg19"

# Load the PBMC dataset1
pbmc.data1 <- Read10X(data.dir = URL)
# Initialize the Seurat object with the raw (non-normalized data).
ctrl <- CreateSeuratObject(counts = pbmc.data1, project = "pbmc3k", min.cells = 300, min.features = 200)

# Load the PBMC dataset2
pbmc.data2 <- Read10X(data.dir = URL)
# Initialize the Seurat object with the raw (non-normalized data).
stim <- CreateSeuratObject(counts = pbmc.data2, project = "pbmc3k", min.cells = 600, min.features = 200)

pmbc.list <- list(ctrl,stim)
merged.pbmc <- merge(x = ctrl, y = stim, add.cell.ids = c("3K", "4K"))

#more than one sample
#merged.pbmc <- merge(x = ctrl, y = c(stim1,stim2,stim3))

#Example from seurat
#https://satijalab.org/seurat/articles/merge_vignette.html
#pbmc.big <- merge(pbmc3k, y = c(pbmc4k, pbmc8k), add.cell.ids = c("3K", "4K", "8K"), project = "PBMC15K")
#pbmc.big

Chapter 1 - Adding MetaData

library(stringr)
sample <- names(merged.pbmc@active.ident)
sample_detect <- ifelse(str_detect(sample,"3K"),"3K","4K")

merged.pbmc@meta.data$sample <- sample_detect

Idents(object = merged.pbmc) <- "sample"

Chapter 1 - Normalized Data and Intergrate the two dataset

pmbc.list <-  SplitObject(merged.pbmc, split.by = "sample")

# perform standard preprocessing on each object
for (i in 1:length(pmbc.list)) {
  pmbc.list[[i]] <- NormalizeData(pmbc.list[[i]], verbose = FALSE)
  pmbc.list[[i]] <- subset(pmbc.list[[i]], downsample = 1000)
  pmbc.list[[i]] <- FindVariableFeatures(
    pmbc.list[[i]], selection.method = "vst",
    nfeatures = 2000, verbose = FALSE
  )
}

# select features that are repeatedly variable across datasets for integration run PCA on each
# dataset using these features
features <- SelectIntegrationFeatures(object.list = pmbc.list)
pmbc.list <- lapply(X = pmbc.list, FUN = function(x) {
    x <- ScaleData(x, features = features, verbose = FALSE)
    x <- RunPCA(x, features = features, verbose = FALSE)
})

Chapter 1 - Intergrate Data

#####################
# find anchors
anchors <- FindIntegrationAnchors(object.list = pmbc.list)
## Computing 2000 integration features

## Scaling features for provided objects

## Finding all pairwise anchors

## Running CCA

## Merging objects

## Finding neighborhoods

## Finding anchors

##  Found 3803 anchors

## Filtering anchors

##  Retained 3018 anchors
# integrate data
merged.pbmc <- IntegrateData(anchorset = anchors)
## Merging dataset 2 into 1

## Extracting anchors for merged samples

## Finding integration vectors

## Finding integration vector weights

## Integrating data

Chapter 1.2 - Data Cleaning and Dimensional reduction for visualization

# Run the standard workflow for visualization and clustering
merged.pbmc <- ScaleData(merged.pbmc, verbose = FALSE)
merged.pbmc <- FindVariableFeatures(merged.pbmc, 
                                    selection.method = "vst",
                                    nfeatures = 2000, 
                                    verbose = FALSE)
## Warning in FindVariableFeatures.Assay(object = assay.data, selection.method =
## selection.method, : selection.method set to 'vst' but count slot is empty; will
## use data slot instead
merged.pbmc <- RunPCA(merged.pbmc, npcs = 30, verbose = FALSE)
merged.pbmc <- RunUMAP(merged.pbmc, reduction = "pca", dims = 1:30)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session

## 13:14:12 UMAP embedding parameters a = 0.9922 b = 1.112

## 13:14:12 Read 2000 rows and found 30 numeric columns

## 13:14:12 Using Annoy for neighbor search, n_neighbors = 30

## 13:14:12 Building Annoy index with metric = cosine, n_trees = 50

## 0%   10   20   30   40   50   60   70   80   90   100%

## [----|----|----|----|----|----|----|----|----|----|

## **************************************************|
## 13:14:13 Writing NN index file to temp file C:\Users\harpa\AppData\Local\Temp\RtmpQV6obQ\file3b945fe1326e
## 13:14:13 Searching Annoy index using 1 thread, search_k = 3000
## 13:14:13 Annoy recall = 100%
## 13:14:14 Commencing smooth kNN distance calibration using 1 thread
## 13:14:14 Initializing from normalized Laplacian + noise
## 13:14:14 Commencing optimization for 500 epochs, with 90866 positive edges
## 13:14:23 Optimization finished
merged.pbmc <- FindNeighbors(merged.pbmc, reduction = "pca", dims = 1:30)
## Computing nearest neighbor graph
## Computing SNN
merged.pbmc <- FindClusters(merged.pbmc, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 2000
## Number of edges: 113355
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8722
## Number of communities: 9
## Elapsed time: 0 seconds
gc()
##            used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells  2768169 147.9    5367582 286.7  5367582 286.7
## Vcells 37097706 283.1   84887931 647.7 82907491 632.6
# Visualization

p1 <- DimPlot(merged.pbmc, reduction = "umap")
p1

Idents(object = merged.pbmc) <- "sample"
p2 <- DimPlot(merged.pbmc, reduction = "umap")
p2

p1+p2

# Find differentially expressed features between CD14+ Monocytes and all other cells, only
# search for positive markers
Idents(object = merged.pbmc) <- "sample"

sammple.markers <- FindMarkers(merged.pbmc, ident.1 = "3K", ident.2 = "4K")
# view results
head(sammple.markers)
##                  p_val avg_log2FC pct.1 pct.2     p_val_adj
## EBP      1.504473e-238  0.3406139 0.094 0.999 2.846463e-235
## MLX      1.389999e-231  0.3645216 0.100 1.000 2.629877e-228
## CARD8    1.389999e-231  0.3642012 0.100 1.000 2.629878e-228
## RFXANK   1.300650e-228  0.3218905 0.101 0.996 2.460830e-225
## CYB561D2 3.550510e-228  0.3677264 0.101 0.995 6.717564e-225
## HAGH     6.276764e-228  0.4230465 0.103 1.000 1.187564e-224
# There shouldn't be any, but.........

library(ComplexHeatmap)
## Loading required package: grid

## ========================================
## ComplexHeatmap version 2.11.1
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##   genomic data. Bioinformatics 2016.
## 
## The new InteractiveComplexHeatmap package can directly export static 
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
heatmapdf <- sammple.markers[1:25,]
row_ha = rowAnnotation("CD14+ Mono" = anno_barplot(heatmapdf$pct.1),
                       "Others"= anno_barplot(heatmapdf$pct.2),
                       width = unit(10, "cm"))

ht0 <- Heatmap(heatmapdf$avg_log2FC,
                name = "Log2FC",
                cluster_rows = TRUE, 
                row_labels = rownames(heatmapdf), 
                right_annotation = row_ha,
                width = unit(1, "cm"))

ht0

# Find differentially expressed features between CD14+ Monocytes and all other cells, only
# search for positive markers
Idents(object = merged.pbmc) <- "seurat_clusters"

cluster1.de.markers <- FindMarkers(merged.pbmc, ident.1 = "0", ident.2 = NULL, only.pos = TRUE)
# view results
head(cluster1.de.markers)
##             p_val avg_log2FC pct.1 pct.2    p_val_adj
## LTB  2.122043e-87  1.4506039 0.983 0.700 4.014906e-84
## LDHB 5.833112e-72  1.0527473 0.961 0.664 1.103625e-68
## IL32 1.055335e-70  1.0788059 0.939 0.601 1.996693e-67
## TPT1 2.409722e-52  0.5544837 1.000 0.985 4.559194e-49
## AQP3 2.515788e-52  1.1517965 0.678 0.463 4.759870e-49
## IL7R 3.722894e-51  1.4124211 0.763 0.442 7.043715e-48
library(ComplexHeatmap)

heatmapdf <- cluster1.de.markers[1:25,]
row_ha = rowAnnotation("CD14+ Mono" = anno_barplot(heatmapdf$pct.1),
                       "Others"= anno_barplot(heatmapdf$pct.2),
                       width = unit(10, "cm"))

ht1 <- Heatmap(heatmapdf$avg_log2FC,
                name = "Log2FC",
                cluster_rows = TRUE, 
                row_labels = rownames(heatmapdf), 
                right_annotation = row_ha,
                width = unit(1, "cm"))

ht1

####################################################################

Chapter 2 - Using ifnb Dataset (The proper way)

# install dataset for the first time
#InstallData("ifnb")

# load dataset
suppressWarnings(suppressMessages(LoadData("ifnb")))
## An object of class Seurat 
## 14053 features across 13999 samples within 1 assay 
## Active assay: RNA (14053 features, 0 variable features)
ifnb <- ifnb

#?ifnb
# split the dataset into a list of two seurat objects (stim and CTRL)
ifnb.list <- SplitObject(ifnb, split.by = "stim")
ifnb.list <- lapply(X = ifnb.list, FUN = NormalizeData)

# normalize and identify variable features for each dataset independently
ifnb.list <- lapply(X = ifnb.list, FUN = function(x) {
    x <- subset(x, downsample = 1000)
    x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
    
})

Chapter 2 - Feature selection

# select features that are repeatedly variable across datasets for integration
features <- SelectIntegrationFeatures(object.list = ifnb.list)
immune.anchors <- FindIntegrationAnchors(object.list = ifnb.list, anchor.features = features)
## Scaling features for provided objects

## Finding all pairwise anchors

## Running CCA

## Merging objects

## Finding neighborhoods

## Finding anchors

##  Found 3188 anchors

## Filtering anchors

##  Retained 2460 anchors
# this command creates an 'integrated' data assay
immune.combined <- IntegrateData(anchorset = immune.anchors)
## Merging dataset 2 into 1

## Extracting anchors for merged samples

## Finding integration vectors

## Finding integration vector weights

## Integrating data

Chapter 2 - Data Procecssing

# specify that we will perform downstream analysis on the corrected data note that the
# original unmodified data still resides in the 'RNA' assay
DefaultAssay(immune.combined) <- "integrated"

# Run the standard workflow for visualization and clustering
immune.combined <- ScaleData(immune.combined, verbose = FALSE)
immune.combined <- RunPCA(immune.combined, npcs = 30, verbose = FALSE)
immune.combined <- RunUMAP(immune.combined, reduction = "pca", dims = 1:30)
## 13:15:24 UMAP embedding parameters a = 0.9922 b = 1.112

## 13:15:24 Read 2000 rows and found 30 numeric columns

## 13:15:24 Using Annoy for neighbor search, n_neighbors = 30

## 13:15:24 Building Annoy index with metric = cosine, n_trees = 50

## 0%   10   20   30   40   50   60   70   80   90   100%

## [----|----|----|----|----|----|----|----|----|----|

## **************************************************|
## 13:15:25 Writing NN index file to temp file C:\Users\harpa\AppData\Local\Temp\RtmpQV6obQ\file3b94734c3976
## 13:15:25 Searching Annoy index using 1 thread, search_k = 3000
## 13:15:25 Annoy recall = 100%
## 13:15:26 Commencing smooth kNN distance calibration using 1 thread
## 13:15:27 Initializing from normalized Laplacian + noise
## 13:15:27 Commencing optimization for 500 epochs, with 84996 positive edges
## 13:15:36 Optimization finished
immune.combined <- FindNeighbors(immune.combined, reduction = "pca", dims = 1:30)
## Computing nearest neighbor graph
## Computing SNN
immune.combined <- FindClusters(immune.combined, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 2000
## Number of edges: 102425
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8758
## Number of communities: 10
## Elapsed time: 0 seconds
gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  4634480 247.6    9451438  504.8   9451438  504.8
## Vcells 89072648 679.6  184431107 1407.1 184430469 1407.1

Chapter 2 - Visualise Data

# Visualization
p1 <- DimPlot(immune.combined, reduction = "umap", group.by = "stim")
p1

p2 <- DimPlot(immune.combined, reduction = "umap", label = TRUE, repel = TRUE)
p2

p1 + p2

Chapter 2 - Identify conserved cell type markers

# For performing differential expression after integration, we switch back to the original
# data
DefaultAssay(immune.combined) <- "RNA"
nk.markers <- FindConservedMarkers(immune.combined, ident.1 = 6, grouping.var = "stim", verbose = FALSE)

#https://www.biostars.org/p/409790/

#FindMarkers will find markers between two different identity groups - you have to specify both #identity groups. This is useful for comparing the differences between two specific groups.

#FindAllMarkers will find markers differentially expressed in each identity group by comparing #it to all of the others - you don't have to manually define anything. Note that markers may #bleed over between closely-related groups - they are not forced to be specific to only one #group. This is what most people use (and likely what you want).

#FindConservedMarkers will find markers that are conserved between two groups - this can be #useful if you want to find markers that are conserved between a treated and untreated condition #for a specific cell type or group of cells. It means they are differentially expressed compared #to other groups, but have similar expression between the two groups you're actually comparing.

head(nk.markers)
##          CTRL_p_val CTRL_avg_log2FC CTRL_pct.1 CTRL_pct.2 CTRL_p_val_adj
## SNHG12 1.612457e-52        2.255337      0.463      0.017   2.265985e-48
## HSPH1  6.420468e-31        2.951868      0.634      0.088   9.022683e-27
## SCML1  1.689201e-29        2.154189      0.341      0.021   2.373834e-25
## SRSF2  3.068016e-29        2.743712      0.854      0.217   4.311482e-25
## CLK1   4.662395e-27        2.311505      0.585      0.083   6.552063e-23
## NOP58  9.024730e-25        2.290002      0.561      0.081   1.268245e-20
##          STIM_p_val STIM_avg_log2FC STIM_pct.1 STIM_pct.2 STIM_p_val_adj
## SNHG12 9.064908e-32        1.776018      0.292      0.007   1.273892e-27
## HSPH1  5.851049e-11        2.792827      0.542      0.127   8.222480e-07
## SCML1  9.315024e-21        2.278457      0.417      0.034   1.309040e-16
## SRSF2  2.620323e-11        2.535208      0.667      0.186   3.682340e-07
## CLK1   1.253029e-15        2.535546      0.542      0.083   1.760882e-11
## NOP58  1.869061e-15        2.361140      0.583      0.096   2.626592e-11
##            max_pval minimump_p_val
## SNHG12 9.064908e-32   3.224913e-52
## HSPH1  5.851049e-11   1.284094e-30
## SCML1  9.315024e-21   3.378401e-29
## SRSF2  2.620323e-11   6.136031e-29
## CLK1   1.253029e-15   9.324790e-27
## NOP58  1.869061e-15   1.804946e-24
FeaturePlot(immune.combined, features = c("CD3D", "SELL", "CREM", "CD8A", "GNLY", "CD79A", "FCGR3A", "CCL2", "PPBP"), min.cutoff = "q9",pt.size = 2)

# Chapter 2 - Renaming clusters

levels(immune.combined)
##  [1] "0" "1" "2" "3" "4" "5" "6" "7" "8" "9"
renamed.immune.combined <- RenameIdents(immune.combined, "1"= "new 1") 
levels(renamed.immune.combined)
##  [1] "new 1" "0"     "2"     "3"     "4"     "5"     "6"     "7"     "8"    
## [10] "9"

Chapter 2 - Dimplot with new cluster names

immune.combined <- RenameIdents(immune.combined, "0" = "CD14 Mono", "1" = "CD4 Naive T", "2" = "CD4 Memory T","3" = "CD16 Mono", "4" = "B", "5" = "CD8 T", "6" = "NK", "7" = "T activated", "8" = "DC", "9" = "B Activated", "10" = "Mk", "11" = "pDC", "12" = "Eryth", "13" = "Mono/Mk Doublets", "14" = "HSPC")
## Warning: Cannot find identity 14

## Warning: Cannot find identity 13

## Warning: Cannot find identity 12

## Warning: Cannot find identity 11

## Warning: Cannot find identity 10
levels(immune.combined)
##  [1] "CD14 Mono"    "CD4 Naive T"  "CD4 Memory T" "CD16 Mono"    "B"           
##  [6] "CD8 T"        "NK"           "T activated"  "DC"           "B Activated"
table(Idents(object = immune.combined))
## 
##    CD14 Mono  CD4 Naive T CD4 Memory T    CD16 Mono            B        CD8 T 
##          640          386          356          179          151          111 
##           NK  T activated           DC  B Activated 
##           65           64           34           14
DimPlot(immune.combined, label = TRUE)

Idents(immune.combined) <- factor(Idents(immune.combined), levels = c("HSPC", "Mono/Mk Doublets","pDC", "Eryth", "Mk", "DC", "CD14 Mono", "CD16 Mono", "B Activated", "B", "CD8 T", "NK", "T activated","CD4 Naive T", "CD4 Memory T"))
markers.to.plot <- c("CD3D", "CREM", "HSPH1", "SELL", "GIMAP5", "CACYBP", "GNLY", "NKG7", "CCL5", "CD8A", "MS4A1", "CD79A", "MIR155HG", "NME1", "FCGR3A", "VMO1", "CCL2", "S100A9", "HLA-DQA1","GPR183", "PPBP", "GNG11", "HBA2", "HBB", "TSPAN13", "IL3RA", "IGJ", "PRSS57")

Chapter 2 - Dotplot

DotPlot(immune.combined, features = markers.to.plot, cols = c("blue", "red"), dot.scale = 8, split.by = "stim") + RotatedAxis()

Chapter 2 - Identify differential expressed genes across conditions

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.2
library(cowplot)
## 
## Attaching package: 'cowplot'

## The following object is masked from 'package:patchwork':
## 
##     align_plots
theme_set(theme_cowplot())
t.cells <- subset(immune.combined, idents = "CD4 Naive T")
Idents(t.cells) <- "stim"
avg.t.cells <- as.data.frame(log1p(AverageExpression(t.cells, verbose = FALSE)$RNA))
avg.t.cells$gene <- rownames(avg.t.cells)

cd14.mono <- subset(immune.combined, idents = "CD14 Mono")
Idents(cd14.mono) <- "stim"
avg.cd14.mono <- as.data.frame(log1p(AverageExpression(cd14.mono, verbose = FALSE)$RNA))
avg.cd14.mono$gene <- rownames(avg.cd14.mono)

genes.to.label = c("ISG15", "LY6E", "IFI6", "ISG20", "MX1", "IFIT2", "IFIT1", "CXCL10", "CCL8")
p1 <- ggplot(avg.t.cells, aes(CTRL, STIM)) + geom_point() + ggtitle("CD4 Naive T Cells")
p1 <- LabelPoints(plot = p1, points = genes.to.label, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
p2 <- ggplot(avg.cd14.mono, aes(CTRL, STIM)) + geom_point() + ggtitle("CD14 Monocytes")
p2 <- LabelPoints(plot = p2, points = genes.to.label, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
p1 + p2

# Chapter 2 - Inteferon response

immune.combined$celltype.stim <- paste(Idents(immune.combined), immune.combined$stim, sep = "_")
immune.combined$celltype <- Idents(immune.combined)
Idents(immune.combined) <- "celltype.stim"

b.interferon.response <- FindMarkers(immune.combined, ident.1 = "B_STIM", ident.2 = "B_CTRL", verbose = FALSE)

head(b.interferon.response, n = 15)
##                 p_val avg_log2FC pct.1 pct.2    p_val_adj
## IFIT1    1.166508e-28   4.521337 1.000 0.071 1.639293e-24
## IFIT3    3.708120e-27   3.986884 1.000 0.298 5.211021e-23
## IFITM3   4.715986e-26   2.958064 1.000 0.690 6.627375e-22
## ISG15    4.854345e-26   4.917321 1.000 0.500 6.821811e-22
## ISG20    8.180154e-26   3.478169 1.000 0.464 1.149557e-21
## IFIT2    9.153945e-26   3.319417 1.000 0.214 1.286404e-21
## IFI6     1.108080e-25   3.122246 1.000 0.357 1.557184e-21
## LY6E     1.882624e-25   2.822519 1.000 0.440 2.645651e-21
## APOBEC3A 1.514663e-24   3.352339 1.000 0.452 2.128555e-20
## OASL     1.793719e-24   3.087877 0.955 0.179 2.520713e-20
## CXCL10   7.132351e-24   3.939777 1.000 0.333 1.002309e-19
## MX1      7.954234e-24   3.088767 0.955 0.131 1.117809e-19
## TNFSF10  9.491629e-24   2.687823 1.000 0.512 1.333859e-19
## RSAD2    9.691537e-24   3.354510 0.925 0.119 1.361952e-19
## CXCL11   1.453433e-22   3.924294 0.851 0.036 2.042510e-18
FeaturePlot(immune.combined, features = c("CD3D", "GNLY", "IFI6"), split.by = "stim", max.cutoff = 2, cols = c("grey", "red"))

# Chapter 2 - Violin plot on different conditions

plots <- VlnPlot(immune.combined, features = c("ISG15", "CXCL10"), split.by = "stim", group.by = "celltype", pt.size = 0, combine = FALSE)
## The default behaviour of split.by has changed.
## Separate violin plots are now plotted side-by-side.
## To restore the old behaviour of a single split violin,
## set split.plot = TRUE.
##       
## This message will be shown once per session.
wrap_plots(plots = plots, ncol = 1)