Table of Contents |
---|
Aim:
Identify statistically significant (FDR < 0.05) differentially expressed genes.
...
RNAseq gene expression feature counts
Go to the/sandpit/demo/run3_full_pipeline/ folder that contains the results for running the complete RNAseq pipeline. The output folders from task 3 look like this:
...
Now let’s find the full path to the ‘salmon.merged.gene_counts.tsv’ file:
|
---|
...
R script - run the script using Rstudio
A copy of the script is at /demo/run3_full_pipeline/
RNAseq_DESeq2.R script:
Code Block |
---|
Code Block |
##STEP1: import gene counts
countData <- read.table('salmon.merge.gene_counts.tsv', header = TRUE, sep = "\t")
head(countData)
##STEP2: import metaData
metaData <- read.csv('metadata.tsv*', header = TRUE, sep = ",")
metaData
##STEP3: Construct DESEQDataSet Object
dds <- DESeqDataSetFromMatrix(countData=countData,
colData=metaData,
design=~celltype, tidy = TRUE)
#define the reference group set as the first mentioned in levels
dds$celltype <- factor(dds$celltype, levels = c("CON","TB"))
##STEP4: Run DESEQ function
dds <- DESeq(dds)
##STPE5: Take a look at the results table
res <- results(dds)
head(results(dds, tidy=TRUE)) #let's look at the results table
##STEP6: Summary of differential gene expression
summary(res) #summary of results
##STEP7: Sort summary list by p-value
res <- res[order(res$padj),]
head(res)
head(res, 9)
##STEP8: save DEGs to file
write.table(res, file = "NoHeadInjury_CON__vs__HeadInjury_TB__DESeq2_toptags_DEGs.txt")
##STEP9: save normalised counts to file
foo <- counts(dds, normalized = TRUE)
write.csv(foo, file="NoHeadInjury_CON__vs__HeadInjury_TB__DESeq2_toptags_DEGs__Normalised_Counts.csv")
##STEP10: PCA
#version 1
pdf(file="Figure_NoHeadInjury_CON__vs__HeadInjury_TB__PCA.pdf")
vsdata <- vst(dds, blind=TRUE, nsub = 100)
plotPCA(vsdata, intgroup="dex")
dev.off()
#version 2:
pdf(file="Figure_NoHeadInjury_CON__vs__HeadInjury_TB_label.pdf")
p <- plotPCA(vsdata, intgroup="dex")
p + geom_text_repel(aes(label=substr(name, start = 1, stop = 5)),size = 2, min.segment.length = 0, seed = 42, box.padding = 0.5, max.overlaps = 20)
dev.off()
## jpeq
library(Cairo)
Cairo(file="Figure_NoHeadInjury_CON__vs__HeadInjury_TB__PCA_label.jpeg",
type="png",
units="px",
width=2500,
height=2000,
pointsize=12,
dpi=300)
p <- plotPCA(vsdata, intgroup="dex")
p + geom_text_repel(aes(label=substr(name, start = 1, stop = 5)),size = 2, min.segment.length = 0, seed = 42, box.padding = 0.5, max.overlaps = 20)
dev.off()
#STEP11: VOLCANO PLOTS
#basic volcano plot
EnhancedVolcano(res,
lab = rownames(res),
x = 'log2FoldChange',
y = 'pvalue')
#save enhanced volcano plot to file
pdf(file="Figure_NoHeadInjury_CON__vs__HeadInjury_TB__enhanced_volcano_plot_2.0FC_p1e-2.pdf")
EnhancedVolcano(res,
lab = rownames(res),
x = 'log2FoldChange',
y = 'pvalue',
title = 'NoHeadInjury_CON__vs__HeadInjury_TB',
pCutoff = 1e-2,
FCcutoff = 1.5,
pointSize = 1.0,
labSize = 3.5)
dev.off()
pdf(file="Figure_NoHeadInjury_CON__vs__HeadInjury_TB__enhanced_volcano_plot_2.0FC_p10e-3.pdf")
EnhancedVolcano(res,
lab = rownames(res),
x = "log2FoldChange",
y = "pvalue",
pCutoff = 10e-3,
FCcutoff = 2,
ylim = c(0, -log10(10e-12)),
pointSize = c(ifelse(res$log2FoldChange>2, 2, 2)),
labSize = 3.0,
#shape = c(6, 6, 19, 16),
title = "DESeq2: NoHeadInjury_CON vs. HeadInjury_TB",
subtitle = "Differential expression",
caption = bquote(~Log[2]~ "fold change cutoff, 2; p-value cutoff, 10e-3"),
#legendPosition = "right",
legendLabSize = 8,
col = c("grey30", "forestgreen", "royalblue", "red2"),
colAlpha = 0.9,
drawConnectors = TRUE,
hline = c(10e-3),
widthConnectors = 0.5)
dev.off()
#save as high resolution jpeg
library(Cairo)
## Initialize the device for saving first plot.
## Plot will have length=500 pixel, width=500 pixel
Cairo(file="Figure_NoHeadInjury_CON__vs__HeadInjury_TB__enhanced_volcano_plot_2.0FC_p1e-3.jpeg",
type="png",
units="px",
width=2800,
height=2500,
pointsize=12,
dpi=300)
EnhancedVolcano(res,
lab = rownames(res),
x = 'log2FoldChange',
y = 'pvalue',
title = 'NoHeadInjury_CON__vs__HeadInjury_TB',
pCutoff = 1e-3,
FCcutoff = 2.0,
pointSize = 1.0,
labSize = 3.0)
dev.off()
#option2:
Cairo(file="Figure_NoHeadInjury_CON__vs__HeadInjury_TB__enhanced_volcano_plot_2.0FC_p10e-3_v2.jpeg",
type="png",
units="px",
width=2500,
height=2200,
pointsize=14,
dpi=300)
EnhancedVolcano(res,
lab = rownames(res),
x = "log2FoldChange",
y = "pvalue",
pCutoff = 10e-3,
FCcutoff = 2,
ylim = c(0, -log10(10e-12)),
pointSize = c(ifelse(res$log2FoldChange>2, 2, 2)),
labSize = 3.5,
#shape = c(6, 6, 19, 16),
title = "DESeq2: NoHeadInjury_CON vs. HeadInjury_TB",
subtitle = "Differential expression",
caption = bquote(~Log[2]~ "fold change cutoff, 2; p-value cutoff, 10e-3"),
#legendPosition = "right",
legendLabSize = 8,
col = c("grey30", "forestgreen", "royalblue", "red2"),
colAlpha = 0.9,
drawConnectors = TRUE,
hline = c(10e-3),
widthConnectors = 0.5)
dev.off()
#save as high res tiff
tiff(file="Figure_NoHeadInjury_CON__vs__HeadInjury_TB__enhanced_volcano_plot_2.0FC_p1e-5.tiff",
width=8, height=8, units="in", res=150)
EnhancedVolcano(res,
lab = rownames(res),
x = 'log2FoldChange',
y = 'pvalue',
title = 'NoHeadInjury_CON__vs__HeadInjury_TB',
pCutoff = 1e-5,
FCcutoff = 2.0,
pointSize = 1.5,
labSize = 3.5)
dev.off() |
A copy of the script is at /demo/run3_full_pipeline/
RNAseq_DESeq2.R script:
Code Block |
---|
# Differential expression analysis
# When you see '## USER INPUT', this means you have to modify the code for your computer or dataset. All other code can be run as-is (i.e. you don't need to understand the code, just run it)
#### Installing required packages ####
# **NOTE: this section only needs to be run once (or occassionally to update the packages)
# Install devtools
install.packages("devtools", repos = "http://cran.us.r-project.org")
# Install R packages. This only needs to be run once.
bioconductor_packages <- c("DESeq2", "EnhancedVolcano")
cran_packages <- c("ggrepel", "ggplot2", "plyr", "reshape2", "readxl", "FactoMineR", "factoextra")
# Compares installed packages to above packages and returns a vector of missing packages
new_packages <- bioconductor_packages[!(bioconductor_packages %in% installed.packages()[,"Package"])]
new_cran_packages <- cran_packages[!(cran_packages %in% installed.packages()[,"Package"])]
# Install missing bioconductor packages
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install(new_packages)
# Install missing cran packages
if (length(new_cran_packages)) install.packages(new_cran_packages, repos = "http://cran.us.r-project.org")
# Update all installed packages are the latest version
update.packages(bioconductor_packages, ask = FALSE)
update.packages(cran_packages, ask = FALSE, repos = "http://cran.us.r-project.org")
#### Loading required packages ####
# This section needs to be run very time
# Load packages
bioconductor_packages <- c("DESeq2", "EnhancedVolcano")
cran_packages <- c("ggrepel", "ggplot2", "plyr", "reshape2", "readxl", "FactoMineR", "factoextra")
lapply(cran_packages, require, character.only = TRUE)
lapply(bioconductor_packages, require, character.only = TRUE)
#### Import your count data ####
# Make sure you have: a) your count table (salmon.merged.gene_counts.tsv file, if you used Nextflow nfcore/rnaseq to analyse your data). Copy this to a subdirectory called 'data'. b) your metadata file. This should be either an Excel file called 'metadata.xlsx' or a tab-separated text file called 'metadata.txt'. It needs 3 columns called 'sample_name', 'sample_ID' and 'group'. The sample names should be EXACTLY the same as the names in the count table. These names are often uninformative and long, so the 'sample_ID' is the sample labels you want to put on your plots. E.g. if you have a 'high fat' group, you might want to rename the samples HF1, HF2, HF3, etc)
## USER INPUT
# Set working directory.
# Change this to your working directory (In the RStudio menu: Session -> Set working directory -> Choose working directory)
setwd("C:/Users/whatmorp/OneDrive - Queensland University of Technology/Desktop/Projects/RNA-Seq downstream analysis")
# Import your count data. make sure you've created a 'data' subdirectory and put the count table file there.
metacountdata <- read.table("./data/salmon.merged.gene_counts.tsv", header = TRUE, row.names = 1)
# Import metadata (option 1: text file). Again, need a metadata.txt file in the data subdirectory.
meta <- read.table("./data/metadata.txt", header = T)
# Import metadata (option 1: Excel file))
#meta <- read_excel("./data/metadata.xlsx")
# Remove 1st columns of metadata (gene_name)
counts <- metacountdata[ ,2:ncol(metacountdata)]
# Rename sample names to new sample IDs
counts <- counts[as.character(meta$sample_name)]
colnames(counts) <- meta$sample_ID
# Counts need to be rounded to integers
counts <- ceiling(counts)
#### Outliers and batch effects ####
# This section normalises and transforms the count data so that it can be plotted on a PCA plot and a heatmap
# Normalise counts by library size ##
# Using DeSeq2's estimateSizeFactors(). Note that DeSeq2 does this internally during DEG calling. The normalisation below is done separately for PCA and density plotting.
# Set up the initial DeSeq2 experimental parameters. This is the length of the no' of samples being analysed. In full DeSeq2 analysis this is to separate groups as factors, but here a simple count is fine.
condition <- factor(1:length(counts))
# Set up the column data. A data frame of sample ID's and conditions
coldata <- data.frame(row.names=colnames(counts), condition)
# Set up the DeSeq2 data set structure
f <- DESeqDataSetFromMatrix(countData = counts, colData = coldata, design= ~ condition)
# Estimate the size factors. See DeSeq2 manual for details
f <- estimateSizeFactors(f)
# Size factors can be viewed by: sizeFactors(f)
# Multiply each row (sample) by the corresponding size factor
subcount_norm <- as.matrix(counts) %*% diag(sizeFactors(f))
# Re-add column names
colnames(subcount_norm) <- colnames(counts)
## Remove low coverage transcripts (mean count < 10) ##
# Find the mean of each row (and output as a data frame object)
means <- as.data.frame(rowMeans(subcount_norm))
# Then join the means data with the counts
means <- cbind(means, subcount_norm)
# Then subset out only genes with mean > 10
data <- subset(means, means[ , 1] > 10)
# Remove the means column
data <- data[,-1]
# Transform data
data_log <- vst(round(as.matrix(data)))
# Transformation can create some infinite values. Can't generate PCA data on these. Can see how many by: sum(sapply(data_log, is.infinite))
# To remove infinite rows, use 'is.finte' or '!is.infinite'
data_log <- data_log[is.finite(rowSums(data_log)),]
colnames(data_log) <- colnames(counts)
### Set up the PCA plot base data ###
# We're using the FactoMineR package to generate PCA plots (http://factominer.free.fr/index.html)
# Need to transpose the data first
data_log_t <- t(data_log)
# Add the group data
data_log_t_vars <- data.frame(meta$group, data_log_t)
# Run the FactoMineR PCR. qual_sup and quant_sup need to be defined in the initial setup code chunk (at the top of the plotting code chunks)
res.pca <- PCA(data_log_t_vars, quanti.sup = quant_sup, quali.sup = qual_sup, graph=FALSE)
res.pca <- PCA(data_log_t_vars, quali.sup = 1, graph=FALSE)
## Set up the dendogram/heatmaps base data ##
# Calculate the distance matrix:
distance_matrix <- as.matrix(dist(t(data_log)))
#### PCA plot ####
p <- fviz_pca_ind(res.pca,
geom.ind = "point", # show points only (but not "text")
col.ind = meta$group, # color by groups
pointsize = 5, title = "",
addEllipses = TRUE, ellipse.type = "t", ellipse.level = 0.95) # Concentration ellipses
+ theme(legend.text = element_text(size = 12), legend.title = element_text(size = 14), axis.title=element_text(size=16), axis.text=element_text(size=14))
p
## USER INPUT. Change "pca.pdf" and "pca.tiff" to something a bit more relevant to your data. NOTE: if you create a new PCA plot with the same filename, it will overwrite the previous file if it has the same name.
pdf_filename <- "pca.pdf"
tiff_filename <- "pca.tiff"
# Output as publication quality (300dpi) tiff and pdf (Note: this will create a 'figures subdirectory where all figures will be output)
dir.create("figures", showWarnings = FALSE)
ggsave(file = paste0("./figures/", tiff_filename), dpi = 300, compression = "lzw", device = "tiff", width = 10, height = 8, plot = p)
ggsave(file = paste0("./figures/", pdf_filename), device = "pdf", width = 10, height = 8, plot = p)
#### Heatmap and dendrogram ####
|
Code Block |
---|
##STEP1: import gene counts countData <- read.csv('mirtop_rawData.mod.rename.SUMs.sorted.TOPCOUNT.mod2.csv', header = TRUE, sep = ",") head(countData) ##STEP2: import metaData metaData <- read.csv('metadata_headInjury.csv', header = TRUE, sep = ",") metaData ##STEP3: Construct DESEQDataSet Object dds <- DESeqDataSetFromMatrix(countData=countData, colData=metaData, design=~celltype, tidy = TRUE) #define the reference group set as the first mentioned in levels dds$celltype <- factor(dds$celltype, levels = c("CON","TB")) ##STEP4: Run DESEQ function dds <- DESeq(dds) ##STPE5: Take a look at the results table res <- results(dds) head(results(dds, tidy=TRUE)) #let's look at the results table ##STEP6: Summary of differential gene expression summary(res) #summary of results ##STEP7: Sort summary list by p-value res <- res[order(res$padj),] head(res) head(res, 9) ##STEP8: save DEGs to file write.table(res, file = "NoHeadInjury_CON__vs__HeadInjury_TB__DESeq2_toptags_DEGs.txt") ##STEP9: save normalised counts to file foo <- counts(dds, normalized = TRUE) write.csv(foo, file="NoHeadInjury_CON__vs__HeadInjury_TB__DESeq2_toptags_DEGs__Normalised_Counts.csv") ##STEP10: PCA #version 1 pdf(file="Figure_NoHeadInjury_CON__vs__HeadInjury_TB__PCA.pdf") vsdata <- vst(dds, blind=TRUE, nsub = 100) plotPCA(vsdata, intgroup="dex") dev.off() #version 2: pdf(file="Figure_NoHeadInjury_CON__vs__HeadInjury_TB_label.pdf") p <- plotPCA(vsdata, intgroup="dex") p + geom_text_repel(aes(label=substr(name, start = 1, stop = 5)),size = 2, min.segment.length = 0, seed = 42, box.padding = 0.5, max.overlaps = 20) dev.off() ## jpeq library(Cairo) Cairo(file="Figure_NoHeadInjury_CON__vs__HeadInjury_TB__PCA_label.jpeg", type="png", units="px", width=2500, height=2000, pointsize=12, dpi=300) p <- plotPCA(vsdata, intgroup="dex") p + geom_text_repel(aes(label=substr(name, start = 1, stop = 5)),size = 2, min.segment.length = 0, seed = 42, box.padding = 0.5, max.overlaps = 20) dev.off() #STEP11: VOLCANO PLOTS #basic volcano plot EnhancedVolcano(res, lab = rownames(res), x = 'log2FoldChange', y = 'pvalue') #save enhanced volcano plot to file pdf(file="Figure_NoHeadInjury_CON__vs__HeadInjury_TB__enhanced_volcano_plot_2.0FC_p1e-2.pdf") EnhancedVolcano(res, lab = rownames(res), x = 'log2FoldChange', y = 'pvalue', title = 'NoHeadInjury_CON__vs__HeadInjury_TB', pCutoff = 1e-2, FCcutoff = 1.5, pointSize = 1.0, labSize = 3.5) dev.off() pdf(file="Figure_NoHeadInjury_CON__vs__HeadInjury_TB__enhanced_volcano_plot_2.0FC_p10e-3.pdf") EnhancedVolcano(res, lab = rownames(res), x = "log2FoldChange", y = "pvalue", pCutoff = 10e-3, FCcutoff = 2, ylim = c(0, -log10(10e-12)), pointSize = c(ifelse(res$log2FoldChange>2, 2, 2)), labSize = 3.0, #shape = c(6, 6, 19, 16), title = "DESeq2: NoHeadInjury_CON vs. HeadInjury_TB", subtitle = "Differential expression", caption = bquote(~Log[2]~ "fold change cutoff, 2; p-value cutoff, 10e-3"), #legendPosition = "right", legendLabSize = 8, col = c("grey30", "forestgreen", "royalblue", "red2"), colAlpha = 0.9, drawConnectors = TRUE, hline = c(10e-3), widthConnectors = 0.5) dev.off() #save as high resolution jpeg library(Cairo) ## Initialize the device for saving first plot. ## Plot will have length=500 pixel, width=500 pixel Cairo(file="Figure_NoHeadInjury_CON__vs__HeadInjury_TB__enhanced_volcano_plot_2.0FC_p1e-3.jpeg", type="png", units="px", width=2800, height=2500, pointsize=12, dpi=300) EnhancedVolcano(res, lab = rownames(res), x = 'log2FoldChange', y = 'pvalue', title = 'NoHeadInjury_CON__vs__HeadInjury_TB', pCutoff = 1e-3, FCcutoff = 2.0, pointSize = 1.0, labSize = 3.0) dev.off() #option2: Cairo(file="Figure_NoHeadInjury_CON__vs__HeadInjury_TB__enhanced_volcano_plot_2.0FC_p10e-3_v2.jpeg", type="png", units="px", width=2500, height=2200, pointsize=14, dpi=300) EnhancedVolcano(res, lab = rownames(res), x = "log2FoldChange", y = "pvalue", pCutoff = 10e-3, FCcutoff = 2, ylim = c(0, -log10(10e-12)), pointSize = c(ifelse(res$log2FoldChange>2, 2, 2)), labSize = 3.5, #shape = c(6, 6, 19, 16), title = "DESeq2: NoHeadInjury_CON vs. HeadInjury_TB", subtitle = "Differential expression", caption = bquote(~Log[2]~ "fold change cutoff, 2; p-value cutoff, 10e-3"), #legendPosition = "right", legendLabSize = 8, |
...
|
...
|
...
|
...
|
...
|
...
col |
...
= |
...
c("grey30", "forestgreen", "royalblue", "red2"), colAlpha = |
...
0. |
...
9, |
...
|
...
|
...
|
...
|
...
|
...
drawConnectors = TRUE, |
...
|
...
|
...
|
...
|
...
|
...
hline = c(10e-3), |
...
|
...
|
...
|
...
|
...
widthConnectors = 0. |
...
5) |
...
dev.off() |
...
|
...
#save as high res tiff tiff(file="Figure_NoHeadInjury_CON__vs__HeadInjury_TB__enhanced_volcano_plot_2.0FC_p1e-5.tiff", width=8, |
...
height=8, units="in", res=150) EnhancedVolcano(res, lab = rownames(res), x = 'log2FoldChange', y = 'pvalue', title = 'NoHeadInjury_CON__vs__HeadInjury_TB', pCutoff = 1e-5, FCcutoff = 2.0, pointSize = 1.5, labSize = 3.5) dev.off() |