DESeq2 - Differential expression analysis

Pre-requisites

Run your samples (FASTQ) using the nextflow nf-core/RNA-seq pipeline using ‘star_salmon’ or an alternative pipeline that generates feature counts. For example:
xaccession gene M.209.3D.C M.647.C M.711.C M.8.C M.996.C M.996.T M.209.3D.T M.647.T M.711.T M.8.T NM_000014.4 A2M 78132 51570 32120 73355 106630 111500 93300 51695 106295 120380 NM_000015.2 NAT2 0 0 0 0 0 0 2 0 0 0 NM_000016.5 ACADM 1241 1614 1436 1192 2962 2127 1579 1471 2084 1123 NM_000017.2 ACADS 799 987 984 1010 2407 1460 1050 879 1462 816 NM_000018.3 ACADVL 6025 5078 4570 3725 8332 8693 8144 6549 10236 6501 NM_000019.3 ACAT1 1020 1178 1272 1109 2781 1400 1266 1011 1575 824 NM_000020.2 ACVRL1 0 51 137 0 132 220 0 237 0 22 NM_000021.3 PSEN1 367 911 355 766 1469 1287 1201 1248 1400 994 NM_000022.2 ADA 77 177 124 62 261 234 104 141 232 58 NM_000023.2 SGCA 0 0 0 0 0 0 0 0 0 0 NM_000024.5 ADRB2 671 1002 336 171 426 166 692 847 277 79 NM_000025.2 ADRB3 0 0 0 0 0 0 1 0 0 0 NM_000026.2 ADSL 1243 1597 1214 1441 2420 1842 1665 1424 1964 1281 NM_000027.3 AGA 535 712 412 358 1104 1017 727 716 915 391

Prepare INPUT files

INPUT 1: Prepare a feature count table in CSV format by either using gene symbols or accession numbers as row names:

gene,M.209.3D.C,M.647.C,M.711.C,M.8.C,M.996.C,M.996.T,M.209.3D.T,M.647.T,M.711.T,M.8.T
A2M,78132,51570,32120,73355,106630,111500,93300,51695,106295,120380
NAT2,0,0,0,0,0,0,2,0,0,0
ACADM,1241,1614,1436,1192,2962,2127,1579,1471,2084,1123
ACADS,799,987,984,1010,2407,1460,1050,879,1462,816
ACADVL,6025,5078,4570,3725,8332,8693,8144,6549,10236,6501
ACAT1,1020,1178,1272,1109,2781,1400,1266,1011,1575,824
ACVRL1,0,51,137,0,132,220,0,237,0,22
PSEN1,367,911,355,766,1469,1287,1201,1248,1400,994
ADA,77,177,124,62,261,234,104,141,232,58
SGCA,0,0,0,0,0,0,0,0,0,0
ADRB2,671,1002,336,171,426,166,692,847,277,79
ADRB3,0,0,0,0,0,0,1,0,0,0
ADSL,1243,1597,1214,1441,2420,1842,1665,1424,1964,1281
AGA,535,712,412,358,1104,1017,727,716,915,391

INPUT 2: Sample metadata:

id,dex,celltype,geoid
M.209.3D.C,melanocytes_control,melanocytes_control,none
M.647.C,melanocytes_control,melanocytes_control,none
M.711.C,melanocytes_control,melanocytes_control,none
M.8.C,melanocytes_control,melanocytes_control,none
M.996.C,melanocytes_control,melanocytes_control,none
M.209.3D.T,melanocytes_treatment,melanocytes_treatment,none
M.647.T,melanocytes_treatment,melanocytes_treatment,none
M.711.T,melanocytes_treatment,melanocytes_treatment,none
M.8.T,melanocytes_treatment,melanocytes_treatment,none
M.996.T,melanocytes_treatment,melanocytes_treatment,none

R script - run the script using Rstudio

#set the working directory
setwd("/path/to/input/files")

#install the following packages if not yet installed
library("ggrepel")
library("DESeq2")
library("ggplot2")
library("EnhancedVolcano")
library("reshape2")
library("plyr")
require(reshape2)

##STEP1: import gene counts
countData <- read.csv('t1_melanocytes_control__vs__melanocytes_treatment__RawCounts_integers_GeneSymbols.csv', header = TRUE, sep = ",")
head(countData)

##STEP2: import metaData
metaData <- read.csv('metadata_t1.v2.csv', header = TRUE, sep = ",")
metaData

##STEP3: Construct DESEQDataSet Object
dds <- DESeqDataSetFromMatrix(countData=countData,                               
                              colData=metaData,                               
                              design=~celltype, tidy = TRUE)
#define the reference group
dds$celltype <- factor(dds$celltype, levels = c("control","cancer"))

##STEP4: Run DESEQ function
dds <- DESeq(dds)

##STPE5: Take a look at the results table
res <- results(dds)
head(results(dds, tidy=TRUE)) #let's look at the results table

##STEP6: Summary of differential gene expression
summary(res) 

##STEP7: Sort summary list by p-value
res <- res[order(res$padj),]
head(res)

##STEP8: save DEGs to file
write.table(res, file = "t1_melanocytes_control__vs__melanocytes_treatment__DESeq2_toptags_DEGs.txt")

##STEP9: save normalised counts to file
foo <- counts(dds, normalized = TRUE)
write.csv(foo, file="t1_melanocytes_control__vs__melanocytes_treatment__DESeq2_toptags_DEGs__Normalised_Counts.csv")

##STEP10: PCA
#version 1
pdf(file="Figure_t1_melanocytes_control__vs__melanocytes_treatment__PCA.pdf")
vsdata <- vst(dds, blind=TRUE, nsub = 1000)
plotPCA(vsdata, intgroup="dex") 
dev.off()

#version 2:
pdf(file="Figure_t1_melanocytes_control__vs__melanocytes_treatment_label.pdf")
p <- plotPCA(vsdata, intgroup="dex") 
p + geom_text_repel(aes(label=substr(name, start = 1, stop = 5)),size = 2, min.segment.length = 0, seed = 42, box.padding = 0.5, max.overlaps = 20)
dev.off()

#STEP11: VOLCANO PLOTS
#basic volcano plot 
EnhancedVolcano(res,
                lab = rownames(res),
                x = 'log2FoldChange',
                y = 'pvalue')

#save enhanced volcano plot to file
pdf(file="Figure_t1_melanocytes_control__vs__melanocytes_treatment__enhanced_volcano_plot_2.0FC_p1e-5.pdf")
EnhancedVolcano(res,
                lab = rownames(res),
                x = 'log2FoldChange',
                y = 'pvalue',
                title = 'melanocytes control vs melanocytes treatment',
                pCutoff = 1e-5,
                FCcutoff = 2.0,
                pointSize = 1.0,
                labSize = 3.0)
dev.off()

#save as high resolution jpeg
library(Cairo)
Cairo(file="Figure_t1_melanocytes_control__vs__melanocytes_treatment__enhanced_volcano_plot_2.0FC_p1e-5.jpeg",
      type="png",
      units="px",
      width=2500,
      height=2500,
      pointsize=12,
      dpi=300)
EnhancedVolcano(res,
      lab = rownames(res),
      x = 'log2FoldChange',
      y = 'pvalue',
      title = 'melanocytes control vs melanocytes treatment',
      pCutoff = 1e-5,
      FCcutoff = 2.0,
      pointSize = 1.0,
      labSize = 3.0)
dev.off()