...

Monitor the progress:

Code Block
qjobs

DESeq2

Code Block
#### 3. Loading required

...

This section needs to be run every time

Load packages

bioconductor_packages <-

 packages ####

# This section needs to be run every time
# Load packages
bioconductor_packages <- c("DESeq2", "EnhancedVolcano")


cran_packages <- c("ggrepel", "ggplot2", "plyr", "reshape2", "FactoMineR", "factoextra", "pheatmap")


lapply(cran_packages, require, character.only = TRUE)


lapply(bioconductor_packages, require, character.only = TRUE)





#### 4. Import your count

...

Set working directory.

Change this to your working directory

Your home wd

 data ####

# Set working directory. 
# Change this to your working directory
# Your home wd
setwd("C:/Users/whatmorp/OneDrive - Queensland University of Technology/Desktop/Teaching/HPC_training_workshops/smRNA_seq")


# HPC wd

...


#setwd("/work/training/smallRNAseq/")



# Import your count data. make sure you've created a 'data' subdirectory and put the count table file there.

...


metacounts <- read.table("./data/mature_counts.txt", header = TRUE, row.names = 1)



# Import metadata. Again, need a metadata_microRNA.txt file in the data subdirectory.

...


meta <- read.table("./data/metadata_microRNA.txt", header = TRUE)



# Rename sample names to new sample IDs

...


counts <- metacounts[as.character(meta$sample_name)]


colnames(counts) <- meta$sample_ID








#### 5. Outliers and batch

...

This section normalises and transforms the count data so that it can be plotted on a PCA plot and a heatmap

USER INPUT

Choose the groups you want to plot in a PCA/Heatmap. You can select any 2 or more of the groups (or all of the groups) you have in your 'groups' column of your metadata table

To see what groups are present, run the following:

unique(meta$group)

Now add which groups you want to plot (i.e. replace the groupnames below, and add more, separated by a comma and in "quotes", as needed). NOTE: R is case-sensitive, so these group names must be named EXACTLY the same as in the metadata table.

plotgroups <- c("normal", "Huntingtons_disease")

Pull out only the counts from the above groups

groupcounts <- counts[meta$group %in% plotgroups]

Normalise counts by library size, using DeSeq2's estimateSizeFactors() function. Note that DeSeq2 does this internally during DEG calling. The normalisation below is done separately for PCA and density plotting.

Set up the initial DeSeq2 experimental parameters.

condition <-

 effects ####

# This section normalises and transforms the count data so that it can be plotted on a PCA plot and a heatmap

## USER INPUT
# Choose the groups you want to plot in a PCA/Heatmap. You can select any 2 or more of the groups (or all of the groups) you have in your 'groups' column of your metadata table
# To see what groups are present, run the following:
unique(meta$group)
# Now add which groups you want to plot (i.e. replace the groupnames below, and add more, separated by a comma and in "quotes", as needed). NOTE: R is case-sensitive, so these group names must be named EXACTLY the same as in the metadata table.
plotgroups <- c("normal", "Huntingtons_disease")

# Pull out only the counts from the above groups
groupcounts <- counts[meta$group %in% plotgroups]

# Normalise counts by library size, using DeSeq2's estimateSizeFactors() function. Note that DeSeq2 does this internally during DEG calling. The normalisation below is done separately for PCA and density plotting.
# Set up the initial DeSeq2 experimental parameters.
condition <- factor(1:length(groupcounts))


# Set up the column data. A data frame of sample ID's and conditions

...


coldata <- data.frame(row.names=colnames(groupcounts), condition)


# Set up the DeSeq2 data set structure

...


f <- DESeqDataSetFromMatrix(countData = groupcounts, colData = coldata, design= ~ condition)


# Estimate the size factors. See DeSeq2 manual for details

...


f <- estimateSizeFactors(f)


# Size factors can be viewed by: sizeFactors(f)

...



# Multiply each row (sample) by the corresponding size factor

...


subcount_norm <- as.matrix(groupcounts) %*% diag(sizeFactors(f))


# Re-add column names

...


colnames(subcount_norm) <- colnames(groupcounts)



## Remove low coverage transcripts (mean count < 10)

...

Find the mean of each row (and output as a data frame object)

means <-

 ##

# Find the mean of each row (and output as a data frame object)
means <- as.data.frame(rowMeans(subcount_norm))


# Then join the means data with the counts

...


means <- cbind(means, subcount_norm)


# Then subset out only genes with mean > 10

...


data <- subset(means, means[ , 1] > 10)


# Remove the means column

...


data <- data[,-1]



# Transform data

...


data_log <- vst(round(as.matrix(data)), nsub = nrow(data)-20)


# Transformation can create some infinite values. Can't generate PCA data on these. Can see how many by: sum(sapply(data_log, is.infinite))

...


# To remove infinite rows, use 'is.finte' or '!is.infinite'

...


data_log <- data_log[is.finite(rowSums(data_log)),]


colnames(data_log) <- colnames(groupcounts)



### Set up the PCA plot base

...

 data ###

# We're using the FactoMineR package to generate PCA plots (http://factominer.free.fr/index.html)

...



# Need to transpose the data first

...


data_log_t <- t(data_log)


# Add the group data

...


data_log_t_vars <- data.frame(meta$group[meta$group %in% plotgroups], data_log_t)


# Generate the PCA data using FactoMineR package

...


res.pca <- PCA(data_log_t_vars, quali.sup = 1, graph=FALSE)



## Set up the dendogram/heatmaps base

...

Calculate the distance matrix:

 data ##

# Calculate the distance matrix:
distance_matrix <- as.matrix(dist(t(data_log)))





#### 5a. PCA

...

Generate the PCA plot. Groups are shaded with ellipses at 95% confidence level. NOTE: at least 4 replicates need to be in a group for an ellipses to be drawn.

...

 plot ####

# Generate the PCA plot. Groups are shaded with ellipses at 95% confidence level. NOTE: at least 4 replicates need to be in a group for an ellipses to be drawn.
# NOTE: change the group point colours by changing 'palette = ' below. Use the 'RColourBrewer' colour names (https://r-graph-gallery.com/38-rcolorbrewers-palettes.html). For example, if you are plotting 3 groups and choose palette = "Set1", this will use the first 3 colours from the Set1 colour palette.

...


p <- fviz_pca_ind(res.pca,


                  geom.ind = c("point", "text"), # show points only (but not "text")


                  col.ind = meta$group[meta$group %in% plotgroups], # color by

groups
pointsize = 5, label =

 groups
                  pointsize = 5, label = "all", title = "", legend.title = "Treatment groups", palette = "Dark2",

addEllipses = TRUE,


                  addEllipses = TRUE, ellipse.type = "t", ellipse.level = 0.95) + theme(legend.text = element_text(size = 12), legend.title = element_text(size = 14), axis.title=element_text(size=16), axis.text=element_text(size=14))

p

...



p

# Output as publication quality (300dpi) tiff and pdf.

...


# This will name your output files with the treatment groups you selected.

...



# Create a 'results_outliers_included' subdirectory where all results_outliers_included will be output

...


dir.create("results_outliers_included", showWarnings = FALSE)


# Create a (300dpi) tiff

...


ggsave(file = paste0("./results_outliers_included/PCA_", paste(plotgroups, collapse = "_Vs_"), ".tiff"), dpi = 300, compression = "lzw", device = "tiff", width = 10, height = 8, plot = p)


# Create a pdf

...


ggsave(file = paste0("./results_outliers_included/PCA_", paste(plotgroups, collapse = "_Vs_"), ".pdf"), device = "pdf", width = 10, height = 8, plot = p)

5b. Samples heatmap and dendrogram

This section plots a heatmap and dendrogram of pairwise relationships between samples. In this way you can see if samples cluster by treatment group.

...






#### 5b. Samples heatmap and dendrogram ####

# This section plots a heatmap and dendrogram of pairwise relationships between samples. In this way you can see if samples cluster by treatment group.

# See here: https://davetang.org/muse/2018/05/15/making-a-heatmap-in-r-with-the-pheatmap-package/

...



# Define annotation column

...


annot_columns <- data.frame(meta$group[meta$group %in% plotgroups])


# Make the row names the sample IDs

...


row.names(annot_columns) <- meta$sample_ID[meta$group %in% plotgroups]


colnames(annot_columns) <- "Treatment groups"


# Need to factorise it

...


annot_columns[[1]] <- factor(annot_columns[[1]])



# Generate dendrogram and heatmap

...


pheatmap(distance_matrix, color=colorRampPalette(c("white", "#9999FF", "#990000"))(50), cluster_rows = TRUE, show_rownames = TRUE, treeheight_row = 0, treeheight_col = 70, fontsize_col = 12, annotation_names_col = F, annotation_col = annot_columns, filename = paste0("./results_outliers_included/Pairwise_sample_heatmap_", paste(plotgroups, collapse = "_Vs_"), ".tiff"))



# Notes about heatmap colours.

...


# You can change the colours used in the heatmap itself by changing the colour names (color=colorRampPalette....)

...


# If you want to change the annotation colours, see here: https://zhiganglu.com/post/pheatmap_change_annotation_colors/

...





#### 6. Differential expression

...

In this section we use the Deseq2 package to identify differentially expressed genes.

USER INPUT

Choose the treatment groups you want to compare.

To see what groups are present, run the following:

unique(meta$group)

Enter which groups you want to compare (two groups only). BASELINE OR CONTROL GROUP SHOULD BE LISTED FIRST.

degroups <-

 analysis ####

# In this section we use the Deseq2 package to identify differentially expressed genes.

## USER INPUT
# Choose the treatment groups you want to compare. 
# To see what groups are present, run the following:
unique(meta$group)
# Enter which groups you want to compare (two groups only). BASELINE OR CONTROL GROUP SHOULD BE LISTED FIRST.
degroups <- c("normal", "Huntingtons_disease")



# From the count table, pull out only the counts from the above groups

...


expdata <- as.matrix(counts[,meta$group %in% degroups])



# Set up the experimental condition

...


# 'factor' sets up the reference level, i.e. which is the baseline group (otherwise the default baseline level is in alphabetic order)

...


condition <- factor(meta$group[meta$group %in% degroups], levels = degroups)


# Type 'condition' in the console to see is the levels are set correctly

...



# Set up column data (treatment groups and sample ID)

...


coldata <- data.frame(row.names=colnames(expdata), condition)



# Create the DESeq2 dataset (dds)

...


dds <- DESeq2::DESeqDataSetFromMatrix(countData=expdata, colData=coldata, design=~condition)


dds$condition <- factor(dds$condition, levels = degroups)



# Run DESeq2 to identify differentially expressed genes

...


deseq <- DESeq(dds)



# Extract a results table from the DESeq analysis

...


res <- results(deseq)



# Reorder results by adjusted p vales, so that the most signififcantly DE genes are at the top

...


res <- res[order(res$padj), ]



# You can do a summary of the results to see how many significantly (alpha=0.05, adjust to 0.01 if needed) upregulated and downregulated DE genes were found

...


summary(res, alpha=0.05)



# Convert from DESeq object to a data frame.

...


res <- data.frame(res)



# Look at the top 6 DE genes

...


head(res)




# Add normalised counts to the output table. This is so you can later plot expression trends for individual genes in R, Excel, etc.

...


# Need to normalise the counts first, using the size factors calculated by DESeq2 (in the 'deseq' object)

...


expdata_norm <- as.matrix(expdata) %*% diag(deseq$sizeFactor)


colnames(expdata_norm) <- colnames(expdata)


annot_counts <- merge(x = res, y = expdata_norm, by = 0, all = TRUE)




# Pull out just significant genes (change from 0.05 to 0.01 if needed)

...


DE_genes <- subset(annot_counts, padj < 0.05, select=colnames(annot_counts))



# Export as a csv table

...


write.csv(DE_genes, file=paste0("./results_outliers_included/DE_genes_", paste(degroups, collapse = "_Vs_"), ".csv"), row.names = FALSE)




#### 6b. Volcano plot

...

 ####

p <- EnhancedVolcano(res, lab = row.names(res), selectLab = row.names(res)[1:20], drawConnectors = TRUE, title = NULL, subtitle = NULL, x = 'log2FoldChange', y = 'pvalue')



p <- EnhancedVolcano(res, lab = rownames(res), pointSize = 3, drawConnectors = TRUE, title = NULL, subtitle = NULL, x = 'log2FoldChange', y = 'pvalue')

p




p

# NOTE: the above plot shows labels for the top significantly DE (i.e. by lowest adjusted p value) genes.

...



# Output as publication quality (300dpi) tiff and pdf.

...


# Create a (300dpi) tiff

...


ggsave(file = paste0("./results_outliers_included/volcano_", paste(degroups, collapse = "_Vs_"), ".tiff"), dpi = 300, compression = "lzw", device = "tiff", width = 10, height = 8, plot = p)


# Create a pdf

...


ggsave(file = paste0("./results_outliers_included/volcano_", paste(degroups, collapse = "_Vs_"), ".pdf"), device = "pdf", width = 10, height = 8, plot = p)






#### 6c. DE genes heatmaps and

...

 dendrograms ####

# sort by p-value

...


DE_genes <- DE_genes[order(DE_genes$padj), ]


row.names(DE_genes) <- DE_genes$Row.names



# Pull out normalised counts only

...


siggc <- DE_genes[colnames(DE_genes) %in% colnames(expdata)]



# Scale and center each row. This is important to visualise relative differences between groups and not have row-wise colouration dominated by high or low gene expression.

...


xts <- scale(t(siggc))


xtst <- t(xts)



# Define annotation column

...


annot_columns <- data.frame(meta$group[meta$group %in% degroups])


# Make the row names the sample IDs

...


row.names(annot_columns) <- meta$sample_ID[meta$group %in% degroups]


colnames(annot_columns) <- "Treatment groups"


# Need to factorise it

...


annot_columns[[1]] <- factor(annot_columns[[1]])



# Generate dendrogram and heatmap for ALL DE genes

...


pheatmap(xtst, color=colorRampPalette(c("#D55E00", "white", "#0072B2"))(100), annotation_col=annot_columns, annotation_names_col = F, fontsize_col = 12, fontsize_row = 7, labels_row = row.names(siggc), show_rownames = T, filename = paste0("./results_outliers_included/All_DEG_Heatmap_", paste(plotgroups, collapse = "_Vs_"), ".tiff"))

OUTLIER REMOVAL

This section repeats the above, but removes outliers first

REMOVE OUTLIERS FROM METADATA TABLE AN COUNT TABLE.

meta <- meta[-
















#### OUTLIER REMOVAL ####

# This section repeats the above, but removes outliers first

# REMOVE OUTLIERS FROM METADATA TABLE AN COUNT TABLE.
meta <- meta[- grep(c("WT4"), meta$sample_ID),]


counts <- counts[- grep(c("WT4"), colnames(counts))]









#### 5. Outliers and batch

...

This section normalises and transforms the count data so that it can be plotted on a PCA plot and a heatmap

USER INPUT

Choose the groups you want to plot in a PCA/Heatmap. You can select any 2 or more of the groups (or all of the groups) you have in your 'groups' column of your metadata table

To see what groups are present, run the following:

unique(meta$group)

Now add which groups you want to plot (i.e. replace the groupnames below, and add more, separated by a comma and in "quotes", as needed). NOTE: R is case-sensitive, so these group names must be named EXACTLY the same as in the metadata table.

plotgroups <- c("normal", "Huntingtons_disease")

Pull out only the counts from the above groups

groupcounts <- counts[meta$group %in% plotgroups]

Normalise counts by library size, using DeSeq2's estimateSizeFactors() function. Note that DeSeq2 does this internally during DEG calling. The normalisation below is done separately for PCA and density plotting.

Set up the initial DeSeq2 experimental parameters.

condition <-

 effects ####

# This section normalises and transforms the count data so that it can be plotted on a PCA plot and a heatmap

## USER INPUT
# Choose the groups you want to plot in a PCA/Heatmap. You can select any 2 or more of the groups (or all of the groups) you have in your 'groups' column of your metadata table
# To see what groups are present, run the following:
unique(meta$group)
# Now add which groups you want to plot (i.e. replace the groupnames below, and add more, separated by a comma and in "quotes", as needed). NOTE: R is case-sensitive, so these group names must be named EXACTLY the same as in the metadata table.
plotgroups <- c("normal", "Huntingtons_disease")

# Pull out only the counts from the above groups
groupcounts <- counts[meta$group %in% plotgroups]

# Normalise counts by library size, using DeSeq2's estimateSizeFactors() function. Note that DeSeq2 does this internally during DEG calling. The normalisation below is done separately for PCA and density plotting.
# Set up the initial DeSeq2 experimental parameters.
condition <- factor(1:length(groupcounts))


# Set up the column data. A data frame of sample ID's and conditions

...


coldata <- data.frame(row.names=colnames(groupcounts), condition)


# Set up the DeSeq2 data set structure

...


f <- DESeqDataSetFromMatrix(countData = groupcounts, colData = coldata, design= ~ condition)


# Estimate the size factors. See DeSeq2 manual for details

...


f <- estimateSizeFactors(f)


# Size factors can be viewed by: sizeFactors(f)

...



# Multiply each row (sample) by the corresponding size factor

...


subcount_norm <- as.matrix(groupcounts) %*% diag(sizeFactors(f))


# Re-add column names

...


colnames(subcount_norm) <- colnames(groupcounts)



## Remove low coverage transcripts (mean count < 10)

...

Find the mean of each row (and output as a data frame object)

means <-

 ##

# Find the mean of each row (and output as a data frame object)
means <- as.data.frame(rowMeans(subcount_norm))


# Then join the means data with the counts

...


means <- cbind(means, subcount_norm)


# Then subset out only genes with mean > 10

...


data <- subset(means, means[ , 1] > 10)


# Remove the means column

...


data <- data[,-1]



# Transform data

...


data_log <- vst(round(as.matrix(data)), nsub = nrow(data)-20)


# Transformation can create some infinite values. Can't generate PCA data on these. Can see how many by: sum(sapply(data_log, is.infinite))

...


# To remove infinite rows, use 'is.finte' or '!is.infinite'

...


data_log <- data_log[is.finite(rowSums(data_log)),]


colnames(data_log) <- colnames(groupcounts)



### Set up the PCA plot base

...

 data ###

# We're using the FactoMineR package to generate PCA plots (http://factominer.free.fr/index.html)

...



# Need to transpose the data first

...


data_log_t <- t(data_log)


# Add the group data

...


data_log_t_vars <- data.frame(meta$group[meta$group %in% plotgroups], data_log_t)


# Generate the PCA data using FactoMineR package

...


res.pca <- PCA(data_log_t_vars, quali.sup = 1, graph=FALSE)



## Set up the dendogram/heatmaps base

...

Calculate the distance matrix:

 data ##

# Calculate the distance matrix:
distance_matrix <- as.matrix(dist(t(data_log)))





#### 5a. PCA

...

Generate the PCA plot. Groups are shaded with ellipses at 95% confidence level. NOTE: at least 4 replicates need to be in a group for an ellipses to be drawn.

...

 plot ####

# Generate the PCA plot. Groups are shaded with ellipses at 95% confidence level. NOTE: at least 4 replicates need to be in a group for an ellipses to be drawn.
# NOTE: change the group point colours by changing 'palette = ' below. Use the 'RColourBrewer' colour names (https://r-graph-gallery.com/38-rcolorbrewers-palettes.html). For example, if you are plotting 3 groups and choose palette = "Set1", this will use the first 3 colours from the Set1 colour palette.

...


p <- fviz_pca_ind(res.pca,


                  geom.ind = c("point", "text"), # show points only (but not

"text")

 "text")
                  col.ind = meta$group[meta$group %in% plotgroups], # color by

groups
pointsize = 5, label =

 groups
                  pointsize = 5, label = "all", title = "", legend.title = "Treatment groups", palette = "Dark2",

addEllipses = TRUE,


                  addEllipses = TRUE, ellipse.type = "t", ellipse.level = 0.95) + theme(legend.text = element_text(size = 12), legend.title = element_text(size = 14), axis.title=element_text(size=16), axis.text=element_text(size=14))

p

...



p

# Output as publication quality (300dpi) tiff and pdf.

...


# This will name your output files with the treatment groups you selected.

...



# Create a 'results_outliers_removed' subdirectory where all results_outliers_removed will be output

...


dir.create("results_outliers_removed", showWarnings = FALSE)


# Create a (300dpi) tiff

...


ggsave(file = paste0("./results_outliers_removed/PCA_", paste(plotgroups, collapse = "_Vs_"), ".tiff"), dpi = 300, compression = "lzw", device = "tiff", width = 10, height = 8, plot = p)


# Create a pdf

...


ggsave(file = paste0("./results_outliers_removed/PCA_", paste(plotgroups, collapse = "_Vs_"), ".pdf"), device = "pdf", width = 10, height = 8, plot = p)






#### 5b. Samples heatmap and

...

This section plots a heatmap and dendrogram of pairwise relationships between samples. In this way you can see if samples cluster by treatment group.

...

 dendrogram ####

# This section plots a heatmap and dendrogram of pairwise relationships between samples. In this way you can see if samples cluster by treatment group.

# See here: https://davetang.org/muse/2018/05/15/making-a-heatmap-in-r-with-the-pheatmap-package/

...



# Define annotation column

...


annot_columns <- data.frame(meta$group[meta$group %in% plotgroups])


# Make the row names the sample IDs

...


row.names(annot_columns) <- meta$sample_ID[meta$group %in% plotgroups]


colnames(annot_columns) <- "Treatment groups"


# Need to factorise it

...


annot_columns[[1]] <- factor(annot_columns[[1]])



# Generate dendrogram and heatmap

...


pheatmap(distance_matrix, color=colorRampPalette(c("white", "#9999FF", "#990000"))(50), cluster_rows = TRUE, show_rownames = TRUE, treeheight_row = 0, treeheight_col = 70, fontsize_col = 12, annotation_names_col = F, annotation_col = annot_columns, filename = paste0("./results_outliers_removed/Pairwise_sample_heatmap_", paste(plotgroups, collapse = "_Vs_"), ".tiff"))



# Notes about heatmap colours.

...


# You can change the colours used in the heatmap itself by changing the colour names (color=colorRampPalette....)

...


# If you want to change the annotation colours, see here: https://zhiganglu.com/post/pheatmap_change_annotation_colors/

...





#### 6. Differential expression

...

In this section we use the Deseq2 package to identify differentially expressed genes.

USER INPUT

Choose the treatment groups you want to compare.

To see what groups are present, run the following:

unique(meta$group)

Enter which groups you want to compare (two groups only). BASELINE OR CONTROL GROUP SHOULD BE LISTED FIRST.

degroups <-

 analysis ####

# In this section we use the Deseq2 package to identify differentially expressed genes.

## USER INPUT
# Choose the treatment groups you want to compare. 
# To see what groups are present, run the following:
unique(meta$group)
# Enter which groups you want to compare (two groups only). BASELINE OR CONTROL GROUP SHOULD BE LISTED FIRST.
degroups <- c("normal", "Huntingtons_disease")



# From the count table, pull out only the counts from the above groups

...


expdata <- as.matrix(counts[,meta$group %in% degroups])



# Set up the experimental condition

...


# 'factor' sets up the reference level, i.e. which is the baseline group (otherwise the default baseline level is in alphabetic order)

...


condition <- factor(meta$group[meta$group %in% degroups], levels = degroups)


# Type 'condition' in the console to see is the levels are set correctly

...



# Set up column data (treatment groups and sample ID)

...


coldata <- data.frame(row.names=colnames(expdata), condition)



# Create the DESeq2 dataset (dds)

...


dds <- DESeq2::DESeqDataSetFromMatrix(countData=expdata, colData=coldata, design=~condition)


dds$condition <- factor(dds$condition, levels = degroups)



# Run DESeq2 to identify differentially expressed genes

...


deseq <- DESeq(dds)



# Extract a results table from the DESeq analysis

...


res <- results(deseq)



# Reorder results by adjusted p vales, so that the most signififcantly DE genes are at the top

...


res <- res[order(res$padj), ]



# You can do a summary of the results to see how many significantly (alpha=0.05, adjust to 0.01 if needed) upregulated and downregulated DE genes were found

...


summary(res, alpha=0.05)



# Convert from DESeq object to a data frame.

...


res <- data.frame(res)



# Look at the top 6 DE genes

...


head(res)




# Add normalised counts to the output table. This is so you can later plot expression trends for individual genes in R, Excel, etc.

...


# Need to normalise the counts first, using the size factors calculated by DESeq2 (in the 'deseq' object)

...


expdata_norm <- as.matrix(expdata) %*% diag(deseq$sizeFactor)


colnames(expdata_norm) <- colnames(expdata)


annot_counts <- merge(x = res, y = expdata_norm, by = 0, all = TRUE)




# Pull out just significant genes (change from 0.05 to 0.01 if needed)

...


DE_genes <- subset(annot_counts, padj < 0.05, select=colnames(annot_counts))



# Export as a csv table

...


write.csv(DE_genes, file=paste0("./results_outliers_removed/DE_genes_", paste(degroups, collapse = "_Vs_"), ".csv"), row.names = FALSE)




#### 6b. Volcano plot

...

 ####

p <- EnhancedVolcano(res, lab = row.names(res), selectLab = row.names(res)[1:20], drawConnectors = TRUE, title = NULL, subtitle = NULL, x = 'log2FoldChange', y = 'pvalue')



p <- EnhancedVolcano(res, lab = rownames(res), pointSize = 3, drawConnectors = TRUE, title = NULL, subtitle = NULL, x = 'log2FoldChange', y = 'pvalue')

p




p

# NOTE: the above plot shows labels for the top significantly DE (i.e. by lowest adjusted p value) genes.

...



# Output as publication quality (300dpi) tiff and pdf.

...


# Create a (300dpi) tiff

...


ggsave(file = paste0("./results_outliers_removed/volcano_", paste(degroups, collapse = "_Vs_"), ".tiff"), dpi = 300, compression = "lzw", device = "tiff", width = 10, height = 8, plot = p)


# Create a pdf

...


ggsave(file = paste0("./results_outliers_removed/volcano_", paste(degroups, collapse = "_Vs_"), ".pdf"), device = "pdf", width = 10, height = 8, plot = p)






#### 6c. DE genes heatmaps and

...

 dendrograms ####

# sort by p-value

...


DE_genes <- DE_genes[order(DE_genes$padj), ]


row.names(DE_genes) <- DE_genes$Row.names



# Pull out normalised counts only

...


siggc <- DE_genes[colnames(DE_genes) %in% colnames(expdata)]



# Scale and center each row. This is important to visualise relative differences between groups and not have row-wise colouration dominated by high or low gene expression.

...


xts <- scale(t(siggc))


xtst <- t(xts)



# Define annotation column

...


annot_columns <- data.frame(meta$group[meta$group %in% degroups])


# Make the row names the sample IDs

...


row.names(annot_columns) <- meta$sample_ID[meta$group %in% degroups]


colnames(annot_columns) <- "Treatment groups"


# Need to factorise it

...


annot_columns[[1]] <- factor(annot_columns[[1]])



# Generate dendrogram and heatmap for ALL DE genes

...


pheatmap(xtst, color=colorRampPalette(c("#D55E00", "white", "#0072B2"))(100), annotation_col=annot_columns, annotation_names_col = F, fontsize_col = 12, fontsize_row = 7, labels_row = row.names(siggc), show_rownames = T, filename = paste0("./results_outliers_removed/All_DEG_Heatmap_", paste(plotgroups, collapse = "_Vs_"), ".tiff"))

Version	Old Version 19	New Version Current
Changes made by	Roberto Barrero Gumiel	Roberto Barrero Gumiel
Saved on	Oct 26, 2023	Oct 26, 2023

Content Comparison

Versions Compared

Key

This section needs to be run every time

Load packages

Set working directory.

Change this to your working directory

Your home wd

This section normalises and transforms the count data so that it can be plotted on a PCA plot and a heatmap

USER INPUT

Choose the groups you want to plot in a PCA/Heatmap. You can select any 2 or more of the groups (or all of the groups) you have in your 'groups' column of your metadata table

To see what groups are present, run the following:

Now add which groups you want to plot (i.e. replace the groupnames below, and add more, separated by a comma and in "quotes", as needed). NOTE: R is case-sensitive, so these group names must be named EXACTLY the same as in the metadata table.

Pull out only the counts from the above groups

Normalise counts by library size, using DeSeq2's estimateSizeFactors() function. Note that DeSeq2 does this internally during DEG calling. The normalisation below is done separately for PCA and density plotting.

Set up the initial DeSeq2 experimental parameters.

Find the mean of each row (and output as a data frame object)

Calculate the distance matrix:

Generate the PCA plot. Groups are shaded with ellipses at 95% confidence level. NOTE: at least 4 replicates need to be in a group for an ellipses to be drawn.

5b. Samples heatmap and dendrogram

This section plots a heatmap and dendrogram of pairwise relationships between samples. In this way you can see if samples cluster by treatment group.

In this section we use the Deseq2 package to identify differentially expressed genes.

USER INPUT

Choose the treatment groups you want to compare.

To see what groups are present, run the following:

Enter which groups you want to compare (two groups only). BASELINE OR CONTROL GROUP SHOULD BE LISTED FIRST.

OUTLIER REMOVAL

This section repeats the above, but removes outliers first

REMOVE OUTLIERS FROM METADATA TABLE AN COUNT TABLE.

This section normalises and transforms the count data so that it can be plotted on a PCA plot and a heatmap

USER INPUT

Choose the groups you want to plot in a PCA/Heatmap. You can select any 2 or more of the groups (or all of the groups) you have in your 'groups' column of your metadata table

To see what groups are present, run the following:

Now add which groups you want to plot (i.e. replace the groupnames below, and add more, separated by a comma and in "quotes", as needed). NOTE: R is case-sensitive, so these group names must be named EXACTLY the same as in the metadata table.

Pull out only the counts from the above groups

Normalise counts by library size, using DeSeq2's estimateSizeFactors() function. Note that DeSeq2 does this internally during DEG calling. The normalisation below is done separately for PCA and density plotting.

Set up the initial DeSeq2 experimental parameters.

Find the mean of each row (and output as a data frame object)

Calculate the distance matrix:

Generate the PCA plot. Groups are shaded with ellipses at 95% confidence level. NOTE: at least 4 replicates need to be in a group for an ellipses to be drawn.

This section plots a heatmap and dendrogram of pairwise relationships between samples. In this way you can see if samples cluster by treatment group.

In this section we use the Deseq2 package to identify differentially expressed genes.

USER INPUT

Choose the treatment groups you want to compare.

To see what groups are present, run the following:

Enter which groups you want to compare (two groups only). BASELINE OR CONTROL GROUP SHOULD BE LISTED FIRST.