Markdown document collecting the code used in the paper by Nazzari, M., Romitti, M., Kip, A. M., Kamps, R., Costagliola, S., van de Beucken, T., Moroni, L., & Caiment, F. (2024) Impact of benzo[a]pyrene, PCB153 and sex hormones on human ESC-derived thyroid follicles using single cell transcriptomics. Environment International, 188, 108748. https://doi.org/10.1016/j.envint.2024.108748
# main working directory
setwd('/project/')
dir.create('/project/analysis/', showWarnings = F)
dir.create('/project/analysis/plots/', showWarnings = F)
# sequenced reads
read.table('/project/sequencing_qc.txt', sep = '\t', header = T) %>% # this table is stored locally
rename(`Number of cells` = 'Number.of.cells', # need to rename some columns that contain symbols
`Number of Reads, x10^6` = 'Number.of.Reads..x10.6',
`Library Size, MBases` = 'Library.Size..MBases',
`% >= Q30 bases` = 'X.....Q30.bases',
`Mean Quality Score` = 'Mean.Quality.Score',
`Median Genes per Cell` = 'Median.Genes.per.Cell',
`Sequencing Saturation, %` = 'Sequencing.Saturation...',
`Reads Mapped Confidently to Genome, %` = 'Reads.Mapped.Confidently.to.Genome...',
`Median Reads per Cell` = 'Mean.Reads.per.Cell') %>%
dplyr::select(-`Library Size, MBases`) %>%
reshape2::melt(id.vars = 'Sample',
value.name = 'value',
variable.name = 'metric') %>%
ggplot(aes(x = value, y = 1)) +
geom_boxplot(aes(fill = metric), alpha = .3) +
geom_point() +
ggrepel::geom_text_repel(aes(label = Sample),
segment.alpha = .5,
size = 3,
point.padding = .1,
min.segment.length = 0) +
theme_bw() +
labs(x = '', y = '') +
theme(axis.ticks.y = element_blank(),
axis.text.y = element_blank()) +
scale_x_continuous(labels = scales::label_comma()) +
facet_wrap(vars(metric), scales = 'free', ncol = 2, nrow = 4) +
guides(fill = 'none')
ggsave('/project/analysis/plots/scrnaseq_qc.png', width = 20, height = 15, units = 'cm', dpi = 300)
# number of cells before/after filter
read.table('cells_before_after_filter.txt', sep = '\t', header = T) %>% # table is sotred locally
dplyr::rename(`Cells before filter` = 'Cells.before.filter', # need to rename some columns that contain symbols
`Cells after filter` = 'Cells.after.filter') %>%
reshape2::melt(id.vars = 'Sample',
value.name = 'Number of cells',
variable.name = 'metric') %>%
ggplot(aes(x = Sample, y = `Number of cells`, fill = Sample)) +
geom_bar(stat = 'identity') +
theme_bw() +
theme(axis.ticks.x = element_blank(),
axis.text.x = element_text(hjust = 1, vjust = .5, angle = 90),) +
facet_grid(rows = vars(metric), scales = 'free') +
scale_fill_manual(values = c('Female_BaP' = '#FFBB00', 'Female_Ctrl' = '#FFEA61', 'Female_PCB153' = '#FFFFB7',
'Male_BaP' = '#408af1', 'Male_Ctrl' = '#78b6fa', 'Male_PCB153' = '#a8d7fa',
'Untr_BaP' = 'red', 'Untr_Ctrl' = 'pink'))
ggsave('/project/analysis/plots/cells_before_after_filter.png', width = 10, height = 15, units = 'cm', dpi = 300)
library(tidyverse)
library(magrittr)
library(Seurat)
library(HGNChelper)
library(tuple)
library(clusterProfiler)
library(org.Hs.eg.db)
library(enrichplot)
library(ggraph)
library(igraph)
library(tidyverse)
library(data.tree)
library(scater)
library(ReactomePA)
library(scCustomize)
library(scater)
library(ggvenn)
library(topGO)
library(Rgraphviz)
`%notin%` = Negate(`%in%`)
FDR = 0.05
set.seed(42)
resolution = 0.5
# Load the dataset
Untr_Ctrl.data = Read10X(data.dir = '/project/raw_data/Untr_Ctrl/outs/filtered_feature_bc_matrix/')
Untr_Ctrl.data = CreateSeuratObject(counts = Untr_Ctrl.data, project = 'Untr_Ctrl')
Untr_Ctrl.data
Untr_BaP.data = Read10X(data.dir = '/project/raw_data/Untr_BaP/outs/filtered_feature_bc_matrix/')
Untr_BaP.data = CreateSeuratObject(counts = Untr_BaP.data, project = 'Untr_BaP')
Untr_BaP.data
Male_PCB153.data = Read10X(data.dir = '/project/raw_data/Male_PCB153/outs/filtered_feature_bc_matrix/')
Male_PCB153.data = CreateSeuratObject(counts = Male_PCB153.data, project = 'Male_PCB153')
Male_PCB153.data
Male_Ctrl.data = Read10X(data.dir = '/project/raw_data/Male_Ctrl/outs/filtered_feature_bc_matrix/')
Male_Ctrl.data = CreateSeuratObject(counts = Male_Ctrl.data, project = 'Male_Ctrl')
Male_Ctrl.data
Male_BaP.data = Read10X(data.dir = '/project/raw_data/Male_BaP/outs/filtered_feature_bc_matrix/')
Male_BaP.data = CreateSeuratObject(counts = Male_BaP.data, project = 'Male_BaP')
Male_BaP.data
Female_PCB153.data = Read10X(data.dir = '/project/raw_data/Female_PCB153/outs/filtered_feature_bc_matrix/')
Female_PCB153.data = CreateSeuratObject(counts = Female_PCB153.data, project = 'Female_PCB153')
Female_PCB153.data
Female_Ctrl.data = Read10X(data.dir = '/project/raw_data/Female_Ctrl/outs/filtered_feature_bc_matrix/')
Female_Ctrl.data = CreateSeuratObject(counts = Female_Ctrl.data, project = 'Female_Ctrl')
Female_Ctrl.data
Female_BaP.data = Read10X(data.dir = '/project/raw_data/Female_BaP/outs/filtered_feature_bc_matrix/')
Female_BaP.data = CreateSeuratObject(counts = Female_BaP.data, project = 'Female_BaP')
Female_BaP.data
# sample names
samples = c('Untr_Ctrl', 'Untr_BaP',
'Male_PCB153','Male_Ctrl','Male_BaP',
'Female_PCB153','Female_Ctrl', 'Female_BaP')
# initialize the Seurat object with the raw (non-normalized data).
sobj = merge(Untr_Ctrl.data, y = c(Untr_BaP.data, Male_PCB153.data, Male_Ctrl.data, Male_BaP.data,
Female_PCB153.data, Female_Ctrl.data, Female_BaP.data),
add.cell.ids = c('Untr_Ctrl', 'Untr_BaP', 'Male_PCB153','Male_Ctrl','Male_BaP',
'Female_PCB153','Female_Ctrl', 'Female_BaP'),
project = 'human_follicles_hormones')
# add sample info to metadata
DefaultAssay(sobj) = ASSAY = 'RNA'
sobj@meta.data %<>% dplyr::rename(sample = 'orig.ident') %>%
separate(col = 'sample', into = c('hormones', 'EDC'), sep = '_', remove = F)
# calculate % of reads mapping to mitochondrial DNA
sobj[['percent.mt']] = PercentageFeatureSet(sobj, pattern = '^MT-')
# visualize QC metrics as a violin plot
p1 = VlnPlot(seurat_object, features = 'nFeature_RNA') +
geom_hline(yintercept = c(1700, 10000), color = 'red') +
scale_fill_manual(values = c('Female_BaP' = '#FFBB00', 'Female_Ctrl' = '#FFEA61', 'Female_PCB153' = '#FFFFB7',
'Male_BaP' = '#408af1', 'Male_Ctrl' = '#78b6fa', 'Male_PCB153' = '#a8d7fa',
'Untr_BaP' = 'red', 'Untr_Ctrl' = 'pink'))
p2 = VlnPlot(seurat_object, features = 'nCount_RNA') +
geom_hline(yintercept = 800, color = 'red') +
scale_fill_manual(values = c('Female_BaP' = '#FFBB00', 'Female_Ctrl' = '#FFEA61', 'Female_PCB153' = '#FFFFB7',
'Male_BaP' = '#408af1', 'Male_Ctrl' = '#78b6fa', 'Male_PCB153' = '#a8d7fa',
'Untr_BaP' = 'red', 'Untr_Ctrl' = 'pink'))
p3 = VlnPlot(seurat_object, features = 'percent.mt') +
geom_hline(yintercept = 12.5, color = 'red') +
scale_fill_manual(values = c('Female_BaP' = '#FFBB00', 'Female_Ctrl' = '#FFEA61', 'Female_PCB153' = '#FFFFB7',
'Male_BaP' = '#408af1', 'Male_Ctrl' = '#78b6fa', 'Male_PCB153' = '#a8d7fa',
'Untr_BaP' = 'red', 'Untr_Ctrl' = 'pink'))
patchwork::wrap_plots(p1, p2, p3, ncol = 3)
ggsave('/plots/before_filter.png', width = 16, height = 8, unit = 'in', dpi = 300)
sobj = subset(sobj, subset = nFeature_RNA > 1700 & nFeature_RNA < 10000 & nCount_RNA > 800 & percent.mt < 12.5)
# normalize data
sobj[['percent.mt']] = PercentageFeatureSet(sobj, pattern = '^MT-')
sobj = NormalizeData(sobj, normalization.method = 'LogNormalize', scale.factor = 10000)
sobj = FindVariableFeatures(sobj, selection.method = 'vst', nfeatures = 2000)
# scale and run PCA
sobj = ScaleData(sobj, features = rownames(sobj))
sobj = RunPCA(sobj, features = VariableFeatures(object = sobj))
# investigate genes influencing the loading scores
print(sobj[['pca']], dims = 1:5, nfeatures = 5)
VizDimLoadings(sobj, dims = 1:3, reduction = 'pca')
DimPlot(sobj, reduction = 'pca')
# first construct a KNN graph based on the euclidean distance in PCA space, and refine the edge weights between any two cells based on the shared overlap in their local neighborhoods (Jaccard similarity).
# input: selected amount of PCs
sobj = FindNeighbors(sobj, dims = 1:15)
# modularity optimization techniques: Louvain algorithm (default) or SLM
# setting the resolution between 0.4-1.2 typically returns good results for single-cell datasets of around 3K cells. Optimal resolution often increases for larger datasets.
sobj = FindClusters(sobj, resolution = resolution)
# determines the amount of clusters to use more precisely then reading the elbowplot. Resolution = between 0.4 (least resolution) and 1.2 (max resolution)
clusters = Idents(sobj)
sobj = RunUMAP(sobj, dims = 1:length(levels(clusters)))
DimPlot(sobj, reduction = 'umap', label = T, seed = 42, group.by = 'seurat_clusters')
# load gene set preparation function
source('/project/scripts/gene_sets_prepare.R')
# load cell type annotation function
source('/project/scripts/sctype_score_.R')
# DB file
db = '/project/sc_annotation_file.xlsx'
tissue = c('Thyroid')
# prepare gene sets
gs_list = gene_sets_prepare(db, tissue)
# get cell-type by cell matrix
es.max = sctype_score(scRNAseqData = sobj[['RNA']]@scale.data, scaled = TRUE,
gs = gs_list$gs_positive, gs2 = gs_list$gs_negative)
# NOTE: scRNAseqData parameter should correspond to your input scRNA-seq matrix.
# In case Seurat is used, it is either sobj[['RNA']]@scale.data (default), sobj[['SCT']]@scale.data, in case sctransform is used for normalization,
# or sobj[['integrated']]@scale.data, in case a joint analysis of multiple single-cell datasets is performed.
# merge by cluster
cL_resutls = do.call('rbind', lapply(unique(sobj@meta.data$seurat_clusters), function(cl){
es.max.cl = sort(rowSums(es.max[ , rownames(sobj@meta.data[sobj@meta.data$seurat_clusters == cl, ])]), decreasing = !0)
head(data.frame(cluster = cl, type = names(es.max.cl), scores = es.max.cl, ncells = sum(sobj@meta.data$seurat_clusters == cl)), 10)
}))
sctype_scores = cL_resutls %>% group_by(cluster) %>% top_n(n = 1, wt = scores)
# set low-confident (low ScType score) clusters to 'unknown'
sctype_scores$type[as.numeric(as.character(sctype_scores$scores)) < sctype_scores$ncells/4] = 'Unknown'
print(sctype_scores[, 1:3])
# Overlay the annotation on the UMAP
sobj@meta.data$customclassif = ''
for(j in unique(sctype_scores$cluster)){
cl_type = sctype_scores[sctype_scores$cluster == j,];
sobj@meta.data$customclassif[sobj@meta.data$seurat_clusters == j] = as.character(cl_type$type[1])
}
DimPlot(sobj, reduction = 'umap', label = TRUE, seed = 42, repel = T, group.by = c('seurat_clusters', 'customclassif'))
ggsave('/project/analysis/plots/UMAP_classification.png', width = 16, height = 8, unit = 'in', dpi = 300)
# genes of interest
genes = c('TPO', 'TG', 'SLC5A5', 'IYD', 'PAX8', 'NKX2-1', 'FOXE1', 'HHEX', 'DUOX1', 'DUOX2', 'DUOXA1', 'DUOXA2', 'SLC16A2', 'DIO1', 'DIO2', 'SLC26A7')
# Note on VlnPlot(): by default it plots `slot = 'data'` (= ln-normalized values). If you want the raw counts, set `slot = 'counts'`
VlnPlot(sobj,
features = genes,
idents = 'Mature Thyrocytes',
group.by = 'sample',
cols = c('#FFBB00', '#FFEA61', '#FFFFB7', '#408af1', '#78b6fa', '#a8d7fa', 'red', 'pink'))
ggsave('/project/analysis/plots/violin_thyroid_gene.png', width = 40, height = 40, unit = 'cm', dpi = 300)
FeaturePlot(sobj, reduction = 'umap', features = genes, ncol = 4)
FeaturePlot(sobj, reduction = 'umap', features = genes, split.by = 'sample') # one plot per sample
# number of cells per type per sample
table(sobj@meta.data$customclassif, sobj@meta.data$sample) %>%
as.data.frame() %>%
transform(percentage = Freq / tapply(Freq, Var2, sum)[Var2] * 100) %>% # find composition in percentage
ggplot(aes(x = Var2, y = percentage, fill = Var1)) +
geom_bar(stat = 'identity', position = 'fill', color = 'black') +
theme_bw() +
labs(x = 'Sample', y = 'Composition, %') +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_y_continuous(labels = scales::percent) +
scale_fill_discrete(name = 'Cell type')
ggsave('/project/analysis/plots/samples_cell_composition.png', width = 10, height = 10, unit = 'cm', dpi = 300)
# expression of hormone receptor genes per cell type
genes = c('ESR1', 'ESR2', 'PGR', 'AR')
AverageExpression(sobj, features = genes, group.by = 'customclassif') %>%
as.data.frame() %>%
rownames_to_column(var = 'gene') %>%
reshape2::melt(id.vars = 'gene',
variable.name = 'Annotated cell type',
value.name = 'Average expression') %>%
mutate(`Annotated cell type` = sub('RNA\\.', '', `Annotated cell type`)) %>%
mutate(`Annotated cell type` = gsub('\\.', ' ', `Annotated cell type`)) %>%
ggplot(aes(x = gene, y = `Average expression`, fill = `Annotated cell type`)) +
geom_bar(stat = 'identity', color = 'black', linewidth = .5, position = 'dodge') +
theme_bw() +
ggtitle('All cells') +
theme(plot.title = element_text(hjust = .5))
ggsave('/project/analysis/plots/average_expression_hormone_receptors.png', width = 15, height = 10, units = 'cm', dpi = 300)
# % of cells expressing hormone receptor genes per cell type
scCustomize::Percent_Expressing(sobj, features = genes, group_by = 'customclassif') %>%
as.data.frame() %>%
rownames_to_column(var = 'Gene') %>%
reshape2::melt(id.vars = 'Gene',
variable.name = 'Annotated cell type',
value.name = 'Percent expressing') %>%
mutate(`Annotated cell type` = sub('RNA\\.', '', `Annotated cell type`)) %>%
mutate(`Annotated cell type` = gsub('\\.', ' ', `Annotated cell type`)) %>%
ggplot(aes(x = Gene, y = `Percent expressing`, fill = `Annotated cell type`)) +
geom_bar(stat = 'identity', position = 'dodge', color = 'black', linewidth = .5) +
geom_hline(yintercept = 10, linetype = 'dotted', col = 'grey25') + # for 10% threshold
theme_bw() +
ggtitle('All cells') +
theme(plot.title = element_text(hjust = .5))
ggsave('/project/analysis/plots/percentage_expression_hormone_receptors.png', width = 15, height = 10, units = 'cm', dpi = 300)
# add a col to the @meta.data that concatenates cell type and hormone treatment and EDC treatment
sobj$cell_hormone_edc = paste(sobj@meta.data[, 'customclassif'],
sobj@meta.data[, 'hormones'],
sobj@meta.data[, 'EDC'], sep = '_')
sobj$cell_hormone = paste(sobj@meta.data[, 'customclassif'],
sobj@meta.data[, 'hormones'], sep = '_')
Idents(sobj) = sobj$customclassif
# comparisons i want to test
test_comparisons = list(
male = c('Mature Thyrocytes_Male_Ctrl', 'Mature Thyrocytes_Untr_Ctrl'),
female = c('Mature Thyrocytes_Female_Ctrl', 'Mature Thyrocytes_Untr_Ctrl'),
male_female = c('Mature Thyrocytes_Male_Ctrl', 'Mature Thyrocytes_Female_Ctrl'),
bapResponse_affectedByMale = c('Mature Thyrocytes_Male_BaP', 'Mature Thyrocytes_Untr_BaP'),
bapResponse_affectedByFemale = c('Mature Thyrocytes_Female_BaP', 'Mature Thyrocytes_Untr_BaP'),
bapResponse = c('Mature Thyrocytes_Untr_BaP', 'Mature Thyrocytes_Untr_Ctrl'),
bapResponse_inMaleEnv = c('Mature Thyrocytes_Male_BaP', 'Mature Thyrocytes_Male_Ctrl'),
bapResponse_inFemaleEnv = c('Mature Thyrocytes_Female_BaP', 'Mature Thyrocytes_Female_Ctrl'),
PCB153Response_inMaleEnv = c('Mature Thyrocytes_Male_PCB153', 'Mature Thyrocytes_Male_Ctrl'),
PCB153Response_inFemaleEnv = c('Mature Thyrocytes_Female_PCB153', 'Mature Thyrocytes_Female_Ctrl')
)
# where to store all results
degs_all = list() # degs
genes_all = list() # all genes
for (x in 1:length(test_comparisons)) {
print(paste0('Comparing ', test_comparisons[[x]][1], ' VS ', test_comparisons[[x]][2]))
# find DEGs
all_genes = FindMarkers(Mature_Thyrocytes_sobj, ident.1 = test_comparisons[[x]][1], ident.2 = test_comparisons[[x]][2], group.by = 'cell_hormone_edc', logfc.threshold = 0.25, test.use = "wilcox", min.pct = 0.1, min.cells.feature = 3, min.cells.group = 3, base = 2)
# assign DEGs results to list that collects them all
degs_all[[x]] = filter(all_genes, p_val_adj < FDR)
genes_all[[x]] = all_genes
names(degs_all)[[x]] = paste0(test_comparisons[[x]][1], '_VS_', test_comparisons[[x]][2])
names(genes_all)[[x]] = paste0(test_comparisons[[x]][1], '_VS_', test_comparisons[[x]][2])
# also, save results as a table
# all genes
write.csv(genes_all[[x]], paste0(test_comparisons[[x]][1], '_', test_comparisons[[x]][2], '_All_genes.csv'))
# DEGs
write.csv(degs_all[[x]], paste0(test_comparisons[[x]][1], '_', test_comparisons[[x]][2], '_DEGs_FDR_', FDR, '.csv'))
# how many genes have been tested and how many DEGs (FDR < 0.05) there are
num_genes = nrow(genes_all[[x]])
num_DEGs = nrow(degs_all[[x]])
DEG_percent = (num_DEGs/num_genes)*100
# give some info on number of tested genes and DEGs
print(paste0('Found ', num_DEGs, ' unique DEGs (', num_genes, ' genes tested) when comparing ', test_comparisons[[x]][1], ' - ', test_comparisons[[x]][2], '; ', round(DEG_percent, digits = 1), '% is differentially expressed.'))
}
# save all tested genes and DEGs in two excel files
degs_all_save = degs_all
for (x in 1:10) { # number of comparisons
names(degs_all_save)[x] = paste0(map_chr(str_split(test_comparisons[[x]][1], 'Thyrocytes_'), 2),
'_VS_',
map_chr(str_split(test_comparisons[[x]][2], 'Thyrocytes_'), 2))
degs_all_save[[x]] %<>% rownames_to_column(var = 'gene')
}
writexl::write_xlsx(degs_all_save, '/project/analysis/DEGs_all_comparisons.xlsx')
genes_all_save = genes_all
for (x in 1:10) {
names(genes_all_save)[x] = paste0(map_chr(str_split(test_comparisons[[x]][1], 'Thyrocytes_'), 2),
'_VS_',
map_chr(str_split(test_comparisons[[x]][2], 'Thyrocytes_'), 2))
genes_all_save[[x]] %<>% rownames_to_column(var = 'gene')
}
writexl::write_xlsx(genes_all_save, '/project/analysis/all_genes_all_comparisons.xlsx')
Color OXPHOS, ribosomal genes and lipid transport genes
# KEGG pathway hsa00190 “Oxidative phosphorylation” (KEGG release 107)
oxphos = read.table('project/KEGG_PATHWAY_hsa00190_genes_31_07_2023.txt', sep = '\t', header = T)
# GO term “Lipid transport” (GO:0006869)
lipid_transport = read.table('project/GO_6869_lipid_transport.txt', header = F, sep = '\t', quote = '') %>%
filter(V12 == 'protein') %>%
pull(V1) %>%
unique()
titles = c('Male Ctrl vs Untr Ctrl',
'Female Ctrl vs Untr Ctrl',
'Male Ctrl vs Female Ctrl',
'Male BaP vs Untr BaP',
'Female BaP vs Untr BaP',
'Untr BaP vs Untr Ctrl',
'Male BaP vs Male Ctrl',
'Female BaP vs Female Ctrl',
'Male PCB153 vs Male Ctrl')
for (x in 1:length(titles)) {
genes_all[[x]] %>%
rownames_to_column(var = 'gene') %>%
mutate(gene_category = case_when(
grepl('^RP[SL]', gene) & p_val_adj < 0.05 ~ 'Ribosomal protein',
gene %in% oxphos[oxphos$location == 'mitochondrial', ]$gene & p_val_adj < 0.05 ~ 'OXPHOS (mitochondrial genome)',
gene %in% oxphos[oxphos$location == 'nuclear', ]$gene & p_val_adj < 0.05 ~ 'OXPHOS (nuclear genome)',
gene %in% lipid_transport & p_val_adj < 0.05 ~ 'Lipid transport',
TRUE ~ 'none')) %>% # all other cases
ggplot(aes(x = avg_log2FC, y = -log10(p_val_adj))) +
geom_point(aes(colour = gene_category)) +
geom_hline(yintercept = -log10(0.05), linetype = 'dotted', col = 'grey25') +
geom_vline(xintercept = 0, linetype = 'solid', col = 'grey25') +
ggrepel::geom_text_repel(data = . %>% filter(gene_category == 'Lipid transport'), aes(label = gene)) +
scale_color_manual(name = '',
values = c('chartreuse2', 'cadetblue1', 'blue', 'darkgoldenrod1', 'black'),
breaks = c('Ribosomal protein', 'OXPHOS (mitochondrial genome)', 'OXPHOS (nuclear genome)', 'Lipid transport', 'none'),
labels = c('Ribosomal gene', 'OXPHOS (mitochondrial genome)', 'OXPHOS (nuclear genome)', 'Lipid transport', 'other genes')) +
labs(title = titles[x], x = 'Average log2(Fold Change)', y = '-log10(FDR)') +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5)) +
xlim(c(-1.7, 1.8))
ggsave(paste0('/project/analysis/plots/Volcano_', titles[x] ,'.png'), width = 10, height = 15, units = 'cm', dpi = 300)
}
# clusterProfiler
ontology = 'BP'
go_analysis = function(DEG_LIST, i, ...) {
gene_list = bitr(rownames(DEG_LIST), fromType = 'SYMBOL', toType = 'ENTREZID', OrgDb = 'org.Hs.eg.db')
# if some genes failed to map, report it
print('These genes failed to map: ')
print(DEG_LIST[rownames(DEG_LIST) %notin% gene_list$SYMBOL, 1:2])
x = enrichGO(gene = gene_list$ENTREZ,
OrgDb = org.Hs.eg.db,
qvalueCutoff = 0.01,
pvalueCutoff = 0.01,
ont = ontology,
readable = TRUE)
x = clusterProfiler::simplify(x, by = 'qvalue')
npathway = x@result%>%
as.data.frame %>%
filter(qvalue < 0.01) %>%
nrow()
print(paste0('Enriched GO pathways: ', npathway))
# also, save results as a table
x@result %>%
as.data.frame %>%
filter(qvalue < 0.01) %>%
write.csv(., paste('project/analysis/GO', ontology, 'simplified', test_comparisons[[i]][1], test_comparisons[[i]][2], ..., 'qvalue_0.01.csv', sep = '_'))
return(x)
}
go_male_vs_untr = go_analysis(degs_all[[1]], 1)
# emap plot
go_male_vs_untr_EMAP = pairwise_termsim(go_male_vs_untr)
emapplot(go_male_vs_untr_EMAP, color = 'qvalue', pie.params = list(pie = 'count'), cex.params = list(category_node = 1)) +
scale_fill_continuous(low = 'blue', high = 'grey90', name = 'qvalue') +
theme(plot.title = element_text(face = 'bold', hjust = .5)) +
ggtitle(titles[1])
ggsave(paste0('emapplot_GO_', ontology, '_', titles[1], '.png'), dpi = 300, units = 'cm', width = 25, height = 20, bg = 'white')
# define function for coloring based on log2(fold change)
heatplot_fc = function(DEG_LIST) {
gene_fc = DEG_LIST$avg_log2FC
names(gene_fc) = rownames(DEG_LIST)
return(gene_fc)
}
# heatplot
heatplot(go_male_vs_untr, foldChange = heatplot_fc(degs_all[[1]])) +
ggtitle(paste(titles[1], '- GO Biological Process')) +
theme(plot.title = element_text(face = 'bold', hjust = .5))
ggsave(paste0('heatplot_GO_', ontology, '_', titles[1], '.png'), width = 25, height = 15, dpi = 300, units = 'cm', bg = 'white')
N.B.: only in the comparisons Untr BaP vs Untr Ctrl, Male BaP vs Male Ctrl, Female BaP vs Female Ctrl
mylist = list()
# extract genes only if they are DEGs
for (i in 1:3) {
print(i)
mylist[[i]] = degs_all[[i+5]] %>% filter(grepl('CYP1|NQO1|ALDH3|TIPARP|AHR', rownames(.)))
names(mylist)[[i]] = titles[i+5] # interested in comparisons 6, 7, 8
}
mylist
# $`Untr BaP vs Untr Ctrl`
# p_val avg_log2FC pct.1 pct.2 p_val_adj
# CYP1B1 2.121654e-21 0.7621296 0.439 0.068 7.765465e-17
# NQO1 4.166497e-07 0.5565888 0.735 0.633 1.524980e-02
#
# $`Male BaP vs Male Ctrl`
# p_val avg_log2FC pct.1 pct.2 p_val_adj
# CYP1B1 8.545337e-19 0.8081897 0.458 0.126 3.127679e-14
# CYP1A1 2.905552e-13 0.3147409 0.228 0.009 1.063461e-08
# NQO1 5.812601e-10 0.5464255 0.707 0.604 2.127470e-05
#
# $`Female BaP vs Female Ctrl`
# p_val avg_log2FC pct.1 pct.2 p_val_adj
# CYP1B1 3.898564e-16 1.5897009 0.544 0.080 1.426914e-11
# CYP1A1 5.454316e-09 1.1776597 0.262 0.000 1.996334e-04
# NQO1 3.050631e-07 0.8004275 0.725 0.554 1.116561e-02
# add column with comparison name
for (i in 1:3) {
mylist[[i]]$Comparison = names(mylist)[i]
mylist[[i]] %<>% rownames_to_column(var = 'Gene')
}
mylist
# $`Untr BaP vs Untr Ctrl`
# Gene p_val avg_log2FC pct.1 pct.2 p_val_adj Comparison
# 1 CYP1B1 2.121654e-21 0.7621296 0.439 0.068 7.765465e-17 Untr BaP vs Untr Ctrl
# 2 NQO1 4.166497e-07 0.5565888 0.735 0.633 1.524980e-02 Untr BaP vs Untr Ctrl
#
# $`Male BaP vs Male Ctrl`
# Gene p_val avg_log2FC pct.1 pct.2 p_val_adj Comparison
# 1 CYP1B1 8.545337e-19 0.8081897 0.458 0.126 3.127679e-14 Male BaP vs Male Ctrl
# 2 CYP1A1 2.905552e-13 0.3147409 0.228 0.009 1.063461e-08 Male BaP vs Male Ctrl
# 3 NQO1 5.812601e-10 0.5464255 0.707 0.604 2.127470e-05 Male BaP vs Male Ctrl
#
# $`Female BaP vs Female Ctrl`
# Gene p_val avg_log2FC pct.1 pct.2 p_val_adj Comparison
# 1 CYP1B1 3.898564e-16 1.5897009 0.544 0.080 1.426914e-11 Female BaP vs Female Ctrl
# 2 CYP1A1 5.454316e-09 1.1776597 0.262 0.000 1.996334e-04 Female BaP vs Female Ctrl
# 3 NQO1 3.050631e-07 0.8004275 0.725 0.554 1.116561e-02 Female BaP vs Female Ctrl
# make single dataframe and plot
rbind(mylist[[1]], mylist[[2]], mylist[[3]]) %>%
ggplot(aes(x = Gene, y = avg_log2FC, fill = Comparison)) +
geom_bar(stat = 'identity', position = position_dodge2(preserve = 'single')) +
scale_fill_manual(name = 'Comparison',
values = c('#f37d80', '#777aba', '#feea7f'),
labels = c('Untr BaP vs Untr Ctrl', 'Male BaP vs Male Ctrl', 'Female BaP vs Female Ctrl'),
breaks = c('Untr BaP vs Untr Ctrl', 'Male BaP vs Male Ctrl', 'Female BaP vs Female Ctrl')) +
theme_bw() +
labs(y = 'Average log2(Fold Change)', title = 'Mature Thyrocytes') +
theme(plot.title = element_text(hjust = 0.5))
ggsave('project/analysis/plots/fc_AHR_and_targets.png', width = 10, height = 10, dpi = 300, units = 'cm')
# calculate %RP
sobj[['percent.rp']] = PercentageFeatureSet(sobj, pattern = '^RP[SL]')
# subset sobj for thyrocytes only
Mature_Thyrocytes_sobj = subset(sobj, subset = customclassif == 'Mature Thyrocytes')
# find min, max, median of %RP per sample
df = data.frame(matrix(nrow = 8, ncol = 3))
colnames(df) = c('Min', 'Median', 'Max')
rownames(df) = samples
for (i in 1:length(samples)) {
df[i, 1] = subset(Mature_Thyrocytes_sobj, subset = sample == samples[i])$percent.rp %>% min
df[i, 2] = subset(Mature_Thyrocytes_sobj, subset = sample == samples[i])$percent.rp %>% median
df[i, 3] = subset(Mature_Thyrocytes_sobj, subset = sample == samples[i])$percent.rp %>% max
}
# projection on UMAP of %RP
FeaturePlot(Mature_Thyrocytes_sobj, reduction = 'umap', features = 'percent.rp', split.by = 'sample')
ggsave('percent_RP_thyrocytes.png', height = 10, width = 90, dpi = 300, units = 'cm')
# Violin plot of %RP
median.stat = function(x){
out = quantile(x, probs = c(0.5))
names(out) = c('ymed')
return(out)
}
VlnPlot(object = Mature_Thyrocytes_sobj, group.by = 'sample', features = 'percent.rp') +
stat_summary(fun.y = median.stat, geom = 'crossbar') + # to add median expression as a bar
labs(title = '', y = 'Percentage RP', x = 'Samples') +
scale_fill_manual(values = c('Female_BaP' = '#FFBB00', 'Female_Ctrl' = '#FFEA61', 'Female_PCB153' = '#FFFFB7',
'Male_BaP' = '#408af1', 'Male_Ctrl' = '#78b6fa', 'Male_PCB153' = '#a8d7fa',
'Untr_BaP' = 'red', 'Untr_Ctrl' = 'pink'))
ggsave('/project/analysis/plots/violin_percent_RP_thyrocytes.png', height = 15, width = 15, dpi = 300, units = 'cm')
a = VlnPlot(Mature_Thyrocytes_sobj, features = 'percent.rp', group.by = 'sample')
rp_thyrocytes = list()
# extract %RP
for (i in 1:8) {
print(samples[i])
rp_thyrocytes[[i]] = filter(a$data, ident == samples[i]) %>% pull(percent.rp)
names(rp_thyrocytes)[[i]] = samples[i]
}
# check if samples are normally distributed - they are not
for (i in 1:8) {
print(names(samples)[i])
shap = shapir o.test(rp_thyrocytes[[i]])
summary(shap)
}
# kruskal-wallis test (non-parametric equivalent of ANOVA)
kruskal.test(a$data$percent.rp ~ a$data$ident, a$data)
# post-hoc test
FSA::dunnTest(a$data$percent.rp ~ a$data$ident, a$data, method = 'bh')
# degs_all[[6]] = Untr BaP vs Untr Ctrl
# degs_all[[7]] = Male BaP vs Male Ctrl
# degs_all[[8]] = Female BaP vs Female Ctrl
data.frame(all_genes_venn = unique(c(rownames(degs_all[[6]]),
rownames(degs_all[[7]]),
rownames(degs_all[[8]])))) %>%
mutate(venn6 = if_else(all_genes_venn %in% rownames(degs_all[[6]]), T, F),
venn7 = if_else(all_genes_venn %in% rownames(degs_all[[7]]), T, F),
venn8 = if_else(all_genes_venn %in% rownames(degs_all[[8]]), T, F)) %>%
ggplot() +
geom_venn(aes(A = venn6, B = venn7, C = venn8),
show_percentage = FALSE,
set_names = c(paste0('Untr BaP vs Untr Ctrl\n(', nrow(degs_all[[6]]), ')'),
paste0('Male BaP vs Male Ctrl\n(', nrow(degs_all[[7]]), ')'),
paste0('Female BaP vs Female Ctrl\n(', nrow(degs_all[[8]]), ')')),
fill_color = c('red', 'blue', 'gold'),
fill_alpha = 0.5,
stroke_size = 0.7,
text_size = 5.5) +
coord_fixed() +
theme_void()
ggsave('project/analysis/plots/BAP_DEGs.png', width = 15, height = 15, units = 'cm', dpi = 300, bg = 'white')