Markdown document collecting the code used in the paper by Nazzari, M., Romitti, M., Kip, A. M., Kamps, R., Costagliola, S., van de Beucken, T., Moroni, L., & Caiment, F. (2024) Impact of benzo[a]pyrene, PCB153 and sex hormones on human ESC-derived thyroid follicles using single cell transcriptomics. Environment International, 188, 108748. https://doi.org/10.1016/j.envint.2024.108748

1 Set up working directory and create analysis subfolders

# main working directory
setwd('/project/')
dir.create('/project/analysis/', showWarnings = F)
dir.create('/project/analysis/plots/', showWarnings = F)

2 scRNA-Seq dataset stats

# sequenced reads
read.table('/project/sequencing_qc.txt', sep = '\t', header = T) %>% # this table is stored locally
  rename(`Number of cells` = 'Number.of.cells', # need to rename some columns that contain symbols
         `Number of Reads, x10^6` = 'Number.of.Reads..x10.6', 
         `Library Size, MBases` = 'Library.Size..MBases', 
         `% >= Q30 bases` = 'X.....Q30.bases',
         `Mean Quality Score` = 'Mean.Quality.Score',
         `Median Genes per Cell` = 'Median.Genes.per.Cell',
         `Sequencing Saturation, %` = 'Sequencing.Saturation...',
         `Reads Mapped Confidently to Genome, %` = 'Reads.Mapped.Confidently.to.Genome...',
         `Median Reads per Cell` = 'Mean.Reads.per.Cell') %>%
  dplyr::select(-`Library Size, MBases`) %>% 
  reshape2::melt(id.vars = 'Sample',
                 value.name = 'value',
                 variable.name = 'metric') %>% 
  ggplot(aes(x = value, y = 1)) +
  geom_boxplot(aes(fill = metric), alpha = .3) +
  geom_point() +  
  ggrepel::geom_text_repel(aes(label = Sample), 
                           segment.alpha = .5, 
                           size = 3, 
                           point.padding = .1,
                           min.segment.length = 0) +
  theme_bw() +
  labs(x = '', y = '') +
  theme(axis.ticks.y = element_blank(),
        axis.text.y = element_blank()) +
  scale_x_continuous(labels = scales::label_comma()) +
  facet_wrap(vars(metric), scales = 'free', ncol = 2, nrow = 4) +
  guides(fill = 'none')

ggsave('/project/analysis/plots/scrnaseq_qc.png', width = 20, height = 15, units = 'cm', dpi = 300)

# number of cells before/after filter
read.table('cells_before_after_filter.txt', sep = '\t', header = T) %>% # table is sotred locally
  dplyr::rename(`Cells before filter` = 'Cells.before.filter', # need to rename some columns that contain symbols
         `Cells after filter` = 'Cells.after.filter') %>%
  reshape2::melt(id.vars = 'Sample',
                 value.name = 'Number of cells',
                 variable.name = 'metric') %>% 
  ggplot(aes(x = Sample, y = `Number of cells`, fill = Sample)) +
  geom_bar(stat = 'identity') +
  theme_bw() +
  theme(axis.ticks.x = element_blank(),
        axis.text.x = element_text(hjust = 1, vjust = .5, angle = 90),) +
  facet_grid(rows = vars(metric), scales = 'free') +
  scale_fill_manual(values = c('Female_BaP' = '#FFBB00', 'Female_Ctrl' = '#FFEA61', 'Female_PCB153' = '#FFFFB7', 
                         'Male_BaP' = '#408af1', 'Male_Ctrl' = '#78b6fa', 'Male_PCB153' = '#a8d7fa', 
                         'Untr_BaP' = 'red', 'Untr_Ctrl' = 'pink'))

ggsave('/project/analysis/plots/cells_before_after_filter.png', width = 10, height = 15, units = 'cm', dpi = 300)

3 scRNA-Seq data analysis

3.1 Create Seurat object

library(tidyverse)
library(magrittr)
library(Seurat)
library(HGNChelper)
library(tuple)
library(clusterProfiler)
library(org.Hs.eg.db)
library(enrichplot)
library(ggraph)
library(igraph)
library(tidyverse)
library(data.tree)
library(scater)
library(ReactomePA)
library(scCustomize)
library(scater)
library(ggvenn)
library(topGO)
library(Rgraphviz)

`%notin%` = Negate(`%in%`)
FDR = 0.05
set.seed(42)

resolution = 0.5


# Load the dataset
Untr_Ctrl.data = Read10X(data.dir = '/project/raw_data/Untr_Ctrl/outs/filtered_feature_bc_matrix/')
Untr_Ctrl.data = CreateSeuratObject(counts = Untr_Ctrl.data, project = 'Untr_Ctrl')
Untr_Ctrl.data

Untr_BaP.data = Read10X(data.dir = '/project/raw_data/Untr_BaP/outs/filtered_feature_bc_matrix/')
Untr_BaP.data = CreateSeuratObject(counts = Untr_BaP.data, project = 'Untr_BaP')
Untr_BaP.data

Male_PCB153.data = Read10X(data.dir = '/project/raw_data/Male_PCB153/outs/filtered_feature_bc_matrix/')
Male_PCB153.data = CreateSeuratObject(counts = Male_PCB153.data, project = 'Male_PCB153')
Male_PCB153.data

Male_Ctrl.data = Read10X(data.dir = '/project/raw_data/Male_Ctrl/outs/filtered_feature_bc_matrix/')
Male_Ctrl.data = CreateSeuratObject(counts = Male_Ctrl.data, project = 'Male_Ctrl')
Male_Ctrl.data

Male_BaP.data = Read10X(data.dir = '/project/raw_data/Male_BaP/outs/filtered_feature_bc_matrix/')
Male_BaP.data = CreateSeuratObject(counts = Male_BaP.data, project = 'Male_BaP')
Male_BaP.data

Female_PCB153.data = Read10X(data.dir = '/project/raw_data/Female_PCB153/outs/filtered_feature_bc_matrix/')
Female_PCB153.data = CreateSeuratObject(counts = Female_PCB153.data, project = 'Female_PCB153')
Female_PCB153.data

Female_Ctrl.data = Read10X(data.dir = '/project/raw_data/Female_Ctrl/outs/filtered_feature_bc_matrix/')
Female_Ctrl.data = CreateSeuratObject(counts = Female_Ctrl.data, project = 'Female_Ctrl')
Female_Ctrl.data

Female_BaP.data = Read10X(data.dir = '/project/raw_data/Female_BaP/outs/filtered_feature_bc_matrix/')
Female_BaP.data = CreateSeuratObject(counts = Female_BaP.data, project = 'Female_BaP')
Female_BaP.data

# sample names
samples = c('Untr_Ctrl', 'Untr_BaP', 
            'Male_PCB153','Male_Ctrl','Male_BaP', 
            'Female_PCB153','Female_Ctrl', 'Female_BaP')

# initialize the Seurat object with the raw (non-normalized data).
sobj = merge(Untr_Ctrl.data, y = c(Untr_BaP.data, Male_PCB153.data, Male_Ctrl.data, Male_BaP.data,
                                   Female_PCB153.data, Female_Ctrl.data, Female_BaP.data), 
             add.cell.ids = c('Untr_Ctrl', 'Untr_BaP', 'Male_PCB153','Male_Ctrl','Male_BaP', 
                              'Female_PCB153','Female_Ctrl', 'Female_BaP'), 
             project = 'human_follicles_hormones')

# add sample info to metadata
DefaultAssay(sobj) = ASSAY = 'RNA'
sobj@meta.data %<>% dplyr::rename(sample = 'orig.ident') %>% 
  separate(col = 'sample', into = c('hormones', 'EDC'), sep = '_', remove = F)

# calculate % of reads mapping to mitochondrial DNA 
sobj[['percent.mt']] = PercentageFeatureSet(sobj, pattern = '^MT-')

# visualize QC metrics as a violin plot
p1 = VlnPlot(seurat_object, features = 'nFeature_RNA') + 
    geom_hline(yintercept = c(1700, 10000), color = 'red') +
    scale_fill_manual(values = c('Female_BaP' = '#FFBB00', 'Female_Ctrl' = '#FFEA61', 'Female_PCB153' = '#FFFFB7', 
                                 'Male_BaP' = '#408af1', 'Male_Ctrl' = '#78b6fa', 'Male_PCB153' = '#a8d7fa', 
                                 'Untr_BaP' = 'red', 'Untr_Ctrl' = 'pink'))
p2 = VlnPlot(seurat_object, features = 'nCount_RNA') + 
    geom_hline(yintercept = 800, color = 'red') +
    scale_fill_manual(values = c('Female_BaP' = '#FFBB00', 'Female_Ctrl' = '#FFEA61', 'Female_PCB153' = '#FFFFB7', 
                                 'Male_BaP' = '#408af1', 'Male_Ctrl' = '#78b6fa', 'Male_PCB153' = '#a8d7fa', 
                                 'Untr_BaP' = 'red', 'Untr_Ctrl' = 'pink'))
p3 = VlnPlot(seurat_object, features = 'percent.mt') + 
    geom_hline(yintercept = 12.5, color = 'red') +
    scale_fill_manual(values = c('Female_BaP' = '#FFBB00', 'Female_Ctrl' = '#FFEA61', 'Female_PCB153' = '#FFFFB7', 
                                 'Male_BaP' = '#408af1', 'Male_Ctrl' = '#78b6fa', 'Male_PCB153' = '#a8d7fa', 
                                 'Untr_BaP' = 'red', 'Untr_Ctrl' = 'pink'))

patchwork::wrap_plots(p1, p2, p3, ncol = 3)
ggsave('/plots/before_filter.png', width = 16, height = 8, unit = 'in', dpi = 300)

3.2 Filter for number of features (i.e. genes), count (i.e. read count per cell) and percentage of reads mapping to the mitochondrial genome

sobj = subset(sobj, subset = nFeature_RNA > 1700 & nFeature_RNA < 10000 & nCount_RNA > 800 & percent.mt < 12.5)

3.3 Normalize data, run PCA and divide into clusters

# normalize data
sobj[['percent.mt']] = PercentageFeatureSet(sobj, pattern = '^MT-')
sobj = NormalizeData(sobj, normalization.method = 'LogNormalize', scale.factor = 10000)
sobj = FindVariableFeatures(sobj, selection.method = 'vst', nfeatures = 2000)

# scale and run PCA
sobj = ScaleData(sobj, features = rownames(sobj))
sobj = RunPCA(sobj, features = VariableFeatures(object = sobj))

# investigate genes influencing the loading scores
print(sobj[['pca']], dims = 1:5, nfeatures = 5)
VizDimLoadings(sobj, dims = 1:3, reduction = 'pca')
DimPlot(sobj, reduction = 'pca')

# first construct a KNN graph based on the euclidean distance in PCA space, and refine the edge weights between any two cells based on the shared overlap in their local neighborhoods (Jaccard similarity).
# input: selected amount of PCs
sobj = FindNeighbors(sobj, dims = 1:15)

# modularity optimization techniques: Louvain algorithm (default) or SLM
# setting the resolution between 0.4-1.2 typically returns good results for single-cell datasets of around 3K cells. Optimal resolution often increases for larger datasets.
sobj = FindClusters(sobj, resolution = resolution) 
# determines the amount of clusters to use more precisely then reading the elbowplot. Resolution = between 0.4 (least resolution) and 1.2 (max resolution)
clusters = Idents(sobj)

sobj = RunUMAP(sobj, dims = 1:length(levels(clusters)))
DimPlot(sobj, reduction = 'umap', label = T, seed = 42, group.by = 'seurat_clusters')

3.4 Annotate clusters

# load gene set preparation function
source('/project/scripts/gene_sets_prepare.R')
# load cell type annotation function
source('/project/scripts/sctype_score_.R')

# DB file
db = '/project/sc_annotation_file.xlsx'
tissue = c('Thyroid')

# prepare gene sets
gs_list = gene_sets_prepare(db, tissue)

# get cell-type by cell matrix
es.max = sctype_score(scRNAseqData = sobj[['RNA']]@scale.data, scaled = TRUE, 
                      gs = gs_list$gs_positive, gs2 = gs_list$gs_negative) 

# NOTE: scRNAseqData parameter should correspond to your input scRNA-seq matrix. 
# In case Seurat is used, it is either sobj[['RNA']]@scale.data (default), sobj[['SCT']]@scale.data, in case sctransform is used for normalization,
# or sobj[['integrated']]@scale.data, in case a joint analysis of multiple single-cell datasets is performed.

# merge by cluster
cL_resutls = do.call('rbind', lapply(unique(sobj@meta.data$seurat_clusters), function(cl){
    es.max.cl = sort(rowSums(es.max[ , rownames(sobj@meta.data[sobj@meta.data$seurat_clusters == cl, ])]), decreasing = !0)
    head(data.frame(cluster = cl, type = names(es.max.cl), scores = es.max.cl, ncells = sum(sobj@meta.data$seurat_clusters == cl)), 10)
}))
sctype_scores = cL_resutls %>% group_by(cluster) %>% top_n(n = 1, wt = scores)  


# set low-confident (low ScType score) clusters to 'unknown'
sctype_scores$type[as.numeric(as.character(sctype_scores$scores)) < sctype_scores$ncells/4] = 'Unknown'
print(sctype_scores[, 1:3])


# Overlay the annotation on the UMAP
sobj@meta.data$customclassif = ''
for(j in unique(sctype_scores$cluster)){
  cl_type = sctype_scores[sctype_scores$cluster == j,]; 
  sobj@meta.data$customclassif[sobj@meta.data$seurat_clusters == j] = as.character(cl_type$type[1])
}

DimPlot(sobj, reduction = 'umap', label = TRUE, seed = 42, repel = T, group.by = c('seurat_clusters', 'customclassif')) 
ggsave('/project/analysis/plots/UMAP_classification.png', width = 16, height = 8, unit = 'in', dpi = 300)

3.5 [Example] Violin plot (level of gene expression) and UMAP with overlapped pattern of gene expression

# genes of interest
genes = c('TPO', 'TG', 'SLC5A5', 'IYD', 'PAX8', 'NKX2-1', 'FOXE1', 'HHEX', 'DUOX1', 'DUOX2', 'DUOXA1', 'DUOXA2', 'SLC16A2', 'DIO1', 'DIO2', 'SLC26A7')

# Note on VlnPlot(): by default it plots `slot = 'data'` (= ln-normalized values). If you want the raw counts, set `slot = 'counts'` 
VlnPlot(sobj, 
        features = genes, 
        idents = 'Mature Thyrocytes', 
        group.by = 'sample', 
        cols = c('#FFBB00', '#FFEA61', '#FFFFB7', '#408af1', '#78b6fa', '#a8d7fa', 'red', 'pink')) 

ggsave('/project/analysis/plots/violin_thyroid_gene.png', width = 40, height = 40, unit = 'cm', dpi = 300)


FeaturePlot(sobj, reduction = 'umap', features = genes, ncol = 4) 
FeaturePlot(sobj, reduction = 'umap', features = genes, split.by = 'sample') # one plot per sample

3.6 Samples cell composition

# number of cells per type per sample
table(sobj@meta.data$customclassif, sobj@meta.data$sample) %>% 
  as.data.frame() %>% 
  transform(percentage = Freq / tapply(Freq, Var2, sum)[Var2] * 100) %>% # find composition in percentage
  ggplot(aes(x = Var2, y = percentage, fill = Var1)) +
  geom_bar(stat = 'identity', position = 'fill', color = 'black') +
  theme_bw() +
  labs(x = 'Sample', y = 'Composition, %') + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_y_continuous(labels = scales::percent) +
  scale_fill_discrete(name = 'Cell type')

ggsave('/project/analysis/plots/samples_cell_composition.png', width = 10, height = 10, unit = 'cm', dpi = 300)   

3.7 [Example] Average gene expression and percent of expressing cells

# expression of hormone receptor genes per cell type
genes = c('ESR1', 'ESR2', 'PGR', 'AR')

AverageExpression(sobj, features = genes, group.by = 'customclassif') %>%
    as.data.frame() %>%
    rownames_to_column(var = 'gene') %>%
    reshape2::melt(id.vars = 'gene',
        variable.name = 'Annotated cell type',
        value.name = 'Average expression') %>%
    mutate(`Annotated cell type` = sub('RNA\\.', '', `Annotated cell type`)) %>%
    mutate(`Annotated cell type` = gsub('\\.', ' ', `Annotated cell type`)) %>%
    ggplot(aes(x = gene, y = `Average expression`, fill = `Annotated cell type`)) +
        geom_bar(stat = 'identity', color = 'black', linewidth = .5, position = 'dodge') +
        theme_bw() +
        ggtitle('All cells') +
        theme(plot.title = element_text(hjust = .5)) 

ggsave('/project/analysis/plots/average_expression_hormone_receptors.png', width = 15, height = 10, units = 'cm', dpi = 300)


# % of cells expressing hormone receptor genes per cell type
scCustomize::Percent_Expressing(sobj, features = genes, group_by = 'customclassif') %>%
    as.data.frame() %>%
    rownames_to_column(var = 'Gene') %>%
    reshape2::melt(id.vars = 'Gene',
        variable.name = 'Annotated cell type',
        value.name = 'Percent expressing') %>%
    mutate(`Annotated cell type` = sub('RNA\\.', '', `Annotated cell type`)) %>%
    mutate(`Annotated cell type` = gsub('\\.', ' ', `Annotated cell type`)) %>%
    ggplot(aes(x = Gene, y = `Percent expressing`, fill = `Annotated cell type`)) +
        geom_bar(stat = 'identity', position = 'dodge', color = 'black', linewidth = .5) +
        geom_hline(yintercept = 10, linetype = 'dotted', col = 'grey25') + # for 10% threshold
        theme_bw() +
        ggtitle('All cells') +
        theme(plot.title = element_text(hjust = .5))

ggsave('/project/analysis/plots/percentage_expression_hormone_receptors.png', width = 15, height = 10, units = 'cm', dpi = 300)

3.8 Differential expression analysis using Seurat::FindMarkers()

# add a col to the @meta.data that concatenates cell type and hormone treatment and EDC treatment
sobj$cell_hormone_edc = paste(sobj@meta.data[, 'customclassif'],
                      sobj@meta.data[, 'hormones'],
                      sobj@meta.data[, 'EDC'], sep = '_')

sobj$cell_hormone = paste(sobj@meta.data[, 'customclassif'],
                  sobj@meta.data[, 'hormones'], sep = '_')

Idents(sobj) = sobj$customclassif


# comparisons i want to test
test_comparisons = list(
    male = c('Mature Thyrocytes_Male_Ctrl', 'Mature Thyrocytes_Untr_Ctrl'),
    female = c('Mature Thyrocytes_Female_Ctrl', 'Mature Thyrocytes_Untr_Ctrl'),
    male_female = c('Mature Thyrocytes_Male_Ctrl', 'Mature Thyrocytes_Female_Ctrl'),
    bapResponse_affectedByMale = c('Mature Thyrocytes_Male_BaP', 'Mature Thyrocytes_Untr_BaP'),
    bapResponse_affectedByFemale = c('Mature Thyrocytes_Female_BaP', 'Mature Thyrocytes_Untr_BaP'),
    bapResponse = c('Mature Thyrocytes_Untr_BaP', 'Mature Thyrocytes_Untr_Ctrl'),
    bapResponse_inMaleEnv = c('Mature Thyrocytes_Male_BaP', 'Mature Thyrocytes_Male_Ctrl'),
    bapResponse_inFemaleEnv = c('Mature Thyrocytes_Female_BaP', 'Mature Thyrocytes_Female_Ctrl'),
    PCB153Response_inMaleEnv = c('Mature Thyrocytes_Male_PCB153', 'Mature Thyrocytes_Male_Ctrl'), 
    PCB153Response_inFemaleEnv = c('Mature Thyrocytes_Female_PCB153', 'Mature Thyrocytes_Female_Ctrl')
    )

# where to store all results
degs_all = list() # degs
genes_all = list() # all genes

for (x in 1:length(test_comparisons)) {
    
    print(paste0('Comparing ', test_comparisons[[x]][1], ' VS ', test_comparisons[[x]][2]))

    # find DEGs
    all_genes = FindMarkers(Mature_Thyrocytes_sobj, ident.1 = test_comparisons[[x]][1], ident.2 = test_comparisons[[x]][2], group.by = 'cell_hormone_edc', logfc.threshold = 0.25, test.use = "wilcox", min.pct = 0.1, min.cells.feature = 3, min.cells.group = 3, base = 2)
    
    # assign DEGs results to list that collects them all
    degs_all[[x]] = filter(all_genes, p_val_adj < FDR)
    genes_all[[x]] = all_genes
    names(degs_all)[[x]] = paste0(test_comparisons[[x]][1], '_VS_', test_comparisons[[x]][2])
    names(genes_all)[[x]] = paste0(test_comparisons[[x]][1], '_VS_', test_comparisons[[x]][2])

    # also, save results as a table
    # all genes
    write.csv(genes_all[[x]], paste0(test_comparisons[[x]][1], '_', test_comparisons[[x]][2], '_All_genes.csv'))
    
    # DEGs
    write.csv(degs_all[[x]], paste0(test_comparisons[[x]][1], '_', test_comparisons[[x]][2], '_DEGs_FDR_', FDR, '.csv'))
    
    # how many genes have been tested and how many DEGs (FDR < 0.05) there are
    num_genes = nrow(genes_all[[x]])
    num_DEGs = nrow(degs_all[[x]])
    DEG_percent = (num_DEGs/num_genes)*100

    # give some info on number of tested genes and DEGs
    print(paste0('Found ', num_DEGs, ' unique DEGs (', num_genes, ' genes tested) when comparing ', test_comparisons[[x]][1], ' - ', test_comparisons[[x]][2], '; ', round(DEG_percent, digits = 1), '% is differentially expressed.'))
    
    }

# save all tested genes and DEGs in two excel files
degs_all_save = degs_all

for (x in 1:10) { # number of comparisons
    names(degs_all_save)[x] = paste0(map_chr(str_split(test_comparisons[[x]][1], 'Thyrocytes_'), 2), 
                                     '_VS_',
                                     map_chr(str_split(test_comparisons[[x]][2], 'Thyrocytes_'), 2))
    
    degs_all_save[[x]] %<>% rownames_to_column(var = 'gene')
}

writexl::write_xlsx(degs_all_save, '/project/analysis/DEGs_all_comparisons.xlsx')

genes_all_save = genes_all


for (x in 1:10) {
    names(genes_all_save)[x] = paste0(map_chr(str_split(test_comparisons[[x]][1], 'Thyrocytes_'), 2), 
                                      '_VS_',
                                      map_chr(str_split(test_comparisons[[x]][2], 'Thyrocytes_'), 2))
    
    genes_all_save[[x]] %<>% rownames_to_column(var = 'gene')
    }

writexl::write_xlsx(genes_all_save, '/project/analysis/all_genes_all_comparisons.xlsx')

3.9 Volcano plot of genes

Color OXPHOS, ribosomal genes and lipid transport genes

# KEGG pathway hsa00190 “Oxidative phosphorylation” (KEGG release 107)
oxphos = read.table('project/KEGG_PATHWAY_hsa00190_genes_31_07_2023.txt', sep = '\t', header = T)

# GO term “Lipid transport” (GO:0006869) 
lipid_transport = read.table('project/GO_6869_lipid_transport.txt', header = F, sep = '\t', quote = '') %>% 
  filter(V12 == 'protein') %>% 
  pull(V1) %>% 
  unique()

titles = c('Male Ctrl vs Untr Ctrl',
    'Female Ctrl vs Untr Ctrl',
    'Male Ctrl vs Female Ctrl',
    'Male BaP vs Untr BaP',
    'Female BaP vs Untr BaP',
    'Untr BaP vs Untr Ctrl',
    'Male BaP vs Male Ctrl',
    'Female BaP vs Female Ctrl',
    'Male PCB153 vs Male Ctrl')

for (x in 1:length(titles)) {
  
    genes_all[[x]] %>%
    rownames_to_column(var = 'gene') %>%
    mutate(gene_category = case_when(
        grepl('^RP[SL]', gene) & p_val_adj < 0.05 ~ 'Ribosomal protein',
        gene %in% oxphos[oxphos$location == 'mitochondrial', ]$gene & p_val_adj < 0.05 ~ 'OXPHOS (mitochondrial genome)',
        gene %in% oxphos[oxphos$location == 'nuclear', ]$gene & p_val_adj < 0.05 ~ 'OXPHOS (nuclear genome)',
        gene %in% lipid_transport & p_val_adj < 0.05 ~ 'Lipid transport',
        TRUE ~ 'none')) %>% # all other cases 
    ggplot(aes(x = avg_log2FC, y = -log10(p_val_adj))) +
        geom_point(aes(colour = gene_category)) + 
        geom_hline(yintercept = -log10(0.05), linetype = 'dotted', col = 'grey25') +
        geom_vline(xintercept = 0, linetype = 'solid', col = 'grey25') +
        ggrepel::geom_text_repel(data = . %>% filter(gene_category == 'Lipid transport'), aes(label = gene)) +
        scale_color_manual(name = '', 
            values = c('chartreuse2', 'cadetblue1', 'blue', 'darkgoldenrod1', 'black'), 
            breaks = c('Ribosomal protein', 'OXPHOS (mitochondrial genome)', 'OXPHOS (nuclear genome)', 'Lipid transport', 'none'), 
            labels = c('Ribosomal gene', 'OXPHOS (mitochondrial genome)', 'OXPHOS (nuclear genome)', 'Lipid transport', 'other genes')) +
        labs(title = titles[x], x = 'Average log2(Fold Change)', y = '-log10(FDR)') + 
        theme_bw() +
        theme(plot.title = element_text(hjust = 0.5)) +
        xlim(c(-1.7, 1.8))
  
  ggsave(paste0('/project/analysis/plots/Volcano_', titles[x] ,'.png'), width = 10, height = 15, units = 'cm', dpi = 300)
  
  } 

3.10 [Example] Gene Ontology (GO) analysis

# clusterProfiler 

ontology = 'BP'

go_analysis = function(DEG_LIST, i, ...) {
  
    gene_list = bitr(rownames(DEG_LIST), fromType = 'SYMBOL', toType = 'ENTREZID', OrgDb = 'org.Hs.eg.db')

    # if some genes failed to map, report it
    print('These genes failed to map: ')
    print(DEG_LIST[rownames(DEG_LIST) %notin% gene_list$SYMBOL, 1:2])

    x = enrichGO(gene = gene_list$ENTREZ, 
             OrgDb = org.Hs.eg.db,
             qvalueCutoff = 0.01,
             pvalueCutoff = 0.01,
             ont = ontology, 
             readable = TRUE)  

    x = clusterProfiler::simplify(x, by = 'qvalue') 

    npathway = x@result%>% 
        as.data.frame %>% 
        filter(qvalue < 0.01) %>% 
        nrow() 

    print(paste0('Enriched GO pathways: ', npathway))
    
    # also, save results as a table 
    x@result %>% 
        as.data.frame %>% 
        filter(qvalue < 0.01) %>% 
        write.csv(., paste('project/analysis/GO', ontology, 'simplified', test_comparisons[[i]][1], test_comparisons[[i]][2], ..., 'qvalue_0.01.csv', sep = '_'))
    
    return(x)
    
}

go_male_vs_untr = go_analysis(degs_all[[1]], 1)

3.11 [Example] Emap plot and heatplot

# emap plot

go_male_vs_untr_EMAP = pairwise_termsim(go_male_vs_untr)

emapplot(go_male_vs_untr_EMAP, color = 'qvalue', pie.params = list(pie = 'count'), cex.params = list(category_node = 1)) + 
    scale_fill_continuous(low = 'blue', high = 'grey90', name = 'qvalue') +
    theme(plot.title = element_text(face = 'bold', hjust = .5)) + 
    ggtitle(titles[1])

ggsave(paste0('emapplot_GO_', ontology, '_', titles[1], '.png'), dpi = 300, units = 'cm', width = 25, height = 20, bg = 'white')

# define function for coloring based on log2(fold change)
heatplot_fc = function(DEG_LIST) {
  
  gene_fc = DEG_LIST$avg_log2FC
  names(gene_fc) = rownames(DEG_LIST)
  return(gene_fc)
  
  }

# heatplot
heatplot(go_male_vs_untr, foldChange = heatplot_fc(degs_all[[1]])) + 
    ggtitle(paste(titles[1], '- GO Biological Process')) +
    theme(plot.title = element_text(face = 'bold', hjust = .5)) 

ggsave(paste0('heatplot_GO_', ontology, '_', titles[1], '.png'), width = 25, height = 15, dpi = 300, units = 'cm', bg = 'white')

3.12 Fold change of AHR and its target genes

N.B.: only in the comparisons Untr BaP vs Untr Ctrl, Male BaP vs Male Ctrl, Female BaP vs Female Ctrl

mylist = list()

# extract genes only if they are DEGs
for (i in 1:3) { 
    print(i)        
    mylist[[i]] = degs_all[[i+5]] %>% filter(grepl('CYP1|NQO1|ALDH3|TIPARP|AHR', rownames(.))) 
    names(mylist)[[i]] = titles[i+5] # interested in comparisons 6, 7, 8
}

mylist
# $`Untr BaP vs Untr Ctrl`
#               p_val avg_log2FC pct.1 pct.2    p_val_adj
# CYP1B1 2.121654e-21  0.7621296 0.439 0.068 7.765465e-17
# NQO1   4.166497e-07  0.5565888 0.735 0.633 1.524980e-02
# 
# $`Male BaP vs Male Ctrl`
#               p_val avg_log2FC pct.1 pct.2    p_val_adj
# CYP1B1 8.545337e-19  0.8081897 0.458 0.126 3.127679e-14
# CYP1A1 2.905552e-13  0.3147409 0.228 0.009 1.063461e-08
# NQO1   5.812601e-10  0.5464255 0.707 0.604 2.127470e-05
# 
# $`Female BaP vs Female Ctrl`
#               p_val avg_log2FC pct.1 pct.2    p_val_adj
# CYP1B1 3.898564e-16  1.5897009 0.544 0.080 1.426914e-11
# CYP1A1 5.454316e-09  1.1776597 0.262 0.000 1.996334e-04
# NQO1   3.050631e-07  0.8004275 0.725 0.554 1.116561e-02


# add column with comparison name
for (i in 1:3) { 
    mylist[[i]]$Comparison = names(mylist)[i] 
    mylist[[i]] %<>% rownames_to_column(var = 'Gene') 
    }

mylist
# $`Untr BaP vs Untr Ctrl`
#     Gene        p_val avg_log2FC pct.1 pct.2    p_val_adj            Comparison
# 1 CYP1B1 2.121654e-21  0.7621296 0.439 0.068 7.765465e-17 Untr BaP vs Untr Ctrl
# 2   NQO1 4.166497e-07  0.5565888 0.735 0.633 1.524980e-02 Untr BaP vs Untr Ctrl
# 
# $`Male BaP vs Male Ctrl`
#     Gene        p_val avg_log2FC pct.1 pct.2    p_val_adj            Comparison
# 1 CYP1B1 8.545337e-19  0.8081897 0.458 0.126 3.127679e-14 Male BaP vs Male Ctrl
# 2 CYP1A1 2.905552e-13  0.3147409 0.228 0.009 1.063461e-08 Male BaP vs Male Ctrl
# 3   NQO1 5.812601e-10  0.5464255 0.707 0.604 2.127470e-05 Male BaP vs Male Ctrl
# 
# $`Female BaP vs Female Ctrl`
#     Gene        p_val avg_log2FC pct.1 pct.2    p_val_adj                Comparison
# 1 CYP1B1 3.898564e-16  1.5897009 0.544 0.080 1.426914e-11 Female BaP vs Female Ctrl
# 2 CYP1A1 5.454316e-09  1.1776597 0.262 0.000 1.996334e-04 Female BaP vs Female Ctrl
# 3   NQO1 3.050631e-07  0.8004275 0.725 0.554 1.116561e-02 Female BaP vs Female Ctrl

# make single dataframe and plot
rbind(mylist[[1]], mylist[[2]], mylist[[3]]) %>%
    ggplot(aes(x = Gene, y = avg_log2FC, fill = Comparison)) +
    geom_bar(stat = 'identity', position = position_dodge2(preserve = 'single')) +
    scale_fill_manual(name = 'Comparison',
              values = c('#f37d80', '#777aba', '#feea7f'),
              labels = c('Untr BaP vs Untr Ctrl', 'Male BaP vs Male Ctrl', 'Female BaP vs Female Ctrl'),
              breaks = c('Untr BaP vs Untr Ctrl', 'Male BaP vs Male Ctrl', 'Female BaP vs Female Ctrl')) +
    theme_bw() +
    labs(y = 'Average log2(Fold Change)', title = 'Mature Thyrocytes') +
    theme(plot.title = element_text(hjust = 0.5))

ggsave('project/analysis/plots/fc_AHR_and_targets.png', width = 10, height = 10, dpi = 300, units = 'cm')

3.13 Plot % reads mapping to ribosome genes in the Mature Thyrocytes cluster

# calculate %RP
sobj[['percent.rp']] = PercentageFeatureSet(sobj, pattern = '^RP[SL]')

# subset sobj for thyrocytes only 
Mature_Thyrocytes_sobj = subset(sobj, subset = customclassif == 'Mature Thyrocytes') 

# find min, max, median of %RP per sample
df = data.frame(matrix(nrow = 8, ncol = 3))
colnames(df) = c('Min', 'Median', 'Max') 
rownames(df) = samples 

for (i in 1:length(samples)) {
    df[i, 1] = subset(Mature_Thyrocytes_sobj, subset = sample == samples[i])$percent.rp %>% min 
    df[i, 2] = subset(Mature_Thyrocytes_sobj, subset = sample == samples[i])$percent.rp %>% median
    df[i, 3] = subset(Mature_Thyrocytes_sobj, subset = sample == samples[i])$percent.rp %>% max
    }

# projection on UMAP of %RP
FeaturePlot(Mature_Thyrocytes_sobj, reduction = 'umap', features = 'percent.rp', split.by = 'sample') 
ggsave('percent_RP_thyrocytes.png', height = 10, width = 90, dpi = 300, units = 'cm')

# Violin plot of %RP
median.stat = function(x){
    out = quantile(x, probs = c(0.5))
    names(out) = c('ymed')
    return(out) 
}

VlnPlot(object = Mature_Thyrocytes_sobj, group.by = 'sample', features = 'percent.rp') + 
    stat_summary(fun.y = median.stat, geom = 'crossbar') + # to add median expression as a bar
    labs(title = '', y = 'Percentage RP', x = 'Samples') +
    scale_fill_manual(values = c('Female_BaP' = '#FFBB00', 'Female_Ctrl' = '#FFEA61', 'Female_PCB153' = '#FFFFB7', 
                     'Male_BaP' = '#408af1', 'Male_Ctrl' = '#78b6fa', 'Male_PCB153' = '#a8d7fa', 
                     'Untr_BaP' = 'red', 'Untr_Ctrl' = 'pink'))

ggsave('/project/analysis/plots/violin_percent_RP_thyrocytes.png', height = 15, width = 15, dpi = 300, units = 'cm')

3.14 Test difference in %RP among the samples Mature Thyrocytes subpopulation

a = VlnPlot(Mature_Thyrocytes_sobj, features = 'percent.rp', group.by = 'sample')

rp_thyrocytes = list()

# extract %RP 
for (i in 1:8) { 
    print(samples[i])       
    rp_thyrocytes[[i]] = filter(a$data, ident == samples[i]) %>% pull(percent.rp)
    names(rp_thyrocytes)[[i]] = samples[i]
}

# check if samples are normally distributed - they are not
for (i in 1:8) { 
    print(names(samples)[i])        
    shap = shapir o.test(rp_thyrocytes[[i]])
    summary(shap)
}

# kruskal-wallis test (non-parametric equivalent of ANOVA)
kruskal.test(a$data$percent.rp ~ a$data$ident, a$data)

# post-hoc test
FSA::dunnTest(a$data$percent.rp ~ a$data$ident, a$data, method = 'bh')

3.15 Overlap of DEGs among comparisons Untr BaP vs Untr Ctrl, Male BaP vs Male Ctrl, Female BaP vs Female Ctrl

# degs_all[[6]] = Untr BaP vs Untr Ctrl
# degs_all[[7]] = Male BaP vs Male Ctrl
# degs_all[[8]] = Female BaP vs Female Ctrl 

data.frame(all_genes_venn = unique(c(rownames(degs_all[[6]]), 
                                     rownames(degs_all[[7]]),
                                     rownames(degs_all[[8]])))) %>%
  mutate(venn6 = if_else(all_genes_venn %in% rownames(degs_all[[6]]), T, F),
         venn7 = if_else(all_genes_venn %in% rownames(degs_all[[7]]), T, F),
     venn8 = if_else(all_genes_venn %in% rownames(degs_all[[8]]), T, F)) %>%
  ggplot() +
  geom_venn(aes(A = venn6, B = venn7, C = venn8), 
                    show_percentage = FALSE,
                    set_names = c(paste0('Untr BaP vs Untr Ctrl\n(', nrow(degs_all[[6]]), ')'), 
                                  paste0('Male BaP vs Male Ctrl\n(', nrow(degs_all[[7]]), ')'),
                                  paste0('Female BaP vs Female Ctrl\n(', nrow(degs_all[[8]]), ')')),
                    fill_color = c('red', 'blue', 'gold'),
                    fill_alpha = 0.5,
                    stroke_size = 0.7,
                    text_size = 5.5) +
  coord_fixed() +
  theme_void()
ggsave('project/analysis/plots/BAP_DEGs.png', width = 15, height = 15, units = 'cm', dpi = 300, bg = 'white')