Deep Learning for Predicting Gene Regulatory Networks: A Step-by-Step Protocol in R

Transcriptional regulatory network predictions

Deep learning
GRN
Author

Vijaykumar Yogesh Muley

Computational and software requirements

install.packages("tensorflow")
library(reticulate)
path_to_python <- "/usr/local/bin/python3" 
virtualenv_create("r-reticulate", python = path_to_python)
library(reticulate)
path_to_python <- "/usr/local/bin/python3" 
virtualenv_create("r-reticulate", python = path_to_python)
library(tensorflow)
install_tensorflow(envname = "r-reticulate")
install.packages("keras")
library(keras)
install_keras(envname = "r-reticulate")
library(tensorflow)
tf$constant("Hello Tensorflow!")
install.packages(c("readr", "tibble", "caret", "verification"))

Methods

library(magrittr)
library(keras)
library(tensorflow)
set.seed(1979)
exp <- read.table("net3_expression_data.tsv", header = T)
exp <- t(exp)
dim(exp)
exp[1:5,1:5]
# plot
hist(exp, 100, col = "darkgrey", border = "white", 
     xlab = "Expression intensity", main = "Gene expression")

# export plot to pdf file in the current directory
pdf(file = "DeepLearning_Figure1.pdf", 
    width = 5, height = 5, pointsize = 10, useDingbats = FALSE)
hist(exp, 100, col = "darkgrey", border = "white", 
     xlab = "Expression intensity", main = "Gene expression")
dev.off()
gold <- readr::read_table("DREAM5_NetworkInference_GoldStandard_Network3.tsv",
                          col_names = F)
gold
gold <- gold[gold$X1 %in% rownames(exp) & gold$X2 %in% rownames(exp), ]
table(gold$X3)
keep_indices <- c(which(gold$X3==1), 
                  sample(which(gold$X3==0), size = sum(gold$X3)*2))
gold <- gold[keep_indices,]
table(gold$X3)
inputdata <- data.frame(tf = gold$X1, gene = gold$X2, 
                        exp[gold$X1,], exp[gold$X2,])
inputdata$pcc <- sapply(seq.int(dim(gold)[1]), function(i) 
                        cor(exp[gold$X1[i],], exp[gold$X2[i],]))
inputdata$class <- gold$X3

inputdata <- tibble::as_tibble(inputdata)

featurenames <- colnames(inputdata) %>% setdiff("class")  %>% 
  setdiff("tf") %>% setdiff("gene")
h1 <- hist(as.matrix(inputdata[inputdata$class=="1",featurenames]), 
           breaks = 100, plot = FALSE)
h2 <- hist(as.matrix(inputdata[inputdata$class=="0",featurenames]), 
           breaks = 100, plot = FALSE)

pdf(file = "DeepLearning_Figure2.pdf", 
    width = 5, height = 5, pointsize = 10, useDingbats = FALSE)
plot(h2, col = rgb(0,0,1,0.4), freq = FALSE, 
     xlab = "Expression intensity", 
     main = "Distribution of feature values for training data")
plot(h1, xaxt = "n", yaxt = "n", col = rgb(1,0,0,0.4), 
     freq = FALSE, add = TRUE)
dev.off()
indices <- nrow(inputdata) %>% sample.int(., ceiling( . * 0.8))
traindata <- inputdata[indices, ]
testdata <- inputdata[-indices, ]
indices <- nrow(testdata) %>% sample.int(., ceiling( . * 0.5))
valdata <- testdata[-indices, ]
testdata <- testdata[indices, ]
table(traindata$class)
table(valdata$class)
table(testdata$class)
set_random_seed(1979)

model <- keras_model_sequential(
  input_shape = c(length(featurenames))) %>%
  layer_dense(units = 806, activation = "relu") %>%
  layer_dense(units = 1, activation = "sigmoid")

summary(model)
model %>% compile(optimizer = optimizer_adam(),
 loss = "binary_crossentropy",
 metrics = c("accuracy"))
history <- model %>% fit(as.matrix(traindata[,featurenames]), 
                         traindata$class,
                         epochs=100, batch_size = 64,
                         validation_data = list(as.matrix(valdata[,featurenames]), 
                                                valdata$class))
print(history)
evaluate(model, x = as.matrix(traindata[,featurenames]), y = traindata$class)
evaluate(model, x = as.matrix(valdata[,featurenames]), y = valdata$class)
plot(history)
pdf(file = "DeepLearning_Figure3.pdf",
    width = 7, height = 4, pointsize = 10, useDingbats = FALSE)

par(mfrow = c(1,2), cex = 0.8)

maxLoss <- max(c(history$metrics$loss, history$metrics$val_loss))

plot(history$metrics$loss, main="Model Loss", xlab = "Epoch", ylab="Loss", 
     xlim = c(0, length(history$metrics$loss)), ylim = c(0, maxLoss),
     col="red2", type="b",lwd=1)
lines(history$metrics$val_loss, col="steelblue1", type="b",lwd=1)
legend("topright", c("Training","Validation"), col=c("red2", "steelblue1"), 
       lty=c(1,1), bty = "n")

plot(history$metrics$accuracy, col="red2", type="b",lwd=1, 
     main="Model Accuracy", xlab = "Epoch", ylab="Accuracy", 
     xlim = c(0, length(history$metrics$accuracy)), ylim = c(0, 1))
lines(history$metrics$val_accuracy, col="steelblue1", type="b",lwd=1)
abline(h = 0.5, col = "grey", lty = 3)
dev.off()
predprob <- model %>% predict(as.matrix(testdata[,featurenames]))
f1 <- function(indata, truelabels){
  res <-  model %>% predict(as.matrix(indata[,featurenames]))
  res <- as.numeric(res>0.5)
  res <- factor(res, c(1,0))
  caret::confusionMatrix(res, factor(truelabels, c(1,0)))
}

trainAccuracy <- f1(traindata, traindata$class)
validationAccuracy <- f1(valdata, valdata$class)
testAccuracy <-f1(testdata, testdata$class)

performance <- round(t(as.data.frame(rbind(t(trainAccuracy$byClass), 
                                           t(validationAccuracy$byClass), 
                                           t(testAccuracy$byClass)))),3)
colnames(performance) <- c("Traning", "Validation", "Test")

performance
pdf(file = "DeepLearning_Figure4.pdf",
    width = 5, height = 5, pointsize = 12, useDingbats = FALSE)

verification::roc.plot(testdata$class, predprob, 
                       ylab = "True Positive Rate", 
                       xlab = "False Positive Rate")
abline(v = 0.5596, col = "red2", lty = 3)
abline(h = 0.0509, col = "steelblue1", lty = 3)

dev.off()
save_model_hdf5(model, "EcoliRegModel.h5")
save(exp, performance, gold, history, file = "EcoliRegModel.RData")
library(magrittr)
library(keras)
library(tensorflow)

set.seed(1979)


model <- load_model_hdf5("EcoliRegModel.h5")

# import expression data file
exp <- t(read.table("net3_expression_data.tsv", header = T))

# import original gene names mapping 
genenames <- read.table("net3_gene_ids.tsv", header = F)
genes <- genenames$V2; names(genes) <- genenames$V1

# import list of all transcription factors of Escherichia coli
# tfs <- names(genes)[genes %in% c("gadX", "flhC", "flhD","dnaA")] # trail run
tfs <- read.table("net3_transcription_factors.tsv", header = F)$V1

length(tfs)*nrow(exp)


predictions <- NULL

for(i in tfs){
  
  tfdata <-data.frame(tf = i, gene = rownames(exp), 
                      tfname = genes[i],
                      genename = genes[rownames(exp)])
  tfdata <- tibble::as_tibble(tfdata[tfdata$tf != tfdata$gene,])
  
  inpreddata <- cbind(exp[tfdata$tf,], exp[tfdata$gene,])
  inpreddata <- cbind(inpreddata, 
                      sapply(seq.int(dim(tfdata)[1]), 
                             function(i) 
                               cor(exp[tfdata$tf[i],], 
                                   exp[tfdata$gene[i],])))
  
  tfdata$probability <- (model %>% predict(inpreddata))[,1]
  
  predictions <- rbind(predictions, tfdata[tfdata$probability>0.5,])
  
}

predictions <- predictions[rev(order(predictions$probability)),]
predictions
predictions[predictions$tfname=="gadX",]
predictions[predictions$tfname=="flhC",]
predictions[predictions$tfname=="flhD",]
predictions[!paste0(predictions$tf,predictions$gene) %in%  
              paste0(inputdata$tf[inputdata$class==1], 
                     inputdata$gene[inputdata$class==1]),] 
write.table(predictions, file = "grnPredictionsEcoli.txt", 
            col.names = T, row.names = F, quote = F, sep = "\t")
write.csv(predictions, file = "grnPredictionsEcoli.csv", row.names = F)
writexl::write_xlsx(list(Table1 = predictions), 
                    path = "grnPredictionsEcoli.xlsx", col_names = T, )

Handiling imbalanced gold standard data

weights <- as.list((1/table(traindata$class)))

history <- model %>% fit(as.matrix(traindata[,featurenames]), traindata$class, 
                         epochs = 10, batch_size = 16, 
                         validation_data = list(as.matrix(valdata[,featurenames]), valdata$class),
                         class_weight = weights)

Increasing generalization and depth of DNN

model <- keras_model_sequential(input_shape = c(length(featurenames))) %>%
  layer_dense(units = 806, activation = "relu") %>%
  layer_batch_normalization() %>%
  layer_dense(units = 806, activation = "relu") %>%
  layer_dropout(0.5) %>%
  layer_dense(units = 512, activation = "relu") %>%
  layer_dropout(0.2) %>%
  layer_dense(units = 56, activation = "relu") %>%
  layer_batch_normalization() %>%
  layer_dropout(0.1) %>%
  layer_dense(units = 1, activation = "sigmoid")

Performance measures

metrics <- list(
  "fn" = metric_false_negatives(),
  "fp" = metric_false_positives(),
  "tn" = metric_true_negatives(),
  "tp" = metric_true_positives(),
  "accuracy" = metric_accuracy(),
  "precision" = metric_precision(),
  "recall" = metric_recall(),
  "auc" = metric_auc())

model %>% compile(optimizer = optimizer_adam(),
                  loss = "binary_crossentropy",
                  metrics = metrics)

An alternative code for prediction of GRN on high-end computational resources.

library(magrittr)
library(keras)
library(tensorflow)

set.seed(1979)


model <- load_model_hdf5("EcoliRegModel.h5")

# import expression data file
exp <- t(read.table("net3_expression_data.tsv", header = T))

# import original gene names mapping 
genenames <- read.table("net3_gene_ids.tsv", header = F)
genes <- genenames$V2; names(genes) <- genenames$V1

# import list of all transcription factors of Escherichia coli
tfs <- read.table("net3_transcription_factors.tsv", header = F)$V1

# create all possible pairs between tanscription factors and genes

predictions <- expand.grid(tfs,  rownames(exp), stringsAsFactors = FALSE)
predictions <- tibble::as_tibble(all_pairs[predictions$Var1 != predictions$Var2,])

# add original gene names

predictions$tfname <- genes[predictions$Var1]
predictions$genename <- genes[predictions$Var2]

# create feature table
inpreddata <- cbind(exp[predictions$Var1,], exp[predictions$Var2,])
inpreddata$pcc  <- sapply(seq.int(dim(predictions)[1]), 
                          function(i) cor(exp[predictions$Var1[i],], 
                                          exp[predictions$Var2[i],])))

# predict regulatory pairs
predictions$probability <- (model %>% predict(inpreddata))[,1]
predictions <- predictions[predictions$probability>0.5,]
predictions <- predictions[rev(order(predictions$probability)),]
predictions 

Controlling DNN model training by Callback functions

early_stopping <- callback_early_stopping(monitor = "val_loss", patience = 3)
model_checkpoint <- callback_model_checkpoint(filepath = "best_model.h5", 
                                              monitor = "val_accuracy", 
                                              save_best_only = TRUE)
reduce_lr <- callback_reduce_lr_on_plateau(monitor = "val_loss", 
                                           factor = 0.1, patience = 2)
tensorboard <- callback_tensorboard(log_dir = "logs")
history <- model %>% fit(as.matrix(traindata[,featurenames]), traindata$class,
                           epochs=40, batch_size = 50, validation_split=0.2,
                           callbacks = list(early_stopping, reduce_lr))

Citation

BibTeX citation:
@article{muley2023,
  author = {Muley, VY},
  title = {Deep {Learning} for {Predicting} {Gene} {Regulatory}
    {Networks:} {A} {Step-by-Step} {Protocol} in {R}},
  journal = {Methods in Molecular Biology},
  volume = {2719},
  date = {2023-10},
  langid = {en}
}
For attribution, please cite this work as:
Muley, VY. 2023. “Deep Learning for Predicting Gene Regulatory Networks: A Step-by-Step Protocol in R.” Methods in Molecular Biology 2719 (October).