install.packages("tensorflow")
Computational and software requirements
library(reticulate)
<- "/usr/local/bin/python3"
path_to_python virtualenv_create("r-reticulate", python = path_to_python)
library(reticulate)
<- "/usr/local/bin/python3"
path_to_python virtualenv_create("r-reticulate", python = path_to_python)
library(tensorflow)
install_tensorflow(envname = "r-reticulate")
install.packages("keras")
library(keras)
install_keras(envname = "r-reticulate")
library(tensorflow)
$constant("Hello Tensorflow!") tf
install.packages(c("readr", "tibble", "caret", "verification"))
Methods
library(magrittr)
library(keras)
library(tensorflow)
set.seed(1979)
<- read.table("net3_expression_data.tsv", header = T)
exp <- t(exp)
exp dim(exp)
1:5,1:5] exp[
# plot
hist(exp, 100, col = "darkgrey", border = "white",
xlab = "Expression intensity", main = "Gene expression")
# export plot to pdf file in the current directory
pdf(file = "DeepLearning_Figure1.pdf",
width = 5, height = 5, pointsize = 10, useDingbats = FALSE)
hist(exp, 100, col = "darkgrey", border = "white",
xlab = "Expression intensity", main = "Gene expression")
dev.off()
<- readr::read_table("DREAM5_NetworkInference_GoldStandard_Network3.tsv",
gold col_names = F)
gold
<- gold[gold$X1 %in% rownames(exp) & gold$X2 %in% rownames(exp), ]
gold table(gold$X3)
<- c(which(gold$X3==1),
keep_indices sample(which(gold$X3==0), size = sum(gold$X3)*2))
<- gold[keep_indices,]
gold table(gold$X3)
<- data.frame(tf = gold$X1, gene = gold$X2,
inputdata $X1,], exp[gold$X2,])
exp[gold$pcc <- sapply(seq.int(dim(gold)[1]), function(i)
inputdatacor(exp[gold$X1[i],], exp[gold$X2[i],]))
$class <- gold$X3
inputdata
<- tibble::as_tibble(inputdata)
inputdata
<- colnames(inputdata) %>% setdiff("class") %>%
featurenames setdiff("tf") %>% setdiff("gene")
<- hist(as.matrix(inputdata[inputdata$class=="1",featurenames]),
h1 breaks = 100, plot = FALSE)
<- hist(as.matrix(inputdata[inputdata$class=="0",featurenames]),
h2 breaks = 100, plot = FALSE)
pdf(file = "DeepLearning_Figure2.pdf",
width = 5, height = 5, pointsize = 10, useDingbats = FALSE)
plot(h2, col = rgb(0,0,1,0.4), freq = FALSE,
xlab = "Expression intensity",
main = "Distribution of feature values for training data")
plot(h1, xaxt = "n", yaxt = "n", col = rgb(1,0,0,0.4),
freq = FALSE, add = TRUE)
dev.off()
<- nrow(inputdata) %>% sample.int(., ceiling( . * 0.8))
indices <- inputdata[indices, ]
traindata <- inputdata[-indices, ]
testdata <- nrow(testdata) %>% sample.int(., ceiling( . * 0.5))
indices <- testdata[-indices, ]
valdata <- testdata[indices, ]
testdata table(traindata$class)
table(valdata$class)
table(testdata$class)
set_random_seed(1979)
<- keras_model_sequential(
model input_shape = c(length(featurenames))) %>%
layer_dense(units = 806, activation = "relu") %>%
layer_dense(units = 1, activation = "sigmoid")
summary(model)
%>% compile(optimizer = optimizer_adam(),
model loss = "binary_crossentropy",
metrics = c("accuracy"))
<- model %>% fit(as.matrix(traindata[,featurenames]),
history $class,
traindataepochs=100, batch_size = 64,
validation_data = list(as.matrix(valdata[,featurenames]),
$class)) valdata
print(history)
evaluate(model, x = as.matrix(traindata[,featurenames]), y = traindata$class)
evaluate(model, x = as.matrix(valdata[,featurenames]), y = valdata$class)
plot(history)
pdf(file = "DeepLearning_Figure3.pdf",
width = 7, height = 4, pointsize = 10, useDingbats = FALSE)
par(mfrow = c(1,2), cex = 0.8)
<- max(c(history$metrics$loss, history$metrics$val_loss))
maxLoss
plot(history$metrics$loss, main="Model Loss", xlab = "Epoch", ylab="Loss",
xlim = c(0, length(history$metrics$loss)), ylim = c(0, maxLoss),
col="red2", type="b",lwd=1)
lines(history$metrics$val_loss, col="steelblue1", type="b",lwd=1)
legend("topright", c("Training","Validation"), col=c("red2", "steelblue1"),
lty=c(1,1), bty = "n")
plot(history$metrics$accuracy, col="red2", type="b",lwd=1,
main="Model Accuracy", xlab = "Epoch", ylab="Accuracy",
xlim = c(0, length(history$metrics$accuracy)), ylim = c(0, 1))
lines(history$metrics$val_accuracy, col="steelblue1", type="b",lwd=1)
abline(h = 0.5, col = "grey", lty = 3)
dev.off()
<- model %>% predict(as.matrix(testdata[,featurenames])) predprob
<- function(indata, truelabels){
f1 <- model %>% predict(as.matrix(indata[,featurenames]))
res <- as.numeric(res>0.5)
res <- factor(res, c(1,0))
res ::confusionMatrix(res, factor(truelabels, c(1,0)))
caret
}
<- f1(traindata, traindata$class)
trainAccuracy <- f1(valdata, valdata$class)
validationAccuracy <-f1(testdata, testdata$class)
testAccuracy
<- round(t(as.data.frame(rbind(t(trainAccuracy$byClass),
performance t(validationAccuracy$byClass),
t(testAccuracy$byClass)))),3)
colnames(performance) <- c("Traning", "Validation", "Test")
performance
pdf(file = "DeepLearning_Figure4.pdf",
width = 5, height = 5, pointsize = 12, useDingbats = FALSE)
::roc.plot(testdata$class, predprob,
verificationylab = "True Positive Rate",
xlab = "False Positive Rate")
abline(v = 0.5596, col = "red2", lty = 3)
abline(h = 0.0509, col = "steelblue1", lty = 3)
dev.off()
save_model_hdf5(model, "EcoliRegModel.h5")
save(exp, performance, gold, history, file = "EcoliRegModel.RData")
library(magrittr)
library(keras)
library(tensorflow)
set.seed(1979)
<- load_model_hdf5("EcoliRegModel.h5")
model
# import expression data file
<- t(read.table("net3_expression_data.tsv", header = T))
exp
# import original gene names mapping
<- read.table("net3_gene_ids.tsv", header = F)
genenames <- genenames$V2; names(genes) <- genenames$V1
genes
# import list of all transcription factors of Escherichia coli
# tfs <- names(genes)[genes %in% c("gadX", "flhC", "flhD","dnaA")] # trail run
<- read.table("net3_transcription_factors.tsv", header = F)$V1
tfs
length(tfs)*nrow(exp)
<- NULL
predictions
for(i in tfs){
<-data.frame(tf = i, gene = rownames(exp),
tfdata tfname = genes[i],
genename = genes[rownames(exp)])
<- tibble::as_tibble(tfdata[tfdata$tf != tfdata$gene,])
tfdata
<- cbind(exp[tfdata$tf,], exp[tfdata$gene,])
inpreddata <- cbind(inpreddata,
inpreddata sapply(seq.int(dim(tfdata)[1]),
function(i)
cor(exp[tfdata$tf[i],],
$gene[i],])))
exp[tfdata
$probability <- (model %>% predict(inpreddata))[,1]
tfdata
<- rbind(predictions, tfdata[tfdata$probability>0.5,])
predictions
}
<- predictions[rev(order(predictions$probability)),]
predictions predictions
$tfname=="gadX",]
predictions[predictions$tfname=="flhC",]
predictions[predictions$tfname=="flhD",] predictions[predictions
!paste0(predictions$tf,predictions$gene) %in%
predictions[paste0(inputdata$tf[inputdata$class==1],
$gene[inputdata$class==1]),] inputdata
write.table(predictions, file = "grnPredictionsEcoli.txt",
col.names = T, row.names = F, quote = F, sep = "\t")
write.csv(predictions, file = "grnPredictionsEcoli.csv", row.names = F)
::write_xlsx(list(Table1 = predictions),
writexlpath = "grnPredictionsEcoli.xlsx", col_names = T, )
Handiling imbalanced gold standard data
<- as.list((1/table(traindata$class)))
weights
<- model %>% fit(as.matrix(traindata[,featurenames]), traindata$class,
history epochs = 10, batch_size = 16,
validation_data = list(as.matrix(valdata[,featurenames]), valdata$class),
class_weight = weights)
Increasing generalization and depth of DNN
<- keras_model_sequential(input_shape = c(length(featurenames))) %>%
model layer_dense(units = 806, activation = "relu") %>%
layer_batch_normalization() %>%
layer_dense(units = 806, activation = "relu") %>%
layer_dropout(0.5) %>%
layer_dense(units = 512, activation = "relu") %>%
layer_dropout(0.2) %>%
layer_dense(units = 56, activation = "relu") %>%
layer_batch_normalization() %>%
layer_dropout(0.1) %>%
layer_dense(units = 1, activation = "sigmoid")
Performance measures
<- list(
metrics "fn" = metric_false_negatives(),
"fp" = metric_false_positives(),
"tn" = metric_true_negatives(),
"tp" = metric_true_positives(),
"accuracy" = metric_accuracy(),
"precision" = metric_precision(),
"recall" = metric_recall(),
"auc" = metric_auc())
%>% compile(optimizer = optimizer_adam(),
model loss = "binary_crossentropy",
metrics = metrics)
An alternative code for prediction of GRN on high-end computational resources.
library(magrittr)
library(keras)
library(tensorflow)
set.seed(1979)
<- load_model_hdf5("EcoliRegModel.h5")
model
# import expression data file
<- t(read.table("net3_expression_data.tsv", header = T))
exp
# import original gene names mapping
<- read.table("net3_gene_ids.tsv", header = F)
genenames <- genenames$V2; names(genes) <- genenames$V1
genes
# import list of all transcription factors of Escherichia coli
<- read.table("net3_transcription_factors.tsv", header = F)$V1
tfs
# create all possible pairs between tanscription factors and genes
<- expand.grid(tfs, rownames(exp), stringsAsFactors = FALSE)
predictions <- tibble::as_tibble(all_pairs[predictions$Var1 != predictions$Var2,])
predictions
# add original gene names
$tfname <- genes[predictions$Var1]
predictions$genename <- genes[predictions$Var2]
predictions
# create feature table
<- cbind(exp[predictions$Var1,], exp[predictions$Var2,])
inpreddata $pcc <- sapply(seq.int(dim(predictions)[1]),
inpreddatafunction(i) cor(exp[predictions$Var1[i],],
$Var2[i],])))
exp[predictions
# predict regulatory pairs
$probability <- (model %>% predict(inpreddata))[,1]
predictions<- predictions[predictions$probability>0.5,]
predictions <- predictions[rev(order(predictions$probability)),]
predictions predictions
Controlling DNN model training by Callback functions
<- callback_early_stopping(monitor = "val_loss", patience = 3) early_stopping
<- callback_model_checkpoint(filepath = "best_model.h5",
model_checkpoint monitor = "val_accuracy",
save_best_only = TRUE)
<- callback_reduce_lr_on_plateau(monitor = "val_loss",
reduce_lr factor = 0.1, patience = 2)
<- callback_tensorboard(log_dir = "logs") tensorboard
<- model %>% fit(as.matrix(traindata[,featurenames]), traindata$class,
history epochs=40, batch_size = 50, validation_split=0.2,
callbacks = list(early_stopping, reduce_lr))
Citation
BibTeX citation:
@article{muley2023,
author = {Muley, VY},
title = {Deep {Learning} for {Predicting} {Gene} {Regulatory}
{Networks:} {A} {Step-by-Step} {Protocol} in {R}},
journal = {Methods in Molecular Biology},
volume = {2719},
date = {2023-10},
langid = {en}
}
For attribution, please cite this work as:
Muley, VY. 2023. “Deep Learning for Predicting Gene Regulatory
Networks: A Step-by-Step Protocol in R.” Methods in Molecular
Biology 2719 (October).