install.packages("tensorflow")Computational and software requirements
library(reticulate)
path_to_python <- "/usr/local/bin/python3"
virtualenv_create("r-reticulate", python = path_to_python)library(reticulate)
path_to_python <- "/usr/local/bin/python3"
virtualenv_create("r-reticulate", python = path_to_python)library(tensorflow)
install_tensorflow(envname = "r-reticulate")install.packages("keras")
library(keras)
install_keras(envname = "r-reticulate")library(tensorflow)
tf$constant("Hello Tensorflow!")install.packages(c("readr", "tibble", "caret", "verification"))Methods
library(magrittr)
library(keras)
library(tensorflow)set.seed(1979)exp <- read.table("net3_expression_data.tsv", header = T)
exp <- t(exp)
dim(exp)
exp[1:5,1:5]# plot
hist(exp, 100, col = "darkgrey", border = "white",
xlab = "Expression intensity", main = "Gene expression")
# export plot to pdf file in the current directory
pdf(file = "DeepLearning_Figure1.pdf",
width = 5, height = 5, pointsize = 10, useDingbats = FALSE)
hist(exp, 100, col = "darkgrey", border = "white",
xlab = "Expression intensity", main = "Gene expression")
dev.off()gold <- readr::read_table("DREAM5_NetworkInference_GoldStandard_Network3.tsv",
col_names = F)
goldgold <- gold[gold$X1 %in% rownames(exp) & gold$X2 %in% rownames(exp), ]
table(gold$X3)keep_indices <- c(which(gold$X3==1),
sample(which(gold$X3==0), size = sum(gold$X3)*2))
gold <- gold[keep_indices,]
table(gold$X3)inputdata <- data.frame(tf = gold$X1, gene = gold$X2,
exp[gold$X1,], exp[gold$X2,])
inputdata$pcc <- sapply(seq.int(dim(gold)[1]), function(i)
cor(exp[gold$X1[i],], exp[gold$X2[i],]))
inputdata$class <- gold$X3
inputdata <- tibble::as_tibble(inputdata)
featurenames <- colnames(inputdata) %>% setdiff("class") %>%
setdiff("tf") %>% setdiff("gene")h1 <- hist(as.matrix(inputdata[inputdata$class=="1",featurenames]),
breaks = 100, plot = FALSE)
h2 <- hist(as.matrix(inputdata[inputdata$class=="0",featurenames]),
breaks = 100, plot = FALSE)
pdf(file = "DeepLearning_Figure2.pdf",
width = 5, height = 5, pointsize = 10, useDingbats = FALSE)
plot(h2, col = rgb(0,0,1,0.4), freq = FALSE,
xlab = "Expression intensity",
main = "Distribution of feature values for training data")
plot(h1, xaxt = "n", yaxt = "n", col = rgb(1,0,0,0.4),
freq = FALSE, add = TRUE)
dev.off()indices <- nrow(inputdata) %>% sample.int(., ceiling( . * 0.8))
traindata <- inputdata[indices, ]
testdata <- inputdata[-indices, ]
indices <- nrow(testdata) %>% sample.int(., ceiling( . * 0.5))
valdata <- testdata[-indices, ]
testdata <- testdata[indices, ]
table(traindata$class)
table(valdata$class)
table(testdata$class)set_random_seed(1979)
model <- keras_model_sequential(
input_shape = c(length(featurenames))) %>%
layer_dense(units = 806, activation = "relu") %>%
layer_dense(units = 1, activation = "sigmoid")
summary(model)model %>% compile(optimizer = optimizer_adam(),
loss = "binary_crossentropy",
metrics = c("accuracy"))history <- model %>% fit(as.matrix(traindata[,featurenames]),
traindata$class,
epochs=100, batch_size = 64,
validation_data = list(as.matrix(valdata[,featurenames]),
valdata$class))print(history)
evaluate(model, x = as.matrix(traindata[,featurenames]), y = traindata$class)
evaluate(model, x = as.matrix(valdata[,featurenames]), y = valdata$class)
plot(history)pdf(file = "DeepLearning_Figure3.pdf",
width = 7, height = 4, pointsize = 10, useDingbats = FALSE)
par(mfrow = c(1,2), cex = 0.8)
maxLoss <- max(c(history$metrics$loss, history$metrics$val_loss))
plot(history$metrics$loss, main="Model Loss", xlab = "Epoch", ylab="Loss",
xlim = c(0, length(history$metrics$loss)), ylim = c(0, maxLoss),
col="red2", type="b",lwd=1)
lines(history$metrics$val_loss, col="steelblue1", type="b",lwd=1)
legend("topright", c("Training","Validation"), col=c("red2", "steelblue1"),
lty=c(1,1), bty = "n")
plot(history$metrics$accuracy, col="red2", type="b",lwd=1,
main="Model Accuracy", xlab = "Epoch", ylab="Accuracy",
xlim = c(0, length(history$metrics$accuracy)), ylim = c(0, 1))
lines(history$metrics$val_accuracy, col="steelblue1", type="b",lwd=1)
abline(h = 0.5, col = "grey", lty = 3)
dev.off()predprob <- model %>% predict(as.matrix(testdata[,featurenames]))f1 <- function(indata, truelabels){
res <- model %>% predict(as.matrix(indata[,featurenames]))
res <- as.numeric(res>0.5)
res <- factor(res, c(1,0))
caret::confusionMatrix(res, factor(truelabels, c(1,0)))
}
trainAccuracy <- f1(traindata, traindata$class)
validationAccuracy <- f1(valdata, valdata$class)
testAccuracy <-f1(testdata, testdata$class)
performance <- round(t(as.data.frame(rbind(t(trainAccuracy$byClass),
t(validationAccuracy$byClass),
t(testAccuracy$byClass)))),3)
colnames(performance) <- c("Traning", "Validation", "Test")
performancepdf(file = "DeepLearning_Figure4.pdf",
width = 5, height = 5, pointsize = 12, useDingbats = FALSE)
verification::roc.plot(testdata$class, predprob,
ylab = "True Positive Rate",
xlab = "False Positive Rate")
abline(v = 0.5596, col = "red2", lty = 3)
abline(h = 0.0509, col = "steelblue1", lty = 3)
dev.off()save_model_hdf5(model, "EcoliRegModel.h5")
save(exp, performance, gold, history, file = "EcoliRegModel.RData")library(magrittr)
library(keras)
library(tensorflow)
set.seed(1979)
model <- load_model_hdf5("EcoliRegModel.h5")
# import expression data file
exp <- t(read.table("net3_expression_data.tsv", header = T))
# import original gene names mapping
genenames <- read.table("net3_gene_ids.tsv", header = F)
genes <- genenames$V2; names(genes) <- genenames$V1
# import list of all transcription factors of Escherichia coli
# tfs <- names(genes)[genes %in% c("gadX", "flhC", "flhD","dnaA")] # trail run
tfs <- read.table("net3_transcription_factors.tsv", header = F)$V1
length(tfs)*nrow(exp)
predictions <- NULL
for(i in tfs){
tfdata <-data.frame(tf = i, gene = rownames(exp),
tfname = genes[i],
genename = genes[rownames(exp)])
tfdata <- tibble::as_tibble(tfdata[tfdata$tf != tfdata$gene,])
inpreddata <- cbind(exp[tfdata$tf,], exp[tfdata$gene,])
inpreddata <- cbind(inpreddata,
sapply(seq.int(dim(tfdata)[1]),
function(i)
cor(exp[tfdata$tf[i],],
exp[tfdata$gene[i],])))
tfdata$probability <- (model %>% predict(inpreddata))[,1]
predictions <- rbind(predictions, tfdata[tfdata$probability>0.5,])
}
predictions <- predictions[rev(order(predictions$probability)),]
predictionspredictions[predictions$tfname=="gadX",]
predictions[predictions$tfname=="flhC",]
predictions[predictions$tfname=="flhD",]predictions[!paste0(predictions$tf,predictions$gene) %in%
paste0(inputdata$tf[inputdata$class==1],
inputdata$gene[inputdata$class==1]),] write.table(predictions, file = "grnPredictionsEcoli.txt",
col.names = T, row.names = F, quote = F, sep = "\t")
write.csv(predictions, file = "grnPredictionsEcoli.csv", row.names = F)
writexl::write_xlsx(list(Table1 = predictions),
path = "grnPredictionsEcoli.xlsx", col_names = T, )Handiling imbalanced gold standard data
weights <- as.list((1/table(traindata$class)))
history <- model %>% fit(as.matrix(traindata[,featurenames]), traindata$class,
epochs = 10, batch_size = 16,
validation_data = list(as.matrix(valdata[,featurenames]), valdata$class),
class_weight = weights)Increasing generalization and depth of DNN
model <- keras_model_sequential(input_shape = c(length(featurenames))) %>%
layer_dense(units = 806, activation = "relu") %>%
layer_batch_normalization() %>%
layer_dense(units = 806, activation = "relu") %>%
layer_dropout(0.5) %>%
layer_dense(units = 512, activation = "relu") %>%
layer_dropout(0.2) %>%
layer_dense(units = 56, activation = "relu") %>%
layer_batch_normalization() %>%
layer_dropout(0.1) %>%
layer_dense(units = 1, activation = "sigmoid")Performance measures
metrics <- list(
"fn" = metric_false_negatives(),
"fp" = metric_false_positives(),
"tn" = metric_true_negatives(),
"tp" = metric_true_positives(),
"accuracy" = metric_accuracy(),
"precision" = metric_precision(),
"recall" = metric_recall(),
"auc" = metric_auc())
model %>% compile(optimizer = optimizer_adam(),
loss = "binary_crossentropy",
metrics = metrics)An alternative code for prediction of GRN on high-end computational resources.
library(magrittr)
library(keras)
library(tensorflow)
set.seed(1979)
model <- load_model_hdf5("EcoliRegModel.h5")
# import expression data file
exp <- t(read.table("net3_expression_data.tsv", header = T))
# import original gene names mapping
genenames <- read.table("net3_gene_ids.tsv", header = F)
genes <- genenames$V2; names(genes) <- genenames$V1
# import list of all transcription factors of Escherichia coli
tfs <- read.table("net3_transcription_factors.tsv", header = F)$V1
# create all possible pairs between tanscription factors and genes
predictions <- expand.grid(tfs, rownames(exp), stringsAsFactors = FALSE)
predictions <- tibble::as_tibble(all_pairs[predictions$Var1 != predictions$Var2,])
# add original gene names
predictions$tfname <- genes[predictions$Var1]
predictions$genename <- genes[predictions$Var2]
# create feature table
inpreddata <- cbind(exp[predictions$Var1,], exp[predictions$Var2,])
inpreddata$pcc <- sapply(seq.int(dim(predictions)[1]),
function(i) cor(exp[predictions$Var1[i],],
exp[predictions$Var2[i],])))
# predict regulatory pairs
predictions$probability <- (model %>% predict(inpreddata))[,1]
predictions <- predictions[predictions$probability>0.5,]
predictions <- predictions[rev(order(predictions$probability)),]
predictions Controlling DNN model training by Callback functions
early_stopping <- callback_early_stopping(monitor = "val_loss", patience = 3)model_checkpoint <- callback_model_checkpoint(filepath = "best_model.h5",
monitor = "val_accuracy",
save_best_only = TRUE)reduce_lr <- callback_reduce_lr_on_plateau(monitor = "val_loss",
factor = 0.1, patience = 2)tensorboard <- callback_tensorboard(log_dir = "logs")history <- model %>% fit(as.matrix(traindata[,featurenames]), traindata$class,
epochs=40, batch_size = 50, validation_split=0.2,
callbacks = list(early_stopping, reduce_lr))Citation
BibTeX citation:
@article{muley2023,
author = {Muley, VY},
title = {Deep {Learning} for {Predicting} {Gene} {Regulatory}
{Networks:} {A} {Step-by-Step} {Protocol} in {R}},
journal = {Methods in Molecular Biology},
volume = {2719},
date = {2023-10},
langid = {en}
}
For attribution, please cite this work as:
Muley, VY. 2023. “Deep Learning for Predicting Gene Regulatory
Networks: A Step-by-Step Protocol in R.” Methods in Molecular
Biology 2719 (October).