标签:
反馈方式:
本程序待改进地方:
脚步说明:
使用方法:
代码如下:
#! /data02/software/R/bin/R
## Created by Roger Young, 2015-02-05
## nohup /data02/software/R/bin/R --file=~/mRNA_workflow.R &> ./nohup.log
## nohup /data02/software/R/bin/R CMD BATCH ~/mRNA_workflow.R &> ./nohup.log
##-------Those lines should be changed accordingly-----------------------------
## 正常情况下,需要根据需要更改以下几行:output_folder和output_folder;
## directory: 代表一个目录的绝对路径, 以"/"结尾;另有说明的除外!
## foler: 代表一个文件夹的文件名,以"/"结尾;另有说明的除外!
## path: 代表一个文件的绝对路径;另有说明的除外!
DEBUG_FLAG <- FALSE;
## R路径: /data02/software/R/bin/R
## source("~/Documents/mRNA_profiles/mRNA_workflow.R")
working_directory <- "~/Documents/mRNA_profiles/";
## 如果在Linux系统下测试,请酌情修改。
## C:\Program Files\R\R-3.1.2\bin\i386\R.exe --file=~/mRNA_workflow.R
if(DEBUG_FLAG){
working_directory <- "D:/roger/data/mRNA/";
}
output_folder <- paste("output_files_",
format(Sys.time(), "%Y%m%d_%I%M%S"),"/", sep="");
output_directory <- paste(working_directory, output_folder, sep ="");
dir.create(path = output_directory, recursive = TRUE, showWarnings = FALSE);
##-------所用到的软件目录,需根据具体设备更改------------------------------------------------
fastqc_path <- "~/bin/FastQC/fastqc";
tophat_app_path <- "tophat";
cufflinks_app_path <- "cufflinks";
cuffmerge_app_path <- "cuffmerge";
cuffdiff_app_path <- "cuffdiff";
## Parameters used by all those softwares!
threads_params <- "-p 10";
debug_flag <- " DEBUG FLAG: ";
##-------输出文件夹,一般无需修改----------------------------------------------------
log_folder <- "log/";
log_directory <- paste(output_directory, log_folder,sep = "");
dir.create(path = log_directory, recursive = TRUE, showWarnings = FALSE);
sink(file = paste(log_directory, "log_",
format(Sys.time(), "%Y%m%d_%I%M%S"), ".log", sep=""),
split = TRUE);
##-----获取本脚本的路径,并保存在输出文件夹中(注,尚需修改以保证可移植性)--------------
current_file_path_getter <- function() {
cmdArgs <- commandArgs(trailingOnly = FALSE);
print(paste("cmdArgs: ",paste(commandArgs(), collapse = " ")));
R_CMD_indicator <- "[[:blank:]]CMD[[:blank:]]";
CMD_MODE_matched <- grep(pattern = R_CMD_indicator,
x = cmdArgs);
if(length(CMD_MODE_matched) > 0){
}else{
file_arg_indicator <- "--file=";
matched <- grep(pattern = file_arg_indicator, x = cmdArgs);
if (length(matched) > 0) {
# Rscript
return(normalizePath(sub(pattern = file_arg_indicator,
replacement = "",
x = cmdArgs[matched])));
} else {
# ‘source‘d via R console
return(normalizePath(sys.frames()[[1]]$ofile));
}
}
}
current_script_path <- current_file_path_getter();
print(paste("current_script_path: ", current_script_path));
file.copy(from = current_file_path_getter(),
to = paste(output_directory,"/RScript.R",sep = ""),
overwrite = TRUE, copy.mode = TRUE, copy.date = TRUE);
##-------定义输入文件和文件夹-----------------------------------------------
input_folder <- "input_files/";
input_directory <- paste(working_directory, input_folder, sep = "");
sequenced_reads_folder <- "sequenced_reads"; ## 没有"/"
sequenced_reads_directory <- paste(input_directory,
sequenced_reads_folder, sep = "");
##-------Genome References-----------------------------------------------------
specie_name <- "Homo_sapiens/";
annotation_file_name <- "genes.gtf";
annotation_file_folder <- "Genes/";
annotation_index_directory <- paste(input_directory, specie_name,
annotation_file_folder , sep = "");
annotation_index_path <-paste(annotation_index_directory,
annotation_file_name, sep = "");
bowtie_index_name <- "genome";
bowtie_index_folder <- "Bowtie2Index/";
bowtie_index_directory <- paste(input_directory, specie_name,
bowtie_index_folder , sep = "");
bowtie_index_path <-paste(bowtie_index_directory,
bowtie_index_name, sep = "");
##-------用于确认所依赖的包已正确安装------------------------------------------
check_bioconductor_pkg <- function(pkgs) {
if(require(pkgs , character.only = TRUE)){
print(paste("******", pkgs,"is loaded correctly", "******"));
}
else {
print(paste("******Trying to install ", pkgs, "******"));
## Update R packages
print("******Update R packages******");
update.packages(ask = FALSE);
source("http://bioconductor.org/biocLite.R");
biocLite();
## Install desired Packages
biocLite(pkgs);
## Re-check
if(require(pkgs, character.only = TRUE)){
print(paste("******",pkgs, "installed and loaded!", "******"));
}
else {
stop(paste("??????","Error: could not install",pkgs, "??????"));
}
}
}
check_pkg <- function(pkgs) {
if(require(pkgs , character.only = TRUE)){
print(paste("******", pkgs,"is loaded correctly", "******"));
}
else {
print(paste("******Trying to install ", pkgs, "******"));
# ## Update R packages
print("******Update R packages******");
update.packages(ask = FALSE);
install.packages(pkgs);
## Re-check
if(require(pkgs, character.only = TRUE)){
print(paste("******",pkgs, "installed and loaded!", "******"));
}
else {
stop(paste("??????????","Error: could not install",
pkgs, "??????????"));
}
}
}
## (check_pkg("sendmailR"));
## (check_bioconductor_pkg("cummeRbund"));
##-------获取所需处理的测序文件列表------------------------------------------
(sequenced_files <- list.files(path = sequenced_reads_directory,
pattern = "*[^(.md5)|(.txt)]$",
all.files = TRUE,
full.names = TRUE,
recursive = TRUE));
##-------测序质量评估--------------------------------
(fastqc_output_folder <- "fastqc_output/");
(fastqc_output_directory <- paste(output_directory,
fastqc_output_folder,sep = ""));
(dir.create(path = fastqc_output_directory, recursive = TRUE, showWarnings = FALSE));
(sequenced_file_lists <- sequenced_files);
print(paste("sequenced_file_lists: ", sequenced_file_lists));
for (files_to_test in sequenced_file_lists){
print(paste("files_to_test: ", files_to_test));
fastqc_params <- "--outdir=";
fastqc_output_directory;
## Tedious lines for output folder construction
temp_output_directory_dot_repaced <- gsub(pattern = "\\.",
replacement = "_",
x = files_to_test);
fastqc_output_directory <- gsub(pattern = paste(input_folder,
sequenced_reads_folder,
"/", sep=""), ## specical
replacement = paste(output_folder,
fastqc_output_folder,
sep = ""),,
x = temp_output_directory_dot_repaced);
print(paste(debug_flag, "fastqc_output_directory: ", fastqc_output_directory));
dir.create(fastqc_output_directory, recursive = TRUE);
## 构建用于运行fastqc的命令
fastqc_cmd <- paste(fastqc_path, files_to_test,
paste( fastqc_params, fastqc_output_directory,
sep = ""),
sep = " ");
print(paste(debug_flag,"fasqc_cmd: ", fastqc_cmd));
##质量评估与后续部分无联系,所以在后台运行,以提高速度!
if(!DEBUG_FLAG){
print("Running fastqc!")
system(command = fastqc_cmd, wait = FALSE);
} else {
print("In Debug mode!")
}
}
##-------Align the RNA-seq reads to the genomes--------------------------------
## Map the reads for each sample to the reference genome
(tophat_output_folder <- "tophat_output/");
(tophat_output_directory <- paste(output_directory,
tophat_output_folder, sep = ""));
(dir.create(path = tophat_output_directory, recursive = TRUE, showWarnings = FALSE));
(cufflinks_output_folder <- "cufflinks_output/");
(cufflinks_output_directory <- paste(output_directory,
cufflinks_output_folder, sep = ""));
(dir.create(path = cufflinks_output_directory, recursive = TRUE, showWarnings = FALSE));
(sequenced_file_lists <- sequenced_files);
(cuffmerge_assemblies_file <- "assemblies.txt")
(cuffmerge_assemblies_file_text <- "");
(cuffmerge_assemblies_file_path <- paste(output_directory,
cuffmerge_assemblies_file,
sep = ""));
(accepted_hits_bam_file <- "accepted_hits_bam.txt")
(accepted_hits_bam_file_text <- "");
(accepted_hits_bam_file_path <- paste(output_directory,
accepted_hits_bam_file,
sep = ""));
## 该部分用于构建cuffdiff所需要用的accepted_hits_bam_file_path文件;
## 该部分运行完后需要按要求修改accepted_hits_bam_file_path文件。
for(sequenced_read in sequenced_file_lists){
print(paste(debug_flag, "The file to process: ", sequenced_read));
temp_output_directory_dot_repaced <- gsub(pattern = "\\.",
replacement = "_",
x = sequenced_read);
tophat_output_directory <- gsub(pattern = paste(input_folder,
sequenced_reads_folder,
"/", sep=""), ## specical
replacement = paste(output_folder,
tophat_output_folder,
sep = ""),,
x = temp_output_directory_dot_repaced);
accepted_hits_bam_file_text <- paste(accepted_hits_bam_file_text,
tophat_output_directory, "/",
"accepted_hits.bam",
"\n",
sep = "");
cufflinks_output_directory <- gsub(pattern = paste(input_folder,
sequenced_reads_folder,
"/", sep=""), ## specical
replacement = paste(output_folder,
cufflinks_output_folder,
sep = ""),,
x = temp_output_directory_dot_repaced);
cuffmerge_assemblies_file_text <- paste(cuffmerge_assemblies_file_text,
cufflinks_output_directory, "/",
"transcripts.gtf",
"\n",
sep = "");
}
write(cuffmerge_assemblies_file_text,
file = cuffmerge_assemblies_file_path);
write(accepted_hits_bam_file_text,
file = accepted_hits_bam_file_path);
accepted_hits_bam_file_modify_time <- file.info(accepted_hits_bam_file_path);
## 给用户发邮件,告知accepted_hits_bam_file_path文件已建立,请修改;
print(paste("Please edit file: ", accepted_hits_bam_file_path));
print(paste("First, insert the following line (seperated by tab): \"sample_id group_label\";"));
print("The, Append each other lines with a tab, followed by the sample‘s label (group)")
print("Finally, Save the file!!")
number_of_files_had_processed <- 0; ##length(sequenced_file_lists);
print(paste(debug_flag, "number_of_files_had_processed: ", number_of_files_had_processed));
for(sequenced_read in sequenced_file_lists){
print(paste(debug_flag, "The file to process: ", sequenced_read));
temp_output_directory_dot_repaced <- gsub(pattern = "\\.",
replacement = "_",
x = sequenced_read);
tophat_output_directory <- gsub(pattern = paste(input_folder,
sequenced_reads_folder,
"/", sep=""), ## specical
replacement = paste(output_folder,
tophat_output_folder,
sep = ""),,
x = temp_output_directory_dot_repaced);
print(paste(debug_flag, "Tophat output folder: ",tophat_output_directory));
(dir.create(path = tophat_output_directory, recursive = TRUE, showWarnings = FALSE));
##构建运行tophat的命令
(tophat_cmd <- paste(tophat_app_path, threads_params, "-G",
annotation_index_path,
"-o",
tophat_output_directory,
bowtie_index_path,
sequenced_read, sep = " "));
print(paste(debug_flag, "tophat_cmd: ",tophat_cmd));
if(!DEBUG_FLAG){
system(command = tophat_cmd, wait = TRUE);
}
## 构建运行cufflinks的命令
cufflinks_output_directory <- gsub(pattern = paste(input_folder,
sequenced_reads_folder,
"/", sep=""), ## specical
replacement = paste(output_folder,
cufflinks_output_folder,
sep = ""),,
x = temp_output_directory_dot_repaced);
print(paste(debug_flag, "cufflinks output directory", cufflinks_output_directory))
(dir.create(path = cufflinks_output_directory, recursive = TRUE, showWarnings = FALSE));
(cufflinks_cmd <- paste(cufflinks_app_path, threads_params, "-o",
cufflinks_output_directory,
paste(tophat_output_directory,"/",
"accepted_hits.bam", sep = ""),
sep = " "));
print(paste(debug_flag, "cufflinks_cmd: ",cufflinks_cmd));
##同步运行cufflinks!
number_of_files_had_processed <- number_of_files_had_processed + 1;
print(paste(debug_flag, "Number of files had been handled: ", number_of_files_had_processed));
if(!DEBUG_FLAG){
system(command = cufflinks_cmd, wait = TRUE);
}
}
warnings()
##-------Run Cuffmerge to create a single merged transcriptome annotation------
(cuffmerge_output_folder <- "cuffmerge_output"); ##没有“/”
(cuffmerge_output_directory <- paste(output_directory,
cuffmerge_output_folder, sep = ""));
dir.create(path = cuffmerge_output_directory, recursive = TRUE, showWarnings = FALSE);
cuffmerge_cmd <- paste(cuffmerge_app_path, "--ref-gtf", annotation_index_path,
"-o", cuffmerge_output_directory,
"--ref-sequence", paste(bowtie_index_directory,"genome.fa",
sep = ""),
threads_params,
cuffmerge_assemblies_file_path,
sep = " ");
print(paste(debug_flag, "cuffmerge_cmd: ",cuffmerge_cmd));
if(!DEBUG_FLAG){
system(command = cuffmerge_cmd, wait = TRUE);
}
##-------Identify differentially expressed genes and transcripts------
(cuffdiff_output_folder <- "cuffdiff_output/");
(cuffdiff_output_directory <- paste(output_directory,
cuffdiff_output_folder, sep = ""));
(dir.create(path = cuffdiff_output_directory, recursive = TRUE, showWarnings = FALSE));
cuffdiff_cmd <- paste(cuffdiff_app_path, "-use-sample-sheet",
"-o", cuffdiff_output_directory,
"-b", paste(bowtie_index_directory,"genome.fa",
sep = ""),
threads_params,
"-u", paste(cuffmerge_output_directory,"/","merged.gtf",sep=""),
accepted_hits_bam_file_path,
sep=" ")
print(paste(debug_flag, "cuffdiff_cmd: ",cuffdiff_cmd, sep = ""));
if(DEBUG_FLAG){
Sys.sleep(time = 20);
}
## 此处,我们认为用户已正确修改了accepted_hits_bam_file_path所指代的文件
if(identical(file.info(accepted_hits_bam_file_path),
accepted_hits_bam_file_modify_time)){
print(paste("This file stays unchanged: ", accepted_hits_bam_file_path, sep = ""));
print("cuffdiff is skipped! Please run it manually!!");
print(paste("The cmd for running cuffdiff is: ",cuffdiff_cmd, sep = ""));
} else {
print("cuffdiff_cmd is running");
if(!DEBUG_FLAG){
system(command = cuffdiff_cmd, wait = TRUE,);
}
}
print("Done!");
sink();
print("Done!");
if(!DEBUG_FLAG){
quit(save = "no");
}
完成!
标签:
原文地址:http://www.cnblogs.com/previewslider/p/4332992.html