码迷,mamicode.com
首页 > 其他好文 > 详细

RandomForest&ROC

时间:2016-08-18 18:28:09      阅读:183      评论:0      收藏:0      [点我收藏+]

标签:

技术分享
  1 # -*- coding: utf-8 -*-
  2 # __author__ = ‘JieYao‘
  3 from biocluster.agent import Agent
  4 from biocluster.tool import Tool
  5 import os
  6 import types
  7 import subprocess
  8 from biocluster.core.exceptions import OptionError
  9 
 10 
 11 class RandomforestAgent(Agent):
 12     """
 13     需要RandomForest.pl
 14     version v1.0
 15     author: JieYao
 16     last_modified:2016.07.18
 17     """
 18 
 19     def __init__(self, parent):
 20         super(RandomforestAgent, self).__init__(parent)
 21         options = [
 22             {"name": "otutable", "type": "infile", "format": "meta.otu.otu_table, meta.otu.tax_summary_dir"},
 23             {"name": "level", "type": "string", "default": "otu"},
 24             {"name": "envtable", "type": "infile", "format": "meta.otu.group_table"},
 25             {"name": "envlabs", "type": "string", "default": ""},
 26             {"name": "ntree", "type": "int", "default": 500 },
 27             {"name": "problem_type", "type": "int", "default": 2 },
 28             {"name": "top_number", "type": "int", "default": 50}
 29         ]
 30         self.add_option(options)
 31         self.step.add_steps(RandomforestAnalysis)
 32         self.on(start, self.step_start)
 33         self.on(end, self.step_end)
 34 
 35     def step_start(self):
 36         self.step.RandomforestAnalysis.start()
 37         self.step.update()
 38 
 39     def step_end(self):
 40         self.step.RandomforestAnalysis.finish()
 41         self.step.update()
 42 
 43     def gettable(self):
 44         """
 45         根据输入的otu表和分类水平计算新的otu表
 46         :return:
 47         """
 48         if self.option(otutable).format == "meta.otu.tax_summary_dir":
 49             return self.option(otutable).get_table(self.option(level))
 50         else:
 51             return self.option(otutable).prop[path]
 52         
 53     def check_options(self):
 54         """
 55         重写参数检查
 56         """
 57         if not self.option(otutable).is_set:
 58             raise OptionError(必须提供otu表)
 59         self.option(otutable).get_info()
 60         if self.option(otutable).prop[sample_num] < 2:
 61             raise OptionError(otu表的样本数目少于2,不可进行随机森林特征分析)
 62         if self.option(envtable).is_set:
 63             self.option(envtable).get_info()
 64             if self.option(envlabs):
 65                 labs = self.option(envlabs).split(,)
 66                 for lab in labs:
 67                     if lab not in self.option(envtable).prop[group_scheme]:
 68                         raise OptionError(envlabs中有不在物种(环境因子)表中存在的因子:%s % lab)
 69             else:
 70                 pass
 71             if len(self.option(envtable).prop[sample]) < 2:
 72                 raise OptionError(物种(环境因子)表的样本数目少于2,不可进行随机森林特征分析)
 73         samplelist = open(self.gettable()).readline().strip().split(\t)[1:]
 74         if self.option(envtable).is_set:
 75             self.option(envtable).get_info()
 76             if len(self.option(envtable).prop[sample]) > len(samplelist):
 77                 raise OptionError(OTU表中的样本数量:%s少于物种(环境因子)表中的样本数量:%s % (len(samplelist),
 78                                   len(self.option(envtable).prop[sample])))
 79             for sample in self.option(envtable).prop[sample]:
 80                 if sample not in samplelist:
 81                     raise OptionError(物种(环境因子)表的样本中存在OTU表中未知的样本%s % sample)
 82         table = open(self.gettable())
 83         if len(table.readlines()) < 4 :
 84             raise OptionError(数据表信息少于3行)
 85         table.close()
 86         if self.option(top_number) > self.option(otutable).prop[otu_num]:
 87             self.option(top_number, self.option(otutable).prop[otu_num])
 88         return True
 89     
 90     def set_resource(self):
 91         """
 92         设置所需资源
 93         """
 94         self._cpu = 2
 95         self._memory = ‘‘
 96     
 97     def end(self):
 98         result_dir = self.add_upload_dir(self.output_dir)
 99         result_dir.add_relpath_rules([
100             [".", "", "RandomForest分析结果和ROC计算数据输出目录"],
101             ["./randomforest_confusion_table.xls", "xls", "RandomForest样本分组模拟结果"],
102             ["./randomforest_mds_sites.xls", "xls", "样本点坐标表"],
103             ["./randomforest_proximity_table.xls", "xls", "样本相似度临近矩阵"],
104             ["./randomforest_topx_vimp.xls", "xls", "Top-X物种(环境因子)丰度表"],
105             ["./randomforest_vimp_table.xls", "xls", "所有物种(环境因子)重要度表"],
106             ["./randomforest_predicted_answer.xls", "xls", "随机森林预测分组结果表"],
107             ["./randomforest_votes_probably.xls","xls", "随机森林各样本分组投票预测概率表"],
108             ["./roc_table.xls", "xls", "ROC数据标准化后数据表"],
109             ["./roc_point.xls", "xls", "ROC作图坐标点数据表"],
110             ["./auc.xls", "xls", "ROC折线下方面积值"],
111         ])
112         print self.get_upload_files()
113         super(RandomforestAgent, self).end()
114 
115         
116 class RandomforestTool(Tool):
117     def __init__(self, config):
118         super(RandomforestTool, self).__init__(config)
119         self._version = 1.0.1
120         self.cmd_path = self.config.SOFTWARE_DIR + /bioinfo/meta/scripts/RandomForest_perl.pl
121         self.env_table = self.get_new_env()
122         self.otu_table = self.get_otu_table()
123     
124     def get_otu_table(self):
125         """
126         根据调用的level参数重构otu表
127         :return:
128         """
129         if self.option(otutable).format == "meta.otu.tax_summary_dir":
130             otu_path = self.option(otutable).get_table(self.option(level))
131         else:
132             otu_path = self.option(otutable).prop[path]
133         if self.option(envtable).is_set:
134             return self.filter_otu_sample(otu_path, self.option(envtable).prop[sample],
135                                           os.path.join(self.work_dir, temp_filter.otutable))
136         else:
137             return otu_path
138     
139     def filter_otu_sample(self, otu_path, filter_samples, newfile):
140         if not isinstance(filter_samples, types.ListType):
141             raise Exception(过滤otu表样本的样本名称应为列表)
142         try:
143             with open(otu_path, rb) as f, open(newfile, wb) as w:
144                 one_line = f.readline()
145                 all_samples = one_line.rstrip().split(\t)[1:]
146                 if not ((set(all_samples) & set(filter_samples)) == set(filter_samples)):
147                     raise Exception(提供的过滤样本集合中存在otu表中不存在的样本all:%s,filter_samples:%s % (all_samples, filter_samples))
148                 if len(all_samples) == len(filter_samples):
149                     return otu_path
150                 samples_index = [all_samples.index(i) + 1 for i in filter_samples]
151                 w.write(OTU\t + \t.join(filter_samples) + \n)
152                 for line in f:
153                     all_values = line.rstrip().split(\t)
154                     new_values = [all_values[0]] + [all_values[i] for i in samples_index]
155                     w.write(\t.join(new_values) + \n)
156                 return newfile
157         except IOError:
158             raise Exception(无法打开OTU相关文件或者文件不存在)
159 
160     def get_new_env(self):
161         """
162         根据envlabs生成新的envtable
163         """
164         if self.option(envlabs):
165             new_path = self.work_dir + /temp_env_table.xls
166             self.option(envtable).sub_group(new_path, self.option(envlabs).split(,))
167             return new_path
168         else:
169             return self.option(envtable).path
170     
171     def run(self):
172         """
173         运行
174         """
175         super(RandomforestTool, self).run()
176         self.run_RandomForest_perl()
177     
178     def formattable(self, tablepath):
179         with open(tablepath) as table:
180             if table.read(1) == #:
181                 newtable = os.path.join(self.work_dir, temp_format.table)
182                 with open(newtable, w) as w:
183                     w.write(table.read())
184                 return newtable
185         return tablepath
186     
187     def run_RandomForest_perl(self):
188         """
189         运行RandomForest.pl
190         """
191         real_otu_path = self.formattable(self.otu_table)
192         cmd = self.config.SOFTWARE_DIR + /program/perl/perls/perl-5.24.0/bin/perl  + self.cmd_path
193         cmd +=  -i %s -o %s % (real_otu_path, self.work_dir + /RandomForest)
194         if self.option(envtable).is_set:
195             cmd +=  -g %s -m %s % (self.env_table, self.env_table)
196         cmd +=  -ntree %s % (str(self.option(ntree)))
197         cmd +=  -type %s % (str(self.option(problem_type)))
198         cmd +=  -top %s % (str(self.option(top_number)))
199         self.logger.info(运行RandomForest_perl.pl程序进行RandomForest计算)
200         
201         try:
202             subprocess.check_output(cmd, shell=True)
203             self.logger.info(生成 cmd.r 文件成功)
204         except subprocess.CalledProcessError:
205             self.logger.info(生成 cmd.r 文件失败)
206             self.set_error(无法生成 cmd.r 文件)
207         try:
208             subprocess.check_output(self.config.SOFTWARE_DIR + 
209                                     /program/R-3.3.1/bin/R --restore --no-save < %s/cmd.r % (self.work_dir + /RandomForest), shell=True)
210             self.logger.info(RandomForest计算成功)
211         except subprocess.CalledProcessError:
212             self.logger.info(RandomForest计算失败)
213             self.set_error(R运行计算RandomForest失败)
214         self.logger.info(运行RandomForest_perl.pl程序进行RandomForest计算完成)
215         allfiles = self.get_filesname()        
216         self.linkfile(self.work_dir + /RandomForest/ + allfiles[1], randomforest_mds_sites.xls)
217         self.linkfile(self.work_dir + /RandomForest/ + allfiles[2], randomforest_proximity_table.xls)
218         self.linkfile(self.work_dir + /RandomForest/ + allfiles[3], randomforest_topx_vimp.xls)
219         self.linkfile(self.work_dir + /RandomForest/ + allfiles[4], randomforest_vimp_table.xls)
220         if self.option(envtable).is_set:
221             if allfiles[0] and allfiles[5] and allfiles[6]:
222                 self.linkfile(self.work_dir + /RandomForest/ + allfiles[0], randomforest_confusion_table.xls)
223                 self.linkfile(self.work_dir + /RandomForest/ + allfiles[5], randomforest_predicted_answer.xls)
224                 self.linkfile(self.work_dir + /RandomForest/ + allfiles[6], randomforest_votes_probably.xls)
225             else:
226                 self.set_error(按分组计算的文件生成出错)
227         if not self.option(envtable).is_set:
228             self.end()
229         if not (allfiles[5] and allfiles[6]):
230             self.end()
231         cmd = self.config.SOFTWARE_DIR + /program/perl/perls/perl-5.24.0/bin/perl  + self.config.SOFTWARE_DIR + /bioinfo/meta/scripts/calc_roc.pl
232         cmd +=  -i1 %s %(self.work_dir + /RandomForest/randomforest_votes_probably.xls)
233         cmd +=  -i2 %s %(self.work_dir + /RandomForest/randomforest_predicted_answer.xls)
234         cmd +=  -o %s %(self.work_dir + /ROC/)
235         self.logger.info(开始运行calc_roc.pl计算ROC相关数据)
236         
237         try:
238             subprocess.check_output(cmd, shell=True)
239             self.logger.info(生成 roc_cmd.r 成功)
240         except subprocess.CalledProcessError:
241             self.logger.info(生成 roc_cmd.r 失败)
242             self.set_error(无法生成 roc_cmd.r 文件)
243         try:
244             subprocess.check_output(self.config.SOFTWARE_DIR +
245                                     /program/R-3.3.1/bin/R --restore --no-save < %s/roc_cmd.r % (self.work_dir + /ROC), shell=True)
246             self.logger.info(ROC计算成功)
247         except subprocess.CalledProcessError:
248             self.logger.info(ROC计算失败)
249             self.set_error(R运行计算ROC失败)
250         self.logger.info(运行calc_roc.pl程序进行ROC计算完成)
251         allfiles = self.get_roc_filesname()
252         self.linkfile(self.work_dir + /ROC/ + allfiles[0], roc_table.xls)
253         self.linkfile(self.work_dir + /ROC/ + allfiles[1], roc_point.xls)
254         self.linkfile(self.work_dir + /ROC/ + allfiles[2], auc.xls)
255         self.end()
256 
257     def linkfile(self, oldfile, newname):
258         """
259         link文件到output文件夹
260         :param oldfile: 资源文件路径
261         :param newname: 新的文件名
262         :return:
263         """
264         newpath = os.path.join(self.output_dir, newname)
265         if os.path.exists(newpath):
266             os.remove(newpath)
267         os.link(oldfile, newpath)
268 
269     def get_roc_filesname(self):
270         filelist = os.listdir(self.work_dir + /ROC)
271         roc_table_file = None
272         roc_point_file = None
273         auc_file = None
274         for name in filelist:
275             if roc_table.xls in name:
276                 roc_table_file = name
277             elif roc_point.xls in name:
278                 roc_point_file = name
279             elif auc.xls in name:
280                 auc_file = name
281         if (roc_table_file and roc_point_file and auc_file):
282             return [roc_table_file, roc_point_file, auc_file]
283         else:
284             self.set_error("未知原因,ROC计算结果丢失")
285     
286     def get_filesname(self):
287         filelist = os.listdir(self.work_dir + /RandomForest)
288         randomforest_confusion_table_file = None
289         randomforest_mds_sites_file = None
290         randomforest_proximity_table_file = None
291         randomforest_topx_vimp_file = None
292         randomforest_vimp_table_file = None
293         randomforest_predicted_answer_file = None
294         randomforest_votes_probably_file = None
295         for name in filelist:
296             if randomforest_confusion_table.xls in name:
297                 randomforest_confusion_table_file = name
298             elif randomforest_mds_sites.xls in name:
299                 randomforest_mds_sites_file = name
300             elif randomforest_proximity_table.xls in name:
301                 randomforest_proximity_table_file = name
302             elif randomforest_topx_vimp.xls in name:
303                 randomforest_topx_vimp_file = name
304             elif randomforest_vimp_table.xls in name:
305                 randomforest_vimp_table_file = name
306             elif randomforest_predicted_answer.xls in name:
307                 randomforest_predicted_answer_file = name
308             elif randomforest_votes_probably.xls in name:
309                 randomforest_votes_probably_file = name
310         if (randomforest_mds_sites_file and randomforest_proximity_table_file and 
311             randomforest_topx_vimp_file and randomforest_vimp_table_file):
312             if self.option(envtable).is_set:
313                 if not randomforest_confusion_table_file:
314                     self.set_error(未知原因,样本分组模拟结果丢失或未生成)
315                 if not randomforest_predicted_answer_file:
316                     self.set_error(未知原因,样本分组预测结果文件丢失或未生成)
317                 if not randomforest_votes_probably_file:
318                     self.set_error(未知原因,样本分组预测概率表丢失或未生成)
319             return [randomforest_confusion_table_file, randomforest_mds_sites_file,
320                     randomforest_proximity_table_file, randomforest_topx_vimp_file,
321                     randomforest_vimp_table_file, randomforest_predicted_answer_file,
322                     randomforest_votes_probably_file]
323         else:
324             self.set_error(未知原因,数据计算结果丢失或者未生成)
View Code
技术分享
  1 #!/mnt/ilustre/users/sanger-dev/app/program/perl/perls/perl-5.24.0/bin/perl
  2 use strict;
  3 use warnings;
  4 use Getopt::Long;
  5 my %opts;
  6 my $VERSION = "V2.20160708";
  7 GetOptions( \%opts,"i=s","m=s","o=s","g=s","ntree=i","top=i","type=s");
  8 my $usage = <<"USAGE";
  9        Program : $0   
 10        Discription: Program used to caculate randomforest,with mds plot and importance variables given .  
 11        Version : $VERSION
 12        Contact : jie.yao\@majorbio.com
 13        Usage :perl $0 [options]         
 14                         -i      * input otu table file 
 15                         -o      * output dir
 16                         -m      input mapping file if you want set points\s color and pch by groups. If omitted, randomForest will run in unsupervised mode.
 17                 Default:none
 18                         -g      group name in mapping file .Default:none
 19                         -ntree    Number of trees to grow. This should not be set to too     small a number, to ensure that every input row gets predicted at least a few times.Default:500    
 20                         -top    How many variables to show? 
 21                         -type     either 1,2 or 3, specifying the type of importance measure (1=mean decrease in accuracy, 2=mean decrease in node impurity).
 22             
 23        Example:$0 -i otu_table.xls -o randomForest  -m group -g group  
 24 
 25 USAGE
 26 
 27 die $usage if(!($opts{i}&&$opts{o}));
 28 die $usage if($opts{m}&& !$opts{g});
 29 die $usage if(!$opts{m}&& $opts{g});
 30 
 31 $opts{m}=defined $opts{m}?$opts{m}:"none";
 32 $opts{g}=defined $opts{g}?$opts{g}:"none";
 33 $opts{ntree}=defined $opts{ntree}?$opts{ntree}:"500";
 34 $opts{type}=defined $opts{type}?$opts{type}:"1";
 35 $opts{top}=defined $opts{top}?$opts{top}:"50";
 36 
 37 
 38 if(! -e $opts{o}){
 39                 `mkdir $opts{o}`;
 40 }
 41 
 42 
 43 open CMD,">$opts{o}/cmd.r";
 44 print CMD "
 45 library(sp,warn.conflicts = F)
 46 library(randomForest,warn.conflicts = F)
 47 library(maptools,warn.conflicts = F)
 48 basename=\"randomforest\"
 49 
 50 
 51 # if read otu data
 52 otu <-read.table(\"$opts{i}\",sep=\"\\t\",head=T,check.names = F)
 53 rownames(otu) <-as.factor(otu[,1])
 54 otu <-otu[,-1]
 55 rownames(otu) <-sapply(rownames(otu),function(x) gsub(\"_*{.+}\",\" \",x,perl = TRUE))
 56 rownames(otu) <-sapply(rownames(otu),function(x) gsub(\"-\",\"_\",x,perl = TRUE))
 57 rownames(otu) <-sapply(rownames(otu),function(x) gsub(\"\\\\[\",\"\",x,perl = TRUE))
 58 rownames(otu) <-sapply(rownames(otu),function(x) gsub(\"\\\\]\",\"\",x,perl = TRUE))
 59 rownames(otu) <-sapply(rownames(otu),function(x) gsub(\"\\\\(\",\"\",x,perl = TRUE))
 60 rownames(otu) <-sapply(rownames(otu),function(x) gsub(\"\\\\)\",\"\",x,perl = TRUE))
 61 rownames(otu) <-sapply(rownames(otu),function(x) gsub(\"^[0-9]\",\"X\\\\1\",x,perl = TRUE))
 62 rownames(otu) <-sapply(rownames(otu),function(x) gsub(\"\/\",\"\",x,perl = TRUE))
 63 otu <-as.data.frame(t(otu),stringsAsFactors=T)
 64 
 65 map=\"$opts{m}\"
 66 if(map !=\"none\"){
 67                 sd <-read.table(\"$opts{m}\",head=T,sep=\"\\t\",comment.char = \"\",check.names = FALSE)        
 68                 rownames(sd) <- as.character(sd[,1])
 69                 sd[,1] <-as.character(sd[,1])
 70                 sd\$group <-as.factor(sd\$group )
 71                 legend <- as.matrix(unique(sd\$group)) 
 72 }
 73 
 74 set.seed(1)
 75 if(map != \"none\"){
 76 otu.rf <- randomForest(sd\$group ~ .,otu,importance=T,proximity=T,ntree=$opts{ntree})
 77 
 78 
 79 
 80 class_count <-as.matrix(table(sd\$group))
 81 class <-data.frame(count=class_count)
 82 
 83 ##randomforest votes probably
 84 votes_probably<- paste(\"$opts{o}/\",basename,\"_votes_probably.xls\",sep=\"\")
 85 write.table(otu.rf\$votes,votes_probably,sep=\"\\t\",quote=F)
 86 
 87 ##randomforest predicted answer
 88 predicted_answer <- paste(\"$opts{o}/\",basename,\"_predicted_answer.xls\",sep=\"\")
 89 write.table(otu.rf\$predicted,predicted_answer,sep=\"\\t\",quote=F)
 90 
 91 ##randomforest classification table
 92 rf_table <- paste(\"$opts{o}/\",basename,\"_confusion_table.xls\",sep=\"\")
 93 write.table(otu.rf\$confusion,rf_table,sep=\"\\t\",quote=F)
 94 mds <- cmdscale(1-otu.rf\$proximity)  
 95 }else{
 96 otu.rf <- randomForest(otu,importance=T,proximity=T,ntree=$opts{ntree})
 97 mds <- cmdscale(1-otu.rf\$proximity)
 98 }
 99 
100 
101 ##mds points
102 mds_points <- paste(\"$opts{o}/\",basename,\"_mds_sites.xls\",sep=\"\")
103 write.table(mds,mds_points,sep=\"\\t\",quote=F)
104 
105 ##proximity table
106 proximity <- paste(\"$opts{o}/\",basename,\"_proximity_table.xls\",sep=\"\")
107 write.table(otu.rf\$proximity,proximity,sep=\"\\t\",quote=F)
108 
109 ## importance table
110 vimp_table <- paste(\"$opts{o}/\",basename,\"_vimp_table.xls\",sep=\"\")
111 write.table(otu.rf\$importance,vimp_table,sep=\"\\t\",quote=F)
112 
113 
114 ## top importance species table
115 topx_vimp <- paste(\"$opts{o}/\",basename,\"_topx_vimp.xls\",sep=\"\")
116 imp <- importance(otu.rf)
117 if($opts{type} == 1){
118     top <- imp[order(imp[,\"MeanDecreaseAccuracy\"],decreasing=T),][1:min($opts{top},length(imp[,1])),]
119     write.table(t(otu)[rownames(top),],topx_vimp,sep=\"\\t\",quote=F)
120     
121 }else if ($opts{type} == 2){
122     top <- imp[order(imp[,\"MeanDecreaseGini\"],decreasing=T),][1:min($opts{top},length(imp[,1])),]
123         write.table(t(otu)[rownames(top),],topx_vimp,sep=\"\\t\",quote=F)
124 }
125 
126 ";
127 
128 `R --restore --no-save < $opts{o}/cmd.r`;
View Code

 

RandomForest&ROC

标签:

原文地址:http://www.cnblogs.com/neverforget/p/5784965.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!