码迷,mamicode.com
首页 > 其他好文 > 详细

Concept Learning

时间:2016-06-08 22:56:15      阅读:214      评论:0      收藏:0      [点我收藏+]

标签:

Candidate Elimination

 


Thanks for akashrajkn@gmail.com. This is a good demo to understand candidate elimination algorithm based on this guy‘s good work.

rika@rika-UX303UB$ ./a.out 
<Input> Number of Features:6
<Input> Number of Attributes[0]:2
<Input> 	(1):rainy
<Input> 	(2):sunny
<Input> Number of Attributes[1]:2
<Input> 	(1):cold
<Input> 	(2):warm
<Input> Number of Attributes[2]:2
<Input> 	(1):normal
<Input> 	(2):high
<Input> Number of Attributes[3]:2
<Input> 	(1):weak
<Input> 	(2):strong
<Input> Number of Attributes[4]:2
<Input> 	(1):cool
<Input> 	(2):warm 
<Input> Number of Attributes[5]:2
<Input> 	(1):same
<Input> 	(2):change

Input training data:

sunny,warm,normal,Strong,warm,same,yes

	Specific border
	< sunny warm normal Strong warm same >

	Generic border
	< ? ? ? ? ? ? >

sunny,warm,high,weak,warm,same,yes

	Specific border
	< sunny warm ? weak warm same >

	Generic border
	< ? ? ? ? ? ? >

rainy,cold,high,strong,warm,change,no

	Specific border
	< sunny warm ? weak warm same >

	Generic border
	< sunny ? ? ? ? ? >
	< ? warm ? ? ? ? >
	< ? ? ? weak ? ? >
	< ? ? ? ? ? same >

sunny,warm,high,strong,cool,change,yes

	Specific border
	< sunny warm ? ? ? ? >

	Generic border
	< sunny ? ? ? ? ? >
	< ? warm ? ? ? ? >

This is complete code:

技术分享
  1 /*
  2   Candidate Elimination
  3 */
  4 
  5 #include <cstdio>
  6 #include <cstdlib>
  7 #include <cctype>
  8 #include <vector>
  9 #include <string>
 10 #include <stack>
 11 #include <queue>
 12 #include <iterator>
 13 #include <set>
 14 #include <map>
 15 #include <iostream>
 16 #include <sstream>
 17 #include <deque>
 18 #include <cmath>
 19 #include <memory.h>
 20 #include <algorithm>
 21 #include <utility>
 22 #include <climits>
 23 
 24 typedef long double ld;
 25 typedef long long ll;
 26 #define all(c) c.begin(),c.end()
 27 
 28 using namespace std;
 29 
 30 int main()
 31 {
 32     int numberFeatures; //number of features in the dataset
 33     string data;
 34     bool consistent;
 35     cout << "<Input> Number of Features:";
 36     cin >> numberFeatures; //input the number of features
 37 
 38     vector <int> numberAttributes(numberFeatures); //number of attributes for each feature.
 39     map <string, int> instance[numberFeatures+1];
 40 
 41     for(int i=0; i<numberFeatures; i++)
 42     {
 43         cout <<"<Input> Number of Attributes[" << i << "]:";
 44         cin >> numberAttributes[i];
 45 
 46         for(int j=1; j<=numberAttributes[i]; j++)
 47         {
 48             string temp;
 49             cout <<"<Input>     (" << j << "):";
 50             cin>> temp;
 51             instance[i][temp] = j; //map attribute name with number
 52         }
 53     }
 54 
 55     instance[numberFeatures]["no"] = 0;
 56     instance[numberFeatures]["yes"] = 1;
 57     /*
 58        for any feature if attribute=0 -> null value;
 59        attribute = INT_MAX -> all
 60      */
 61 
 62     vector <int> currdata(numberFeatures+1), tmpData(numberFeatures);
 63 
 64     //set used to represent the generic and specific boundaries.
 65     set < vector<int> > specific, generic;
 66 
 67     //initialization
 68     for(int i=0; i<numberFeatures; i++)
 69     {
 70         tmpData[i]=0;
 71     }
 72     specific.insert(tmpData);
 73 
 74     for(int i=0; i<numberFeatures; i++)
 75     {
 76         tmpData[i]=INT_MAX;
 77     }
 78     generic.insert(tmpData);
 79 
 80 
 81 
 82 
 83     cout << endl << "Input training data:" << endl << endl;
 84     //input the training data
 85     while( getline(cin, data) )
 86     {
 87         if(data.size()==0)
 88         {
 89             continue;
 90         }
 91 
 92         string temp;
 93         int st = 0, count = 0;
 94         int len = data.size();
 95 
 96         for( int i=0; i<len; i++ )
 97         {
 98             if(data[i]==,|| i==len-1)
 99             {
100                 if(i==len-1)
101                     temp = data.substr(st, i-st+1);
102                 else
103                     temp = data.substr(st, i-st);
104 
105                 currdata[count] = instance[count][temp];
106                 count++;
107                 st = i+1;
108             }
109         }
110 
111         vector<int> m, n, p;
112 
113         if( currdata[count-1]==1 ) //if positive example
114         {
115             //remove inconsistent hypotheses from generic border
116             set < vector<int> > tempg, temps;
117             tempg = generic;
118             for(set<vector<int> >::iterator it= generic.begin(); it!=generic.end(); it++)
119             {
120                 m = *it;
121                 int er =0;
122                 for(int i=0; i<numberFeatures; i++)
123                 {
124                     if(m[i]!=currdata[i] && m[i]!=INT_MAX )
125                     {
126                         er = 1;
127                         break;
128                     }
129                 }
130                 if(er==1)
131                     tempg.erase(m);
132             }
133             generic = tempg;
134 
135             m = *(specific.begin());
136             n = *(specific.begin());
137             specific.erase(n);
138 
139             for(int i=0; i<numberFeatures; i++)
140             {
141                 if(m[i]==0)
142                 {
143                     m[i] = currdata[i];
144                 }
145                 else if(m[i]!=currdata[i])
146                 {
147                     m[i]=INT_MAX;
148                 }
149             }
150 
151             specific.insert(m);
152         }
153         else //if negative example
154         {
155             /*
156                if example is inconsistent with spec border, then it is noise
157              */
158 
159             set < vector<int> > tempg, temps;
160             n = *(specific.begin());
161 
162             for(set< vector<int> >::iterator it= generic.begin(); it!=generic.end(); it++)
163             {
164                 m = *it;
165 
166                 int er=0;
167                 //check if given example is consistent with the present hypothesis
168                 for(int i=0; i<numberFeatures; i++)
169                 {
170                     if( m[i]!=INT_MAX && m[i]!=currdata[i])
171                     {
172                         er=1; //curr hyp is consistent
173                         break;
174                     }
175                 }
176 
177                 if(er==1)//if hyp is consistent with the example
178                 {
179                     tempg.insert(m);
180                 }
181                 else//hyp is not consistent with the example
182                 {
183                     vector<int> temphyp;
184                     for(int i=0; i<numberFeatures; i++)
185                     {
186                         if( m[i]==INT_MAX )
187                         {
188                             temphyp = m;
189                             for(int j=1; j<=numberAttributes[i]; j++)//values are 1-based
190                             {
191                                 if(j==currdata[i])
192                                     continue;
193                                 else
194                                 {
195                                     temphyp[i] = j;
196 
197                                     //check if temphyp is more general than specifc hyp.
198                                     consistent = true;
199                                     for(int k=0; k<numberFeatures; k++)
200                                     {
201                                         if(temphyp[k]!=INT_MAX && temphyp[k]!=n[k] && n[k]!=0)
202                                         {
203                                             consistent = false;
204                                             break;
205                                         }
206                                     }
207                                     if(consistent)
208                                         tempg.insert(temphyp); // new hypo is consistent
209                                 }
210                             }
211                         }
212                     }
213 
214                 }
215 
216             }
217 
218             generic.clear();
219             bool mGen;
220             set<vector<int> > tempgg;
221 
222             //remove from generic any hyp that is more specific than another hyp in generic
223             for(set< vector<int> >::iterator it= tempg.begin(); it!=tempg.end(); it++)
224             {
225                 m = *it;
226 
227                 for( set< vector<int> >::iterator jt= tempg.begin(); jt!=tempg.end(); jt++ )
228                 {
229                     if(it==jt)
230                         continue;
231 
232                     p = *jt;
233                     consistent = true;
234                     for(int k=0; k<numberFeatures; k++)
235                     {
236                         if(m[k]!=INT_MAX && m[k]!=p[k])
237                         {
238                             consistent = false;
239                             break;
240                         }
241                     }
242                     if(consistent)
243                         tempgg.insert(p);
244                 }
245             }
246 
247             // generic = set_difference(tempg, tempgg );
248             for( set< vector<int> >::iterator it= tempg.begin(); it!=tempg.end(); it++ )
249             {
250                 m = *it;
251 
252                 if(tempgg.find(m)==tempgg.end())
253                 {
254                     generic.insert(m);
255                 }
256             }
257         }
258 
259 
260         /*
261            Printing Specific and General borders
262          */
263         vector<int> abc;
264 
265         cout<<"\n\tSpecific border"<<endl;
266         abc = *(specific.begin());
267         cout<<"\t< ";
268         for(int i=0; i<numberFeatures; i++)
269         {
270             if( abc[i]==INT_MAX )
271                 cout<<"?"<<" ";
272             else
273             {
274                 for(map<string,int>::iterator jt = instance[i].begin(); jt!=instance[i].end();jt++)
275                 {
276                     if((*jt).second == abc[i])
277                         cout<<(*jt).first<<" ";
278                 }
279             }
280         }
281         cout<<">\n";
282 
283         cout<<"\n\tGeneric border"<<endl;
284         for(set< vector<int> >::iterator it= generic.begin(); it!=generic.end(); it++)
285         {
286             abc = *it;
287             cout<<"\t< ";
288             for(int i=0; i< numberFeatures; i++)
289             {
290                 if( abc[i]==INT_MAX )
291                     cout<<"?"<<" ";
292                 else
293                 {
294                     for(map<string,int>::iterator jt = instance[i].begin(); jt!=instance[i].end();jt++)
295                     {
296                         if((*jt).second == abc[i])
297                             cout<<(*jt).first<<" ";
298                     }
299                 }
300             }
301             cout<<">\n";
302         }
303         cout<<endl;
304 
305     }
306 
307     cout<<"\ncandidate elem done! :)\n";
308 
309     return 0;
310 }
View Code

 

 

From 血糯米Otomii, which will help you to understand more.


样本集:

技术分享

 

把S集合初始化为H中极大特殊假设:技术分享

把G集合初始化为H中极大一般假设:技术分享

 

首先加载第一条和第二条样本:

技术分享

这个过程是特殊向一般的转变,这个过程非常地类似FIND-S算法

接着我们处理第三条样本:

让我们回到数据技术分享

我们会发现,Sky,AirTemp和Foreast和以前的数据不一致,我们可以怀疑是这三个数据导致最后结果的变化。

所以,我们就针对这3个数据进行一次特殊化:

技术分享

接着,我们输入第四条样本:

 

技术分享

 

 

在处理第四条样本的时候,我们先对于S集合进行一般化:技术分享

 

 

然后,为了让G集合覆盖S集合,我们需要剔除技术分享,过程为技术分享

 

在处理完了这四个样本后,我们就可以获取所有的假设:

技术分享

 

当前为6个假设,当我们可以获取到更多的训练集的时候,我们可以划出更小的设计空间。

当我使用这6个假设对测试集进行测试的时候,我们可以使用这6个假设同时对测试样本进行检测,每个假设都有自己的权重,如果最后的结果超过80%,那么就测试通过。

 

本算法弊端

  1. 对噪点兼容性非常差
  2. 当我们Sky属性有10+个的时候,往往这个属性必然被一般化,所以我们需要对数据进行预处理

Concept Learning

标签:

原文地址:http://www.cnblogs.com/jesse123/p/5571374.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!