标签:
学习了数据挖掘这门课,但是里面的算法仅仅是稍微了解了一下,并没有实现一下,试着把每个算法实现一下。。。。
1、决策树之ID3
下表记录了在不同气候条件下是否去打球的情况,要求根据该表用程序输出决策树。
Day Outlook Temperature Humidity Wind PlayTennis 1 Sunny Hot High Weak no 2 Sunny Hot High Strong no 3 Overcast Hot High Weak yes 4 Rainy Mild High Weak yes 5 Rainy Cool Normal Weak yes 6 Rainy Cool Normal Strong no 7 Overcast Cool Normal Strong yes 8 Sunny Mild High Weak no 9 Sunny Cool Normal Weak yes 10 Rainy Mild Normal Weak yes 11 Sunny Mild Normal Strong yes 12 Overcast Mild High Strong yes 13 Overcast Hot Normal Weak yes 14 Rainy Mild High Strong no end
下面是ID3的部分程序,还没有写完,慢慢再补。
1 #include <iostream> 2 #include <string> 3 #include <cstring> 4 #include <vector> 5 #include <list> 6 #include <map> 7 #include <algorithm> 8 #include <cstdlib> 9 #include <cstdio> 10 #include <cmath> 11 12 using namespace std; 13 14 class Node 15 { 16 public: 17 vector<int> next; 18 string attr; 19 string ans; 20 //Node() next(), attr(""), ans(""){} 21 }; 22 23 const string yes = "yes"; 24 const string no = "no"; 25 const int attribute_name_size = 6; 26 vector< vector<string> > data; //day weather temperature humidity wind play_or_not 27 Node node[1000]; 28 int cnt_of_node = 0; 29 30 void input() 31 { 32 string str; 33 vector<string> tmp; 34 while (cin >> str && str != "end") 35 { 36 tmp.push_back(str); 37 for (int i = 0; i < attribute_name_size-1; ++i) 38 { 39 cin >> str; 40 tmp.push_back(str); 41 } 42 data.push_back(tmp); 43 tmp.clear(); 44 } 45 } 46 47 double calcEntropy(vector<vector<string> >& vec, string element) 48 { 49 double ans = 0; 50 map<string, int> mp; 51 if (vec.size() <= 0) return -1; 52 for (int j = 0; j < vec[0].size(); ++j) 53 { 54 if (vec[0][j] == element) 55 for (int i = 1; i < vec.size(); ++i) 56 mp[vec[i][j]]++; 57 } 58 double cnt = vec.size()-1; 59 for (map<string, int>::iterator it = mp.begin(); it != mp.end(); ++it) 60 { 61 double p = (it->second)/cnt; 62 ans -= p*log2(p); 63 } 64 return ans; 65 } 66 67 double calcInfo(vector<vector<string> >& vec, int idx) 68 { 69 double ans = 0; 70 if (vec.size() <= 1) return -1; 71 map<string, map<string, int> > mp; 72 int len = vec[0].size(); 73 int size = vec.size()-1; 74 75 for (int j = 1; j < vec.size(); ++j) 76 mp[vec[j][idx]][vec[j][len-1]]++; 77 for (map<string, map<string, int> >::iterator it = mp.begin(); it != mp.end(); ++it) 78 { 79 int ys = 0, nt = 0; 80 for (map<string, int>::iterator itr = (it->second).begin(); itr != (it->second).end(); ++itr) 81 { 82 if (itr->first == yes) ys += itr->second; 83 if (itr->first == no) nt += itr->second; 84 } 85 ans = -(ys+nt)/size*(-ys/(ys+nt)*log2(ys/(ys+nt)) - nt/(ys+nt)*log2(nt/(ys+nt))); 86 } 87 return ans; 88 } 89 90 int findBestAttribute(vector<vector<string> >& tmp) 91 { 92 if (tmp.size() <= 1) return -1; 93 int len = tmp[0].size(); 94 string result = tmp[0][len-1]; 95 vector<double> v; 96 double info_result = calcEntropy(tmp, result); 97 for (int i = 0; i < len; ++i) 98 v.push_back(calcInfo(tmp, i)); 99 double max_info_gain = 0; 100 int idx = 0; 101 for (int i = 0; i < v.size(); ++i) 102 { 103 if (info_result-v[i] > max_info_gain) 104 max_info_gain = info_result-v[idx=i]; 105 } 106 return idx; 107 } 108 109 void work(vector< vector<string> >& source, int now_node_num) 110 { 111 int idx = 0; 112 idx = findBestAttribute(source); 113 vector<int> vis(source.size(), 0); 114 vector<string> attribute_tmp; 115 for (int i = 0; i < source[0].size(); ++i) 116 if (i != idx) attribute_tmp.push_back(source[0][i]); 117 int len = source[0].size(); 118 for (int i = 1; i < source.size(); ++i) 119 { 120 if (vis[i]) continue; 121 map<string, int> mp; 122 for (int j = i; j < source.size(); ++j) 123 { 124 if (source[j][idx] == source[i][idx]) 125 { 126 mp[source[j][len-1]]++; 127 vis[j] = 1; 128 } 129 } 130 node[now_node_num].next.push_back(++cnt_of_node); 131 node[cnt_of_node].attr = source[i][idx]; 132 if (mp.size() == 1) 133 { 134 node[cnt_of_node].ans = source[i][len-1]; 135 node[cnt_of_node].next.clear(); 136 } 137 else 138 { 139 vector<vector<string> > vs; 140 for (int j = 0; j < source.size(); ++j) 141 { 142 vector<string> tmp; 143 for (int k = 0; k < source[0].size(); ++k) 144 { 145 if (k == idx) continue; 146 tmp.push_back(source[j][k]); 147 } 148 vs.push_back(tmp); 149 } 150 work(vs, cnt_of_node); 151 } 152 } 153 } 154 155 void outputSourceData() 156 { 157 for (int i = 0; i < data.size(); ++i) 158 { 159 for (int j = 0; j < data[i].size(); ++j) 160 cout << data[i][j] << ‘\t‘; 161 cout << endl; 162 } 163 } 164 165 int main() 166 { 167 168 return 0; 169 }
标签:
原文地址:http://www.cnblogs.com/JustForCS/p/4885233.html