现在已经对一些文档求出了倒排索引,对于一些词得出了这些词在哪些文档中出现的列表。
要求对于倒排索引实现一些简单的查询,即查询某些词同时出现,或者有些词出现有些词不出现的文档有哪些。
3 3 1 2 3 1 2 1 3 3 1 1 1 1 -1 0 1 -1 -1
NOT FOUND 1 3 1
#include <iostream> #include <list> #include <algorithm> #include <map> #include <fstream> using namespace std; class Node { public: int word_id; int bit; }; int compare(const void * a,const void * b) { return *(int*)a - *(int *)b; } void print_list(list<int> mylist); list<Node> initialize_list(int **search,int row,int N,bool need); list<int> sub_data(list<int> wanted_list,list<int> not_wanted_list); list<int> find_same_need(list<Node> mylist,int ** data,const int length); list<int> find_same_not_need(list<Node> mylist,int ** data,const int length); void read_data(int ** &data,int ** & search,int &M,int &N); int main() { int ** data; int ** search; int M,N; list<int> wanted_list; list<int> not_wanted_list; list<Node> wanted_node_list; list<Node> not_wanted_node_list; list<int> result; read_data(data,search,M,N); for( int i=0;i<M;i++ ) { wanted_node_list = initialize_list(search,i,N,true); not_wanted_node_list = initialize_list(search,i,N,false); wanted_list = find_same_need(wanted_node_list,data,N); not_wanted_list = find_same_not_need(not_wanted_node_list,data,N); result = sub_data(wanted_list,not_wanted_list); print_list(result); } system("pause"); return 0; } list<int> find_same_need(list<Node> mylist,int ** data,const int length) { list<int> result; bool same ; list<Node>::iterator min_node; int min_article_id; while( true ) { // 1. 检查当前文档号是否是一样的 same = true; Node one = mylist.front(); for(list<Node>::iterator node = mylist.begin();node != mylist.end(); node++) { if( data[one.word_id][one.bit] != data[node->word_id][node->bit] ) { same = false; break; } } // 2. 结果如果不相同时,则需要增加最小的 if( ! same ) { min_article_id = INT_MAX; for(list<Node>::iterator node = mylist.begin();node != mylist.end(); node++) { if( data[node->word_id][node->bit] < min_article_id ) { min_article_id = data[node->word_id][node->bit]; min_node = node; } } min_node->bit ++; // 已经得到所有结果 if( min_node->bit > data[min_node->word_id][0] ) return result; } // 3. 存储一个结果 else { result.push_back( mylist.front().bit ); for (list<Node>::iterator node = mylist.begin(); node != mylist.end(); node++) { node->bit ++; // 已经得到所有结果 if( node->bit > data[node->word_id][0] ) return result; } } } } list<int> find_same_not_need(list<Node> mylist,int ** data,const int length) { map<int,bool> myamp; list<int> result; for(list<Node>::iterator it = mylist.begin();it != mylist.end() ;it++) { while(it->bit <= data[it->word_id][0]) { if( myamp.find(data[it->word_id][it->bit]) == myamp.end() ) { myamp.insert(make_pair(data[it->word_id][it->bit],true)); } it->bit ++; } } for(map<int,bool>::iterator it = myamp.begin();it!=myamp.end();it++) { result.push_back(it->first); } return result; } list<int> sub_data(list<int> wanted_list,list<int> not_wanted_list) { list<int> result; list<int>::iterator wanted_it,not_wanted_it; wanted_it = wanted_list.begin(); not_wanted_it = not_wanted_list.begin(); while( wanted_it!= wanted_list.end() && not_wanted_it != not_wanted_list.end() ) { if( *wanted_it < *not_wanted_it ) { result.push_back(*wanted_it) ; wanted_it++; } else if( *wanted_it > *not_wanted_it ) { not_wanted_it++; } else { wanted_it++; not_wanted_it++; } } while(wanted_it != wanted_list.end()) { result.push_back(*wanted_it); wanted_it++; } return result; } list<Node> initialize_list(int **search,int row,int N,bool need) { list<Node> result; Node node; if( need ) { for( int i=0;i<N;i++ ) { if( search[row][i] == 1 ) { node.word_id = i; node.bit = 1; result.push_back( node ); } } } else { for( int i=0;i<N;i++ ) { if( search[row][i] == -1 ) { node.word_id = i; node.bit = 1; result.push_back( node ); } } } return result; } void print_list(list<int> mylist) { if(mylist.size() > 0) { for(list<int>::iterator it=mylist.begin();it != mylist.end();it++) { cout<<*it<<" "; } } else cout<<"NOT FOUND"; cout<<endl; } void read_data(int ** &data,int ** & search,int &M,int &N) { ifstream reader; reader.open("data.txt"); reader>>N; data = new int* [N]; int m; for(int i=0;i<N;i++) { reader>>m; data[i] = new int[m+1]; data[i][0] = m; for(int j=1;j<m+1;j++) { reader>>data[i][j]; } qsort(data[i]+1,data[i][0],sizeof(int),compare); } reader>>M; search = new int * [M]; for(int i=0;i<M;i++) { search[i] = new int[N]; for(int j =0;j<N;j++) { reader>>search[i][j]; } } reader.close(); }
3 3 1 2 3 1 2 1 3 3 1 1 1 1 -1 0 1 -1 -1
原文地址:http://blog.csdn.net/cqs_experiment/article/details/40374987