码迷,mamicode.com
首页 > 其他好文 > 详细

大规模字符串检索-压缩trie树

时间:2015-03-05 14:45:20      阅读:495      评论:0      收藏:0      [点我收藏+]

标签:

本文使用压缩trie树实现字符串检索的功能。首先将字符串通过编码转化为二进制串,随后将二进制串插入到trie树中,在插入过程中同时实现压缩的功能。

字符编码采用Huffman,但最终测试发现不采用Huffman的方法不仅省下了编码时间,同时trie树的插入时间也有所减少。

  1 /**
  2     程序主函数与编码
  3 */
  4 #include <stdio.h>
  5 #include <stdlib.h>
  6 #include <string.h>
  7 #include "huffman.h"
  8 #include "compress_trie.h"
  9 //#include <time.h>
 10 
 11 #define NUM_OF_HUFFMAN 81
 12 #define LENGTH_OF_LINE 10000
 13 #define RESULT_OF_HUFFMAN "result_of_HUFFMAN.dat"
 14 //#define EMAIL "strpool.dat"
 15 //#define CHECKED_EMAIL "checkedemail.dat"
 16 #define RESULT "result.dat"
 17 
 18 void str_to_bin(char buf[],char binary[],huffman_node hufm[]);
 19 
 20 
 21 int main(int argc, char *argv[])
 22 {
 23     //time_t time_start,time_end;
 24     //time_start = time(NULL);
 25 
 26     char* EMAIL = argv[1];
 27     char* CHECKED_EMAIL = argv[2];
 28 
 29     huffman_node hufm[NUM_OF_HUFFMAN];
 30     hufm_init(hufm,NUM_OF_HUFFMAN);
 31     char buf[LENGTH_OF_LINE];
 32     char binary[LENGTH_OF_LINE];
 33 
 34     FILE* fin_of_huffman;
 35     fin_of_huffman = fopen(RESULT_OF_HUFFMAN,"r");
 36     if(fin_of_huffman == NULL)
 37     {
 38         hufm_init(hufm,NUM_OF_HUFFMAN);
 39         int i;
 40         for(i=0;i<(NUM_OF_HUFFMAN+1)/2;i++)
 41         {
 42             hufm[i].num_of_ch = NUM_OF_HUFFMAN - i;
 43         }
 44         huffman_coding(hufm,NUM_OF_HUFFMAN);
 45     }
 46     else
 47     {
 48         char temp_char;
 49         int i;
 50         for(i=0;i<(NUM_OF_HUFFMAN+1)/2;i++)
 51         {
 52             fgets(buf,sizeof(buf),fin_of_huffman);
 53             sscanf(buf,"%c %d %s",&temp_char,&hufm[i].num_of_ch,hufm[i].code);
 54         }
 55     }
 56     fclose(fin_of_huffman);
 57 
 58     printf("building trie...");
 59     FILE* fin_of_email;
 60     fin_of_email = fopen(EMAIL,"r");
 61     trie_node *root;
 62     root = (trie_node*)malloc(sizeof(trie_node));
 63     trie_node_init(&root);
 64 
 65     while(fgets(buf,sizeof(buf),fin_of_email)!=NULL)
 66     {
 67         str_to_bin(buf,binary,hufm);
 68         trie_insert(&root,binary);
 69     }
 70     fclose(fin_of_email);
 71     printf("\r");
 72     printf("build trie success.\n");
 73 
 74     FILE *fin_of_checked,*fout_of_result;
 75     fin_of_checked = fopen(CHECKED_EMAIL,"r");
 76     fout_of_result = fopen(RESULT,"w");
 77     int num_yes = 0;
 78     int num_no = 0;
 79     while(fgets(buf,sizeof(buf),fin_of_checked)!=NULL)
 80     {
 81         str_to_bin(buf,binary,hufm);
 82         if(trie_search(root,binary))
 83         {
 84             fprintf(fout_of_result,"YES\n");
 85             num_yes++;
 86         }
 87         else
 88         {
 89             fprintf(fout_of_result,"NO\n");
 90             num_no++;
 91         }
 92     }
 93     fprintf(fout_of_result,"num of YES is:%d\n",num_yes);
 94     fprintf(fout_of_result,"num of NO is:%d\n",num_no);
 95     printf("search success!\n");
 96     fclose(fin_of_checked);
 97     fclose(fout_of_result);
 98     //time_end = time(NULL);
 99     //printf("用时:%.0lfs\n", difftime(time_end, time_start));
100     return 0;
101 }
102 
103 
104 void str_to_bin(char buf[],char binary[],huffman_node hufm[])
105 {
106     int i;
107     binary[0] = \0;
108     for(i=strlen(buf)-1;i>=0;i--)
109     {
110         if(buf[i]>=a && buf[i]<=z)
111         {
112             strcat(binary,hufm[buf[i]-a].code);
113         }
114         else if(buf[i]>=A && buf[i]<=Z)
115         {
116             strcat(binary,hufm[buf[i]-A].code);
117         }
118         else if(buf[i]>=0 && buf[i]<=9)
119         {
120             strcat(binary,hufm[26+buf[i]-0].code);
121         }
122         else if(buf[i]==_)
123         {
124             strcat(binary,hufm[36].code);
125         }
126         else if(buf[i]==-)
127         {
128             strcat(binary,hufm[37].code);
129         }
130         else if(buf[i]==.)
131         {
132             strcat(binary,hufm[38].code);
133         }
134         else if(buf[i]==@)
135         {
136             strcat(binary,hufm[39].code);
137         }
138         else
139         {
140             strcat(binary,hufm[40].code);
141         }
142     }
143 }
  1 /**
  2     完成trie树的插入,查找。
  3 */
  4 
  5 typedef struct TRIE_NODE
  6 {
  7     char is_str;
  8     unsigned short num_of_bit;
  9     unsigned char* compress_of_bit;
 10     struct TRIE_NODE *point_of_zero,*point_of_one;
 11 }trie_node;
 12 
 13 //long int temp_of_new = 0;
 14 
 15 
 16 void trie_node_init(trie_node **root);
 17 int trie_insert(trie_node **root,char* bit_of_insert);
 18 int trie_search(trie_node *root,char* bit_of_insert);
 19 void trie_delete(trie_node *root);
 20 void compress(trie_node *root,char* bit_of_insert);
 21 int compare_of_bit(trie_node *root,char* bit_of_insert);
 22 void pop_bit(trie_node *root,char* bit_of_pop,int len_of_pop);
 23 
 24 
 25 
 26 
 27 void trie_node_init(trie_node **root)
 28 {
 29     (*root)->is_str = (char)0;
 30     (*root)->num_of_bit = 0;
 31     (*root)->compress_of_bit = NULL;
 32     (*root)->point_of_zero = NULL;
 33     (*root)->point_of_one = NULL;
 34 }
 35 
 36 void compress(trie_node *root,char* bit_of_insert)
 37 {
 38     int i,j,len_of_insert;
 39     len_of_insert = strlen(bit_of_insert);
 40     root->num_of_bit = len_of_insert;
 41     if(root->num_of_bit<=32)
 42     {
 43         int temp;
 44         for(i=len_of_insert-1,j=0;i>=0;i--,j++)
 45         {
 46             if(bit_of_insert[i] == 0)
 47             {
 48                 clearbit(temp,j);
 49             }
 50             else
 51             {
 52                 setbit(temp,j);
 53             }
 54         }
 55         root->compress_of_bit = (unsigned char*)temp;
 56     }
 57     else
 58     {
 59         root->compress_of_bit = (unsigned char*)malloc((len_of_insert%8)?(len_of_insert/8+1):(len_of_insert/8));
 60         for(i=len_of_insert-1,j=0;i>=0;i--,j++)
 61         {
 62             if(bit_of_insert[i] == 0)
 63             {
 64                 clearbit(root->compress_of_bit[j/8],j%8);
 65             }
 66             else
 67             {
 68                 setbit(root->compress_of_bit[j/8],j%8);
 69             }
 70         }
 71     }
 72 }
 73 
 74 
 75 int trie_insert(trie_node **root,char* bit_of_insert)
 76 {
 77     int ret;
 78     char bit_of_pop[10000];
 79     if(root == NULL)
 80     {
 81         ret = 0;
 82     }
 83     else
 84     {
 85         if((*root)->num_of_bit == 0)
 86         {
 87             if(!(*bit_of_insert))
 88             {
 89                 (*root)->is_str = (char)1;
 90                 ret = 1;
 91             }
 92             else
 93             {
 94                 if((*root)->is_str == 0
 95                    && (*root)->point_of_zero == NULL
 96                    && (*root)->point_of_one == NULL)
 97                 {
 98                     compress((*root),bit_of_insert);
 99                     (*root)->is_str = (char)1;
100                     ret = 1;
101                 }
102                 else
103                 {
104                     if(*bit_of_insert == 0)
105                     {
106                         if((*root)->point_of_zero == NULL)
107                         {
108                             (*root)->point_of_zero = (trie_node*)malloc(sizeof(trie_node));
109                             trie_node_init(&(*root)->point_of_zero);
110                             //temp_of_new++;
111                         }
112                         ret = trie_insert(&(*root)->point_of_zero,bit_of_insert+1);
113                     }
114                     else
115                     {
116                         if((*root)->point_of_one == NULL)
117                         {
118                             (*root)->point_of_one = (trie_node*)malloc(sizeof(trie_node));
119                             trie_node_init(&(*root)->point_of_one);
120                             //temp_of_new++;
121                         }
122                         ret = trie_insert(&(*root)->point_of_one,bit_of_insert+1);
123                     }
124                 }
125             }
126         }
127         else
128         {
129             int ans_of_compare = compare_of_bit((*root),bit_of_insert);
130             if(ans_of_compare == 0)
131             {
132                 trie_node *father = (trie_node*)malloc(sizeof(trie_node));
133                 trie_node_init(&father);
134                 //temp_of_new++;
135                 pop_bit((*root),bit_of_pop,1);
136                 if(bit_of_pop[0] == 0)
137                 {
138                     father->point_of_zero = (*root);
139                 }
140                 else
141                 {
142                     father->point_of_one = (*root);
143                 }
144                 if(!(*bit_of_insert))
145                 {
146                     father->is_str = (char)1;
147                     ret = 1;
148                 }
149                 else
150                 {
151                     if(*bit_of_insert == 0)
152                     {
153                         father->point_of_zero = (trie_node*)malloc(sizeof(trie_node));
154                         trie_node_init(&father->point_of_zero);
155                         //temp_of_new++;
156                         ret = trie_insert(&father->point_of_zero,bit_of_insert+1);
157                     }
158                     else
159                     {
160                         father->point_of_one = (trie_node*)malloc(sizeof(trie_node));
161                         trie_node_init(&father->point_of_one);
162                         //temp_of_new++;
163                         ret = trie_insert(&father->point_of_one,bit_of_insert+1);
164                     }
165                 }
166                 (*root) = father;
167             }
168             else
169             {
170                 if(ans_of_compare == (int)(*root)->num_of_bit
171                    && ans_of_compare == strlen(bit_of_insert))
172                 {
173                     (*root)->is_str = (char)1;
174                     ret = 1;
175                 }
176                 else if(ans_of_compare == (int)(*root)->num_of_bit)
177                 {
178                     bit_of_insert += ans_of_compare;
179                     if(*bit_of_insert == 0)
180                     {
181                         if((*root)->point_of_zero == NULL)
182                         {
183                             (*root)->point_of_zero = (trie_node*)malloc(sizeof(trie_node));
184                             trie_node_init(&(*root)->point_of_zero);
185                             //temp_of_new++;
186                         }
187                         ret = trie_insert(&(*root)->point_of_zero,bit_of_insert+1);
188                     }
189                     else
190                     {
191                         if((*root)->point_of_one == NULL)
192                         {
193                             (*root)->point_of_one = (trie_node*)malloc(sizeof(trie_node));
194                             trie_node_init(&(*root)->point_of_one);
195                             //temp_of_new++;
196                         }
197                         ret = trie_insert(&(*root)->point_of_one,bit_of_insert+1);
198                     }
199                 }
200                 else if(ans_of_compare == strlen(bit_of_insert))
201                 {
202                     trie_node *father = (trie_node*)malloc(sizeof(trie_node));
203                     trie_node_init(&father);
204                     //temp_of_new++;
205                     pop_bit((*root),bit_of_pop,ans_of_compare);
206                     compress(father,bit_of_pop);
207                     father->is_str = (char)1;
208                     pop_bit((*root),bit_of_pop,1);
209                     if(bit_of_pop[0] == 0)
210                     {
211                         father->point_of_zero = (*root);
212                     }
213                     else
214                     {
215                         father->point_of_one = (*root);
216                     }
217                     (*root) = father;
218                 }
219                 else
220                 {
221                     trie_node *father = (trie_node*)malloc(sizeof(trie_node));
222                     trie_node_init(&father);
223                     //temp_of_new++;
224                     pop_bit((*root),bit_of_pop,ans_of_compare);
225                     compress(father,bit_of_pop);
226                     pop_bit((*root),bit_of_pop,1);
227                     bit_of_insert += ans_of_compare+1;
228 
229                     if(bit_of_pop[0] == 0)
230                     {
231                         father->point_of_zero = (*root);
232                         father->point_of_one = (trie_node*)malloc(sizeof(trie_node));
233                         trie_node_init(&father->point_of_one);
234                         //temp_of_new++;
235                         ret = trie_insert(&father->point_of_one,bit_of_insert);
236                     }
237                     else
238                     {
239                         father->point_of_one = (*root);
240                         father->point_of_zero = (trie_node*)malloc(sizeof(trie_node));
241                         trie_node_init(&father->point_of_zero);
242                         //temp_of_new++;
243                         ret = trie_insert(&father->point_of_zero,bit_of_insert);
244                     }
245                     (*root) = father;
246                 }
247             }
248         }
249     }
250     return ret;
251 }
252 
253 
254 int trie_search(trie_node *root,char *bit_of_search)
255 {
256     trie_node *p = root;
257     while(p!=NULL && *bit_of_search)
258     {
259         if(p->num_of_bit!=0)
260         {
261             if((int)p->num_of_bit == compare_of_bit(p,bit_of_search))
262             {
263                 bit_of_search += (int)p->num_of_bit;
264             }
265             else
266             {
267                 p=NULL;
268                 break;
269             }
270         }
271         if(!(*bit_of_search))
272         {
273             break;
274         }
275         if(bit_of_search[0]==0)
276         {
277             p = p->point_of_zero;
278             bit_of_search++;
279         }
280         else if(bit_of_search[0]==1)
281         {
282             p = p->point_of_one;
283             bit_of_search++;
284         }
285         if(!(*bit_of_search) && p!=NULL && p->num_of_bit!=0)
286         {
287             p=NULL;
288             break;
289         }
290     }
291     if(p!=NULL)
292     {
293         return p->is_str;
294     }
295     else
296     {
297         return 0;
298     }
299 }
300 
301 
302 void trie_delete(trie_node *root)
303 {
304     if(root == NULL)
305         return;
306     trie_delete(root->point_of_zero);
307     trie_delete(root->point_of_one);
308     free(root);
309 }
310 
311 
312 int compare_of_bit(trie_node *root,char* bit_of_insert)
313 {
314     int len_of_insert = strlen(bit_of_insert);
315     int i,j,tempbit;
316     if(root->num_of_bit<=32)
317     {
318         for(i=0,j=root->num_of_bit-1;i<len_of_insert && i<root->num_of_bit;i++,j--)
319         {
320             tempbit = getbit((int)root->compress_of_bit,j);
321             if(bit_of_insert[i]-0 != tempbit)
322             {
323                 break;
324             }
325         }
326     }
327     else
328     {
329         for(i=0,j=root->num_of_bit-1;i<len_of_insert && i<root->num_of_bit;i++,j--)
330         {
331             tempbit = getbit(root->compress_of_bit[j/8],j%8);
332             if(bit_of_insert[i]-0 != tempbit)
333             {
334                 break;
335             }
336         }
337     }
338     return i;
339 }
340 
341 void pop_bit(trie_node *root,char* bit_of_pop,int len_of_pop)
342 {
343     int i,j;
344     short num_of_bit = root->num_of_bit - (short)len_of_pop;
345     if(root->num_of_bit<=32)
346     {
347         for(i=0,j=root->num_of_bit-1;i<len_of_pop;i++,j--)
348         {
349             bit_of_pop[i] = getbit((int)root->compress_of_bit,j) +0;
350         }
351         bit_of_pop[i] = \0;
352     }
353     else
354     {
355         for(i=0,j=root->num_of_bit-1;i<len_of_pop;i++,j--)
356         {
357             bit_of_pop[i] = getbit(root->compress_of_bit[j/8],j%8) +0;
358         }
359         bit_of_pop[i] = \0;
360 
361         if(num_of_bit == 0)
362         {
363             free(root->compress_of_bit);
364         }
365         else if(num_of_bit<=32)
366         {
367             int temp;
368             for(j=num_of_bit-1;j>=0;j--)
369             {
370                 if(getbit(root->compress_of_bit[j/8],j%8) == 0)
371                 {
372                     clearbit(temp,j);
373                 }
374                 else
375                 {
376                     setbit(temp,j);
377                 }
378             }
379             free(root->compress_of_bit);
380             root->compress_of_bit = (unsigned char*)temp;
381         }
382         else
383         {
384             unsigned char *p;
385             short num_of_byte = (num_of_bit%8)?(num_of_bit/8+1):(num_of_bit/8);
386             if(((root->num_of_bit%8)?(root->num_of_bit/8+1):(root->num_of_bit/8)) != num_of_byte)
387             {
388                 p = (unsigned char*)malloc(num_of_byte);
389                 short i;
390                 for(i=0;i<num_of_byte;i++)
391                 {
392                     p[i] = root->compress_of_bit[i];
393                 }
394                 free(root->compress_of_bit);
395                 root->compress_of_bit = p;
396             }
397         }
398     }
399     root->num_of_bit = num_of_bit;
400 }

 

大规模字符串检索-压缩trie树

标签:

原文地址:http://www.cnblogs.com/akb48/p/4315677.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!