码迷,mamicode.com
首页 > 其他好文 > 详细

26、蛤蟆的数据结构笔记之二十六串应用之建立词索引表

时间:2015-07-07 09:36:07      阅读:116      评论:0      收藏:0      [点我收藏+]

标签:

26、蛤蟆的数据结构笔记之二十六串应用之建立词索引表

         本篇名言:“生命是一条美丽而曲折的幽径,路旁有妍花的丽蝶,累累的美果,但我们很少去停留观赏,或咀嚼它,只一心一意地渴望赶到我们幻想中更加美丽的豁然开朗的大道。然而在前进的程途中,却逐渐树影凄凉,花蝶匿迹,果实无存,最后终于发觉到达一个荒

漠。-- 萨拉”

 

1.  信息检索

信息检索是计算机应用的重要领域之一。为了提高图书馆数目检索的效率,建立书名关键词索引,可以实现读者快速检索书目的自动化,即读者根据关键词索引表,读者可以方便查询到自己感兴趣的书目。

         准备一个文件BookInfo.txt文件如下。注意最后一行不要后空行,不然程序会出错。

2.  BookInfo.txt

005Computer Data Structure

010Introduction to Data Structure

023Fundamental of Data Structure

034The Design and Analysis of Computer Algorithms

050Introduction to Numerical Analysis

067Numerical Analysis

3.  BookIdx.txt

最后代码运行完毕会生成如下的文本:

关键字              书号        

algorithms          034,

analysis            034,050,067,

computer            005,034,

data                005,010,023,

design              034,

fundamental         023,

introduction        010,050,

numerical           050,067,

structure           005,010,023,

 

 

4.  代码具体实现

4.1         定义

定义如下

#define MaxBookNum1000                           //假设只对1000本书建索引表

#define MaxKeyNum2500                            //索引表的最大容量

#define MaxLineLen500                            //书目串的最大长度

#define MaxWordNum10                             //词表的最大容量

typedef struct{                                

         char*item[MaxKeyNum];                         //字符串的数组

         intlast;                                                //词表的长度

}WordListType;                                    //词表类型(顺序表) 

typedef struct LNode{                             //存放书号的链表

         intdata[3];

         struct LNode*next;

}LNode, *LinkList;

 

typedef struct{                                  //存放关键词的串

         char*ch;

         intlength;                                

}HString;

 

typedef struct{

         HString*key;

         LinkListbnolist;

}IdxTermType;                                   //索引项类型   

 

typedef struct{

         IdxTermTypeitem[MaxKeyNum + 1];

         intlast;

}IdxListType;                                  //索引表类型(有序表)

 

char *buf;                          //书目串缓冲区

WordListTypewdlist;                           //词表

int i;                                         //无重复的关键字个数

int b;                                         //保留i的前一次变换的值

int v;

char *com[10] = {"and","a","an","the","a","an","to","of"};

 

 

 

4.2         InitIdxList

初始化操作,置索引表idxlist为空表,且在idxlist.item[0]设一空串。

IdxListType *InitIdxList(IdxListType *idxlist)       

{

         idxlist =(IdxListType*)malloc(sizeof(IdxListType));

         if(idxlist== NULL)

         {

                   printf("ERROR——1!");

                   exit(-1);

         }

         idxlist->item[0].key= (HString *)malloc(sizeof(HString));

         idxlist->item[0].bnolist= (LinkList)malloc(sizeof(LNode));

         idxlist->item[0].key->ch= NULL;

         idxlist->item[0].key->length= 0;

         idxlist->item[0].bnolist->next= NULL;

         idxlist->last= 0;

         return idxlist;

}

 

 

 

4.3         GetLine

从文件f读入一个书目信息到书目缓冲区buf

其中v,buf是一个全局变量。

当前设置的v最大是100,就是最多读取100行书的信息。

void GetLine(FILE *f)

{

         int j= 0;

         static chara[100][100];

         charc;

         c =fgetc(f);

         while(((a[v][j++]= tolower(c) ) != ‘\n‘) && (c != EOF))

                   c= fgetc(f);

         //printf("%c",a[v][j- 1]);

         a[v][j-1]= ‘\0‘;

         buf =a[v++];

         printf("%s",buf);

}

 

4.4         ExtractKeyWord

从buf中提取书目关键词到词表wdlist,书号存入bno。

Buf中是一行从文本中提出来的字符串。

int ExtractKeyWord(int bno[])

{

         intk,s,t = 0,j = 0;

         b = i;

         char*p = buf,*q,*c[10];

         while(!isalnum(*p))

                   p++;

         while(isdigit(*p) || !isalpha(*p))

         {

                   bno[j]= *p - ‘0‘;

                   p++;j++;

         }

         j--;

//处理完书号,开始处理中间空格,j可以用来表示书号的位数。

         while(!isalpha(*p))

                   p++;

//处理完空格,处理书名

         q = p;

         while(*q)

         {

                   while((*q!= ‘ ‘) && *q && (*q != EOF))

                            q++;

                   q++;

                   *(q- 1) = ‘\0‘;

                   c[t++]= p;

                   p= q;

         }

//将每个书名的词,存放到数组c中。

//对比数组c和数组com中存放的是8个字符串,com数组中存放的是常用的字节,不能用来当做关键词。

         for(k= 0; k < t; ++k)

         {

                   for(s= 0; s < 8; ++s)

                   {

                            if(strcmp(c[k],com[s])== 0)

                                     break;

                   }

                   if(s== 8)

                            wdlist.item[i++]= c[k];

         }

//设定当前关键词的数量

         wdlist.last= i;

         printf("词表的长度%d ",wdlist.last);

         printf("书号:");

         for(k= 0; k < j; ++k)

                   printf("%d",bno[k]);

         printf("\n");

         for(k = b; k < i; ++k )

                   printf("关键字%s\n",wdlist.item[k]);

         return0;

}

4.5         InsIdxList

将书号bno的书名关键词按词典顺序插入索引表idxlist

int InsIdxList(IdxListType *idxlist, int bno[])

{

//b是保留i的前一次变换的值,i是关键字个数

         int k, t ,s,j = 0,c = b,d;

         LinkListp,q;

         if(i>= MaxKeyNum + 1)

         {

                   printf("超过索引表最大存储,请调整!\n");

                   exit(-1);

         }

         for(t= idxlist->last ; j < i - b; j++,c++)

         {

                   if(0 == t )                 //第一个关键字插入

                   {

                            idxlist->item[0].key->ch= wdlist.item[c];

                            for(d= 0; d < 3; ++d )

                                     idxlist->item[0].bnolist->data[d]= bno[d];                //书号存储

                            idxlist->item[0].bnolist->next= NULL;                        //链表尾部指向空

                            idxlist->last++;

                            t++;

                   }

                   //t不为0的时候,进行如下分支

                   else

                   {

//与已经存在的该关键词进行对比,是否已经存在一样的关键词了

                            for(k=0; k < t; k++)

                            {

                   //如果已经存在了一样的关键词,则再索引表对应的关键字增加书号到书号链表。

                                     if((s= strcmp(wdlist.item[c], idxlist->item[k].key->ch))== 0) //插入的关键字已存在

                                     {

                                               p= (LinkList)malloc(sizeof(LNode));                        //申请个新结点,保存书号

                                               printf("\n插入的书号:");

                                               for(d= 0; d < 3; ++d)

                                               {

                                                        p->data[d]= bno[d];

                                                        printf("%d",p->data[d]);

                                               }

                                               //书号赋值给新申请的节点

                                               q= idxlist->item[k].bnolist;

                                               for(d= 0; d < 3; ++d)

                                                        fprintf(stdout, "%d",q->data[d]);

                                               while(q->next!= NULL)

                                                        q= q->next;

                                               q->next= p;

                                               p->next= NULL;

                                               //输出该关键字下的所有书号

                                               fprintf(stdout, "\n关键字%-20s\t\n",idxlist->item[k].key->ch);

                                               q= idxlist->item[k].bnolist;

                                               while(q)

                                               {

                                                        for(d= 0; d < 3; ++d)

                                                                 fprintf(stdout, "%d",q->data[d]);

                                                        if(q->next!= NULL)

                                                                 fprintf(stdout, ",");

                                                        q= q->next;

                                               }

                                               printf("\n");

                                               break;

                                     }

//插入的关键字大于最后一个已存在的关键字,则插在最后

// 并不是发现大于当前关键词后,立马处理的,而是在最后一个的时候才处理。因为后面可能还会发现和他相等的关键词的,如果最后一个还是大于当前关键词则进行插入到最后。

                                     else if(s> 0)

                                     {

                                               if(k== t - 1)                                                

                                               {                                                            

                                                        idxlist->item[t].key= (HString *)malloc(sizeof(HString)); 

                                                        idxlist->item[t].bnolist= (LinkList)malloc(sizeof(LNode));

                                                        idxlist->item[t].key->ch= wdlist.item[c];

                                                        for(d= 0; d < 3; ++d )

                                                                 idxlist->item[t].bnolist->data[d]= bno[d];

                                                        idxlist->item[t].bnolist->next=NULL;

                                                        idxlist->last++;

                                                        t++;

                                                        break;

                                               }

                                     }

//插入的关键字小于最后一个已存在的关键字,则插在当前关键词的前面

// 如果是小于当前关键词,则直接插入,因为本来关键词就是安大小排序的不存在未比较的关键词大的情况。

                                     else

                                     {

                                               idxlist->item[t].key= (HString *)malloc(sizeof(HString)); 

                                               idxlist->item[t].bnolist= (LinkList)malloc(sizeof(LNode));

                                               for(s= t-1; s >= k; --s )

                                               {//插入的关键字小于当前已有的关键字,则把当前的到最后的关键字向后移

                                                        idxlist->item[s+1].key->ch= idxlist->item[s].key->ch ;

                                                        idxlist->item[s+1].bnolist->next= idxlist->item[s].bnolist->next;

                                                        for(d= 0; d < 3; d++)

                                                                 idxlist->item[s+1].bnolist->data[d]= idxlist->item[s].bnolist->data[d];

                                                        //idxlist->item[s+1].bnolist->next= NULL;

                                               }

                                               idxlist->item[k].key->ch=wdlist.item[c];                   //在当前关键字上插入新的关键字

                                               for(d= 0; d < 3; ++d )

                                                        idxlist->item[k].bnolist->data[d]= bno[d];

                                               idxlist->item[k].bnolist->next= NULL;

                                               idxlist->last++;

                                               t++;

                                               break;

                                     }

                            }

                   }

         }

         return0;

}

 

4.6         PutText

将生成的索引表idxlist输入到输出文件g

void PutText(FILE *g, IdxListType *idxlist)

         //将生成的索引表idxlist输入到输出文件g

{

         intk,t;

         fprintf(g,"%-20s%-60s\n","关键字","书号");

         fprintf(stdout,"%-20s%-60s\n","关键字","书号");

         for(k= 0; k < idxlist->last; k++)

         {

                   fprintf(g, "%-20s\t",idxlist->item[k].key->ch);

                   fprintf(stdout, "%-20s\t",idxlist->item[k].key->ch);

                   while(idxlist->item[k].bnolist)

                   {

                            for(t= 0; t < 3; ++t)

                            {

                                     fprintf(g, "%d",idxlist->item[k].bnolist->data[t]);

                                     fprintf(stdout, "%d",idxlist->item[k].bnolist->data[t]);

                            }

                            if(idxlist->item[k].bnolist->next!= NULL)

                            {

                                     fprintf(stdout, ",");

                                     fprintf(g, ",");

                            }

                            idxlist->item[k].bnolist= idxlist->item[k].bnolist->next;

                   }

                   fprintf(g, "\n");

                   fprintf(stdout, "\n");

         }

}

 

 

 

4.7         Idxlist_free

释放索引表空间

 

void Idxlist_free(IdxListType *idxlist)       //释放空间

{

         intk;

         LinkListp, q;

         for(k = 0; k < idxlist->last; k++)

         {

                   free(idxlist->item[k].key);

                   for(p = idxlist->item[k].bnolist; p; p = q)

                   {

                            q= p->next;

                            free(p);

                   }

         }

}

4.8         Main

看下主函数,定义一个索引表类型结构体。定义文件句柄变量 f,g,以及int 数组。其中f为BookInfo.txt句柄,g为BookIdx.txt句柄。

然后调用函数InitIdxList来实现初始化,获取f句柄函数中的字符串,提取其中的关键词,然后在索引表中间关键词和书号对应起来。

循环往复,将每行的关键词都读出来,然后将关键词和书号对应起来,如果关键词已经出现在了之前的关键词中,则将书号关联到已出现的关键词后。

最后将关键词和书号输出到一个文件中。

最后释放索引表,关闭文件句柄,退出。

int main()

{

         IdxListType*idxlist = NULL ;

         FILE*f, *g;

         intBookNo[5];

         if((f= fopen("BookInfo.txt", "r"))== NULL)

         {

                   printf("ERROR!Can not open BookInfo.txt");

                   return0;

         }

         else

         {

 

                   if((g= fopen("BookIdx.txt","w"))== NULL)

                   {

                            printf("ERROR!Can not open BookIdx.txt");

                            return0;

                   }

                   else

                   {

                            idxlist= InitIdxList(idxlist);                           //初始化索引表idxlist为空表

                            while(!feof(f))

                            {

                                     GetLine(f);                                          //从文件f读入一个书目信息到buf

                                     ExtractKeyWord(BookNo);                              //buf中提取关键词到词表,书号存入BookNo

                                     InsIdxList(idxlist,BookNo);                         //将书号为BookNo的关键词插入索引表

                                     printf("\ni=%d,b=%d\n",i,b);

                            }

                            PutText(g,idxlist);                                      //将生成的索引表idxlist输出到文件g

                   }

         }

         Idxlist_free(idxlist);

         fclose(f);

         fclose(g);

         return0;

}

 技术分享

 

 

5.  源码

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <ctype.h>

#define MaxBookNum1000                           //假设只对1000本书建索引表

#define MaxKeyNum2500                            //索引表的最大容量

#define MaxLineLen500                            //书目串的最大长度

#define MaxWordNum10                             //词表的最大容量

typedef struct{                                

         char*item[MaxKeyNum];                         //字符串的数组

         intlast;                                               //词表的长度

}WordListType;                                    //词表类型(顺序表) 

 

typedef struct LNode{                             //存放书号的链表

         intdata[3];

         struct LNode*next;

}LNode, *LinkList;

 

typedef struct{                                  //存放关键词的串

         char*ch;

         intlength;                                

}HString;

 

typedef struct{

         HString*key;                    

         LinkListbnolist;

}IdxTermType;                                   //索引项类型   

 

typedef struct{

         IdxTermTypeitem[MaxKeyNum + 1];

         intlast;

}IdxListType;                                  //索引表类型(有序表)

 

char *buf;                          //书目串缓冲区

WordListTypewdlist;                           //词表

int i;                                         //无重复的关键字个数

int b;                                         //保留i的前一次变换的值

int v;

char *com[10] = {"and","a","an","the","a","an","to","of"};

 

IdxListType *InitIdxList(IdxListType *idxlist)       

         //初始化操作,置索引表idxlist为空表,且在idxlist.item[0]设一空串。

{

         idxlist =(IdxListType*)malloc(sizeof(IdxListType));

         if(idxlist== NULL)

         {

                   printf("ERROR——1!");

                   exit(-1);

         }

         idxlist->item[0].key= (HString *)malloc(sizeof(HString));

         idxlist->item[0].bnolist= (LinkList)malloc(sizeof(LNode));

         idxlist->item[0].key->ch= NULL;

         idxlist->item[0].key->length= 0;

         //idxlist->item[0].bnolist->data[3];

         idxlist->item[0].bnolist->next= NULL;

         idxlist->last= 0;

         return idxlist;

}

void GetLine(FILE *f)

         //从文件f读入一个书目信息到书目缓冲区buf

{

         int j= 0;

         static chara[100][100];

         charc;

         c =fgetc(f);

         while(((a[v][j++]= tolower(c) ) != ‘\n‘) && (c != EOF))

                   c= fgetc(f);

         //printf("%c",a[v][j- 1]);

         a[v][j-1]= ‘\0‘;

         buf =a[v++];

         printf("%s",buf);

}

int ExtractKeyWord(int bno[])

         //buf中提取书目关键词到词表wdlist,书号存入bno

{

         intk,s,t = 0,j = 0;

         b = i;

         char*p = buf,*q,*c[10];

         while(!isalnum(*p))

                   p++;

         while(isdigit(*p) || !isalpha(*p))

         {

                   bno[j]= *p - ‘0‘;

                   p++;j++;

         }

         j--;

         while(!isalpha(*p))

                   p++;

         q = p;

         while(*q)

         {

                   while((*q!= ‘ ‘) && *q && (*q != EOF))

                            q++;

                   q++;

                   *(q- 1) = ‘\0‘;

                   c[t++]= p;

                   p= q;

         }

         for(k= 0; k < t; ++k)

         {

                   for(s= 0; s < 8; ++s)

                   {

                            if(strcmp(c[k],com[s])== 0)

                                     break;

                   }

                   if(s== 8)

                            wdlist.item[i++]= c[k];

         }

         wdlist.last= i;

         printf("词表的长度%d ",wdlist.last);

         printf("书号:");

         for(k= 0; k < j; ++k)

                   printf("%d",bno[k]);

         printf("\n");

         for(k = b; k < i; ++k )

                   printf("关键字%s\n",wdlist.item[k]);

         return0;

}

int InsIdxList(IdxListType *idxlist, int bno[])

         //将书号bno的书名关键词按词典顺序插入索引表idxlist

{

         int k, t ,s,j = 0,c = b,d;

         LinkListp,q;

         if(i>= MaxKeyNum + 1)

         {

                   printf("超过索引表最大存储,请调整!\n");

                   exit(-1);

         }

         for(t= idxlist->last ; j < i - b; j++,c++)

         {

                   if(0 == t )                                                      //第一个关键字插入

                   {

                            idxlist->item[0].key->ch= wdlist.item[c];

                            for(d= 0; d < 3; ++d )

                                     idxlist->item[0].bnolist->data[d]= bno[d];                //书号存储

                            idxlist->item[0].bnolist->next= NULL;                        //链表尾部指向空

                            idxlist->last++;

                            t++;

                   }

                   else

                   {

                            for(k=0; k < t; k++)

                            {

                                     if((s= strcmp(wdlist.item[c], idxlist->item[k].key->ch))== 0) //插入的关键字已存在

                                     {

                                               //ListInsert(idxlist->item[k].bnolist,Length(idxlist->item[k].bnolist)+1, bno);

                                               p= (LinkList)malloc(sizeof(LNode));                        //申请个新结点,保存书号

                                               printf("\n插入的书号:");

                                               for(d= 0; d < 3; ++d)

                                               {

                                                        p->data[d]= bno[d];

                                                        printf("%d",p->data[d]);

                                               }

                                               q= idxlist->item[k].bnolist;

                                               for(d= 0; d < 3; ++d)

                                                        fprintf(stdout, "%d",q->data[d]);

                                               while(q->next!= NULL)

                                                        q= q->next;

                                               q->next= p;

                                               p->next= NULL;

                                               fprintf(stdout, "\n关键字%-20s\t\n",idxlist->item[k].key->ch);

                                               q= idxlist->item[k].bnolist;

                                               while(q)

                                               {

                                                        for(d= 0; d < 3; ++d)

                                                                 fprintf(stdout, "%d",q->data[d]);

                                                        if(q->next!= NULL)

                                                                 fprintf(stdout, ",");

                                                        q= q->next;

                                               }

                                               printf("\n");

                                               break;

                                      }

                                     else if(s> 0)

                                     {

                                               if(k== t - 1)                                                //插入的关键字大于最后一个已存在的关键字,则插在最后

                                               {                                                            

                                                        idxlist->item[t].key= (HString *)malloc(sizeof(HString)); 

                                                        idxlist->item[t].bnolist= (LinkList)malloc(sizeof(LNode));

                                                        idxlist->item[t].key->ch= wdlist.item[c];

                                                        for(d= 0; d < 3; ++d )

                                                                 idxlist->item[t].bnolist->data[d]= bno[d];

                                                        idxlist->item[t].bnolist->next=NULL;

                                                        idxlist->last++;

                                                        t++;

                                                        break;

                                               }

                                     }

                                     else

                                     {

                                               idxlist->item[t].key= (HString *)malloc(sizeof(HString)); 

                                               idxlist->item[t].bnolist= (LinkList)malloc(sizeof(LNode));

                                               for(s= t-1; s >= k; --s )

                                               {//插入的关键字小于当前已有的关键字,则把当前的到最后的关键字向后移

                                                        idxlist->item[s+1].key->ch= idxlist->item[s].key->ch ;

                                                        idxlist->item[s+1].bnolist->next= idxlist->item[s].bnolist->next;

                                                        for(d= 0; d < 3; d++)

                                                                 idxlist->item[s+1].bnolist->data[d]= idxlist->item[s].bnolist->data[d];

                                                        //idxlist->item[s+1].bnolist->next= NULL;

                                               }

                                               idxlist->item[k].key->ch=wdlist.item[c];                   //在当前关键字上插入新的关键字

                                               for(d= 0; d < 3; ++d )

                                                        idxlist->item[k].bnolist->data[d]= bno[d];

                                               idxlist->item[k].bnolist->next= NULL;

                                               idxlist->last++;

                                               t++;

                                               break;

                                     }

                            }

                   }

         }

         return0;

}

void PutText(FILE *g, IdxListType *idxlist)

         //将生成的索引表idxlist输入到输出文件g

{

         intk,t;

        

         fprintf(g,"%-20s%-60s\n","关键字","书号");

         fprintf(stdout,"%-20s%-60s\n","关键字","书号");

         for(k= 0; k < idxlist->last; k++)

         {

                   fprintf(g, "%-20s\t",idxlist->item[k].key->ch);

                   fprintf(stdout, "%-20s\t",idxlist->item[k].key->ch);

                   while(idxlist->item[k].bnolist)

                   {

                            for(t= 0; t < 3; ++t)

                            {

                                     fprintf(g, "%d",idxlist->item[k].bnolist->data[t]);

                                     fprintf(stdout, "%d",idxlist->item[k].bnolist->data[t]);

                            }

                            if(idxlist->item[k].bnolist->next!= NULL)

                            {

                                     fprintf(stdout, ",");

                                     fprintf(g, ",");

                            }

                            idxlist->item[k].bnolist= idxlist->item[k].bnolist->next;

                   }

                   fprintf(g, "\n");

                   fprintf(stdout, "\n");

         }

}

void Idxlist_free(IdxListType *idxlist)       //释放空间

{

         intk;

         LinkListp, q;

         for(k = 0; k < idxlist->last; k++)

         {

                   free(idxlist->item[k].key);

                   for(p = idxlist->item[k].bnolist; p; p = q)

                   {

                            q= p->next;

                            free(p);

                   }

         }

}

int main()

{

         IdxListType*idxlist = NULL ;

         FILE*f, *g;

         intBookNo[5];

         if((f= fopen("BookInfo.txt", "r"))== NULL)

         {

                   printf("ERROR!Can not open BookInfo.txt");

                   return0;

         }

         else

         {

 

                   if((g= fopen("BookIdx.txt","w"))== NULL)

                   {

                            printf("ERROR!Can not open BookIdx.txt");

                            return0;

                   }

                   else

                   {

                            idxlist= InitIdxList(idxlist);                           //初始化索引表idxlist为空表

                            while(!feof(f))

                            {

                                     GetLine(f);                                          //从文件f读入一个书目信息到buf

                                     ExtractKeyWord(BookNo);                              //buf中提取关键词到词表,书号存入BookNo

                                     InsIdxList(idxlist,BookNo);                         //将书号为BookNo的关键词插入索引表

                                     printf("\ni=%d,b=%d\n",i,b);

                            }

                            PutText(g,idxlist);                                      //将生成的索引表idxlist输出到文件g

                   }

         }

         Idxlist_free(idxlist);

         fclose(f);

         fclose(g);

         return0;

}

版权声明:本文为博主原创文章,未经博主允许不得转载。

26、蛤蟆的数据结构笔记之二十六串应用之建立词索引表

标签:

原文地址:http://blog.csdn.net/notbaron/article/details/46779459

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!