标签:
贴上一个文档,是写这个程序的基本的思路,当然具体的程序和这个有一些出入,但是大体上一样。求批评指正。
/*目标:在一个文件夹下吧所有的lrc歌词文件用程序导入,进行分析,最后的结果用TXT表示出来。*/
/*分析:只要建立了倒排索引就可以很容易把歌词的索引文件输入到il.txt中,所以应当有一个函数实现输入倒排索引,而后建立文件输出txt:
0.bool Lyricsindex_out(Lyric_index_list index_list[],int m ){}
//相关的有:
struct Word_item{//词项头存储
String word;
int freq=0;
Word_Doc *head_docID;
};Word_item lyrics_head=new Word_item[];
struct Word_Doc{//词项位置存储
int text_number;
Word_Doc *next;
}*head_docID;
Word_Doc *temp;
Fstream fout(“index_lyrics.txt”);
Fout<<”doc”<<setw(12)<<”freq”<<setw(12)<<”->”<<setw(12)<<”list”<<endl;
for(int i=0,int j=0;index_list[i]->next!=NULL;i++){
Fout<<index_list[i]->word<<setw(12)<<index_list[i]->freq<<setw(12)<<”->”<<setw(12);
temp=index_list[i]->head_docID;
For( j=0;temp!=NULL;j++){
Fout<<temp->text_number<<”,”;
temp=temp->next;
}
}
输出txt文件的写法:
1.应当有一个类,Class Lyric_Index_Analysis
处理从文件夹中的输入的lrc歌词文件,并且过滤掉”[...]”,最后只留下歌词的主体,然后把歌词的主体进行分析,以空格,回车,标点符号为界限,将每个词解析出来,
存入
lyrics_head[],
例如:
lyrics_head[0].word=xiejiang;
lyrics_head[0].freq=1;
lyrics_head[0].head_docID=new Word_item;
lyrics_head[0].head_docID->text_number=0;
lyrics_head[0].head_docID->next=NULL;
每处理一个文档时将最终生成的词项头的数组的最大值记住,这可以设置成一个函数,
Int get_lyrics_head(Word_item lyrics_head[],string filename)
返回的时候返回数组和最大值
2.然后这个类处理完所有的文档之后,应当有一个
lyrics_head[]的数组,大小为n,
然后建立一个函数将所有的词项按照词项的字母序归并排序
之后仍然返回数组
Void Lyric_mergesort(Word_item lyrics_head[],int left,int right)
3.到最后一步,Index_list Analy_Setup_index (Word_item lyrics_head[],int n)
建立链表
输入lyrics_head[]和n
对数组进行遍历,如果两个挨着的数内的word相同那么就将他们的进行合并
用一个指针指向lyrics_head[]和Word_item,直到这个数组相同的部分已经不存在了
然后将两个放在一个链表里。
因为归并排序是稳定的,所以可以直接将两个lyrics_head[].word连起来。如下方法:例如
struct Index_list{//用于生成最终的索引表
String word;
int freq=0;
Word_Doc *head_docID;
Index_list* next;
};
Word_Doc* lyrics_doc; //=new Word_Doc[];
Index_list Lyric_index_list=new Index_list[];
Lyric_index_list[0].word=lyrics_head[0].word;
Lyric_index_list[0].head_docID=lyrics_head[0].head_docID;
Lyric_index_list[0].freq=lyrics_head[0].freq;
Lyric_index_list[0]->next=NULL;
Lyrics_doc=lyrics_head[0].head_docID;
For(int i=0,int j=0;i<n;i++){
If(Lyric_index_list[j].word==lyrics_head[i].word){
Lyric_index_list[j].freq++;
Lyrics_doc->next=lyrics_head[i].head_docID;
Lyrics_doc=Lyrics_doc->next;
}
Else{
J++;
Lyric_index_list[j-1].next=Lyric_index_list[j];
Lyric_index_list[j]->next=NULL;
Lyric_index_list[j].word=lyrics_head[i].word;
Lyric_index_list[j].head_docID=lyrics_head[i].head_docID;
Lyric_index_list[j].freq=lyrics_head[i].freq;
}
}
Return Lyric_index_list;
*/
首先main所在的.cpp:
#include<iostream> #include<fstream> #include<string> #include<iomanip> #include <stdio.h> #include<io.h> #include"Analysis_Lyrics.h" using namespace std; int main(){ Lyric_Index_Analysis Lyric_A;//建立歌词分析的类 string str = "\0"; int bound = 0, upper = 0, i = 1;//有n个数,则有上标为n,这是词项表的上下界 Word_item *temp = new Word_item[Max_size - 500];//用于归并交换的需要 struct _finddata_t fileinfo;//文件处理,寻找默认目录下的歌词文件 long pFile;//调用成功返回0,否则返回-1 // cout << "第 " << pFile << " 个文档是:" << fileinfo.name << " 编号为:1" << endl; if ((pFile = _findfirst("*.lrc", &fileinfo)) == -1) { cout << "不存在.lrc文件" << endl; return 0; } else { cout << "第 " << i++ << " 个文档是:" << fileinfo.name << " 编号为:1" << endl; Lyric_A.Lyrics_input(fileinfo.name, upper, bound, 1);//输入文件调用 cout << "下界为->" << bound << "上界为->" << upper << endl; while (_findnext(pFile, &fileinfo) == 0) { cout << "第 " << i << " 个文档是:" << fileinfo.name << " 编号为:" << i << endl; bound = upper; Lyric_A.Lyrics_input(fileinfo.name, upper, bound, i); cout << "下界为->" << bound << "上界为->" << upper << endl; i++; } } _findclose(pFile); bound = 0; cout << "总的词项表的下界为->" << bound << "上界为->" << upper << endl; Lyric_A.Lyric_mergesort(Lyric_A.Return_lyrics_head(),temp, bound, upper - 1); Lyric_A.print(upper); Lyric_A.Lyricsindex_out(upper); } //如下的小代码是遍历一个文件夹下的文档的程序 /*#include<iostream> #include <io.h> using namespace std; int main() { struct _finddata_t fileinfo; long hFile; if ((hFile = _findfirst("*.lrc", &fileinfo)) == -1) return -1; else { cout << fileinfo.name << endl; while (_findnext(hFile, &fileinfo) == 0){ cout << fileinfo.name << endl; } } _findclose(hFile); return 0; } */
接着是处理歌词文件的类.h:
#include<iostream> using namespace std; static const int Max_size = 4000; static const int max_size = 200; struct Word_Doc {//词项位置存储,包含词的所在的文档编号,下一个词项所在位置 int text_number; Word_Doc *next = nullptr; }; struct Word_item {//用于第一次遍历整个文档时的存储每个词的头项,包含单词和下一个词项 string word; Word_Doc *head_docID = nullptr; }; struct Index_list {//用于生成最终的索引表,包含单词,单词存在的文档的总数,单词存在文档位置的索引,下一个单词 string word; int freq = 0; Word_Doc *head_docID = nullptr; Index_list* next = nullptr; }; class Lyric_Index_Analysis {//分析lrc歌词文件的主类,用各个函数将歌词文档分析出来然后建立成索引文档 private: Word_item* lyrics_head; Index_list* L_H_List; public: int bound = 0, upper = 0;//代表目前词项表的上界和下界,上界和下界随着文档数的处理有所不同 Lyric_Index_Analysis() { lyrics_head = new Word_item[Max_size]; L_H_List = new Index_list;//建立一个索引链表 }; ~Lyric_Index_Analysis() { delete[]lyrics_head; Index_list*temp = L_H_List; while (temp != nullptr) { temp = L_H_List->next; delete L_H_List; } }; Word_item* Return_lyrics_head() { return lyrics_head; } Index_list* Return_L_H_List() { return L_H_List; } void Lyrics_input(string filename, int& upper, int bound, int number);//打开文件输入歌词,参数分别为:文件名,词项表的上界,词项表的下界,文档的编号。调用insert_Word_List(Word_item lyrics_head[], int &upper,int bound, char* elem,int position),最终返回词项表 bool insert_Word_List(Word_item lyrics_head[], int& upper, int bound, char* elem, int position);//将单词插入词项表,如果不在就插入,在则直接退出(仅限于当前的文档),参数为:词项表,上界,下界,单词,文档的编号 void Lyric_mergesort(Word_item lyrics_head[], Word_item temp[], int left, int right);//对词项的表进行归并排序 void Analys_Setup_index(Word_item lyrics_head[], int n);//对词项建立最终的索引表 bool Lyricsindex_out(int n) {//Index_list L_index_list[], int m){//将最终的索引程序输出来,输入在Lyrics_Index_List.txt中 Analys_Setup_index(lyrics_head, n); ofstream fout("Lyrics_Index_List.txt", ios::trunc); fout.setf(ios::left); //fout << setw(20) << "word" << setw(5) << "freq" << setw(3) << " " << "docID" << endl; while (L_H_List != nullptr) { Word_Doc* temp = L_H_List->head_docID; fout << L_H_List->word << "#" << L_H_List->freq << "@"; cout << L_H_List->word << " 出现在"; while (temp->next != nullptr) { cout << temp->text_number << ","; fout << temp->text_number << ","; temp = temp->next; } fout << temp->text_number << endl; cout << temp->text_number; cout << " 号文档,频率为 " << L_H_List->freq << endl; L_H_List = L_H_List->next; } fout.close(); return false; } void print(int n) { for (int i = 0; i < n; i++) { cout << lyrics_head[i].word << " 出现在" << lyrics_head[i].head_docID->text_number << " 号文档" << endl; } } };
再然后是类的具体的.cpp文件:
#include<fstream> #include<string> #include<iomanip> #include"Analysis_Lyrics.h" //打开文件输入歌词,参数分别为:文件名,词项表的上界,词项表的下界,文档的编号。调用insert_Word_List(Word_item lyrics_head[], int &upper,int bound, char* elem,int position),最终返回词项表 void Lyric_Index_Analysis::Lyrics_input(string filename, int& upper, int bound, int number) { //lyrics_head = new Word_item[Max_size]; ifstream fin(filename); if (!fin.is_open()) { cout << "文件读取失败!\n"; exit(0); } string str; getline(fin, str); //遍历整个文档,每次读取一行,然后进行分析 do { cout << str << endl; char c[max_size] = { ‘\0‘ }; int i = 0, ic = 0; for (i = 0; str[i] != ‘]‘; i++); for (int j = i + 1; str[j] != ‘\r‘&&str[j] != ‘\n‘&&str[j] != ‘\0‘; j++) { //去掉引号后面的字符,但是如果是t的话就不去 if ((int)str[j] == 39) { while (str[j] != ‘ ‘&&str[j] != ‘\r‘&&str[j] != ‘\n‘&&str[j] != ‘\0‘) { j++; if (str[j] == ‘t‘) { j--; break; } } if (str[j] == ‘\r‘ || str[j] == ‘\n‘ || str[j] == ‘\0‘) break; } //除去大小写 if (((int)str[j] >= 65) && ((int)str[j] <= 91)) c[ic++] = (int)str[j] + 32; else c[ic++] = str[j]; //cout << "daxiao--------------->" << (int)str[j] << endl; } // cout <<"分割后的字符串: "<< c << endl; const char *d = "[] -;,:/?!.()";//以这些字符为分界符 char *p = NULL; char *next_p = NULL; p = strtok_s(c, d, &next_p); while (p) { insert_Word_List(lyrics_head, upper, bound, p, number); // cout <<"上标是"<<upper<< "分出来了:" << lyrics_head[upper-1].word << endl; p = strtok_s(NULL, d, &next_p); } getline(fin, str); } while (!fin.eof()); fin.close(); }; //将单词插入词项表,如果不在就插入,在则直接退出(仅限于当前的文档),参数为:词项表,上界,下界,单词,文档的编号 bool Lyric_Index_Analysis::insert_Word_List(Word_item lyrics_head[], int& upper, int bound, char* elem, int position) { for (int i = bound; i<upper; i++) { if (lyrics_head[i].word == elem) return false; } lyrics_head[upper].head_docID = new Word_Doc; lyrics_head[upper].head_docID->text_number = position; lyrics_head[upper].head_docID->next = nullptr; lyrics_head[upper].word = elem; //cout << "分出来了(后面):" << lyrics_head[upper].word << endl; upper++; return true; }; //对词项的表进行归并排序 void Lyric_Index_Analysis::Lyric_mergesort(Word_item lyrics_head[], Word_item temp[], int left, int right) { int i, j, k, mid = (left + right) / 2; if (left == right) return; Lyric_mergesort(lyrics_head, temp, left, mid); Lyric_mergesort(lyrics_head, temp, mid + 1, right); for (i = mid; i >= left; i--) temp[i] = lyrics_head[i]; for (j = 1; j <= right - mid; j++) temp[right - j + 1] = lyrics_head[j + mid]; for (i = left, j = right, k = left; k <= right; k++) if (temp[i].word <= temp[j].word) lyrics_head[k] = temp[i++]; else lyrics_head[k] = temp[j--]; }; //对词项建立最终的索引表 void Lyric_Index_Analysis::Analys_Setup_index(Word_item lyrics_head[], int n) { Word_Doc *temp;//用于每个词项出现在文档中的位置的索引 Index_list* t_L_H_List = L_H_List; t_L_H_List->word = lyrics_head[0].word; t_L_H_List->freq = 1; t_L_H_List->head_docID = lyrics_head[0].head_docID; t_L_H_List->next = nullptr; temp = t_L_H_List->head_docID; cout << "单词是" << t_L_H_List->word << " 出现在 " << temp->text_number << " 号文档,此时频率是" << t_L_H_List->freq << endl; for (int i = 1; i < n; i++) { while (lyrics_head[i - 1].word == lyrics_head[i].word) { temp->next = lyrics_head[i].head_docID; temp = temp->next; cout << " 词项和上一个相等,出现在" << temp->text_number << " 号文档,此时频率是" << t_L_H_List->freq + 1 << endl; t_L_H_List->freq++; ++i; } if (i == n) break; Index_list* temp_L_H_List = new Index_list; temp_L_H_List->word = lyrics_head[i].word; temp_L_H_List->freq = 1; temp_L_H_List->head_docID = lyrics_head[i].head_docID; temp = temp_L_H_List->head_docID; temp_L_H_List->next = nullptr; t_L_H_List->next = temp_L_H_List; t_L_H_List = t_L_H_List->next; cout << "单词是" << t_L_H_List->word << " 出现在 " << temp->text_number << " 号文档,此时频率是" << t_L_H_List->freq << endl; } };
c++下lrc歌词文件检索(自己写的检索歌词文件,记录点滴)
标签:
原文地址:http://www.cnblogs.com/1996313xjf/p/5911311.html