码迷,mamicode.com
首页 > 编程语言 > 详细

c++下lrc歌词文件检索(自己写的检索歌词文件,记录点滴)

时间:2016-09-27 01:48:46      阅读:238      评论:0      收藏:0      [点我收藏+]

标签:

贴上一个文档,是写这个程序的基本的思路,当然具体的程序和这个有一些出入,但是大体上一样。求批评指正。

/*目标:在一个文件夹下吧所有的lrc歌词文件用程序导入,进行分析,最后的结果用TXT表示出来。*/

/*分析:只要建立了倒排索引就可以很容易把歌词的索引文件输入到il.txt中,所以应当有一个函数实现输入倒排索引,而后建立文件输出txt:

0.bool Lyricsindex_out(Lyric_index_list index_list[],int m ){}

//相关的有:

struct Word_item{//词项头存储

String word;

int freq=0;

 Word_Doc *head_docID;

};Word_item lyrics_head=new Word_item[];

 

struct Word_Doc{//词项位置存储

int text_number;

Word_Doc *next;

}*head_docID;

 

Word_Doc *temp;

Fstream  fout(“index_lyrics.txt”);

Fout<<”doc”<<setw(12)<<”freq”<<setw(12)<<”->”<<setw(12)<<”list”<<endl;

 

for(int i=0,int j=0;index_list[i]->next!=NULL;i++){

Fout<<index_list[i]->word<<setw(12)<<index_list[i]->freq<<setw(12)<<”->”<<setw(12);

temp=index_list[i]->head_docID;

For( j=0;temp!=NULL;j++){

Fout<<temp->text_number<<”,”;

temp=temp->next;

}

}

输出txt文件的写法:

1.应当有一个类,Class Lyric_Index_Analysis

处理从文件夹中的输入的lrc歌词文件,并且过滤掉”[...]”,最后只留下歌词的主体,然后把歌词的主体进行分析,以空格,回车,标点符号为界限,将每个词解析出来,

 

存入

lyrics_head[],

例如:

lyrics_head[0].word=xiejiang;

lyrics_head[0].freq=1;      

lyrics_head[0].head_docID=new Word_item;  

lyrics_head[0].head_docID->text_number=0;

lyrics_head[0].head_docID->next=NULL;

每处理一个文档时将最终生成的词项头的数组的最大值记住,这可以设置成一个函数,

Int get_lyrics_head(Word_item lyrics_head[],string  filename)

返回的时候返回数组和最大值

2.然后这个类处理完所有的文档之后,应当有一个

lyrics_head[]的数组,大小为n,

然后建立一个函数将所有的词项按照词项的字母序归并排序

之后仍然返回数组

Void Lyric_mergesort(Word_item lyrics_head[],int left,int right)

3.到最后一步,Index_list  Analy_Setup_index (Word_item lyrics_head[],int n)

建立链表

输入lyrics_head[]和n

对数组进行遍历,如果两个挨着的数内的word相同那么就将他们的进行合并

 

用一个指针指向lyrics_head[]和Word_item,直到这个数组相同的部分已经不存在了

然后将两个放在一个链表里。

因为归并排序是稳定的,所以可以直接将两个lyrics_head[].word连起来。如下方法:例如

 

struct Index_list{//用于生成最终的索引表

String word;

int freq=0;

Word_Doc *head_docID;

Index_list* next;

};

 

Word_Doc* lyrics_doc;                //=new Word_Doc[];

Index_list Lyric_index_list=new Index_list[];

 

Lyric_index_list[0].word=lyrics_head[0].word;

Lyric_index_list[0].head_docID=lyrics_head[0].head_docID;

Lyric_index_list[0].freq=lyrics_head[0].freq;

Lyric_index_list[0]->next=NULL;

 

Lyrics_doc=lyrics_head[0].head_docID;

 

For(int i=0,int j=0;i<n;i++){

If(Lyric_index_list[j].word==lyrics_head[i].word){

 

Lyric_index_list[j].freq++;

Lyrics_doc->next=lyrics_head[i].head_docID;

Lyrics_doc=Lyrics_doc->next;

}

Else{

J++;

Lyric_index_list[j-1].next=Lyric_index_list[j];

Lyric_index_list[j]->next=NULL;

Lyric_index_list[j].word=lyrics_head[i].word;

Lyric_index_list[j].head_docID=lyrics_head[i].head_docID;

Lyric_index_list[j].freq=lyrics_head[i].freq;

}

}

 Return Lyric_index_list;

*/

好了废话少说贴代码

首先main所在的.cpp:

#include<iostream>
#include<fstream>
#include<string>
#include<iomanip>
#include <stdio.h>
#include<io.h>
#include"Analysis_Lyrics.h"
using namespace std;

int main(){
	Lyric_Index_Analysis Lyric_A;//建立歌词分析的类
	string str = "\0";
	int bound = 0, upper = 0, i = 1;//有n个数,则有上标为n,这是词项表的上下界
	Word_item *temp = new Word_item[Max_size - 500];//用于归并交换的需要
	struct _finddata_t fileinfo;//文件处理,寻找默认目录下的歌词文件
	long pFile;//调用成功返回0,否则返回-1
//	cout << "第 " << pFile << " 个文档是:" << fileinfo.name << "  编号为:1" << endl;
	if ((pFile = _findfirst("*.lrc", &fileinfo)) == -1) {
		cout << "不存在.lrc文件" << endl;
		return 0;
	}
	else {
		cout << "第 " << i++ << " 个文档是:" << fileinfo.name << "  编号为:1" << endl;
		Lyric_A.Lyrics_input(fileinfo.name, upper, bound, 1);//输入文件调用
		cout << "下界为->" << bound << "上界为->" << upper << endl;
		while (_findnext(pFile, &fileinfo) == 0) {
			cout << "第 " << i << " 个文档是:" << fileinfo.name << "  编号为:" << i << endl;
			bound = upper;
			Lyric_A.Lyrics_input(fileinfo.name, upper, bound, i);
			cout << "下界为->" << bound << "上界为->" << upper << endl;
			i++;
		}
	}
	_findclose(pFile);
	bound = 0;
	cout << "总的词项表的下界为->" << bound << "上界为->" << upper << endl;
	Lyric_A.Lyric_mergesort(Lyric_A.Return_lyrics_head(),temp, bound, upper - 1);
	Lyric_A.print(upper);
	Lyric_A.Lyricsindex_out(upper);
}
//如下的小代码是遍历一个文件夹下的文档的程序
/*#include<iostream>
#include <io.h>  
using namespace std;
int main()
{
	struct _finddata_t fileinfo;
	long hFile;
	if ((hFile = _findfirst("*.lrc", &fileinfo)) == -1)
		return -1;
	else {
		cout << fileinfo.name << endl;
		while (_findnext(hFile, &fileinfo) == 0){
			
			cout << fileinfo.name << endl;
		}
	}
	_findclose(hFile);
	return 0;
}
*/

接着是处理歌词文件的类.h:

#include<iostream>
using namespace std;
static const int Max_size = 4000;
static const int max_size = 200;
struct Word_Doc {//词项位置存储,包含词的所在的文档编号,下一个词项所在位置
	int text_number;
	Word_Doc *next = nullptr;
};

struct Word_item {//用于第一次遍历整个文档时的存储每个词的头项,包含单词和下一个词项
	string word;
	Word_Doc *head_docID = nullptr;
};

struct Index_list {//用于生成最终的索引表,包含单词,单词存在的文档的总数,单词存在文档位置的索引,下一个单词
	string word;
	int freq = 0;
	Word_Doc *head_docID = nullptr;
	Index_list* next = nullptr;
};

class Lyric_Index_Analysis {//分析lrc歌词文件的主类,用各个函数将歌词文档分析出来然后建立成索引文档
private:
	Word_item* lyrics_head;
	Index_list* L_H_List;
public:
	int bound = 0, upper = 0;//代表目前词项表的上界和下界,上界和下界随着文档数的处理有所不同
	Lyric_Index_Analysis() {
		lyrics_head = new Word_item[Max_size];
		L_H_List = new Index_list;//建立一个索引链表
	};
	~Lyric_Index_Analysis() {
		delete[]lyrics_head;
		Index_list*temp = L_H_List;
		while (temp != nullptr) {
			temp = L_H_List->next;
			delete L_H_List;
		}
	};

	Word_item* Return_lyrics_head() {
		return lyrics_head;
	}
	Index_list* Return_L_H_List() {
		return L_H_List;
	}
	void Lyrics_input(string filename, int& upper, int bound, int number);//打开文件输入歌词,参数分别为:文件名,词项表的上界,词项表的下界,文档的编号。调用insert_Word_List(Word_item lyrics_head[], int &upper,int bound, char* elem,int position),最终返回词项表
	bool insert_Word_List(Word_item lyrics_head[], int& upper, int bound, char* elem, int position);//将单词插入词项表,如果不在就插入,在则直接退出(仅限于当前的文档),参数为:词项表,上界,下界,单词,文档的编号
	void Lyric_mergesort(Word_item lyrics_head[], Word_item temp[], int left, int right);//对词项的表进行归并排序
	void  Analys_Setup_index(Word_item lyrics_head[], int n);//对词项建立最终的索引表
	bool Lyricsindex_out(int n) {//Index_list L_index_list[], int m){//将最终的索引程序输出来,输入在Lyrics_Index_List.txt中
		Analys_Setup_index(lyrics_head, n);
		ofstream fout("Lyrics_Index_List.txt", ios::trunc);
		fout.setf(ios::left);
		//fout << setw(20) << "word" << setw(5) << "freq" << setw(3) << "       " << "docID" << endl;
		while (L_H_List != nullptr) {
			Word_Doc* temp = L_H_List->head_docID;
			fout << L_H_List->word << "#" << L_H_List->freq << "@";
			cout << L_H_List->word << " 出现在";
			while (temp->next != nullptr) {
				cout << temp->text_number << ",";
				fout << temp->text_number << ",";
				temp = temp->next;
			}
			fout << temp->text_number << endl;
			cout << temp->text_number;
			cout << "  号文档,频率为 " << L_H_List->freq << endl;
			L_H_List = L_H_List->next;
		}
		fout.close();
		return false;
	}
	void print(int n) {
		for (int i = 0; i < n; i++) {
			cout << lyrics_head[i].word << " 出现在" << lyrics_head[i].head_docID->text_number << " 号文档" << endl;
		}
	}
};

  再然后是类的具体的.cpp文件:

#include<fstream>
#include<string>
#include<iomanip>
#include"Analysis_Lyrics.h"
//打开文件输入歌词,参数分别为:文件名,词项表的上界,词项表的下界,文档的编号。调用insert_Word_List(Word_item lyrics_head[], int &upper,int bound, char* elem,int position),最终返回词项表

void Lyric_Index_Analysis::Lyrics_input(string filename, int& upper, int bound, int number) {
	//lyrics_head = new Word_item[Max_size];

	ifstream fin(filename);
	if (!fin.is_open()) {
		cout << "文件读取失败!\n";
		exit(0);
	}
	string str;
	getline(fin, str);
	//遍历整个文档,每次读取一行,然后进行分析
	do {
		cout << str << endl;
		char c[max_size] = { ‘\0‘ };
		int i = 0, ic = 0;
		for (i = 0; str[i] != ‘]‘; i++);
		for (int j = i + 1; str[j] != ‘\r‘&&str[j] != ‘\n‘&&str[j] != ‘\0‘; j++) {
			//去掉引号后面的字符,但是如果是t的话就不去
			if ((int)str[j] == 39) {
				while (str[j] != ‘ ‘&&str[j] != ‘\r‘&&str[j] != ‘\n‘&&str[j] != ‘\0‘) {
					j++;
					if (str[j] == ‘t‘) {
						j--;
						break;
					}
				}
				if (str[j] == ‘\r‘ || str[j] == ‘\n‘ || str[j] == ‘\0‘)
					break;
			}
			//除去大小写
			if (((int)str[j] >= 65) && ((int)str[j] <= 91))
				c[ic++] = (int)str[j] + 32;
			else
				c[ic++] = str[j];
			//cout << "daxiao--------------->" << (int)str[j] << endl;
		}

		//	cout <<"分割后的字符串:  "<< c << endl;
		const char *d = "[] -;,:/?!.()";//以这些字符为分界符
		char *p = NULL;
		char *next_p = NULL;
		p = strtok_s(c, d, &next_p);
		while (p)
		{
			insert_Word_List(lyrics_head, upper, bound, p, number);
			//	cout <<"上标是"<<upper<< "分出来了:" << lyrics_head[upper-1].word << endl;
			p = strtok_s(NULL, d, &next_p);
		}
		getline(fin, str);
	} while (!fin.eof());
	fin.close();
};
//将单词插入词项表,如果不在就插入,在则直接退出(仅限于当前的文档),参数为:词项表,上界,下界,单词,文档的编号
bool Lyric_Index_Analysis::insert_Word_List(Word_item lyrics_head[], int& upper, int bound, char* elem, int position) {
	for (int i = bound; i<upper; i++) {
		if (lyrics_head[i].word == elem)
			return false;
	}
	lyrics_head[upper].head_docID = new Word_Doc;
	lyrics_head[upper].head_docID->text_number = position;
	lyrics_head[upper].head_docID->next = nullptr;
	lyrics_head[upper].word = elem;
	//cout << "分出来了(后面):" << lyrics_head[upper].word << endl;
	upper++;
	return true;
};

//对词项的表进行归并排序
void Lyric_Index_Analysis::Lyric_mergesort(Word_item lyrics_head[], Word_item temp[], int left, int right) {
	int i, j, k, mid = (left + right) / 2;
	if (left == right)
		return;
	Lyric_mergesort(lyrics_head, temp, left, mid);
	Lyric_mergesort(lyrics_head, temp, mid + 1, right);
	for (i = mid; i >= left; i--)
		temp[i] = lyrics_head[i];
	for (j = 1; j <= right - mid; j++)
		temp[right - j + 1] = lyrics_head[j + mid];
	for (i = left, j = right, k = left; k <= right; k++)
		if (temp[i].word <= temp[j].word)
			lyrics_head[k] = temp[i++];
		else
			lyrics_head[k] = temp[j--];

};

//对词项建立最终的索引表
void Lyric_Index_Analysis::Analys_Setup_index(Word_item lyrics_head[], int n) {
	Word_Doc *temp;//用于每个词项出现在文档中的位置的索引
	Index_list* t_L_H_List = L_H_List;
	t_L_H_List->word = lyrics_head[0].word;
	t_L_H_List->freq = 1;
	t_L_H_List->head_docID = lyrics_head[0].head_docID;
	t_L_H_List->next = nullptr;
	temp = t_L_H_List->head_docID;
	cout << "单词是" << t_L_H_List->word << " 出现在 " << temp->text_number << " 号文档,此时频率是" << t_L_H_List->freq << endl;
	for (int i = 1; i < n; i++) {
		while (lyrics_head[i - 1].word == lyrics_head[i].word) {
			temp->next = lyrics_head[i].head_docID;
			temp = temp->next;
			cout << "        词项和上一个相等,出现在" << temp->text_number << " 号文档,此时频率是" << t_L_H_List->freq + 1 << endl;
			t_L_H_List->freq++;
			++i;
		}
		if (i == n)
			break;
		Index_list* temp_L_H_List = new Index_list;
		temp_L_H_List->word = lyrics_head[i].word;
		temp_L_H_List->freq = 1;
		temp_L_H_List->head_docID = lyrics_head[i].head_docID;
		temp = temp_L_H_List->head_docID;
		temp_L_H_List->next = nullptr;
		t_L_H_List->next = temp_L_H_List;
		t_L_H_List = t_L_H_List->next;
		cout << "单词是" << t_L_H_List->word << " 出现在 " << temp->text_number << " 号文档,此时频率是" << t_L_H_List->freq << endl;
	}
};

  

c++下lrc歌词文件检索(自己写的检索歌词文件,记录点滴)

标签:

原文地址:http://www.cnblogs.com/1996313xjf/p/5911311.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!