贴上一个文档,是写这个程序的基本的思路,当然具体的程序和这个有一些出入,但是大体上一样。求批评指正。
/*目标:在一个文件夹下吧所有的lrc歌词文件用程序导入,进行分析,最后的结果用TXT表示出来。*/
/*分析:只要建立了倒排索引就可以很容易把歌词的索引文件输入到il.txt中,所以应当有一个函数实现输入倒排索引,而后建立文件输出txt:
0.bool Lyricsindex_out(Lyric_index_list index_list[],int m ){}
//相关的有:
struct Word_item{//词项头存储
String word;
int freq=0;
Word_Doc *head_docID;
};Word_item lyrics_head=new Word_item[];
struct Word_Doc{//词项位置存储
int text_number;
Word_Doc *next;
}*head_docID;
Word_Doc *temp;
Fstream fout(“index_lyrics.txt”);
Fout<<”doc”<<setw(12)<<”freq”<<setw(12)<<”->”<<setw(12)<<”list”<<endl;
for(int i=0,int j=0;index_list[i]->next!=NULL;i++){
Fout<<index_list[i]->word<<setw(12)<<index_list[i]->freq<<setw(12)<<”->”<<setw(12);
temp=index_list[i]->head_docID;
For( j=0;temp!=NULL;j++){
Fout<<temp->text_number<<”,”;
temp=temp->next;
}
}
输出txt文件的写法:
1.应当有一个类,Class Lyric_Index_Analysis
处理从文件夹中的输入的lrc歌词文件,并且过滤掉”[...]”,最后只留下歌词的主体,然后把歌词的主体进行分析,以空格,回车,标点符号为界限,将每个词解析出来,
存入
lyrics_head[],
例如:
lyrics_head[0].word=xiejiang;
lyrics_head[0].freq=1;
lyrics_head[0].head_docID=new Word_item;
lyrics_head[0].head_docID->text_number=0;
lyrics_head[0].head_docID->next=NULL;
每处理一个文档时将最终生成的词项头的数组的最大值记住,这可以设置成一个函数,
Int get_lyrics_head(Word_item lyrics_head[],string filename)
返回的时候返回数组和最大值
2.然后这个类处理完所有的文档之后,应当有一个
lyrics_head[]的数组,大小为n,
然后建立一个函数将所有的词项按照词项的字母序归并排序
之后仍然返回数组
Void Lyric_mergesort(Word_item lyrics_head[],int left,int right)
3.到最后一步,Index_list Analy_Setup_index (Word_item lyrics_head[],int n)
建立链表
输入lyrics_head[]和n
对数组进行遍历,如果两个挨着的数内的word相同那么就将他们的进行合并
用一个指针指向lyrics_head[]和Word_item,直到这个数组相同的部分已经不存在了
然后将两个放在一个链表里。
因为归并排序是稳定的,所以可以直接将两个lyrics_head[].word连起来。如下方法:例如
struct Index_list{//用于生成最终的索引表
String word;
int freq=0;
Word_Doc *head_docID;
Index_list* next;
};
Word_Doc* lyrics_doc; //=new Word_Doc[];
Index_list Lyric_index_list=new Index_list[];
Lyric_index_list[0].word=lyrics_head[0].word;
Lyric_index_list[0].head_docID=lyrics_head[0].head_docID;
Lyric_index_list[0].freq=lyrics_head[0].freq;
Lyric_index_list[0]->next=NULL;
Lyrics_doc=lyrics_head[0].head_docID;
For(int i=0,int j=0;i<n;i++){
If(Lyric_index_list[j].word==lyrics_head[i].word){
Lyric_index_list[j].freq++;
Lyrics_doc->next=lyrics_head[i].head_docID;
Lyrics_doc=Lyrics_doc->next;
}
Else{
J++;
Lyric_index_list[j-1].next=Lyric_index_list[j];
Lyric_index_list[j]->next=NULL;
Lyric_index_list[j].word=lyrics_head[i].word;
Lyric_index_list[j].head_docID=lyrics_head[i].head_docID;
Lyric_index_list[j].freq=lyrics_head[i].freq;
}
}
Return Lyric_index_list;
*/
首先main所在的.cpp:
#include<iostream>
#include<fstream>
#include<string>
#include<iomanip>
#include <stdio.h>
#include<io.h>
#include"Analysis_Lyrics.h"
using namespace std;
int main(){
Lyric_Index_Analysis Lyric_A;//建立歌词分析的类
string str = "\0";
int bound = 0, upper = 0, i = 1;//有n个数,则有上标为n,这是词项表的上下界
Word_item *temp = new Word_item[Max_size - 500];//用于归并交换的需要
struct _finddata_t fileinfo;//文件处理,寻找默认目录下的歌词文件
long pFile;//调用成功返回0,否则返回-1
// cout << "第 " << pFile << " 个文档是:" << fileinfo.name << " 编号为:1" << endl;
if ((pFile = _findfirst("*.lrc", &fileinfo)) == -1) {
cout << "不存在.lrc文件" << endl;
return 0;
}
else {
cout << "第 " << i++ << " 个文档是:" << fileinfo.name << " 编号为:1" << endl;
Lyric_A.Lyrics_input(fileinfo.name, upper, bound, 1);//输入文件调用
cout << "下界为->" << bound << "上界为->" << upper << endl;
while (_findnext(pFile, &fileinfo) == 0) {
cout << "第 " << i << " 个文档是:" << fileinfo.name << " 编号为:" << i << endl;
bound = upper;
Lyric_A.Lyrics_input(fileinfo.name, upper, bound, i);
cout << "下界为->" << bound << "上界为->" << upper << endl;
i++;
}
}
_findclose(pFile);
bound = 0;
cout << "总的词项表的下界为->" << bound << "上界为->" << upper << endl;
Lyric_A.Lyric_mergesort(Lyric_A.Return_lyrics_head(),temp, bound, upper - 1);
Lyric_A.print(upper);
Lyric_A.Lyricsindex_out(upper);
}
//如下的小代码是遍历一个文件夹下的文档的程序
/*#include<iostream>
#include <io.h>
using namespace std;
int main()
{
struct _finddata_t fileinfo;
long hFile;
if ((hFile = _findfirst("*.lrc", &fileinfo)) == -1)
return -1;
else {
cout << fileinfo.name << endl;
while (_findnext(hFile, &fileinfo) == 0){
cout << fileinfo.name << endl;
}
}
_findclose(hFile);
return 0;
}
*/
接着是处理歌词文件的类.h:
#include<iostream>
using namespace std;
static const int Max_size = 4000;
static const int max_size = 200;
struct Word_Doc {//词项位置存储,包含词的所在的文档编号,下一个词项所在位置
int text_number;
Word_Doc *next = nullptr;
};
struct Word_item {//用于第一次遍历整个文档时的存储每个词的头项,包含单词和下一个词项
string word;
Word_Doc *head_docID = nullptr;
};
struct Index_list {//用于生成最终的索引表,包含单词,单词存在的文档的总数,单词存在文档位置的索引,下一个单词
string word;
int freq = 0;
Word_Doc *head_docID = nullptr;
Index_list* next = nullptr;
};
class Lyric_Index_Analysis {//分析lrc歌词文件的主类,用各个函数将歌词文档分析出来然后建立成索引文档
private:
Word_item* lyrics_head;
Index_list* L_H_List;
public:
int bound = 0, upper = 0;//代表目前词项表的上界和下界,上界和下界随着文档数的处理有所不同
Lyric_Index_Analysis() {
lyrics_head = new Word_item[Max_size];
L_H_List = new Index_list;//建立一个索引链表
};
~Lyric_Index_Analysis() {
delete[]lyrics_head;
Index_list*temp = L_H_List;
while (temp != nullptr) {
temp = L_H_List->next;
delete L_H_List;
}
};
Word_item* Return_lyrics_head() {
return lyrics_head;
}
Index_list* Return_L_H_List() {
return L_H_List;
}
void Lyrics_input(string filename, int& upper, int bound, int number);//打开文件输入歌词,参数分别为:文件名,词项表的上界,词项表的下界,文档的编号。调用insert_Word_List(Word_item lyrics_head[], int &upper,int bound, char* elem,int position),最终返回词项表
bool insert_Word_List(Word_item lyrics_head[], int& upper, int bound, char* elem, int position);//将单词插入词项表,如果不在就插入,在则直接退出(仅限于当前的文档),参数为:词项表,上界,下界,单词,文档的编号
void Lyric_mergesort(Word_item lyrics_head[], Word_item temp[], int left, int right);//对词项的表进行归并排序
void Analys_Setup_index(Word_item lyrics_head[], int n);//对词项建立最终的索引表
bool Lyricsindex_out(int n) {//Index_list L_index_list[], int m){//将最终的索引程序输出来,输入在Lyrics_Index_List.txt中
Analys_Setup_index(lyrics_head, n);
ofstream fout("Lyrics_Index_List.txt", ios::trunc);
fout.setf(ios::left);
//fout << setw(20) << "word" << setw(5) << "freq" << setw(3) << " " << "docID" << endl;
while (L_H_List != nullptr) {
Word_Doc* temp = L_H_List->head_docID;
fout << L_H_List->word << "#" << L_H_List->freq << "@";
cout << L_H_List->word << " 出现在";
while (temp->next != nullptr) {
cout << temp->text_number << ",";
fout << temp->text_number << ",";
temp = temp->next;
}
fout << temp->text_number << endl;
cout << temp->text_number;
cout << " 号文档,频率为 " << L_H_List->freq << endl;
L_H_List = L_H_List->next;
}
fout.close();
return false;
}
void print(int n) {
for (int i = 0; i < n; i++) {
cout << lyrics_head[i].word << " 出现在" << lyrics_head[i].head_docID->text_number << " 号文档" << endl;
}
}
};
再然后是类的具体的.cpp文件:
#include<fstream>
#include<string>
#include<iomanip>
#include"Analysis_Lyrics.h"
//打开文件输入歌词,参数分别为:文件名,词项表的上界,词项表的下界,文档的编号。调用insert_Word_List(Word_item lyrics_head[], int &upper,int bound, char* elem,int position),最终返回词项表
void Lyric_Index_Analysis::Lyrics_input(string filename, int& upper, int bound, int number) {
//lyrics_head = new Word_item[Max_size];
ifstream fin(filename);
if (!fin.is_open()) {
cout << "文件读取失败!\n";
exit(0);
}
string str;
getline(fin, str);
//遍历整个文档,每次读取一行,然后进行分析
do {
cout << str << endl;
char c[max_size] = { ‘\0‘ };
int i = 0, ic = 0;
for (i = 0; str[i] != ‘]‘; i++);
for (int j = i + 1; str[j] != ‘\r‘&&str[j] != ‘\n‘&&str[j] != ‘\0‘; j++) {
//去掉引号后面的字符,但是如果是t的话就不去
if ((int)str[j] == 39) {
while (str[j] != ‘ ‘&&str[j] != ‘\r‘&&str[j] != ‘\n‘&&str[j] != ‘\0‘) {
j++;
if (str[j] == ‘t‘) {
j--;
break;
}
}
if (str[j] == ‘\r‘ || str[j] == ‘\n‘ || str[j] == ‘\0‘)
break;
}
//除去大小写
if (((int)str[j] >= 65) && ((int)str[j] <= 91))
c[ic++] = (int)str[j] + 32;
else
c[ic++] = str[j];
//cout << "daxiao--------------->" << (int)str[j] << endl;
}
// cout <<"分割后的字符串: "<< c << endl;
const char *d = "[] -;,:/?!.()";//以这些字符为分界符
char *p = NULL;
char *next_p = NULL;
p = strtok_s(c, d, &next_p);
while (p)
{
insert_Word_List(lyrics_head, upper, bound, p, number);
// cout <<"上标是"<<upper<< "分出来了:" << lyrics_head[upper-1].word << endl;
p = strtok_s(NULL, d, &next_p);
}
getline(fin, str);
} while (!fin.eof());
fin.close();
};
//将单词插入词项表,如果不在就插入,在则直接退出(仅限于当前的文档),参数为:词项表,上界,下界,单词,文档的编号
bool Lyric_Index_Analysis::insert_Word_List(Word_item lyrics_head[], int& upper, int bound, char* elem, int position) {
for (int i = bound; i<upper; i++) {
if (lyrics_head[i].word == elem)
return false;
}
lyrics_head[upper].head_docID = new Word_Doc;
lyrics_head[upper].head_docID->text_number = position;
lyrics_head[upper].head_docID->next = nullptr;
lyrics_head[upper].word = elem;
//cout << "分出来了(后面):" << lyrics_head[upper].word << endl;
upper++;
return true;
};
//对词项的表进行归并排序
void Lyric_Index_Analysis::Lyric_mergesort(Word_item lyrics_head[], Word_item temp[], int left, int right) {
int i, j, k, mid = (left + right) / 2;
if (left == right)
return;
Lyric_mergesort(lyrics_head, temp, left, mid);
Lyric_mergesort(lyrics_head, temp, mid + 1, right);
for (i = mid; i >= left; i--)
temp[i] = lyrics_head[i];
for (j = 1; j <= right - mid; j++)
temp[right - j + 1] = lyrics_head[j + mid];
for (i = left, j = right, k = left; k <= right; k++)
if (temp[i].word <= temp[j].word)
lyrics_head[k] = temp[i++];
else
lyrics_head[k] = temp[j--];
};
//对词项建立最终的索引表
void Lyric_Index_Analysis::Analys_Setup_index(Word_item lyrics_head[], int n) {
Word_Doc *temp;//用于每个词项出现在文档中的位置的索引
Index_list* t_L_H_List = L_H_List;
t_L_H_List->word = lyrics_head[0].word;
t_L_H_List->freq = 1;
t_L_H_List->head_docID = lyrics_head[0].head_docID;
t_L_H_List->next = nullptr;
temp = t_L_H_List->head_docID;
cout << "单词是" << t_L_H_List->word << " 出现在 " << temp->text_number << " 号文档,此时频率是" << t_L_H_List->freq << endl;
for (int i = 1; i < n; i++) {
while (lyrics_head[i - 1].word == lyrics_head[i].word) {
temp->next = lyrics_head[i].head_docID;
temp = temp->next;
cout << " 词项和上一个相等,出现在" << temp->text_number << " 号文档,此时频率是" << t_L_H_List->freq + 1 << endl;
t_L_H_List->freq++;
++i;
}
if (i == n)
break;
Index_list* temp_L_H_List = new Index_list;
temp_L_H_List->word = lyrics_head[i].word;
temp_L_H_List->freq = 1;
temp_L_H_List->head_docID = lyrics_head[i].head_docID;
temp = temp_L_H_List->head_docID;
temp_L_H_List->next = nullptr;
t_L_H_List->next = temp_L_H_List;
t_L_H_List = t_L_H_List->next;
cout << "单词是" << t_L_H_List->word << " 出现在 " << temp->text_number << " 号文档,此时频率是" << t_L_H_List->freq << endl;
}
};
c++下lrc歌词文件检索(自己写的检索歌词文件,记录点滴)
原文:http://www.cnblogs.com/1996313xjf/p/5911311.html