进阶实验5-3.4 迷你搜索引擎 (35 分)

程序员文章站 2022-04-28 18:10:47

...

先说点废话，这个题刚开始头铁全部用C写的，写了300多行而去全部是指针操作，但是不管我怎么调试都没办法过最后一个测试点。调试了1个星期后我觉得肯定是太多的指针操作导致了一些很难发现的问题，所有我用C++重写了一遍，用了string,vector,set来避免指针操作，最后终于过了所有测试点。

先讲一下这个题的一些细节处理：

-用这个strtok()函数来分词。

-HashTable的大小大概在10000到30000左右是够的，再大的话会内存超限。

-auto是真的好用，谁用谁知道

-输入可以用getline()函数,别用gets(),用了gets()函数PTA的编译器会直接报错。输出千万别用cout，会超时，用c_str()把string转为c风格的字符串再用printf输出

-hash 函数的话，我是用单词后3位进行hash，用的是比较简单的线形探测法处理冲突.

思路: 开一个结构（nodeA）数组放单词，每个结构中一个变长数组，用来放该单词出现的文件编号和行号。同时还要开一个string类的二维数组存放每个文件的内容.查询的时候，先查询第一个单词的变长数组，再查询下一个单词的变长数组，两个数组根据文件编号求交集，然后再依次和后面的单词的变长数组求交集，最后再输出结果即可。

直接看代码就可以了，代码上基本上函数名就是说明了再做什么事情。写得比较粗糙，如果有什么疑问可以私信我。

画个图表示一下这个思路：（一个格子表示文件编号，另一个格子表示行号，行号是放在一个set里面的。）

进阶实验5-3.4 迷你搜索引擎 (35 分)

/*2019/10/12 迷你搜索引擎 STL重构*/
#include<string>
#include<cstdio>
#include<vector>
#include<iostream>
#include<cstring>
#include<algorithm>
#include<set>
using namespace std;
string fileName[100];
string file[100][100];//第i个文件第j行
const int MAXTABLESIZE = 300009;
struct nodeB
{
	int fileNum;
	set<int> line;
};
struct nodeA
{	
	int lastFile;
	int lastLine;
	string word;
	vector<nodeB> arrayB;
};
/*Hash*/
struct HashTbl
{
	int TableSize;
	nodeA* Cells;
};
typedef struct HashTbl* HashTable;
HashTable CreateTable()
{
	int i;
	HashTable H = new HashTbl;
	H->TableSize = MAXTABLESIZE;
	H->Cells = new nodeA[MAXTABLESIZE];
	for(i=0; i<H->TableSize; i++)
	{
		H->Cells[i].lastFile = -1;
		H->Cells[i].lastLine = -1;
	}
	return H;
}
int Hash(string Key, int P)
{
	int i,len,N;
    unsigned int h = 0;
	for(i=0; i<N; i++)
	{
		h = (h<<5) + (Key[i] - 'a');
	}
	return (h&(32*32*32-1))%P;
}
int Find(HashTable H, string Key)//线形探测法解决冲突
{
	int newPos, curPos,cnt = 0;//cnt表示冲突次数
	newPos = curPos = Hash(Key, H->TableSize);
	while((H->Cells[newPos].lastFile!=-1)&&(H->Cells[newPos].word!=Key))
	{
		cnt++;
		newPos = curPos + cnt;
		if(newPos>=H->TableSize)
			newPos-=H->TableSize;
	}
	return newPos;
}
void Insert(HashTable H, string Key, int file, int line)
{
	nodeB* tmp;
	int pos = Find(H,Key);
	if(H->Cells[pos].lastFile==-1)//表示这个位置为空,那么要先插入单词，再插入文件和行号
	{
		H->Cells[pos].lastFile = file;
		H->Cells[pos].lastLine = line;
		H->Cells[pos].word = Key;//printf("插入了%s\n",Key.c_str());
		tmp = new nodeB;
		tmp->fileNum = file; tmp->line.insert(line);
		H->Cells[pos].arrayB.push_back(*tmp);
	}
	else if(H->Cells[pos].lastFile!=file)
	{
		H->Cells[pos].lastFile = file;
		H->Cells[pos].lastLine = line;
		tmp = new nodeB;
		tmp->fileNum = file; tmp->line.insert(line);
		H->Cells[pos].arrayB.push_back(*tmp);
	}
	else if(H->Cells[pos].lastLine!=line)
	{
		H->Cells[pos].arrayB[H->Cells[pos].arrayB.size()-1].line.insert(line);
	}
}
/*Hash*/
void readFile(HashTable H)
{
	int N,i,j,k;
	string str,buff;
	scanf("%d",&N);getchar();
	char *cstr,*p;
	for(i=0; i<N; i++)//i是文件的编号从0~N-1
	{
		getline(cin,fileName[i]);j = 0;//j代表行号
		while(getline(cin,str), str != "#")
		{
			file[i][j] = str;
			cstr = new char[str.size()+1];
			strcpy(cstr,str.c_str());
			p = strtok(cstr," ");
			while(p!=NULL)
			{
				for(k=0; k<strlen(p); k++)
				{
					if((p[k]>='A')&&(p[k]<='Z'))
					{
						p[k] = p[k] + 32;
					}
				}
				Insert(H,p,i,j);
				p = strtok(NULL," ");
			}
			j++;
		}
	}
}
void Intersect(vector<nodeB> &v1, vector<nodeB> &v2, vector<nodeB> &v)
{
	vector<nodeB>::iterator it1,it2;
	set<int>::iterator it3,it4;
	it1 = v1.begin(); it2 = v2.begin();
	while(it1!=v1.end()&&it2!=v2.end())
	{
		if(it1->fileNum==it2->fileNum)
		{
			nodeB* tmp = new nodeB;
			tmp->fileNum = it1->fileNum;
			for(it3 = it1->line.begin();it3!=it1->line.end();it3++)
			{
				tmp->line.insert(*it3);
			}
			for(it4 = it2->line.begin();it4!=it2->line.end();it4++)
			{
				tmp->line.insert(*it4);
			}
			v.push_back(*tmp);
			it1++;it2++;
		}
		else if(it1->fileNum<it2->fileNum)
		{
			it1++;
		}
		else
		{
			it2++;
		}
	}
}
void query(HashTable H)
{
	int i,k; string str;
	char* cstr,*p;
	string buff[10];
	getline(cin,str);
	cstr = new char[str.size()+1];
	strcpy(cstr,str.c_str());
	p = strtok(cstr," ");//printf("%s\n", p);
	int cnt = 0;
	while(p!=NULL)
	{
		char* tmp;tmp = p;
		while(*tmp!='\0')
		{
			if(*tmp>='A'&&*tmp<='Z')
			{
				*tmp = *tmp+32;
			}
			tmp++;
		}
		buff[cnt] = p;//cout<<buff[cnt]<<endl;
		cnt++;
		p = strtok(NULL," ");
	}
	vector<nodeB> vi;int pos;vector<nodeB> v;
	for(i=0; i<cnt; i++)
	{
		pos = Find(H,buff[i]);//cout<<"单词是"<<H->Cells[pos].word<<endl;
		if(i==0)
		{
			vi.insert(vi.begin(),H->Cells[pos].arrayB.begin(),H->Cells[pos].arrayB.end());
		}
		else
		{
			Intersect(vi,H->Cells[pos].arrayB,v);
			vi.clear();
			vi.insert(vi.begin(),v.begin(),v.end());
			v.clear();
		}
	}
	cout<<vi.size()<<endl;
	if(vi.size()==0)
	{
		printf("Not Found\n");
		return;
	}
	for(auto it = vi.begin(); it!=vi.end(); it++)
	{
        printf("%s\n",fileName[it->fileNum].c_str());
		for(auto it5=it->line.begin(); it5!=it->line.end(); it5++)
		{
            printf("%s\n",file[it->fileNum][*it5].c_str());
		}
	}
}
int main()
{
	HashTable H = CreateTable();
	readFile(H);
	int M;
	scanf("%d",&M);getchar();
	for(int i=0; i<M;i++)
	{
		query(H);
	}
	return 0;
}

进阶实验5-3.4 迷你搜索引擎 (35 分)

相关标签： PTA 数据结构哈希倒排索引

上一篇： Hadoop 之 MapReduce 的工作原理及其倒排索引的建立

下一篇：初次使用Base64转储文件小结