大数据处理例之STL实战练习
找出3000000条字符串中使用频率最高的10条
要求:高效
目的:锻炼STL的用法,涉及到STL各大类组件使用,包括自定义hash索引类,自定义hash函数,STL堆算法,C++类的设计方法等
主要用到hash_map和heap
生成原始数据脚本(matlab文件):
[vb]
[vb]
base = 97;
alpha = 26;
total = 3000000;
max_len = 4;
min_len = 1;
filename = 'data.txt';
fid = fopen(filename,'w+');
for n=1:total
seed=mod(floor(rand()*100),max_len)+min_len;
for m=min_len:seed
fprintf(fid,char(mod(floor(rand()*100),alpha)+base));
end
fprintf(fid,'\n');
end
fclose(fid);
[vb]
生成的数据文件大致为data.txt:
[vb]
z
cvyh
kf
pjoh
rdr
cukv
ba
[vb]
...
[vb]
STL C++求解实现:
[cpp]
#include <functional>
#include <algorithm>
#include <iostream>
#include <fstream>
#include <string>
#include <hash_map>
using namespace std;
using namespace stdext;
class Node
{
public:
Node(){_str="";};
Node(const Node &n){_str = n.getv();};
Node(const std::string & str):_str(str){};
Node(const char * pstr):_str(pstr){};
~Node(){};
std::string getv() const
{
return _str;
}
void setv(const std::string & str)
{
_str = str;
}
Node & operator = (const Node & m)
{
_str = m.getv();
return *this;
}
friend bool operator < (const Node &n,const Node &m)
{
return n._str < m._str;
}
friend bool operator == (const Node &n,const Node &m)
{
return n._str == m._str;
}
friend ostream & operator << (ostream& o,const Node&n)
{
o<<n.getv();
return o;
}
protected:
private:
std::string _str;
};
inline size_t node_hash_value(const Node & n)
{
return stdext::hash_value(n.getv().c_str());
}
struct node_hash_compare:public hash_compare<Node>
{
size_t operator ()(const Node &n)const
{
return node_hash_value(n);
}
bool operator () (const Node & n, const Node &m)const
{
return n.getv() < m.getv();
}
};
ostream & operator << (std::ostream& o,const pair< Node,int> &n )
{
o<<n.first<<":"<<n.second;
return o;
}
struct node_hash_greater
{
bool operator()(const pair<Node,int> &m,const pair<Node,int> &n)
{
return m.second > n.second;
}
};
const char *filename = "data.txt";
const int find_num = 10;
int main()
{
ifstream out;
hash_map<Node,int,node_hash_compare> nodes;
out.open(filename, ios::in);
std::string line;
while(!out.eof())
{
std::getline(out,line);
if (line.size() == 0)
{
continue;
}
Node n(line);
if (nodes.count(n) > 0)
{
nodes[n] = nodes[n] + 1;
}
else
{
nodes[n] = 1;
}
}
//copy(nodes.begin(),nodes.end(),std::ostream_iterator< std::pair< Node, int> >(cout,"\n"));
out.close();
pair<Node,int> A[find_num+1];
if (nodes.size() < find_num)
{
copy(nodes.begin(),nodes.end(),std::ostream_iterator< std::pair< Node, int> >(cout,"\n"));
return 0;
}
hash_map<Node,int,node_hash_compare>::iterator it = nodes.begin();
int s=0;
for (;s<find_num;s++,it++)
{
A[s].first = it->first;
A[s].second = it->second;
}
make_heap(A,
补充:软件开发 , Vb ,