这个题是阿里的一个电话面试题,我想的头一个解决方案,有点限制,后面再写优化的
实验数据,python从百度抓得:
# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""
import urllib2
import re
import os
#connect to a URL
#一页的搜索结果中url大概是200个左右
file_url = open('url.txt','ab+')
#搜索框里的东西,这块可以设置成数字好让每次搜索的结果不一样
search = '123'
url = "http://www.baidu.com/s?wd="+search
def setUrlToFile():
website = urllib2.urlopen(url)
#read html code
html = website.read()
#use re.findall to get all the links
links = re.findall('"((http|ftp)s?://.*?)"', html)
for s in links:
print s[0]
if len(s[0]) < 256:
file_url.write(s[0]+'\r\n')
#收集实验数据
for i in range(0,50):
setUrlToFile()
file_url.close()
###需要重新打开再读一下
file_url = open('url.txt','r')
file_lines = len(file_url.readlines())
print "there are %d url in %s" %(file_lines,file_url)
file_url.close()
c++ 写的读 url.xml,放到map里面,对map<string , int>的value进行排序,得到前100个,运行一下也就55s,还是很快的,url长度进行了限制小于256个字符:
#pragma once
/*
//计算代码段运行时间的类
//
*/
#include <iostream>
#ifndef ComputeTime_h
#define ComputeTime_h
//单位毫秒
class ComputeTime
{
private:
int Initialized;
__int64 Frequency;
__int64 BeginTime;
public:
bool Avaliable();
double End();
bool Begin();
ComputeTime();
virtual ~ComputeTime();
};
#endif
#include "stdafx.h"
#include "ComputeTime.h"
#include <iostream>
#include <Windows.h>
ComputeTime::ComputeTime()
{
Initialized=QueryPerformanceFrequency((LARGE_INTEGER *)&Frequency);
}
ComputeTime::~ComputeTime()
{
}
bool ComputeTime::Begin()
{
if(!Initialized)
return 0;
return QueryPerformanceCounter((LARGE_INTEGER *)&BeginTime);
}
double ComputeTime::End()
{
if(!Initialized)
return 0;
__int64 endtime;
QueryPerformanceCounter((LARGE_INTEGER *)&endtime);
__int64 elapsed = endtime-BeginTime;
return ((double)elapsed/(double)Frequency)*1000.0; //单位毫秒
}
bool ComputeTime::Avaliable()
{
return Initialized;
}
// sortUrl.cpp : 定义控制台应用程序的入口点。
//
#include "stdafx.h"
//#include <utility>
#include <vector>
#include <map>
#include <fstream>
#include <iostream>
#include <string>
#include <algorithm>
#include "ComputeTime.h"
using namespace std;
map<string,int> urlfrequency;
typedef pair<string, int> PAIR;
struct CmpByValue
{
bool operator()(const PAIR& lhs, const PAIR& rhs)
{
return lhs.second > rhs.second;
}
};
void find_largeTH(map<string,int> urlfrequency)
{
//把map中元素转存到vector中 ,按照value排序
vector<PAIR> url_quency_vec(urlfrequency.begin(), urlfrequency.end());
sort(url_quency_vec.begin(), url_quency_vec.end(), CmpByValue());
//url_quency_vec.size()
for (int i = 0; i != 100; ++i)
{
cout<<url_quency_vec[i].first<<endl;
cout<<url_quency_vec[i].second<<endl;
}
}
//urlheap的建立过程,URL插入时候存在的
void insertUrl(string url)
{
pair<map<string ,int>::iterator, bool> Insert_Pair;
Insert_Pair = urlfrequency.insert(map<string, int>::value_type(url,1));
if (Insert_Pair.second == false)
{
(Insert_Pair.first->second++);
}
}
int _tmain(int argc, _TCHAR* argv[])
{
fstream URLfile;
char buffer[1024];
URLfile.open("url.txt",ios::in|ios::out|ios::binary);
if (! URLfile.is_open())
{ cout << "Error opening file"; exit (1); }
else
{
cout<<"open file success!"<<endl;
}
ComputeTime cp;
cp.Begin();
int i = 0;
while (!URLfile.eof())
{
URLfile.getline (buffer,1024);
//cout << buffer << endl;
string temp(buffer);
//cout<<i++<<endl;
insertUrl(temp);
}
find_largeTH(urlfrequency);
cout<<"running time: "<<cp.End()<<"ms"<<endl;
getchar();
//system("pause");
return 0;
}
实验结果:
版权声明:本文为博主原创文章,未经博主允许不得转载。
原文地址:http://blog.csdn.net/wangyaninglm/article/details/47049907