标签:blank python import target title
大数据 - 哈希
教你如何迅速秒杀掉:99%的海量数据处理面试题 http://blog.csdn.net/v_july_v/article/details/7382693
1: import operator
2: import heapq
3:
4: def hashfiles():
5:
6: files = []
7: for i in range(0, 10):
8: files.append(file(str(i) + ‘.txt‘, ‘w‘))
9:
10: queryfile = file(‘./data/queryfile.txt‘, ‘r‘)
11: for query in queryfile:
12: files[hash(query)%10].write(query)
13:
14: queryfile.close()
15:
16: for f in files:
17: f.close()
18:
19: def sortqueriesinfiles():
20: files = []
21: for i in range(0, 10):
22: files.append(file(str(i) + ‘.txt‘, ‘r+‘))
23:
24: for f in files:
25: D = {}
26: for query in f:
27: query = query.strip()
28: if query in D:
29: D[query] += 1
30: else:
31: D[query] = 1
32: sorted_D = sorted(D.iteritems(), key = operator.itemgetter(1))
33:
34: f.seek(0, 0)
35: f.truncate()
36: for item in sorted_D:
37: f.write(item[0] + ‘\t‘ + str(item[1]) + ‘\n‘)
38: f.close()
39:
40: def iteratefiles(f):
41: for line in f:
42: query, count = line.split(‘\t‘, 1)
43: yield (-int(count), query)
44:
45: def mergefiles():
46: files = []
47: for i in range(0, 10):
48: files.append(file(str(i) + ‘.txt‘, ‘r‘))
49:
50: dest_file = file(‘dest.txt‘, ‘w‘)
51:
52: for line in heapq.merge(*[iteratefiles(f) for f in files]):
53: print line
54: dest_file.write(line[1] + ‘\n‘)
55:
56: dest_file.close()
57:
58: for f in files:
59: f.close()
60:
61: if __name__ == ‘__main__‘:
62:
63: hashfiles()
64: sortqueriesinfiles()
65: mergefiles()