标签:return href add += hog mat append type visit
各种编程语言我都很喜欢,但平时用的最多的是什么呢?
一个github小爬虫,获取全部repo及其主要语言,画出饼图。
"""
你是什么成份?
"""
import requests
from pyquery import PyQuery as pq
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
def parse_page(url):
print(url)
resp = requests.get(url)
html = pq(resp.text)
repo_list = html("#user-repositories-list li")
repos = []
for i in range(repo_list.length):
repo = repo_list.eq(i)
it = dict()
repo_name = repo('h3').text()
repo_language = repo("[itemprop='programmingLanguage']").text()
it['name'] = repo_name
it['language'] = repo_language
repos.append(it)
sons = html(".pagination a").eq(0).attr('href')
sons = [sons] if sons else []
return repos, sons
def analyze(repos):
# unique
ma = dict([(i['name'], i) for i in repos])
repos = ma.values()
cnt = Counter([i['language'] for i in repos if i['language']])
labels = cnt.keys()
sizes = np.array(list(cnt.values()))
explode = np.zeros_like(sizes, dtype=np.float32) # 0.1表示将Hogs那一块凸显出来
explode[np.argsort(sizes)[-3:].reshape(-1, 1)] = 0.1 # 前三名突出显示
plt.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', shadow=False, startangle=90)
# startangle表示饼图的起始角度
plt.show()
def schedule():
user = "weiyinfu"
q = []
seed = "https://github.com/" + user + "?tab=repositories"
q.append(seed)
visited = set()
repos = []
while q:
now = q.pop()
repo_list, url_list = parse_page(now)
for i in url_list:
if i not in visited:
q.append(i)
visited.add(i)
repos += repo_list
return repos
def main():
repos = schedule()
print(repos)
analyze(repos)
if __name__ == '__main__':
main()
标签:return href add += hog mat append type visit
原文地址:https://www.cnblogs.com/weiyinfu/p/9704368.html