标签:box raise pattern main cep pen spider test dict
import requests, re, json, os, time from fake_useragent import UserAgent from lxml import etree from urllib import parse class MyError(Exception): def __init__(self, status, msg): self.status = status self.msg = msg class WyRinking(): def __init__(self): ua = UserAgent() self.stratUrl = "https://music.163.com/discover/toplist" self.headers = { "User-Agent": ua.random } self.timeout = 10 self.allow_redirects = False self.nameList = [] self.urlList = [] def __getRinkNameUrl(self, response): ‘‘‘获取所有排行榜名字,和url‘‘‘ html_selector = self.__etreeSelector(response) self.nameList = html_selector.xpath( "//div[contains(@class,‘item‘) and contains(@class,‘f-cb‘)]/p[@class=‘name‘]/a/text()") or [] self.urlList = html_selector.xpath( "//div[contains(@class,‘item‘) and contains(@class,‘f-cb‘)]/p[@class=‘name‘]/a/@href") or [] def __getPageHtml(self, url): ‘‘‘请求页面‘‘‘ try: response = requests.get(url, headers=self.headers, timeout=self.timeout, allow_redirects=self.allow_redirects) return response except requests.exceptions.Timeout as e: print("Timeout Error>>:", e) self.__getPageHtml(url=url) def __getRankHtml(self): ‘‘‘获取每个排行榜的html源码‘‘‘ if not self.nameList and not self.urlList: raise MyError(10000, "{},{} 数据不能为空".format(self.nameList, self.urlList)) if len(self.nameList) != len(self.urlList): raise MyError(10001, "nameList,urlList数据不能一一对应") for i in range(len(self.urlList)): url = parse.urljoin(self.stratUrl, url=self.urlList[i]) response = self.__getPageHtml(url=url) response.customizeName = self.nameList[i] self.__getRankInfo(response) def __getRankInfo(self, response): ‘‘‘获取到网页中的json格式数据,写入到文件‘‘‘ html_selector = self.__etreeSelector(response) test = html_selector.xpath("//*[@id=‘song-list-pre-data‘]/text()")[0] or "" updateTime = html_selector.xpath("//span[contains(@class,‘sep‘) and contains(@class,‘s-fc3‘)]/text()")[0] try: data = json.loads(test) except json.decoder.JSONDecodeError: data = json.loads(test + ‘"}}]‘) ‘‘‘ if not len(songNmaeList) == len(songUrlList) == len(songIdList) == len(songIdList): raise MyError(10001, "songNmaeList,songUrlList,songIdList,songIdList数据不能一一对应") ‘‘‘ fileName = response.customizeName + ‘--‘ + updateTime + ".json" if not Rink_BASE_PATH: raise MyError(10005, "需要在全局中配置该参数Rink_BASE_PATH,用于文件存放地址") if not os.path.exists(Rink_BASE_PATH): os.makedirs(Rink_BASE_PATH) path = os.path.join(Rink_BASE_PATH, fileName) self.__writeToFile(path, data) def __writeToFile(self, path, data): print(‘正在写入文件{}.json‘.format(path)) index = 1 with open(path, "w", encoding="utf-8") as f: for data_dic in data: dic = {} dic["rankNum"] = index dic["songId"] = data_dic.get("id") dic["songName"] = data_dic.get("name") dic["artistsInfo"] = data_dic.get("artists") dic["commentThreadId"] = data_dic.get("commentThreadId") f.write(json.dumps(dic, ensure_ascii=False) + "\n") index += 1 def __reSongId(self, songurl: str): ‘‘‘ :param songurl: /song?id=1336871144 格式类似于这样 ‘‘‘ pattern = r"id=(\d+)" try: id = re.findall(pattern, songurl)[0] except IndexError: raise MyError(10002, "歌曲id获取失败") return id def collectRanking(self): ‘‘‘获取网易云排行榜数据‘‘‘ response = self.__getPageHtml(url=self.stratUrl) self.__getRinkNameUrl(response) self.__getRankHtml() def __etreeSelector(self, response): ‘‘‘将response对象转换为xml格式‘‘‘ return etree.HTML(response.text) class WySinger(): __isFirstStatus = True # 请求华语男歌手页面的时候,获取到A-Z对应的参数,这个只需要获取一次就足够 def __init__(self): ua = UserAgent() self.stratUrl = "https://music.163.com/discover/artist" self.headers = { "User-Agent": ua.random } self.timeout = 10 self.allow_redirects = False self.sCategoryNameList = [] self.sCategoryIdList = [] self.sCategoryUrlList = [] self.initialIdList = [] self.markList = [] def __getPageHtml(self, url): ‘‘‘请求页面‘‘‘ try: response = requests.get(url, headers=self.headers, timeout=self.timeout, allow_redirects=self.allow_redirects) return response except requests.exceptions.Timeout as e: print("Timeout Error>>:", e) self.__getPageHtml(url=url) def __getSingerCategory(self, response): htmlSelector = self.__etreeSelector(response) sCategoryNameList = htmlSelector.xpath( "//*[@id=‘singer-cat-nav‘]/div[@class=‘blk‘]//li/a[@class=‘cat-flag‘]/text()") sCategoryIdList = htmlSelector.xpath( "//*[@id=‘singer-cat-nav‘]/div[@class=‘blk‘]//li/a[@class=‘cat-flag‘]/@data-cat") sCategoryUrlList = htmlSelector.xpath( "//*[@id=‘singer-cat-nav‘]/div[@class=‘blk‘]//li/a[@class=‘cat-flag‘]/@href") if sCategoryUrlList and len(sCategoryNameList) == len(sCategoryIdList) == len(sCategoryUrlList): self.sCategoryNameList = sCategoryNameList or [] self.sCategoryIdList = sCategoryIdList or [] self.sCategoryUrlList = [parse.urljoin(self.stratUrl, url) for url in sCategoryUrlList or []] def __getSingerListPage(self): if not self.sCategoryNameList and not self.sCategoryUrlList: raise MyError(10000, "{},{} 数据不能为空".format(self.sCategoryNameList, self.sCategoryUrlList)) if len(self.sCategoryNameList) != len(self.sCategoryUrlList): raise MyError(10001, "nameList,urlList数据不能一一对应") for sCategoryUrl in self.sCategoryUrlList: response = self.__getPageHtml(sCategoryUrl) if self.__isFirstStatus: self.__getInitialId(response) self.__isFirstStatus = False for inintalId in self.initialIdList: if inintalId == "-1": # inintalId 为-1的时候代表热门,但是会和后面的歌手信息重复,所以做个判断 continue url = sCategoryUrl + "&initial=" + inintalId res = self.__getPageHtml(url) yield res def __getSingerIdUrl(self, response): htmlSelector = self.__etreeSelector(response) aSelector = htmlSelector.xpath( "//*[@id=‘m-artist-box‘]//a[@class=‘msk‘] | //*[@id=‘m-artist-box‘]/li[@class=‘sml‘]/a[1]") singerUrlList = [parse.urljoin(self.stratUrl, selector.xpath("@href")[0]) for selector in aSelector] singerNameList = [selector.xpath("@title")[0].replace("的音乐", "") for selector in aSelector] if singerUrlList and len(singerUrlList) == len(singerNameList): yield list(zip(singerUrlList, singerNameList)) else: yield [] def __getInitialId(self, response): ‘‘‘获取A-Z对应的initialId‘‘‘ htmlSelector = self.__etreeSelector(response) urlList = htmlSelector.xpath("//*[@id=‘initial-selector‘]/li/a/@href") initialIdList = [self.__reInitialId(url) for url in urlList] markList = htmlSelector.xpath("//*[@id=‘initial-selector‘]/li/a/text()") if len(initialIdList) == len(markList): self.initialIdList = initialIdList self.markList = markList def __reInitialId(self, url): ‘‘‘ url格式为:/discover/artist/cat?id=1001&initial=-1 ‘‘‘ pattern = r"initial=(.*)" initialId = re.findall(pattern, url, re.S)[0] return initialId def __getSingerDetails(self, response): htmlSelector = self.__etreeSelector(response) try: data_json = htmlSelector.xpath("//*[@id=‘song-list-pre-data‘]/text()")[0] data_list = json.loads(data_json, strict=False) singerDetails_json = htmlSelector.xpath("//script[@type=‘application/ld+json‘]/text()")[0] singerDetails_dict = json.loads(singerDetails_json, strict=False) singerDetails_content = singerDetails_dict.get("description") return data_list, singerDetails_content except Exception as e: # 有些音乐人是没有音乐作品的,所以通过索引取值([0])就会抛异常,我这里捕捉改异常,不进行处理就好 print(e) return None, None def __writeToFile(self, datalist, singerDetails_content, singerName): if not os.path.exists(Singer_BASE_PATH): os.makedirs(Singer_BASE_PATH) path = os.path.join(Singer_BASE_PATH, singerName) print("正在写入{}".format(singerName)) with open(path + ".txt", ‘w‘, encoding="utf-8") as f: f.write("歌手简介:{}".format(singerDetails_content) + "\n") for data in datalist: f.write("-" * 50 + "\n") f.write("歌曲名:{}".format(data.get("name")) + "\n") f.write("歌曲ID:{}".format(data.get("privilege").get("id")) + "\n") f.write("歌曲专辑:{}".format(data.get("album").get("name")) + "\n") f.write("歌曲别号:{}".format("无" if not data.get("alias") else data.get("alias")) + "\n") def __etreeSelector(self, response): ‘‘‘将response对象转换为xml格式‘‘‘ return etree.HTML(response.text) def collectSinger(self): response = self.__getPageHtml(url=self.stratUrl) self.__getSingerCategory(response) resGenerator = self.__getSingerListPage() for res in resGenerator: time.sleep(1) a = self.__getSingerIdUrl(res) # a是一个生成器,不知道取啥名,它__next__就是一个列表,这个列表就是当前页面所有歌手名和url的元组 for i in a: # i 就是 a__next__得来的列表 for b in i: # b 就是我们想要的结果 一个元组(歌手名,歌手详情页的url) singerUrl = b[0] singerName = b[1] singerResponse = self.__getPageHtml(singerUrl) datalist, singerDetails_content = self.__getSingerDetails(singerResponse) if not datalist and not singerDetails_content: continue self.__writeToFile(datalist, singerDetails_content, singerName) if __name__ == ‘__main__‘: Rink_BASE_PATH = r"D:\spidersData\Rinking" Singer_BASE_PATH = r"D:\spidersData\SingerInfo" wangyiyun = WyRinking() wangyiyun.collectRanking() # 获取网易云排行榜数据 wangyiyun = WySinger() wangyiyun.collectSinger() # 获取网易云所有歌手及作品
标签:box raise pattern main cep pen spider test dict
原文地址:https://www.cnblogs.com/zhuchunyu/p/10765932.html