码迷,mamicode.com
首页 > 编程语言 > 详细

python 爬虫示例,方便日后参考

时间:2018-07-07 20:24:06      阅读:163      评论:0      收藏:0      [点我收藏+]

标签:return   file   ict   []   append   one   brief   coding   movies   



def getOneMoviesInfo(Mid,url):
    import requests
    from lxml import etree
    
    #print(url)
    data = requests.get(url).text   #download the website
    s = etree.HTML(data)            #analyse data

    picture = s.xpath(‘//*[@id="main"]/section/div[1]/div/div/section/div[1]/div[1]/img/@src‘)
    if len(picture)== 0:
        picture = ‘NULL‘
    #longPicture = s.xpath(‘//*[@id="media_v4"]/div[2]/div[1]/div/div/section[3]/div[2]/div/div[1]/img/@src‘)
    name = s.xpath(‘//*[@id="main"]/section/div[1]/div/div/section/div[2]/section/div[1]/span/a/h2/text()‘)
    if len(name)==0:
        print("Mid = %s , failed for a lack of TMDB id "%Mid)
        return
    name = s.xpath(‘//*[@id="main"]/section/div[1]/div/div/section/div[2]/section/div[1]/span/a/h2/text()‘)[0]
    year = s.xpath(‘//*[@id="main"]/section/div[1]/div/div/section/div[2]/section/div[1]/span/span/text()‘)[0].strip("(").strip().strip(")")
    date = s.xpath(‘//*[@id="media_v4"]/div[2]/div[2]/div/section/div[1]/div/section[1]/ul/li[1]/text()‘)[1].strip()
    brief = s.xpath(‘//*[@id="main"]/section/div[1]/div/div/section/div[2]/section/div[2]/div/p/text()‘)[0].replace("\n","\\n")

    mainCreators =s.xpath(‘//*[@id="main"]/section/div[1]/div/div/section/div[2]/section/div[2]/ol/li‘) #all main creators array
    writers = []
    director = "NULL"
    for div in mainCreators:
        if len(div.xpath(‘./p[1]/a/text()‘))== 0:
            director = ‘NULL‘
            writers = [‘NULL‘,‘NULL‘,‘NULL‘]
        else:            
            creatorName = div.xpath(‘./p[1]/a/text()‘)[0]
            #print(creatorName)
            creatorProfession = div.xpath(‘./p[2]/text()‘)[0]
            #print(creatorProfession)
            if  ‘Director‘ in creatorProfession:
                director = creatorName
            elif ‘Screenplay‘ in creatorProfession or ‘Writer‘ in creatorProfession:
                writers.append(creatorName)
    
        
    stars = []
    starsData = s.xpath(‘//*[@id="media_v4"]/div[2]/div[1]/div/div/section[1]/ol/li‘)
    for div in starsData:
        star = div.xpath(‘./p[1]/a/text()‘)
        if len(star)== 0:
            stars == ["NULL","NULL","NULL"]
        else:
            star = star[0]
            stars.append(star)
                
    
    writerslen = len(writers)
    starslen=len(stars)
    
    for i in range(writerslen,3):
        writers.append("NULL");
    for i in range(starslen,5):
        stars.append("NULL");
    
    with open(r‘C:\Users\yuqiao\Desktop\testSpider.txt‘,‘a‘,encoding=‘utf-8‘) as f:
        f.write("{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}\n".format(Mid,name,brief,year,date,director,
                                                 writers[0],writers[1],writers[2],
                                                 stars[0],stars[1],stars[2],stars[3],stars[4],
                                                 picture))
    print(Mid)
    print(name)
    
#______________________________________________________主函数__________________________________________________________
import time
with open(r‘C:\Users\yuqiao\Desktop\testSpider.txt‘,‘w‘,encoding=‘utf-8‘) as f:
        f.write("")
language = ‘?language=zh-CN‘ #######################
with open(r‘D:\git\ZiyeMovie\MidURL.txt‘, "rt",encoding=‘utf-8‘) as in_file:
    all = in_file.read()
    lines = all.split("\n")
    
    #for i in range(51,61):    51~60
    for i in range(9124,9125):
        line = lines[i]
        print(line)

print(‘finished‘)    

python 爬虫示例,方便日后参考

标签:return   file   ict   []   append   one   brief   coding   movies   

原文地址:https://www.cnblogs.com/YuQiao0303/p/9277666.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!