码迷,mamicode.com
首页 > 其他好文 > 详细

lxml 和 pyquery 示例 爬 卡牌

时间:2018-10-28 19:26:03      阅读:182      评论:0      收藏:0      [点我收藏+]

标签:query   request   form   cti   目录   tree   makedirs   name   src   

 

 

import requests
from pyquery import PyQuery as pq
import json
import jsonpath
from lxml import etree
import os

html = ‘‘‘
<div>
    <ul>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
</div>
‘‘‘
html = requests.get(http://news.4399.com/gonglue/lscs/kptj/).content.decode(gbk)
num = 0
def pq方法():
    global num
    doc = pq(html)
    items = doc(#dq_list > li).items()
    # print(doc)
    # print(type(doc))
    for item in items:
        url=item.find(img).attr(lz_src)
        num+=1
        print(str(num),url)
        url_content=requests.get(url).content
        name = item.find(.kp-name).text()
        
        with open(e:/py3/002/+{:0>4}.format(str(num))+name+.jpg,wb) as file:
            file.write(url_content)
        # print(url,name)

def lxml方法():
    print(html)
    global num
    r=etree.HTML(html)
    # items=r.xpath("//div[@class=‘box10-content‘]//ul[@id=‘dq_list‘]/li/a/img/@lz_src")
    items=r.xpath("//div[@class=‘box10-content‘]//ul[@id=‘dq_list‘]/li/a")
    # print(items)
    for item in items:
        kpname=item.xpath("./div/text()")[0]
        lzsrc=item.xpath("./img/@lz_src")[0]
        num+=1
        print(kpname,lzsrc)
        lzcontent=requests.get(lzsrc).content
        with open(e:/py3/003/+{:0>4}.format(str(num))+_+kpname+.jpg,wb)as file:
            file.write(lzcontent)





if __name__ == __main__:
    # pq方法()
    lxml方法()

    # 创建目录
    ‘‘‘
    for dirnum in range(1,100):
        dirnum2=‘{:0>3}‘.format(str(dirnum))
        mkpath="e:\\py3\\{}\\".format(dirnum2)
        print(mkpath)
        print(‘已存在!‘) if os.path.exists(mkpath) else os.makedirs(mkpath)
    ‘‘‘

‘‘‘
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc)

print(soup.prettify()) # 结构化输出文档
print(soup.title) # 获取title标签
print(soup.title.name) # 获取title标签名称 
print(soup.title.parent.name)
print(soup.p[‘class‘])
‘‘‘

 

lxml 和 pyquery 示例 爬 卡牌

标签:query   request   form   cti   目录   tree   makedirs   name   src   

原文地址:https://www.cnblogs.com/pscc/p/9866194.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!