码迷,mamicode.com
首页 > 其他好文 > 详细

知网爬取

时间:2020-07-23 16:07:56      阅读:157      评论:0      收藏:0      [点我收藏+]

标签:ret   cto   span   false   lis   keyword   fda   失败   刷新   

知网爬取勿做商用

import requests, time, parsel, re
from selenium.webdriver.chrome.options import Options
from urllib.parse import urlencode
from selenium import webdriver

session = requests.session()

proxyHost = "http-dyn.abuyun.com"
proxyPort = "9020"

# 代理隧道验证信息
proxyUser = "xxxx"
proxyPass = "xxxxx"

proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
    "host": proxyHost,
    "port": proxyPort,
    "user": proxyUser,
    "pass": proxyPass,
}
proxies = {http: proxyMeta, https: proxyMeta}

# 获取首页cookie
def cookie_request(search):
    chrome_options = Options()
    chrome_options.add_argument(--window-size=1920,1080)  # 设置窗口界面大小
    chrome_options.add_argument(--headless)
    driver = webdriver.Chrome(chrome_options=chrome_options)
    url = "https://kns.cnki.net/kns/brief/default_result.aspx"
    driver.get(url)
    driver.find_element_by_xpath(//*[@id="txt_1_value1"]).send_keys(search)

    # driver.switch_to.frame("iframeResult")

    driver.find_element_by_xpath(//*[@id="btnSearch"]).click()

    frame = driver.find_element_by_xpath(//*[@id="iframeResult"])  # 定位到iframe标签
    driver.switch_to.frame(frame)  # 这个地方即将刷新

    time.sleep(2)
    driver.find_element_by_xpath(//*[@id="J_ORDER"]/tbody/tr[1]/td/table/tbody/tr/td[2]/div[1]/a[1]).click()

    cookies_dic = {}
    for dict1 in driver.get_cookies():
        name = dict1[name]
        value = dict1[value]
        cookies_dic[name] = value
    # print(cookies_dic)
    driver.quit()
    NET_SessionId = cookies_dic.get(ASP.NET_SessionId)
    SID_kns = cookies_dic.get(SID_kns)
    cookie = f"ASP.NET_SessionId={NET_SessionId}; SID_kns={SID_kns};"

    headers = {
        "Referer": "https://kns.cnki.net/kns/brief/default_result.aspx",
        # "Cookie":"ASP.NET_SessionId=kvxz1ynkhwhzb0gqetuvderq; SID_kns=123106;",  # 只需要这两个
        "Cookie": cookie,
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"
    }
    return headers


# 拿取详情页
def requests_detail(url, title, authors, publication, timestamp, database):
    try:
        res = session.get(url, proxies=proxies, timeout=5)
    except:
        return
    data = parsel.Selector(res.text)
    place = data.xpath(//div[@class="orgn"]/span/a/text())  # 发文单位
    if place:
        place = place[0].extract()
    else:
        place = None
    abstract_list = data.xpath(//*[@id="ChDivSummary"]/text())
    abstract = ""  # 摘要
    if abstract_list:
        abstract_list = abstract_list.extract()
        for abs in abstract_list:
            abstract = abstract + abs
    keywords_list = data.xpath(//label[@id="catalog_KEYWORD"]/following-sibling::a/text())
    keywords = ""
    if keywords_list:
        keywords_list = keywords_list.extract()
        for keyword in keywords_list:
            keyword = keyword.strip()
            keywords = keywords + keyword
    classno = data.xpath(//label[@id="catalog_ZTCLS"]/parent::p/text())  # 分号类
    if classno:
        classno = classno[0].extract()
    else:
        classno = None

    publicationpic = data.xpath(//div[@class="cover"]/a/img/@src)  # 图片  获取不到
    if publicationpic:
        publicationpic = publicationpic[0].extract()
    else:
        publicationpic = None

    publicationen = data.xpath(//div[@class="sourinfo"]/p[2]/a/text())  # 英文名
    if publicationen:
        publicationen = publicationen[0].extract()
    else:
        publicationen = None

    publicationdate = data.xpath(//div[@class="sourinfo"]/p[3]/a/text())  # 发表时间
    if publicationdate:
        publicationdate = publicationdate[0].extract()
    else:
        publicationdate = None
    publication_title = data.xpath(//div[@class="sourinfo"]/p[1]/a/text())  # 杂志中文名
    if publication_title:
        publication_title = publication_title[0].extract()
    else:
        publication_title = None

    issn = data.xpath(//div[@class="sourinfo"]/p[4]/text())  # issn
    if issn:
        issn = issn[0].extract().strip().strip(ISSN:)
    else:
        issn = None
    core = data.xpath(//div[@class="sourinfo"]/p[5]/text())  # 核心刊物
    if core:
        core = core[0].extract()
    else:
        core = None

    dict1 = {}
    dict1[title] = title
    dict1[authors] = authors
    dict1[publication] = publication
    dict1[timestamp] = timestamp
    dict1[database] = database
    dict1[place] = place
    dict1[abstract] = abstract
    dict1[classno] = classno
    dict1[publicationpic] = publicationpic
    dict1[publicationen] = publicationen
    dict1[publicationdate] = publicationdate
    dict1[publication_title] = publication_title
    dict1[issn] = issn
    dict1[core] = core
    dict1[href] = url
    print(dict1)


# 列表页
def requests_list(count,search):
    headers = cookie_request(search)
    # datas = {
    #     # "pagename": "ASP.brief_default_result_aspx",
    #     # "isinEn": "1",
    #     # "dbPrefix": "CFLS",
    #     # "ConfigFile": "SCDBINDEX.xml",
    #     "keyValue": "肾结石"
    # }
    # url = "https://kns.cnki.net/kns/brief/brief.aspx?pagename=ASP.brief_default_result_aspx&isinEn=1&dbPrefix=SCDB&keyValue=%E8%82%BE%E7%BB%93%E7%9F%B3&S=1&sorttype="
    # url = "https://kns.cnki.net/kns/brief/brief.aspx?&pagename=ASP.brief_default_result_aspx&isinEn=1&dbPrefix=SCDB&S=1&sorttype="
    for i in range(1, count):
        url = "https://kns.cnki.net/kns/brief/brief.aspx?curpage=%s&RecordsPerPage=20&QueryID=11&ID=&turnpage=1&tpagemode=L&dbPrefix=CFLS&Fields=&DisplayMode=listmode&PageName=ASP.brief_default_result_aspx&t=1&" % i
        try:
            res = session.get(url, headers=headers,proxies=proxies, timeout=10)
        except:
            continue
        data = parsel.Selector(res.text)
        table = data.xpath(//table[@class="GridTableContent"])
        if table:
            tr_list = table.xpath(//tr).extract()
            tr_list = tr_list[7:27]
            for tr in tr_list:
                data1 = parsel.Selector(str(tr))
                title = data1.xpath(//a[@class="fz14"]/text())[0].extract()  # 标题
                href = data1.xpath(//a[@class="fz14"]/@href)[0].extract()  # 详情页
                res1 = re.search(r"FileName=(.*?)&", href)
                filename = res1.group().replace(FileName=, ‘‘).replace(&, ‘‘)
                href = "https://kns.cnki.net/KCMS/detail/detail.aspx?dbcode=CJFQ&dbname=CJFDAUTO&filename=%s" % filename
                author_list = data1.xpath(//td/a[@class="KnowledgeNetLink"])
                authors = None  # 作者
                if author_list:
                    authors = ""
                    author_list = author_list.extract()
                    for author_html in author_list:
                        data2 = parsel.Selector(str(author_html))
                        author = data2.xpath(//a/text())[0].extract().strip()
                        authors = authors + f"{author};"
                else:
                    author_list = data1.xpath(//td[@class="author_flag"]/text())
                    if author_list:
                        authors = author_list[0].extract().strip()
                        if not authors:
                            author_list = data1.xpath(//td[@class="author_flag"]/a/text())
                            if author_list:
                                authors = ""
                                author_list = author_list.extract()
                                for author in author_list:
                                    authors = authors + f"{author};"
                publication = data1.xpath(//tr/td[4]/a/text())[0].extract()  # 来源
                timestamp = data1.xpath(//tr/td[5]/text())[0].extract().strip()  # 发表日期
                database = data1.xpath(//tr/td[6]/text())[0].extract().strip()  # 数据库
                # print(title)
                requests_detail(href, title, authors, publication, timestamp, database)
        else:
            print("cookie校验失败!", i)
            # session = requests.session()
            # cookie_request()
            continue
    return True


# 翻页
def page(search):
    headers = cookie_request(search)
    url = "https://kns.cnki.net/kns/brief/brief.aspx?curpage=1&RecordsPerPage=20&QueryID=11&ID=&turnpage=1&tpagemode=L&dbPrefix=CFLS&Fields=&DisplayMode=listmode&PageName=ASP.brief_default_result_aspx&isinEn=1&"
    res = session.get(url, headers=headers)
    data = parsel.Selector(res.text)
    try:
        page = data.xpath(//div[@class="pagerTitleCell"]/text())[0].extract().strip()
        page = page.replace(找到, ‘‘).replace(条结果, ‘‘).replace(,, ‘‘).strip()
        page = int(page)  # 总数
        print("总计:%s 条数据" % page)
        page = int(page / 20) + 2

        return page
    except:
        return False


def main(search):
    count = page(search)
    if count:
        requests_list(count,search)
    else:
        print("获取cookie失败,请重新运行!")


search = input("请输入搜索关键字: ").strip()

if __name__ == __main__:
   main(search)

 

知网爬取

标签:ret   cto   span   false   lis   keyword   fda   失败   刷新   

原文地址:https://www.cnblogs.com/wukai66/p/13365653.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!