python 未知

时间：2017-12-31 11:56:36 阅读：140 评论：0 收藏：0 [点我收藏+]

标签：url href word text pen page file beautiful join

import time
import requests
from bs4 import BeautifulSoup
import threading


def format_str(s):
    return s.replace("\n","").replace("","").replace("\t",‘‘)



def get_urls_in_pages(from_page_num,to_page_num):
    urls=[]
    search_word=‘计算机‘
    url_part_1=‘http://www.phei.com.cn/moudle/goods/‘\
               ‘searchkey.jsp? Page=‘
    url_part_2=‘&Page=2&searchKey=‘
    for i in range(from_page_num,to_page_num+1):
        urls.append(url_part_1
                    +str(i)+
                    url_part_2+search_word)
    all_href_list=[]
    for url in urls:
        print(url)
        resp=requests.get(url)
        bs=BeautifulSoup(resp.text)
        a_list=bs.find_all(‘a‘)
        needed_list=[]
        for a in a_list:
            if ‘href‘in a.attrs:
                href_val=a[‘href‘]
                title=a.text
                if ‘bookid‘in href_val and ‘shopcar0.jsp‘\
                             not in href_val and title !=‘‘:
                    if [title,href_val] not in needed_list:
                        needed_list.append([format_str(title),
                                            format_str(href_val)])
        all_href_list+=needed_list
    all_href_file=open(str(from_page_num)+‘_‘+
                     str(to_page_num)+‘_‘+
                     ‘all_hrefs.txt‘,‘w‘)
    for href in all_href_list:
        all_href_file.write(‘\t‘.join(href)+‘\n‘)
    all_href_file.close()
    print(from_page_num,to_page_num,len(all_href_list))

python 未知

标签：url href word text pen page file beautiful join

原文地址：https://www.cnblogs.com/Justice-V/p/8157180.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行