因为看见老男孩的学员做过这一期,也很有兴趣,并且刚在学python,所以用python实现了下。由于刚学习python,肯定还有很多地方可以优化。
#!/bin/bash # oldboy linux training # 2015-06-01 # Happy Children‘s Day # 说明:本脚本来自老男孩linux21期学员张耀开发! EduFile=/tmp/edu.html EduFile2=/tmp/edu2.html Url="$*" # Check for given parameters [ $# -eq 0 ] && { echo "USAGE: /bin/sh $0 http://...." exit 1 } # Judge url is ok? curl -I $Url &>/dev/null [ $? -ne 0 ] &&{ echo "Bad url,Please check it" exit 1 } # Defined get pagenum and CourseId Functions function getnum(){ curl -s $Url>$EduFile grep ‘"pagesGoEnd"‘ $EduFile &>/dev/null if [ $? -eq 0 ] then num=`sed -rn ‘s#.*page=([0-9].*)" class="pagesGoEnd".*$#\1#gp‘ $EduFile` else num=`sed -rn ‘s|.*page=([0-9].*)#" class="pagesNum".*$|\1|gp‘ $EduFile` fi pagenum=${num:-1} CourseId=`echo $Url|awk -F "[-.]" ‘{print $4}‘` } # Defined curl html Functions function Curl(){ getnum for i in `seq $pagenum` do curl "http://edu.51cto.com/index.php?do=course&m=lessions&course_id=$CourseId&page=$i" 1>>$EduFile 2>/dev/null done } # Defined Create table Functions function table(){ sum="" index=1 sed -rn ‘/do=lesson/ s#<.*(<a href=")(.*)</h4>$#\1http://edu.51cto.com\2#gp‘ $EduFile > $EduFile2 while read line do sum=$sum"<tr><th width="40" scope="row">$index</th><td width="520">$line</td>" ((index++)) done <$EduFile2 } # Defined Create html Functions function html(){ cat >/tmp/oldboy.html<<-END <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <title>test</title> </head> <body> <table width="560" border="1"> $sum </table> </body> </html> END } function main(){ Curl table html } main
#!/usr/bin/env python #coding:utf-8 import urllib,urllib2,sys,os,re reload(sys) sys.setdefaultencoding(‘utf-8‘) def get_course_id_page_num(URI): orgin = urllib2.urlopen(URI).read().decode(‘utf-8‘).encode(‘utf-8‘) try: page_pattern = re.compile(r‘<a href="/index.php?.*course_id=(.*)&page=(.*)" class="pagesGoEnd"‘) course_id = page_pattern.search(orgin).group(1) pagenum = page_pattern.search(orgin).group(2) except AttributeError: page_pattern = re.compile(r‘<a href="/index.php?.*course_id=(.*)&page=(.*)#" class="pagesNum"‘) course_id = page_pattern.search(orgin).group(1) pagenum = page_pattern.search(orgin).group(2) return (course_id,pagenum) def get_url_title(course,page): #cto = file(‘/tmp/edu.51cto.html‘,‘a+‘) url= "http://edu.51cto.com/index.php?do=course&m=lessions&course_id=" + str(course) + "&page=" + str(page) request = urllib2.urlopen(url).read().decode(‘utf-8‘).encode(‘utf-8‘) url_title_pattern = re.compile(r‘<a href="(.*)" target="_blank">(.*)</a></h4>‘) ut=url_title_pattern.findall(request) cto_table_1 = ‘<tr><th scope=row>+_+</th><td><a href="‘ cto_table_2 = ‘http://edu.51cto.com‘ cto_table_3 = ‘" target="_blank">‘ cto_table_4 = ‘</a></td>‘ for k,v in ut: if ‘lesson‘ in k: line = cto_table_1 + cto_table_2 + k + cto_table_3 + v + cto_table_4 print >> cto, line #cto.close() if __name__ == "__main__": URI = "http://edu.51cto.com/course/course_id-839.html" #URI = sys.argv[1] cto = file(‘/tmp/edu.51cto.html‘,‘a+‘) cto_head = ‘<head>\n<meta http-equiv="Content-Type" content="text/html;charset=utf-8"/>\n<title>51cto</title>\n</head>\n<body><table width="560" border="1">\n‘ print >> cto, cto_head course_id,pagenum = get_course_id_page_num(URI) for i in range(1,int(pagenum) + 1): get_url_title(course_id,i) cto_tail = ‘</table>\n</body>\n</html>‘ print >> cto, cto_tail cto.close()
本文出自 “孜孜不倦的学习着...” 博客,请务必保留此出处http://jonyisme.blog.51cto.com/3690784/1662243
原文地址:http://jonyisme.blog.51cto.com/3690784/1662243