码迷,mamicode.com
首页 > 编程语言 > 详细

python处理html的table标签

时间:2014-12-04 17:12:48      阅读:238      评论:0      收藏:0      [点我收藏+]

标签:des   http   ar   sp   for   on   数据   div   art   

转载:http://www.xuebuyuan.com/583071.html

python处理html的table标签

2012年01月06日 ⁄ 综合 ⁄ 共 5279字 ⁄ 字号    ⁄ 评论关闭
 
import sys
import csv
import urllib2

import BeautifulSoup

#page    = urllib2.urlopen(sys.argv[1]).read()
soup    = BeautifulSoup.BeautifulSoup(open(sys.argv[1]).read())
csvout  = csv.writer(sys.stdout)

for table in soup.findAll(‘table‘):
    print "<table border=‘1‘>"
    #print ‘#‘
    #print ‘# Table‘
    #print ‘# Fields: ‘ + ‘,‘.join([tr.text for tr in table.findAll(‘th‘)])
    for row in table.findAll(‘tr‘):
        print "<tr>"
        #csvout.writerow([tr.text for tr in row.findAll(‘td‘)])
        for tr in row.findAll(‘td‘):
            print "<td>"
            print tr.text.encode("utf-8")
            print "</td>"
        print "</tr>"
    print "</table>"
    break

 

#!/bin/bash

#process.h


basedir=$(dirname $1)

echo $basedir

#echo \<head\>\<meta http-equiv=\"Content-Type\" content=\"text/html\; charset=UTF-8\" /\>\</head\> >> $basedir/baobei.html

prodname=$(grep -o ‘<title id="id_title">.*</title>‘ $1 | cut -d \> -f 2 | cut -d \< -f 1)

prodname=$(echo $prodname | cut -d _ -f 1)

price=$(grep -o ‘<span class="s1">[0-9]*</span>‘ $1 | cut -d \> -f 2 | cut -d \< -f 1)

echo \<table\>                > $basedir/baobei.html

echo  \<tr\>                 >> $basedir/baobei.html
echo  \<td\>Name\</td\>      >> $basedir/baobei.html
echo  \<td\>$prodname\</td\> >> $basedir/baobei.html
echo  \</tr\>                >> $basedir/baobei.html

echo  \<tr\>                 >> $basedir/baobei.html
echo  \<td\>Price\</td\>     >> $basedir/baobei.html
echo  \<td\>$price\</td\>    >> $basedir/baobei.html
echo  \</tr\>                >> $basedir/baobei.html
 
python ./printtab.py $1      >> $basedir/baobei.html

echo \</table\>              >> $basedir/baobei.html

imgsrc=$(head -n 1 $basedir/imglist)
if test y$imgsrc = y; then
	rm -rf $basedir/baobei.html && exit;
fi
echo \<img src=\‘$imgsrc\‘/\> >> $basedir/baobei.html

cat $basedir/baobei.html | tr -d ‘\n‘ | tr -d ‘"‘ > $basedir/baobei.html.tmp
mv  $basedir/baobei.html.tmp                        $basedir/baobei.html



 

#!/bn/bash

#process2.sh

basedir=$(dirname $1)

name=$(grep  -o "<td>Name</td><td>.*</td>" $1   | cut -d \> -f 4 | cut -d \< -f 1 )

if  test "x$name" = "x" ; then
	exit ;
fi

price=$(grep -o "<td>Price</td><td>.*</td>" $1 | cut -d \> -f 4 | cut -d \< -f 1 )

if  test "x$price" = "x" ; then
	exit;
fi

if  test "x$class" = "x"
then
	class=$(grep -o "<td>产品类型</td><td>.*</td>" $1 | cut -d \> -f 4 | cut -d \< -f 1 )
fi

if  test "x$class" = "x"
then
	class=$(grep -o "<td>设备类型</td><td>.*</td>" $1 | cut -d \> -f 4 | cut -d \< -f 1 )
fi

if  test "x$class" = "x"
then
	class=$(grep -o "<td>打印针数</td><td>.*</td>" $1 | cut -d \> -f 4 | cut -d \< -f 1 )
fi

if  test "x$class" = "x"
then
	class="条形码打印机"
fi


if   $( echo $class | grep --quiet ‘票据‘ )
then
	class="536187477"
elif $( echo $class | grep --quiet ‘发票‘ )
then
	class="536187477"
elif $( echo $class | grep --quiet ‘票证‘ )
then
	class="536187477"	
elif $( echo $class | grep --quiet ‘存折‘ )
then
	class="536187477"	
	
##################################################################
elif $( echo $class | grep --quiet ‘针‘ )
then
	class="536187477"

##################################################################
elif $( echo $class | grep --quiet ‘灯泡‘ )
then
	class="536187479"
elif $( echo $class | grep --quiet ‘UHE‘ )
then
	class="536187479"
elif $( echo $class | grep --quiet ‘UHP‘ )
then
	class="536187479"
elif $( echo $class | grep --quiet ‘HSCR‘ )
then
	class="536187479"

###############################################################
elif $( echo $class | grep --quiet ‘条形码打印机‘ )
then
	class="536187480"

##################################################################
elif $( echo $class | grep --quiet ‘证卡打印‘ )
then
	class="536187483"

##################################################################
elif $( echo $class | grep --quiet ‘条码‘ )
then
	class="536187481"

elif $( echo $class | grep --quiet ‘扫描‘ )
then
	class="536187481"

elif $( echo $class | grep --quiet ‘阅读‘ )
then
	class="536187481"

elif $( echo $class | grep --quiet ‘采集‘ )
then
	class="536187481"

elif $( echo $class | grep --quiet ‘手持‘ )
then
	class="536187481"

elif $( echo $class | grep --quiet ‘数据终端‘ )
then
	class="536187481"

##################################################################
elif $( echo $class | grep --quiet ‘激光‘ )
then
	class="536187484"
	
##################################################################
elif $( echo $class | grep --quiet ‘喷墨‘ )
then
	class="536187486"
	
##################################################################
elif $( echo $class | grep --quiet ‘复印‘ )
then
	class="536187615"
	
##################################################################
elif $( echo $class | grep --quiet ‘一体机‘ )
then
	class="536187485"

##################################################################
elif $( echo $class | grep --quiet ‘硒鼓‘ )
then
	class="536187616"

elif $( echo $class | grep --quiet ‘墨盒‘ )
then
	class="536187616"
else
	class="536187616"
fi

################################################################
imagepath=$(find $basedir -type f -iname "*.jpg")
if test "x$imagepath" = "x"; then
    exit ;
fi
image=$(md5sum $imagepath | cut -d ‘ ‘ -f 1)
cp -rf $imagepath $basedir/../../template/$image.tbi
################################################################
desc=$(cat $1)
################################################################

echo -e \"$name\""\t"110514"\t"\",$class,\""\t"1"\t"\"上海\""\t"\"上海\""\t"\"b\""\t"$price"\t"0.000000"\t"1"\t"7"\t"2"\t"0.000000"\t"0.000000"\t"0.000000"\t""\t""\t"1"\t"1"\t"0"\t"1"\t"1"\t"0"\t"\"2012-10-16 13:09:48\""\t""\t"\"$desc\""\t""\t"\"20000:31140\;20196:3228846\;29969:107401\;30681:32998\;31468:102250\;31479:92188\;3415558:27513\;3415563:21959\;3415571:21959\;3415581:10122\;3415609:22041\;7884463:75957615\;14319244:80897641\;14319250:123483713\;14791484:10285019\;\""\t""\t""\t"0"\t"0"\t"\"2012-10-16 13:37:51\""\t"100"\t""\t"0"\t"\"$image:0:0:\|\;\""\t"\"\""\t"\"\""\t"\",\""\t"\",\""\t"\"\""\t"\"\""\t"0"\t"\"15758222730\""\t"15758222730

 

 classtable = {
17     "536187477" : "票据打印机" ,
18     "536187478" : "针式打印机" ,
19     "536187479" : "投影灯泡"   ,
20     "536187480" : "条形码打印机" ,
21     "536187481" : "条码设备"   ,
22     "536187483" : "证卡打印机" ,
23     "536187484" : "激光打印机" ,
24     "536187485" : "多功能一体机" ,
25     "536187486" : "喷墨打印机" ,
26     "536187615" : "复印复合机" ,
27     "536187616" : "硒鼓"  ,
28 }

python处理html的table标签

标签:des   http   ar   sp   for   on   数据   div   art   

原文地址:http://www.cnblogs.com/stepit/p/4143039.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!