码迷,mamicode.com
首页 > Web开发 > 详细

网页行为分析

时间:2016-09-20 12:05:17      阅读:218      评论:0      收藏:0      [点我收藏+]

标签:

去掉重复的id,并且存入数据库:

import MySQLdb
# 打开数据库连接
db = MySQLdb.connect(host=localhost, user=root, passwd=123456, port=3306, charset="utf8", db="db_websiterecommend")
cur = db.cursor()
# db = MySQLdb.connect(host="localhost",user="root",passwd="123456",db="db_websiterecommend",port=‘3306‘ )
# # 使用cursor()方法获取操作游标
# cursor = db.cursor()
# 使用execute方法执行SQL语句
sql=select ip from t_useripvisittrace

cur.execute(sql)
# 使用 fetchone() 方法获取一条数据库。
data = cur.fetchall()
#print(data)
user_ip=[]
for l in data:
    user_ip.append(l[0])
#print(type(user_ip))
#print(user_ip)
# 关闭数据库连接
user_ip_cai=set(user_ip)
# print(user_ip_cai)
userip=list(user_ip_cai)
value=[]
for i in range(len(userip)):
    value.append((i,userip[i]))
cur.executemany(insert into t_userIP_list values(%s,%s),value)
db.commit()

 根据网站情况,进行网站的分类:

# -*- coding: utf-8 -*-
import MySQLdb
import re
import requests
from lxml import etree
# 打开数据库连接
count=0
url=http://www.tipdm.org
db = MySQLdb.connect(host=localhost, user=root, passwd=123456, port=3306, charset="utf8", db="db_websiterecommend")
cur = db.cursor()
cur1=db.cursor()
sql=select page_path from t_useripvisittrace
sql1=select url_list from urls
cur.execute(sql)
cur1.execute(sql1)
# 使用 fetchone() 方法获取一条数据库。
value=[]
data = cur.fetchall()
print(len(data))
for each in data:
    #print(type(each[0]))
    if each[0]==/:
        print(2222)
        value.append((each[0],首页))
        cur.executemany(insert into t_url_classify values(%s,%s), value)
        db.commit()
        count+=1
        print(count)
        #print(value)
    elif each[0]==/index.jhtml:
        print(3333)
        value.append((each[0],首页))
        cur.executemany(insert into t_url_classify values(%s,%s), value)
        db.commit()
        count += 1
        print(count)
        #print(value)
    elif index in each[0]:
        print(4444)
        urls=url+each[0]
        html = requests.get(urls)
        selector = etree.HTML(html.text)
        content=selector.xpath(/html/body/div[6]/div[2]/div[1]/div/a[2]/text())
        value.append((each[0],content))
        cur.executemany(insert into t_url_classify values(%s,%s), value)
        db.commit()
        count += 1
        print(count)
        #print(value)
    elif .jhtml in each[0]:
        print(5555)
        url1=url+each[0]
        html = requests.get(url1)
        selector = etree.HTML(html.text)
        content=selector.xpath(/html/body/div[5]/div[2]/div[1]/div[1]/a[2]/text())
        value.append((each[0],content))
        cur.executemany(insert into t_url_classify values(%s,%s), value)
        db.commit()
        count += 1
        print(count)
    else:
        print(666)
        value.append((each[0],其他))
        print(each[0])
        cur.executemany(insert into t_url_classify values(%s,%s), value)
        db.commit()
        count += 1
        print(count)
print(value)
print(finish)

 使用pandas读取数据库进行统计

import pandas as pd
from sqlalchemy import create_engine
engine = create_engine(mysql+pymysql://root:123456@127.0.0.1:3306/db_websiterecommend?charset=utf8)
sql = pd.read_sql(t_useripvisittrace, engine, chunksize = 10000)
output=C:\\Users\\lenovo\\Desktop\\count_.xls
‘‘‘z
用create_engine建立连接,连接地址的意思依次为“数据库格式(mysql)+程序名(pymysql)+账号密码@地址端口/数据库名(test)”,最后指定编码为utf8;
all_gzdata是表名,engine是连接数据的引擎,chunksize指定每次读取1万条记录。这时候sql是一个容器,未真正读取数据。
‘‘‘
import MySQLdb
# 打开数据库连接
db = MySQLdb.connect(host=localhost, user=root, passwd=123456, port=3306, charset="utf8", db="db_websiterecommend")
cur = db.cursor()
value=[]
# for j in range(len(list(sql))):
#     s=sql[j][‘ip‘].value_counts()
#     value.append((j,s))
#     print value
for i in sql:
    s=i[ip].value_counts()
    print type(s)
    value.append((list(i[ip]),list(s)))
    cur.executemany(insert into userip values(%,%s), value)
    db.commit()
print value

 

网页行为分析

标签:

原文地址:http://www.cnblogs.com/caicaihong/p/5869653.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!