自动检测CSDN博客文章阅读次数的爬虫

自动检测CSDN博客文章阅读次数的爬虫

平时没事会去CSDN上看一看博客里有什么变化，如果文章有评论或者有了私信，CSDN上都不会通知你，这就需要经常查看博客信息。其实这种事情完全可以写一个脚本来自动检测识别，如果博客里的信息有什么变化，自动发一个邮件给我，并且告诉我哪些信息发生了变化，再将这个脚本放到计划任务中，每天定时执行扫描博客内容。

下面具体说明一下这个脚本的作用和注意：

1. 我的想法是先将每篇文章的阅读次数保存到数据库

2. 然后每次将数据库里的数据与博客里的数据进行对比

3. 再将有差别的记录下来，具体信息放到邮件中发送给我

4. URL中list后面是页数，当页数特别大时就显示所有文章，不用分页

5. 文章信息摘取使用BeautifulSoup库，嵌套了几层终于提取了所有信息

6. 写完脚本放到服务器里，开一个计划任务，写一个Shell让它自动执行就好了

下面是源码：

# -*- coding=utf-8 -*-
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )
import urllib2
import MySQLdb
import time
from bs4 import BeautifulSoup
import smtplib
from email.mime.text import MIMEText
from email.header import Header
def download(url):
    try:
        html = urllib2.urlopen(url).read()
    except urllib2.URLError as e:
        print "error"
        print e.code  # 可以打印出来错误代号如404。
        print e.reason  # 可以捕获异常
        html = None
    return html
def email(text ,toemail):
    sender = "rain@rain1024.com"   #发送方的邮箱
    password = ""         #邮箱的授权码
    receiver = toemail  #接收方的邮箱
    data_time = time.strftime("%Y-%m-%d")
    subject = data_time + "的CSDN博客报告"   #title邮件标题
    words = text   #text邮件内容
    smtpserver = 'smtp.exmail.qq.com'
    msg = MIMEText(words, 'plain', 'utf-8')#中文需参数‘utf-8'，单字节字符不需要
    msg['Subject'] = Header(subject, 'utf-8') #邮件标题
    msg['from'] = sender #发信人地址
    msg['to'] = receiver #收信人地址
    smtp = smtplib.SMTP_SSL("smtp.exmail.qq.com", 465)
    smtp.connect('smtp.exmail.qq.com')
    smtp.login(sender, password)
    smtp.sendmail(sender, receiver, msg.as_string())
    smtp.quit()
    print data_time + "的邮件发送成功!"
def operator_SQL(flag,update=None):
    # 将获取的数据存入数据库中
    try:
        conn = MySQLdb.connect(
            host='127.0.0.1',
            port=3306,
            user='root',
            passwd='root',
            db='test',
            charset='utf8',
        )
    except:
        conn = MySQLdb.connect(
            host='127.0.0.1',
            port=3306,
            user='test',
            passwd='',
            db='test',
            charset='utf8',
        )
    cur = conn.cursor()

    if flag == 1:
        res = cur.execute("select * from csdn_article")
        cur.execute(
            'insert into csdn_article values("%s","%s", "%s", "%s", "%s")' % (res + 1, update[0], update[1], update[2], 0))
    elif flag == 2:
        cur.execute("update csdn_article set article_read="+str(update[1])+" where article_id = "+str(update[0]))
        cur.execute("update csdn_article set change_read=" + str(
            update[3]) + "  where article_id = " + str(update[0]))
    elif flag == 3:
        res = cur.execute("select article_read from csdn_article where article_id="+str(update))
        if res == 1:
            res = cur.fetchmany(res)[0][0]
        elif res == 0:
            res = -1
        return res
        # sql_list = list(cur.fetchmany(res))
        # print sql_list[1].count(435)
        # if 435 in sql_list:
        #     print '432432443242'
    cur.close()
    conn.commit()
    conn.close()
def extract(html):
    soup = BeautifulSoup(html)
    # 获取到评论等内容
    comment = soup.find(attrs={'id':'blog_statistics'}).text
    # 接下来获取每一篇文章的阅读数，与数据库中的数据进行判断，如果有变化，则记录下来
    article_soup = soup.find_all(attrs={'class':'list_item article_item'})

    article_all_list = []
    for value in article_soup:
        # 根据每篇文章的特征提取对应的数据
        value = str(value)
        tmp_soup = BeautifulSoup(value)
        article_id = tmp_soup.a['href']
        article_title = tmp_soup.find('span',class_='link_title').contents[0].string
        article_read = tmp_soup.find('span',class_='link_view').contents[1].string
        # print article_id,article_read,article_title
        # 使用了各种办法，终于将两个字符串中的数字提取出来了
        article_id = int(str.split(str(article_id),'/')[4])
        article_read = int(str.split(str(article_read[1:]),')')[0])
        # 然后将每一篇文章里的数据放入一个列表里，再将这个列表放到大的列表里
        article_list = []
        article_list.append(article_id)
        article_list.append(article_read)
        article_list.append(article_title)
        # 对每个值去数据库里查找，如果存在就进行比较，不相同则记录下来，并去数据库中修改，如果不存在，则插入这条数据

        read = operator_SQL(3, article_id)
        if read == -1:
            operator_SQL(1, article_list)
        elif read != article_read:
            # 将阅读次数的查值传入数据库中
            article_list.append(article_read - read)
            # print article_list[3]
            operator_SQL(2, article_list)
            article_all_list.append(article_list)
    text = "您今天的CSDN博客信息报告如下："
    text = text + str(comment)
    text = text + '今日有' + str(article_all_list.__len__()) + '篇文章的阅读数发生了变化' + '\n'
    i = 1
    for value in article_all_list:
        # print value[0]
        text = text + str(i) + '： ' + value[2]  + '总阅读次数为：'+ str(value[1]) + '；比昨天增加了：' + str(value[3]) + '次。' + '\n'
        i = i + 1

    # print text
    return text
def main():
    url = 'http://blog.csdn.net/rain_web/article/list/500'
    html = download(url)
    text = extract(html=html)
    email(text,'nylrain@163.com')
    # operator_SQL(3, 353425)
if __name__=='__main__':
    main()

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

# -*- coding=utf-8 -*-

import sys

reload(sys)

sys.setdefaultencoding( "utf-8" )

import urllib2

import MySQLdb

import time

from bs4 import BeautifulSoup

import smtplib

from email.mime.text import MIMEText

from email.header import Header

def download(url):

try:

html = urllib2.urlopen(url).read()

except urllib2.URLError as e:

print "error"

print e.code # 可以打印出来错误代号如404。

print e.reason # 可以捕获异常

html = None

return html

def email(text ,toemail):

sender = "rain@rain1024.com" #发送方的邮箱

password = "" #邮箱的授权码

receiver = toemail #接收方的邮箱

data_time = time.strftime("%Y-%m-%d")

subject = data_time + "的CSDN博客报告" #title邮件标题

words = text #text邮件内容

smtpserver = 'smtp.exmail.qq.com'

msg = MIMEText(words, 'plain', 'utf-8')#中文需参数‘utf-8'，单字节字符不需要

msg['Subject'] = Header(subject, 'utf-8') #邮件标题

msg['from'] = sender #发信人地址

msg['to'] = receiver #收信人地址

smtp = smtplib.SMTP_SSL("smtp.exmail.qq.com", 465)

smtp.connect('smtp.exmail.qq.com')

smtp.login(sender, password)

smtp.sendmail(sender, receiver, msg.as_string())

smtp.quit()

print data_time + "的邮件发送成功!"

def operator_SQL(flag,update=None):

# 将获取的数据存入数据库中

try:

conn = MySQLdb.connect(

host='127.0.0.1',

port=3306,

user='root',

passwd='root',

db='test',

charset='utf8',

)

except:

conn = MySQLdb.connect(

host='127.0.0.1',

port=3306,

user='test',

passwd='',

db='test',

charset='utf8',

)

cur = conn.cursor()

if flag == 1:

res = cur.execute("select * from csdn_article")

cur.execute(

'insert into csdn_article values("%s","%s", "%s", "%s", "%s")' % (res + 1, update[0], update[1], update[2], 0))

elif flag == 2:

cur.execute("update csdn_article set article_read="+str(update[1])+" where article_id = "+str(update[0]))

cur.execute("update csdn_article set change_read=" + str(

update[3]) + " where article_id = " + str(update[0]))

elif flag == 3:

res = cur.execute("select article_read from csdn_article where article_id="+str(update))

if res == 1:

res = cur.fetchmany(res)[0][0]

elif res == 0:

res = -1

return res

# sql_list = list(cur.fetchmany(res))

# print sql_list[1].count(435)

# if 435 in sql_list:

# print '432432443242'

cur.close()

conn.commit()

conn.close()

def extract(html):

soup = BeautifulSoup(html)

# 获取到评论等内容

comment = soup.find(attrs={'id':'blog_statistics'}).text

# 接下来获取每一篇文章的阅读数，与数据库中的数据进行判断，如果有变化，则记录下来

article_soup = soup.find_all(attrs={'class':'list_item article_item'})

article_all_list = []

for value in article_soup:

# 根据每篇文章的特征提取对应的数据

value = str(value)

tmp_soup = BeautifulSoup(value)

article_id = tmp_soup.a['href']

article_title = tmp_soup.find('span',class_='link_title').contents[0].string

article_read = tmp_soup.find('span',class_='link_view').contents[1].string

# print article_id,article_read,article_title

# 使用了各种办法，终于将两个字符串中的数字提取出来了

article_id = int(str.split(str(article_id),'/')[4])

article_read = int(str.split(str(article_read[1:]),')')[0])

# 然后将每一篇文章里的数据放入一个列表里，再将这个列表放到大的列表里

article_list = []

article_list.append(article_id)

article_list.append(article_read)

article_list.append(article_title)

# 对每个值去数据库里查找，如果存在就进行比较，不相同则记录下来，并去数据库中修改，如果不存在，则插入这条数据

read = operator_SQL(3, article_id)

if read == -1:

operator_SQL(1, article_list)

elif read != article_read:

# 将阅读次数的查值传入数据库中

article_list.append(article_read - read)

# print article_list[3]

operator_SQL(2, article_list)

article_all_list.append(article_list)

text = "您今天的CSDN博客信息报告如下："

text = text + str(comment)

text = text + '今日有' + str(article_all_list.__len__()) + '篇文章的阅读数发生了变化' + '\n'

i = 1

for value in article_all_list:

# print value[0]

text = text + str(i) + '： ' + value[2] + '总阅读次数为：'+ str(value[1]) + '；比昨天增加了：' + str(value[3]) + '次。' + '\n'

i = i + 1

# print text

return text

def main():

url = 'http://blog.csdn.net/rain_web/article/list/500'

html = download(url)

text = extract(html=html)

email(text,'nylrain@163.com')

# operator_SQL(3, 353425)

if __name__=='__main__':

main()