Seven's blog

你不会找到路,除非你敢于迷路

0%

Python 兴趣小组第十周作业

学习目标

第十周的部分目标在第九周提前完成了,第十周实战章节的教程与我目前的能力不太符合。索性调整了计划,自己设计实现了一个简单地新闻爬取程序。

利用了第三方现有的接口,综合使用了之前学到的许多知识,勉强能让自己满意吧。

学习代码

学习代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
# -*- coding: utf-8 -*-
import json

import requests
import logging
import mysql.connector
from datetime import datetime
from apscheduler.schedulers.blocking import BlockingScheduler

# 配置 log 等级
logging.basicConfig(level='INFO')

# 新闻 api 地址
REQUEST_URL = 'https://news.topurl.cn/api'
# api 抓取间隔,单位为秒
REQUEST_INTERVAL = 60
# 单个分类新闻最大数量
CATEGORY_ITEM_MAX_SIZE = 10
# 新闻抓取最大数量
NEWS_MAX_SIZE = 10
# 新闻时效,只查询过去 n 小时的新闻
PAST_HOURS_OF_NEWS = 23


class New(object):
def __init__(self, title, url, category):
self.title = title
self.url = url
self.id = int(url[-12:-5])
self.category = category

# @staticmethod
# def hash_str(string):
# """
# 计算字符串的 hash 值
# :param string: 待计算 hash 值的字符串
# :return: 整型 hash 值
# """
# if not isinstance(string, str):
# string = str(string)
#
# sha = sha1()
# sha.update(string.encode('utf-8'))
# return int(sha.hexdigest(), 16) % (10 ** 8)


def to_markdown_string(news_category):
"""
把抓取到的新闻组织成 markdown 文本
:param news_category: 新闻分类字典
:return: markdown 文本
"""
if not isinstance(news_category, dict):
return
keys = news_category.keys()
markdown = ''
for key in keys:
h1 = '# ' + key + ' \n\n'
markdown = markdown + h1
news = news_category.get(key)
for new in news:
list_item = '- [%s](%s) \n\n' % (new.title, new.url)
markdown = markdown + list_item

return markdown


def get_news_from_api():
"""
从 api 抓取新闻列表
:return: 新闻列表
"""
api = requests.get(REQUEST_URL)
logging.info('抓取内容为:' + api.text)
# 判断接口是否异常
if not api.ok:
logging.warning("访问频率过高被限制")
return []
# 判断是否被限制调用频率
response = json.loads(api.text)
if 403 == response.get('code'):
logging.warning("访问频率过高被限制")
return []

# 返回新闻列表
news_list = response['data']['newsList']
return [New(i['title'], i['url'], i['category']) for i in news_list]


def init_database():
"""
初始化数据库,建表等
:return: None
"""
conn = mysql.connector.connect(user='root', password='123456', host='127.0.0.1', port=3306)
cursor = conn.cursor()

# cursor.execute('DROP DATABASE IF EXISTS `morning_new`')
cursor.execute('CREATE DATABASE if not exists `morning_new`')
cursor.execute('USE `morning_new`')

# cursor.execute('DROP TABLE IF EXISTS `news`')
news_table_sql = r"""
create table if not exists `news` (
`id` bigint(0) not null auto_increment comment 'id',
`category` varchar(20) not null comment 'category',
`title` varchar(500) not null default '' comment 'title of the new',
`url` varchar(1500) not null default '' comment 'url of the new',
`create_time` timestamp not null default CURRENT_TIMESTAMP comment 'create_time',
primary key (`id`)
) charset=utf8mb4 comment 'news item';
"""
cursor.execute(news_table_sql)
cursor.close()
conn.close()


def get_mysql_connection():
"""
获取 mysql 连接
:return: mysql 连接
"""
return mysql.connector.connect(user='root', password='123456', host='127.0.0.1', port=3306, database='morning_new')


def remove_duplicates(news):
"""
去除重复新闻
:param news: 待去重新闻列表
:return: 去重后的新闻列表
"""
if not isinstance(news, list) or len(news) == 0:
return []
id_list = [i.id for i in news if isinstance(i, New)]
sql = r"select id from news where id in (%s)" % ','.join(['%s'] * len(id_list))
conn = get_mysql_connection()
cursor = conn.cursor()
cursor.execute(sql, id_list)
duplicates_new_id_list = [i[0] for i in cursor.fetchall()]
cursor.close()
conn.close()
non_duplicates_new_list = [i for i in news if not duplicates_new_id_list.__contains__(i.id)]

return non_duplicates_new_list


def save_news(news):
"""
把新闻写入数据库
:param news: 新闻列表
:return: None
"""
if not isinstance(news, list):
return None
# 新闻去重
news = remove_duplicates(news)
if len(news) <= 0:
return None

sql = r'insert into news (id, category, title, url) values '
param_list = []
for new in news:
if not isinstance(new, New):
continue
sql = sql + ' (%s, %s, %s, %s), '
param_list.append(new.id)
param_list.append(new.category)
param_list.append(new.title)
param_list.append(new.url)

sql = sql[0:-2]
conn = get_mysql_connection()
cursor = conn.cursor()
cursor.execute(sql, param_list)
conn.commit()
cursor.close()
conn.close()
logging.info('存入 %s 条新闻' % len(news))


def query_categories():
"""
获取当日新闻分类列表
:return: 当日新闻分类列表
"""
sql = r'select distinct category from news where create_time > (current_timestamp - interval %s hour)'
conn = get_mysql_connection()
cursor = conn.cursor()
cursor.execute(sql, (PAST_HOURS_OF_NEWS, ))
category_list = [i[0] for i in cursor.fetchall()]
cursor.close()
conn.close()

return category_list


def query_news():
"""
查询过去 23 小时内的最新几条新闻并分类返回
:return: 分类完成的过去 23 个小时内的最新几条新闻
"""
category_list = query_categories()
paper = {}
sql = r"""
select title, url
from news
where category = %s
and create_time > (current_timestamp - interval %s hour)
order by create_time desc
limit %s
"""
conn = get_mysql_connection()
cursor = conn.cursor()
for category in category_list:
cursor.execute(sql, (category, PAST_HOURS_OF_NEWS, CATEGORY_ITEM_MAX_SIZE))
fetch = cursor.fetchall()
news = [New(i[0], i[1], category) for i in fetch]
paper[category] = news
cursor.close()
conn.close()

return paper


def get_and_save_news():
"""
从 api 获取新闻并存储到数据库
:return: None
"""
news = get_news_from_api()
save_news(news)


def query_and_publish_news():
"""
查询并发布新闻
:return: None
"""
paper = query_news()
dt = datetime.now()
with open(dt.strftime('%Y%m%d%H%M%S') + '.md', 'w') as f:
f.write(to_markdown_string(paper))


def main():
init_database()
scheduler = BlockingScheduler()
# 新闻抓取
scheduler.add_job(get_and_save_news, 'interval', seconds=REQUEST_INTERVAL)
# 新闻读取
scheduler.add_job(query_and_publish_news, 'cron', minute="0")
try:
scheduler.start()
except (KeyboardInterrupt, SystemExit):
print('Exit')
pass


if __name__ == '__main__':
# todo 提取 SQL 执行部分代码为方法
main()
微信公众号
扫码关注, 一起进步!