-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathbili_reply.py
173 lines (146 loc) · 5.5 KB
/
bili_reply.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import re
import requests
import json
from crawler_target_users_good import bili_action
from crawler_target_users_good import utils
import time
import random
from tenacity import *
# 初始化变量
# stop_date = '2018-07-10 00:00:00'
# stop_date = '2000-03-15 00:00:00'
stop_date = utils.STOP_DATE
keywords_comment_bad = './keywords/keywords_comment_bad'
# 类, 提供出事变量
class Count(object):
count_to_pause = 1
def reply_url(pn_r, oid):
# oid就是av号
return 'https://api.bilibili.com/x/v2/reply?jsonp=jsonp&pn={}&type=1&oid={}&sort=0'.format(pn_r, oid)
url_dm = ''
url_send_msg = ''
# 时间戳转换日期格式
def timeStamp_to_date(timeStamp):
timeArray = time.localtime(timeStamp)
otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
return otherStyleTime
def hit_comment(content):
with open(keywords_comment_bad, encoding='utf-8') as f_keys:
f_keywords = f_keys.read()
keyword_list = f_keywords.splitlines()
for key in keyword_list:
if re.search(key, content) is not None:
return False
return True
# 随机调用
def act_random(video_id, rpid, mid):
# time.sleep(random.randint(1, 3))
time.sleep(1)
# 10次 就暂停
if Count.count_to_pause%10 == 0:
time.sleep(utils.RETRY_WAIT_TIME)
try:
i = random.randint(1, 10)
bili_action.action_like(video_id, rpid, '1')
bili_action.action_follow(mid, '1')
if i==8:
bili_action.action_2('3')
if i==9:
bili_action.action_1(video_id)
if i==10:
bili_action.action_3(mid)
Count.count_to_pause = Count.count_to_pause+1
except RetryError as e:
print(e)
# 过滤精确的目标
def get_target_user(comment):
comment = comment.lower()
re_xiang = '想|打算|入'
re_list = ['i[57]', '[入买弄搞]', '1[3456789][年款的寸吋]|十[三四五六七八九][年款的寸吋]', '(mac)|(苹果)|mba|mbp']
count = 0
str_hit = ['入手', '想入', '想买', '推荐', '买什么', '怎么选', '买那款', '买哪款', ]
for hit in str_hit:
result = re.search(hit, comment)
if result is not None:
print('\t\t\t'+comment)
return True
# 统计命中关键字次数
for re_str in re_list:
# print(re_str)
result = re.search(re_str, comment)
if result is not None: count = count + 1
# 命中想字, 在命中一个list中的一个就触发
if re.search(re_xiang, comment) is not None:
if count > 0:
print('\t\t\t'+comment)
return True
# 否则 至少要命中3个
else:
if count > 2:
print('\t\t\t'+comment)
return True
# 解析评论 二级评论 保存到文件
def parse_reply(video_id, replies, f_comm):
if replies is not None:
for reply in replies:
# 评论
comment_msg = reply['content']['message']
# rpid 点赞
rpid = reply['rpid_str'];
# 时间戳 转日期
date_comm = timeStamp_to_date(reply['ctime'])
# mid用户id
mid = str(reply['member']['mid'])
uname = reply['member']['uname']
# 在截止日期之前 且 是目标用户
if date_comm > stop_date and hit_comment(comment_msg) is True:
comment_msg = re.sub('\n', '___', comment_msg)
# 过滤精确目标
got_user = get_target_user(comment_msg)
if got_user:
# 评论信息保存到本地
f_comm.write(mid + '\t' + comment_msg + '\t' + date_comm + '\t' + uname + '\t' + rpid+'\n')
# 入口 随机选择
act_random(video_id, rpid, mid)
# 二级回复 递归
if reply['rcount'] > 0:
reply_sec = reply['replies']
# f_comm.write('\t\t')
parse_reply(video_id, reply_sec, f_comm)
# 截止日期 停掉爬虫
if date_comm < stop_date:
return False
return True
@retry(stop=stop_after_attempt(3), wait=wait_fixed(utils.RETRY_WAIT_TIME))
def get_reply(video_id):
comment_detail_path = 'write_to_local/comments_hit_{}.txt'.format(video_id)
f_comm = open(comment_detail_path, 'w+', encoding='utf-8')
page_r = 1
while True:
print('\t\t当前正在爬第{}页'.format(page_r))
res_reply = requests.get(reply_url(page_r, video_id))
res_reply_json = json.loads(res_reply.text)
# pprint.pprint(res_reply_json)
reply_ori = res_reply_json['data']['replies']
# 总评论数量
count_reply = res_reply_json['data']['page']['count']
pages = count_reply/20
# 评论详情
flag = parse_reply(video_id, reply_ori, f_comm)
# 分页数量
if flag is False or page_r >= pages:
# if flag is False or page_r >= 13:
break
page_r = page_r + 1
time.sleep(1)
# 关掉文件,写入硬盘
f_comm.close()
if __name__ == '__main__':
# 初始化参数
video_id = '45261119'
video_id = '32142910'
video_id = '55270114' # 纳言 mac
video_id = '9212564' #testv 值不值得买
video_id = '31141936'# 测试用
# get_reply(video_id)
print(get_target_user('请问一个问题,目前在慢慢学习PS和视频剪辑,手头的电脑很卡,试试用苹果电脑 13寸 16g 512,不知道上i7会不会有点浪费?个人原因不是很上15寸……'))