请注意,本文编写于 1260 天前,最后修改于 1260 天前,其中某些信息可能已经过时。
时间紧任务重,这里主要记录整个项目进度和具体构思。首先明确任务目标与需求:
本部分主要目的有两个:
主要使用:requests伪造api请求,正则表达式提取字段,sqlite3数据库存储
涉及python文件:bilbliapi.py database.py
本部分目的主要目的有两个:
主要使用:AGNES聚类算法,sqlite3数据库存储
涉及python文件:datamake.py AGNES.py
本部分主要目的有三个:
涉及python文件:main.py useraner.py using.py
准备使用bilibili为基础,做一个视频推荐系统。
本小节是一个爬虫系统,主要是用于爬取用户关注的up主。哔哩哔哩每一个用户都拥有一个唯一的uid,我们可以从这个uid入手,定位对应的用户。其次获取每一个用户的关注列表,返回数据通过正则表达式进行处理,以备后续使用。
import requests
import urllib
from urllib import request
import urllib.parse
import urllib.request
import re
def requests_GET(url,head): #GET请求封装模块
join_in = requests.get(url,headers=head)
#return_json = join_in.json()
return join_in
def get_AV_data():
'''
获取指定信息,API直接使用
'''
# 请求头
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
"Host": "api.bilibili.com",
"Referer": "https://www.bilibili.com/",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
}
# 刚刚获取的api
api_url = "https://api.bilibili.com/x/web-interface/online"
# 开始请求
res = requests.get(api_url, headers=headers)
online_dic = res.json()
print(online_dic)
#print("最新投稿:%d" % online_dic['data']['all_count'])
#print("在线人数:%d" % online_dic['data']['web_online'])
def followings(id):
'''
爬取关注up主,返回列表 0.up主uid 1.up主名字 2.up主简介 3.bilibili官方认证
'''
#global user
#if id == 0:
# return
following_up = []
i = 0
result = []
ref_url = "https://space.bilibili.com/"+str(id)+"/#/fans/follow"
head = {
'Accept': '*/*',
#'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'DNT': '1',
'Host': 'api.bilibili.com',
'Pragma': 'no-cache',
'Referer': ref_url,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36'
}
while 1:
i += 1
if i >= 6:
break
url = "https://api.bilibili.com/x/relation/followings?vmid=" + \
str(id)+"&pn="+str(i) + \
"&ps=20&order=desc&jsonp=jsonp&callback=__jp5"
try:
'''
爬取关注up主信息,并正则表达式提取关键信息
'''
r = requests.get(url, headers=head, timeout=10)
#r.encoding='utf-8'
a = r.content
r2=str(a,'utf-8')
#print(r2)
#print(type(r2))
#my_mid = re.split('mid', r2)
pattern_mid = '"mid":(.*?),"'
pattern_uname = '"uname":"(.*?)"'
pattern_sign = '"sign":"(.*?)"'
pattern_desc = '"desc":"(.*?)"'
#my_mid = re.search('(?P<province>\d{3})(?P<city>\d{3})(?P<born_year>\d{4})',s)
up_mid = re.findall(pattern_mid,r2) #up主唯一标识
up_uname = re.findall(pattern_uname,r2) #up主名字
up_sign = re.findall(pattern_sign,r2) #up主简介
up_desc = re.findall(pattern_desc,r2) #up主名牌
#print(up_mid)
#print(up_uname)
#print(up_sign)
#print(up_desc)
'''
处理每轮产生结果,生成字典,每一项存放一个up主
'''
for H in range(0,len(up_mid)):
up_list = []
up_list.append(up_mid[H])
up_list.append(up_uname[H])
up_list.append(up_sign[H])
up_list.append(up_desc[H])
following_up.append(up_list)
except Exception as e:
print(e)
if following_up != []:
return following_up
#save(result, id)
None1 = []
return None1
if __name__ == '__main__':
#get_api()
print(followings(9212842))
本环节主要用于存储数据,由于我们后续可能要构建机器学习的算法,所以我们需要大量的数据,并且数据需要按规则存储,以便后续选择使用。
import sqlite3
import requests
import time
import bilbliapi
user = [] #保证收录的up信息只出现一次
def create():
'''
创建数据库
id 主键值用户唯一id
mid 关注up的id
uname up主的名字
sign up主的简介
desc up主的官方认证
'''
global conn
conn = sqlite3.connect('data.db')
print("Opened database successfully")
conn.execute("""
create table if not exists user(
id INTEGER PRIMARY KEY ,
mid int DEFAULT NULL,
name varchar DEFAULT NULL,
sign varchar DEFAULT NULL,
desc varchar DEFAULT NULL)""")
conn.execute("""
create table if not exists relation(
id INTEGER PRIMARY KEY ,
master int,
following int
)""")
conn.commit()
print("Table created successfully")
conn.commit()
def save(id,result):
'''
添加数据进入数据库
'''
# 将数据保存至本地
global conn
command1 = "insert into user \
(mid,name,sign,desc) values (?,?,?,?);"
command2 = "insert into relation\
(master,following)values(?,?)"
if result == []:
return
for row in result:
try:
temp = (id, row[0])
if row[0] not in user:
user.append(row[0])
conn.execute(command1, row)
conn.execute(command2, temp)
else:
conn.execute(command2, temp)
except Exception as e:
print(e)
print("insert error!")
conn.rollback()
conn.commit()
return
def use_database():
'''
使用爬虫信息备用,创建数据库
9208637
'''
num = 100000
now_num = 9208637
create()
for i in range(0,num):
uid = now_num - i
data = bilbliapi.followings(uid)
if data != []:
save(uid ,data)
print("\r已爬取uid为{0}的用户".format(uid))
conn.close()
if __name__ == "__main__":
use_database()
本文件下主要用于处理数据,包括各种数据库的构建,其中包含核心量化算法。在本实验中设计了三个量化算法。
$$ X = \frac{R + V}{F} $$
$$ Y = \frac{G}{F} $$
$$ X = F\times\frac{G}{V+R} $$
$$ Y = F\times\frac{G}{N} $$
$$ X = \sqrt[4]{F\times\frac{G}{V+R}} $$
$$ Y = \sqrt[4]{F\times\frac{G}{N}} $$
AGNES聚类算法实现,参考周志华《机器学习》,手动实现
提取数据库数据,主要用于2类up主的推荐,原因是第2,3,4类up主的画像相对比较固定,接受人群比较单一
算法流程:
用户协同主要针对1类up主,原因是大up主用户粘合度高,产出稳定,视频种类更加固定且丰富,适合用户协同
算法流程:
启动主函数
import sqlite3
import requests
import time
import bilbliapi
import AGNES
import math
import useraner
import using
def useraner_data_F(mid):
'''
项协同的推荐算法,主要入口
'''
for i in mid:
useraner.take_database(i)
def using_data_F(mid):
'''
用户协同的推荐算法,程序入口
'''
using.using_database(mid)
def using_agnes_data():
'''
基于聚类算法的推荐入口
算法流程:
1.查找数据库,查找出全部一类up主
2.根据mid查找up主相关信息,输出up主相关信息
'''
conn = sqlite3.connect('data.db')
data = conn.execute("SELECT user.mid,user.name,user.desc from user,datausing1 WHERE datausing1.group_num == 1 and datausing1.mid = user.mid GROUP BY user.mid").fetchall()
tplt = "{0:{3}^10}\t{1:{3}^10}\t{2:^10}"
print(tplt.format("up主mid", "up主id", "up获得头衔", chr(12288)))
for i in data:
print(tplt.format(i[0], i[1], i[2], chr(12288)))
return
def initialization(uid): #推荐uid
'''
初始化函数主要为后续两种算法处理数据
'''
A = bilbliapi.followings(uid)
O = ['63231', '116683', '122879', '208259', '375375', '466272', '546195', '562197', '730732', '777536', '883968', '1420982', '1532165', '1577804', '2200736', '2206456', '2374194', '2920960', '3345720', '3353026', '4162287', '4474705', '5294454', '5870268', '5970160', '6574487', '7788379', '8047632', '9008159', '9617619', '9824766', '10558098',
'11688464', '13354765', '14110780', '14583962', '14804670', '17409016', '18202105', '19577966', '20165629', '26366366', '27756469', '29329085', '32786875', '35789774', '37090048', '37663924', '44688866', '51896064', '62540916',
'67141499', '89847338', '90361813', '99157282', '113362335', '161775300', '163637592', '174501086', '176037767', '196356191', '212535360', '233114659', '250858633', '254463269', '280793434', '290526283', '319341196', '321173469', '326499679', '339680732', '353539995', '393166851', '401742377', '414336759', '419738808', '437316738', '455876411', '456664753', '472747194', '474702359', '517327498', '519872016', '587618113', '591856754', '598464467', '605544858', '641591444', '642389251', '1202762767', '1868902080', '2026561407']
up_mid = []
for i in A:
up_mid.append(i[0])
useraner_data = [] #项协同
using_data = [] #用户协同
for i in up_mid:
if any(i in j for j in O):
using_data.append(i)
else:
useraner_data.append(i)
useraner_data_F(useraner_data)
using_data_F(using_data)
2
def muen():
'''
主菜单界面,模式选择等功能
'''
while True:
print("欢迎来的到bilibili推荐热门up系统 ----by ljzjsc")
print("本程序提供两种推荐方式,一种是基于聚类算法的up热门推荐,适用于新来b站的朋友")
print("一种是基于用户协同的推荐方式")
print("1.聚类算法推荐")
print("2.用户协同算法推荐")
num = input("请输入你的选择:")
if(int(num) == 1):
using_agnes_data()
elif(int(num) == 2):
uid = input("请输入uid:")
initialization(uid)
#print("2")
else:
continue
if __name__ == "__main__":
#initialization(1384117815)
#muen()
'''
P = []
conn = sqlite3.connect('data.db')
command1 = "SELECT datausing1.mid from datausing1 where datausing1.group_num ==1 GROUP BY datausing1.mid"
up = conn.execute(command1).fetchall()
for i in up:
P.append(i[0])
print(P)'''
muen()
全部评论 (暂无评论)
info 还没有任何评论,你来说两句呐!