# 根据html标签获取数据
# 需要安装包pip lxml
import requests
import bs4
import ctypes
import tkinter
import tkinter.ttk
import tkinter.scrolledtext
import pyperclip
import lxml
headers = {
# 'authority': 'developer.mozilla.org',
# 'pragma': 'no-cache',
# 'cache-control': 'no-cache',
# 'upgrade-insecure-requests': '1',
'user-agent':'Mozilla/5.0 (Windows NT 10.0WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 '
'YaBrowser/19.7.0.1635 Yowser/2.5 Safari/537.36',
# 'accept': 'text/html,application/xhtml+xml,application/xmlq=0.9,image/webp,image/apng,*/*q=0.8,
# application/signed-exchangev=b3', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,
# zh-TWq=0.9,zhq=0.8,en-USq=0.7,enq=0.6', 'cookie': 你的cookie,
}
global view
global dm
global time
global like
global coin
global collect
global share
def get_video_info():
global view
global dm
global time
global like
global coin
global collect
global share
half_url = GetBVString.get()
if 'http' in half_url:
bv_string = half_url.strip().split('?')[0].split('/')[4]
GetBVString.delete(0, '')
GetBVString.insert(0, bv_string)
else:
bv_string = half_url
response = requests.get(f'https://www.bilibili.com/video/{bv_string}', headers=headers)
soup = bs4.BeautifulSoup(response.text, "lxml")
view = soup.select('#viewbox_report >div >span.view')[0].text.replace('播放 · ', '')
dm = soup.select('#viewbox_report >div >span.dm')[0].text.replace('弹幕', '')
like = soup.select('#arc_toolbar_report >div.ops >span.like')[0].text.replace('\n ', '')
coin = soup.select('#arc_toolbar_report >div.ops >span.coin')[0].text.replace('\n ', '').replace('\n ',
'')
collect = soup.select('#arc_toolbar_report >div.ops >span.collect')[0].text.replace('\n ', '')
share = soup.select('#arc_toolbar_report >div.ops >span.share')[0].text.replace('\n ', '')
time = soup.select('#viewbox_report >div >span:nth-child(3)')[0].text
viewLabel.configure(text=view)
dmLabel.configure(text=dm)
likeLabel.configure(text=like)
coinLabel.configure(text=coin)
collectLabel.configure(text=collect)
shareLabel.configure(text=share)
timeLabel.configure(text=time)
def paste():
GetBVString.delete(0, '')
GetBVString.insert(0, pyperclip.paste())
def clear():
GetBVString.delete(0, '')
window = tkinter.Tk()
ctypes.windll.shcore.SetProcessDpiAwareness(1)
ScaleFactor = ctypes.windll.shcore.GetScaleFactorForDevice(0)
window.tk.call('tk', 'scaling', ScaleFactor /72)
window.title('视频数据监控')
GetBVString = tkinter.ttk.Entry(window)# , width=50
confirm = tkinter.ttk.Button(window, text="获取", command=get_video_info)
paste = tkinter.ttk.Button(window, text="粘贴", command=paste)
clear = tkinter.ttk.Button(window, text="清空", command=clear)
tkinter.ttk.Label()
viewLabelHint = tkinter.ttk.Label(window, text='播放:')
dmLabelHint = tkinter.ttk.Label(window, text='弹幕:')
timeLabelHint = tkinter.ttk.Label(window, text='时间:')
likeLabelHint = tkinter.ttk.Label(window, text='点赞:')
coinLabelHint = tkinter.ttk.Label(window, text='投币:')
collectLabelHint = tkinter.ttk.Label(window, text='收藏:')
shareLabelHint = tkinter.ttk.Label(window, text='分享:')
viewLabel = tkinter.ttk.Label(window, text='')
dmLabel = tkinter.ttk.Label(window, text='')
likeLabel = tkinter.ttk.Label(window, text='')
coinLabel = tkinter.ttk.Label(window, text='')
collectLabel = tkinter.ttk.Label(window, text='')
shareLabel = tkinter.ttk.Label(window, text='')
timeLabel = tkinter.ttk.Label(window, text='')
GetBVString.grid(column=0, row=0, columnspan=2)
confirm.grid(column=2, row=0)
paste.grid(column=3, row=0)
clear.grid(column=4, row=0)
viewLabelHint.grid(column=0, row=1, sticky='E')
dmLabelHint.grid(column=0, row=2, sticky='E')
likeLabelHint.grid(column=0, row=3, sticky='E')
coinLabelHint.grid(column=0, row=4, sticky='E')
collectLabelHint.grid(column=0, row=5, sticky='E')
shareLabelHint.grid(column=0, row=6, sticky='E')
timeLabelHint.grid(column=0, row=7, sticky='E')
viewLabel.grid(column=1, row=1, sticky='W')
dmLabel.grid(column=1, row=2, sticky='W')
likeLabel.grid(column=1, row=3, sticky='W')
coinLabel.grid(column=1, row=4, sticky='W')
collectLabel.grid(column=1, row=5, sticky='W')
shareLabel.grid(column=1, row=6, sticky='W')
timeLabel.grid(column=1, row=7, sticky='W')
window.mainloop()
爬虫流程其实把网络爬虫抽象开来看,它无外乎包含如下几个步骤
模拟请求网页。模拟浏览器,打开目标网站。
获取数据。打开网站之后,就可以自动化的获取我们所需要的网站数据。
保存数据。拿到数据之后,需要持久化到本地文件或者数据库等存储设备中。
那么我们该如何使用 Python 来编写自己的爬虫程序呢,在这里我要重点介绍一个 Python 库:Requests。
Requests 使用
Requests 库是 Python 中发起 HTTP 请求的库,使用非常方便简单。
模拟发送 HTTP 请求
发送 GET 请求
当我们用浏览器打开豆瓣首页时,其实发送的最原始的请求就是 GET 请求
import requests
res = requests.get('http://www.douban.com')
print(res)
print(type(res))
>>>
<Response [200]>
<class 'requests.models.Response'>