爬虫弹幕可以下载吗

Python017

爬虫弹幕可以下载吗,第1张

可以。准备:python3环境,需要安装BeautifulSoup,selenium包,phantomjs。原理:通过aid下载bilibili番剧弹幕,通过aid获取cid,如http://www.bilibili.com/widget/getPageList?aid=9654289,下载弹幕地址:http://comment.bilibili.com/cid.xml.

# code at 2021-10-1

# 根据html标签获取数据

# 需要安装包pip lxml

import requests

import bs4

import ctypes

import tkinter

import tkinter.ttk

import tkinter.scrolledtext

import pyperclip

import lxml

headers = {

# 'authority': 'developer.mozilla.org',

# 'pragma': 'no-cache',

# 'cache-control': 'no-cache',

# 'upgrade-insecure-requests': '1',

    'user-agent':'Mozilla/5.0 (Windows NT 10.0WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 '

                  'YaBrowser/19.7.0.1635 Yowser/2.5 Safari/537.36',

    # 'accept': 'text/html,application/xhtml+xml,application/xmlq=0.9,image/webp,image/apng,*/*q=0.8,

# application/signed-exchangev=b3', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,

    # zh-TWq=0.9,zhq=0.8,en-USq=0.7,enq=0.6', 'cookie': 你的cookie,

}

global view

global dm

global time

global like

global coin

global collect

global share

def get_video_info():

global view

global dm

global time

global like

global coin

global collect

global share

half_url = GetBVString.get()

if 'http' in half_url:

bv_string = half_url.strip().split('?')[0].split('/')[4]

GetBVString.delete(0, '')

GetBVString.insert(0, bv_string)

else:

bv_string = half_url

response = requests.get(f'https://www.bilibili.com/video/{bv_string}', headers=headers)

soup = bs4.BeautifulSoup(response.text, "lxml")

view = soup.select('#viewbox_report >div >span.view')[0].text.replace('播放 · ', '')

dm = soup.select('#viewbox_report >div >span.dm')[0].text.replace('弹幕', '')

like = soup.select('#arc_toolbar_report >div.ops >span.like')[0].text.replace('\n    ', '')

coin = soup.select('#arc_toolbar_report >div.ops >span.coin')[0].text.replace('\n      ', '').replace('\n    ',

                                                                                                            '')

collect = soup.select('#arc_toolbar_report >div.ops >span.collect')[0].text.replace('\n    ', '')

share = soup.select('#arc_toolbar_report >div.ops >span.share')[0].text.replace('\n      ', '')

time = soup.select('#viewbox_report >div >span:nth-child(3)')[0].text

viewLabel.configure(text=view)

dmLabel.configure(text=dm)

likeLabel.configure(text=like)

coinLabel.configure(text=coin)

collectLabel.configure(text=collect)

shareLabel.configure(text=share)

timeLabel.configure(text=time)

def paste():

GetBVString.delete(0, '')

GetBVString.insert(0, pyperclip.paste())

def clear():

GetBVString.delete(0, '')

window = tkinter.Tk()

ctypes.windll.shcore.SetProcessDpiAwareness(1)

ScaleFactor = ctypes.windll.shcore.GetScaleFactorForDevice(0)

window.tk.call('tk', 'scaling', ScaleFactor /72)

window.title('视频数据监控')

GetBVString = tkinter.ttk.Entry(window)# , width=50

confirm = tkinter.ttk.Button(window, text="获取", command=get_video_info)

paste = tkinter.ttk.Button(window, text="粘贴", command=paste)

clear = tkinter.ttk.Button(window, text="清空", command=clear)

tkinter.ttk.Label()

viewLabelHint = tkinter.ttk.Label(window, text='播放:')

dmLabelHint = tkinter.ttk.Label(window, text='弹幕:')

timeLabelHint = tkinter.ttk.Label(window, text='时间:')

likeLabelHint = tkinter.ttk.Label(window, text='点赞:')

coinLabelHint = tkinter.ttk.Label(window, text='投币:')

collectLabelHint = tkinter.ttk.Label(window, text='收藏:')

shareLabelHint = tkinter.ttk.Label(window, text='分享:')

viewLabel = tkinter.ttk.Label(window, text='')

dmLabel = tkinter.ttk.Label(window, text='')

likeLabel = tkinter.ttk.Label(window, text='')

coinLabel = tkinter.ttk.Label(window, text='')

collectLabel = tkinter.ttk.Label(window, text='')

shareLabel = tkinter.ttk.Label(window, text='')

timeLabel = tkinter.ttk.Label(window, text='')

GetBVString.grid(column=0, row=0, columnspan=2)

confirm.grid(column=2, row=0)

paste.grid(column=3, row=0)

clear.grid(column=4, row=0)

viewLabelHint.grid(column=0, row=1, sticky='E')

dmLabelHint.grid(column=0, row=2, sticky='E')

likeLabelHint.grid(column=0, row=3, sticky='E')

coinLabelHint.grid(column=0, row=4, sticky='E')

collectLabelHint.grid(column=0, row=5, sticky='E')

shareLabelHint.grid(column=0, row=6, sticky='E')

timeLabelHint.grid(column=0, row=7, sticky='E')

viewLabel.grid(column=1, row=1, sticky='W')

dmLabel.grid(column=1, row=2, sticky='W')

likeLabel.grid(column=1, row=3, sticky='W')

coinLabel.grid(column=1, row=4, sticky='W')

collectLabel.grid(column=1, row=5, sticky='W')

shareLabel.grid(column=1, row=6, sticky='W')

timeLabel.grid(column=1, row=7, sticky='W')

window.mainloop()

爬虫流程

其实把网络爬虫抽象开来看,它无外乎包含如下几个步骤

模拟请求网页。模拟浏览器,打开目标网站。

获取数据。打开网站之后,就可以自动化的获取我们所需要的网站数据。

保存数据。拿到数据之后,需要持久化到本地文件或者数据库等存储设备中。

那么我们该如何使用 Python 来编写自己的爬虫程序呢,在这里我要重点介绍一个 Python 库:Requests。

Requests 使用

Requests 库是 Python 中发起 HTTP 请求的库,使用非常方便简单。

模拟发送 HTTP 请求

发送 GET 请求

当我们用浏览器打开豆瓣首页时,其实发送的最原始的请求就是 GET 请求

import requests

res = requests.get('http://www.douban.com')

print(res)

print(type(res))

>>>

<Response [200]>

<class 'requests.models.Response'>