From 705cf09741635cfca718dae2e2d269028c274ae4 Mon Sep 17 00:00:00 2001 From: Jerry Yan <792602257@qq.com> Date: Sun, 12 Jun 2022 00:43:57 +0800 Subject: [PATCH 01/10] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=97=A0=E7=94=A8?= =?UTF-8?q?=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- proxy_script.py | 34 ---------------------------------- 1 file changed, 34 deletions(-) delete mode 100644 proxy_script.py diff --git a/proxy_script.py b/proxy_script.py deleted file mode 100644 index 5bf8232..0000000 --- a/proxy_script.py +++ /dev/null @@ -1,34 +0,0 @@ -# ! IMPORT ! make sure you ran mitmproxy with this script, -# eg: `/path/to/mitmproxy -s mitmproxy.py` -import time -from mitmproxy import http -import re -import requests - -session = requests.session() - - -class Writer: - def response(self, flow: http.HTTPFlow): - # /aweme/v1/web/user/profile/other/ 他人主页获取他人信息 - if '/aweme/v1/web/user/profile/other' in flow.request.path: - response_json_content = flow.response.content - session.post("http://127.0.0.1:5000/user_info", headers={ - "X-MITM-TS": str(time.time()), - "X_REFERER": flow.request.url - }, data=response_json_content, timeout=(1, 1)) - - def websocket_message(self, flow: http.HTTPFlow): - re_c = re.search('webcast\d-ws-web-.*\.douyin\.com', flow.request.host) - if re_c: - message = flow.websocket.messages[-1] - if message.from_client: - return - content = message.content - session.post("http://127.0.0.1:5000/message", headers={ - "X-MITM-TS": str(time.time()), - "X_REFERER": flow.request.url - }, data=content, timeout=(1, 1)) - - -addons = [Writer()] From 0edb398e20914043027ece93bc557572ad6ebaa3 Mon Sep 17 00:00:00 2001 From: Jerry Yan <792602257@qq.com> Date: Sun, 12 Jun 2022 00:46:07 +0800 Subject: [PATCH 02/10] Bump mitmproxy and protobuf version --- proxy/manager.py | 26 ++++++++++++++------------ requirements.txt | 5 ++--- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/proxy/manager.py b/proxy/manager.py index d8f4b3a..bd340ba 100644 --- a/proxy/manager.py +++ b/proxy/manager.py @@ -18,17 +18,20 @@ _manager: "Optional[ProxyManager]" = None class ProxyManager: def __init__(self): self._mitm_instance = None - self._loop: "Optional[asyncio.AbstractEventLoop]" = None + self._loop: "asyncio.AbstractEventLoop" = asyncio.new_event_loop() opts = Options( listen_host=config()['mitm']['host'], listen_port=config()['mitm']['port'], ) - self._mitm_instance = DumpMaster(options=opts) - self._load_addon() - opts.update_defer( - flow_detail=0, - termlog_verbosity="error", - ) + async def _init_mitm_instance(): + self._mitm_instance = DumpMaster(options=opts) + self._load_addon() + opts.update_defer( + flow_detail=0, + termlog_verbosity="error", + ) + _loop = asyncio.get_event_loop() + _loop.run_until_complete(_init_mitm_instance()) self._thread = None def __del__(self): @@ -45,13 +48,12 @@ class ProxyManager: self._mitm_instance.addons.add(DanmakuWebsocketAddon(MESSAGE_QUEUE)) def _start(self): - loop = asyncio.new_event_loop() - self._loop = loop - asyncio.set_event_loop(loop) - self._mitm_instance.run() + asyncio.set_event_loop(self._loop) + if self._mitm_instance: + self._loop.run_until_complete(self._mitm_instance.run()) def start_loop(self): - self._thread = threading.Thread(target=self._start) + self._thread = threading.Thread(target=self._start, args=()) self._thread.start() def join(self): diff --git a/requirements.txt b/requirements.txt index 7b7733b..21ac01a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,5 @@ colorama==0.4.4 selenium==4.1.0 -requests==2.27.1 -mitmproxy~=7.0.4 -protobuf<3.19 \ No newline at end of file +mitmproxy~=8.1.0 +protobuf~=3.20.1 \ No newline at end of file From 6bbb5cddb2675a5aabaa6ed22aceadc4c0359176 Mon Sep 17 00:00:00 2001 From: Jerry Yan <792602257@qq.com> Date: Sun, 12 Jun 2022 00:46:32 +0800 Subject: [PATCH 03/10] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=97=A0=E7=94=A8?= =?UTF-8?q?=E9=85=8D=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config/settings.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/config/settings.yml b/config/settings.yml index 01a3469..ddfa482 100644 --- a/config/settings.yml +++ b/config/settings.yml @@ -32,9 +32,5 @@ live: users: - MS4wLjABAAAAzBItqEvCjPryxn_Y6w6LtRBFDOVNfjvYSJg8VVZFwlw -http: - host: 127.0.0.1 - port: 5000 - api: userinfo: 'https://live.douyin.com/webcast/user/?aid=6383&target_uid=' \ No newline at end of file From 2c652189faf629c61da612cb41c2f7498acf673d Mon Sep 17 00:00:00 2001 From: Jerry Yan <792602257@qq.com> Date: Sun, 12 Jun 2022 01:11:18 +0800 Subject: [PATCH 04/10] =?UTF-8?q?=E5=85=BC=E5=AE=B9=E6=9C=AA=E8=AE=BE?= =?UTF-8?q?=E7=BD=AE=E7=9A=84=E6=83=85=E5=86=B5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- browser/manager.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/browser/manager.py b/browser/manager.py index aa1942e..751559e 100644 --- a/browser/manager.py +++ b/browser/manager.py @@ -1,10 +1,10 @@ import threading +from typing import TYPE_CHECKING from urllib.parse import urlparse -from config.helper import config -from browser.edge import EdgeDriver from browser.chrome import ChromeDriver -from typing import TYPE_CHECKING +from browser.edge import EdgeDriver +from config.helper import config if TYPE_CHECKING: from typing import Type, Optional, List @@ -27,10 +27,11 @@ class BrowserManager(): self._tabs: "List[TabInfo]" = [] def init_browser(self): - _users = config()['live']['users'] + _live_config = config().get("live", {}) + _users = _live_config.get("users", []) if type(_users) is not list: _users = [_users] - _rooms = config()['live']['rooms'] + _rooms = _live_config.get("rooms", []) if type(_rooms) is not list: _rooms = [_rooms] for _user in _users: From 16ad5bd828c27ef8c9edeb0169fbbc5b276270b7 Mon Sep 17 00:00:00 2001 From: Jerry Yan <792602257@qq.com> Date: Sun, 12 Jun 2022 01:11:34 +0800 Subject: [PATCH 05/10] =?UTF-8?q?=E6=B8=85=E9=99=A4=E9=BB=98=E8=AE=A4?= =?UTF-8?q?=E9=85=8D=E7=BD=AE=E7=9A=84=E6=88=BF=E9=97=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config/settings.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/config/settings.yml b/config/settings.yml index ddfa482..e727802 100644 --- a/config/settings.yml +++ b/config/settings.yml @@ -27,10 +27,9 @@ output: live: rooms: - - "585723119943" - - "583853809376" + - "" users: - - MS4wLjABAAAAzBItqEvCjPryxn_Y6w6LtRBFDOVNfjvYSJg8VVZFwlw + - "" api: userinfo: 'https://live.douyin.com/webcast/user/?aid=6383&target_uid=' \ No newline at end of file From dc373acc27903a0e5bb59fc0f571b3b6e393c174 Mon Sep 17 00:00:00 2001 From: Jerry Yan <792602257@qq.com> Date: Sun, 12 Jun 2022 01:12:49 +0800 Subject: [PATCH 06/10] =?UTF-8?q?README.md=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 63 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 39 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 310e8cc..36f7f06 100644 --- a/README.md +++ b/README.md @@ -1,33 +1,48 @@ # 抖音web直播间([live.douyin.com](https://live.douyin.com))弹幕抓取 -## 近期工作内容比较繁重,大概率会摸一段时间 (from:q792602257) +> ### 近期工作内容比较繁重,大概率会摸一段时间 (from:q792602257) +> +> ### 实现功能: +> 1. 使用新版mitmproxy,使mitmproxy进程跑在主进程里,兼容Python3.10 +> 2. 数据无磁盘IO,通过Queue请求传输proto数据,如果对弹幕发送时间要求较高的,可以使用消息对象中的时间 +> 3. 修改输出为组件化,后期通过配置进行启用或禁用,开发者也可以自行编写对应的保存逻辑 +> 4. 自动打开配置的房间及用户首页 +> +> ### 对其中的修改: +> 1. 删除了mongo相关内容(以后补吧……,重写一个也不麻烦) +> +> ### 待实现功能(咕): +> 1. 未开播时,自动刷新页面进行重新检测 +> 2. 下播事件触发及对应动作 +> 3. 上播事件触发及自动打开对应的房间 +> 4. 录播支持 +> 5. 异步输出支持 -## 实现功能: - 1. 使用旧版(7.0.4)的mitmproxy,使mitmproxy进程跑在主进程里(主要是Queue这种方式需要) - 2. 数据无磁盘IO,通过Queue请求传输proto数据,如果对弹幕发送时间要求较高的,可以使用消息对象中的时间 - 3. 修改输出为组件化,后期通过配置进行启用或禁用,开发者也可以自行编写对应的保存逻辑 - 4. 自动打开配置的房间及用户首页 - -## 对其中的修改: -1. 删除了mongo相关内容(以后补吧……,重写一个也不麻烦) +### **如何配置** +1. 首先配置`config/settings.yml`中`webdriver.use`将要使用到的浏览器(现仅支持`chrome`及`edge`浏览器) +2. 下载对应浏览器`WebDriver`驱动 + - [Edge浏览器](https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/) + - [Chrome浏览器](https://chromedriver.chromium.org/downloads "官网下载") [国内镜像](https://registry.npmmirror.com/binary.html?path=chromedriver/ "淘宝镜像") +3. 配置`WebDriver`驱动可执行文件目录 + - Edge浏览器请配置于`webdriver.edge.bin` + - Chrome浏览器请配置于`webdriver.chrome.bin` +4. 配置浏览器是否需要无头`headless`模式 + - `webdriver.headless`设置为True,浏览器打开后不会显示窗口,适合Linux服务器等无需显示的情况 + - `webdriver.headless`设置为False,浏览器打开后会显示窗口,更适合需要自己手动操作浏览器等其他需要显示情况 +5. 配置输出插件(`output.use`),可以自由搭配使用 + - `print`:控制台打印的组件,收到弹幕信息会在控制台中输出 + - `xml`:B站弹幕姬相兼容的弹幕格式,适用于后期与视频叠加或分析 + - `debug`:开发或测试使用,会保存所有未处理的消息类型,及保留报错信息,方便后期维护排查 +6. 配置默认需要打开的房间及用户主页 + - `live.rooms`: 填写房间号(链接地址最后一串数字),或者完整链接地址 + - `live.users`: 填写用户加密ID(用户首页链接地址最后一串字符串),或者完整链接地址 *(暂无任何用途)* -## 待实现功能(咕): -1. 未开播时,自动刷新页面进行重新检测 -2. 下播事件触发及对应动作 -3. 上播事件触发及自动打开对应的房间 -4. 修改README -5. 录播支持 -6. 异步输出支持 +## 运行步骤: - -## 改版后,运行步骤: - -1. 下载edge浏览器的**webDriver**驱动: - - 下载地址:https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/ - - 下载后在{ **/config/settings.yml** },配置**webdriver - edge**的路径,直接放在运行目录,就不用修改配置文件 -3. 使用 [ **requirements.txt** ] 下的包,新版的**不支持mitmproxy8**,只**支持mitmproxy7**的版本。 -4. 新版不需要在额外先启动mitmproxy,直接运行**main.py**就行 +1. 安装依赖 `pip install -r requirements.txt` +2. 按照上述步骤进行配置 +3. 运行`main.py` ## **屏幕效果截图** From e2f582204a8d6a4e30b5f2fdaf162787abfa89f4 Mon Sep 17 00:00:00 2001 From: Jerry Yan <792602257@qq.com> Date: Sun, 12 Jun 2022 09:20:31 +0800 Subject: [PATCH 07/10] =?UTF-8?q?mitmproxy=E5=AE=9E=E4=BE=8B=E5=8C=96?= =?UTF-8?q?=E6=96=B9=E6=B3=95=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- proxy/manager.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/proxy/manager.py b/proxy/manager.py index bd340ba..b30e025 100644 --- a/proxy/manager.py +++ b/proxy/manager.py @@ -30,19 +30,18 @@ class ProxyManager: flow_detail=0, termlog_verbosity="error", ) - _loop = asyncio.get_event_loop() - _loop.run_until_complete(_init_mitm_instance()) + self._loop.run_until_complete(_init_mitm_instance()) self._thread = None def __del__(self): self.terminate() def terminate(self): + if self._mitm_instance: + self._mitm_instance.shutdown() if self._loop: if self._loop.is_running(): self._loop.stop() - if self._mitm_instance: - self._mitm_instance.shutdown() def _load_addon(self): self._mitm_instance.addons.add(DanmakuWebsocketAddon(MESSAGE_QUEUE)) From ee45407f0e693e63733064ac329821eabb2b3afc Mon Sep 17 00:00:00 2001 From: Jerry Yan <792602257@qq.com> Date: Sun, 12 Jun 2022 09:47:18 +0800 Subject: [PATCH 08/10] =?UTF-8?q?output=E6=9E=90=E6=9E=84=E6=96=B9?= =?UTF-8?q?=E6=B3=95=E6=9B=B4=E6=96=B0=EF=BC=8C=E9=80=80=E5=87=BA=E9=80=BB?= =?UTF-8?q?=E8=BE=91=E8=A1=A5=E5=85=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.py | 5 ++++- output/manager.py | 8 ++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/main.py b/main.py index 43dbecd..56c492c 100644 --- a/main.py +++ b/main.py @@ -23,4 +23,7 @@ if __name__ == '__main__': signal.signal(signal.SIGTERM, terminate) signal.signal(signal.SIGINT, terminate) output_manager.start_loop() - proxy_manager.join() + try: + proxy_manager.join() + finally: + terminate() diff --git a/output/manager.py b/output/manager.py index 40a5c4c..367dfd0 100644 --- a/output/manager.py +++ b/output/manager.py @@ -3,19 +3,19 @@ import threading from typing import TYPE_CHECKING from config.helper import config -from messages.fansclub import FansclubMessage -from proxy.queues import MESSAGE_QUEUE from messages.chat import ChatMessage from messages.control import ControlMessage +from messages.fansclub import FansclubMessage from messages.gift import GiftMessage from messages.like import LikeMessage from messages.member import MemberMessage from messages.roomuserseq import RoomUserSeqMessage from messages.social import SocialMessage +from output.debug import DebugWriter from output.print import Print from output.xml import XMLWriter -from output.debug import DebugWriter from protobuf import message_pb2, wss_pb2 +from proxy.queues import MESSAGE_QUEUE if TYPE_CHECKING: from typing import Type, Optional, List @@ -122,7 +122,7 @@ class OutputManager(): self.decode_payload(message) def terminate(self): - if self._should_exit: + if not self._should_exit.is_set(): self._should_exit.set() MESSAGE_QUEUE.put(None) From 931ed17c287f1e5cd53a3ba5fc9e22751d59977b Mon Sep 17 00:00:00 2001 From: Jerry Yan <792602257@qq.com> Date: Sun, 12 Jun 2022 09:48:02 +0800 Subject: [PATCH 09/10] =?UTF-8?q?README=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/README.md b/README.md index 36f7f06..9bddf01 100644 --- a/README.md +++ b/README.md @@ -52,3 +52,31 @@ ![enter image description here](https://github.com/gll19920817/tiktok_live/blob/main/WX20211129-144919@2x.png?raw=true) +### 配置文件说明 + +- `mitm`:mitmproxy相关配置 + - `host`:mitmproxy监听地址,_无特殊要求不建议修改_ + - `port`:mitmproxy监听端口,_无特殊要求不建议修改_ +- `webdriver`:浏览器WebDriver相关配置 + - `headless`:是否开启无头模式,`True/False` + - `use`:使用哪个浏览器,`chrome/edge` + - `edge`:Edge浏览器相关配置,用谷歌可以不管这个 + - `bin`:webdriver可执行文件路径 + - `chrome`:Chrome浏览器相关配置,用Edge的可以不管这个 + - `bin`:webdriver可执行文件路径 + - `no_sandbox`:是否添加`--no-sandbox`启动参数,用于root用户启动浏览器,`True/False` +- `output`:输出相关配置 + - `use`:使用的输出模块,为一个数组,`print/xml/debug` + - `xml`:XML输出模块相关配置 + - `save_path`:_预留内容,实际没有作用_ + - `file_pattern`:xml文件名称格式,现在也只有默认的这个,待后续开发 + - `debug`:Debug输出模块相关配置 + - `save_path`:保存路径相关配置 + - `error`:如果遇见错误,将错误存储在这个路径下 + - `debug`:如果遇见未处理的消息类型,将该消息存储在这个路径下 + - `known`:_预留内容,实际没有作用_ +- `live`:直播间相关配置 + - `rooms`:房间号(链接地址最后一串数字),或者完整链接地址,为一个数组 + - `users`:用户加密ID(用户首页链接地址最后一串字符串),或者完整链接地址 ,为一个数组 +- `api`:这个现在暂时没啥用了…… + - `userinfo`:…… From f08cf605789f06038c933f231852efe600c6561e Mon Sep 17 00:00:00 2001 From: Jerry Yan <792602257@qq.com> Date: Sun, 12 Jun 2022 09:49:14 +0800 Subject: [PATCH 10/10] =?UTF-8?q?=E9=99=8D=E4=BD=8E=E8=A6=81=E6=B1=82?= =?UTF-8?q?=E4=BB=A5=E6=94=AF=E6=8C=81python3.8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 21ac01a..973e513 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ colorama==0.4.4 selenium==4.1.0 -mitmproxy~=8.1.0 -protobuf~=3.20.1 \ No newline at end of file +mitmproxy~=8.0.0 +protobuf<3.20 \ No newline at end of file