spider.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596
  1. # -*- encoding: utf-8 -*-
  2. '''
  3. @File : spider.py
  4. @Time : 2023年06月18日 17:44:21 星期天
  5. @Author : erma0
  6. @Version : V3
  7. @Link : https://erma0.cn
  8. @Desc : 抖音爬虫核心代码
  9. '''
  10. import datetime
  11. import os
  12. import random
  13. import re
  14. import subprocess
  15. import time
  16. from threading import Lock
  17. from typing import List
  18. from urllib.parse import quote, unquote, urlparse
  19. import ujson as json
  20. import zmysql
  21. from loguru import logger
  22. from playwright.sync_api import Error, Route, TimeoutError
  23. from browser import Browser, BrowserContext
  24. version = 'V3.230622'
  25. banner = rf'''
  26. ____ _ ____ _ _
  27. | _ \ ___ _ _ _ _(_)_ __ / ___| _ __ (_) __| | ___ _ __
  28. | | | |/ _ \| | | | | | | | '_ \ \___ \| '_ \| |/ _` |/ _ \ '__|
  29. | |_| | (_) | |_| | |_| | | | | | ___) | |_) | | (_| | __/ |
  30. |____/ \___/ \__,_|\__, |_|_| |_| |____/| .__/|_|\__,_|\___|_|
  31. |___/ |_|
  32. {version}
  33. Github: https://github.com/erma0/douyin
  34. '''
  35. print(banner)
  36. class Douyin(object):
  37. def __init__(self,
  38. context: BrowserContext,
  39. url: str = '',
  40. num: int = -1,
  41. type: str = 'post',
  42. down_path: str = '下载',
  43. path_type: str = 'id',
  44. msToken: bool = False,
  45. author='',
  46. mid=0):
  47. """
  48. 初始化
  49. type=['post', 'like', 'music', 'search', 'follow', 'fans', 'collection', 'video', 'favorite', 'id']
  50. 默认用id命名文件(夹),当path_type='title'时,使用昵称/标题来命名文件(夹),但可能影响用户作品增量采集
  51. 因为可能还没拿到用户昵称,就已经先拿到作品列表的请求了,此时会导致重复采集
  52. """
  53. self.context = context
  54. self.num = num
  55. self.type = type
  56. self.down_path = down_path
  57. self.path_type = path_type
  58. self.msToken = msToken
  59. self.url = url.strip() if url else ''
  60. self.author = author
  61. self.mid = mid
  62. self.has_more = True
  63. if not os.path.exists(self.down_path): os.makedirs(self.down_path)
  64. self.pageDown = 0
  65. self.pageDownMax = 5 # 重试次数
  66. self.results = [] # 保存结果
  67. self.results_old = [] # 前一次保存结果
  68. self.lock = Lock()
  69. self.init_() # 初始化URL相关参数
  70. @staticmethod
  71. def str2path(str: str):
  72. """
  73. 把字符串转为Windows合法文件名
  74. """
  75. # 非法字符
  76. lst = ['\r', '\n', '\\', '/', ':', '*', '?', '"', '<', '>', '|']
  77. # lst = ['\r', '\n', '\\', '/', ':', '*', '?', '"', '<', '>', '|', ' ', '^']
  78. # 非法字符处理方式1
  79. for key in lst:
  80. str = str.replace(key, '_')
  81. # 非法字符处理方式2
  82. # str = str.translate(None, ''.join(lst))
  83. # 文件名+路径长度最大255,汉字*2,取80
  84. if len(str) > 80:
  85. str = str[:80]
  86. return str.strip()
  87. @staticmethod
  88. def quit(str):
  89. """
  90. 直接退出程序
  91. """
  92. logger.error(str)
  93. exit()
  94. def url2redirect(self, url):
  95. """
  96. 取302跳转地址
  97. """
  98. r = self.context.new_page()
  99. r.goto(url, wait_until='domcontentloaded')
  100. url = r.url
  101. r.close()
  102. return url
  103. @staticmethod
  104. def url2redirect_requests(url):
  105. """
  106. requests取302跳转地址
  107. """
  108. import requests
  109. r = requests.get(url, allow_redirects=False)
  110. u = r.headers.get('Location', url)
  111. return u
  112. @staticmethod
  113. def filter_emoji(desstr, restr=''):
  114. # 过滤表情,在处理文件名的时候如果想去除emoji可以调用
  115. try:
  116. res = re.compile(u'[\U00010000-\U0010ffff]')
  117. except re.error:
  118. res = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
  119. return res.sub(restr, desstr)
  120. def _append_user(self, user_list: List[dict]):
  121. if not user_list:
  122. logger.error("本次请求结果为空")
  123. return
  124. with self.lock: # 加锁避免意外冲突
  125. if self.has_more:
  126. for item in user_list:
  127. if item['is_red_uniqueid']: # 完全匹配
  128. info: dict = item['user_info']
  129. for key in list(info.keys()):
  130. if not info[key]:
  131. info.pop(key)
  132. self.results.append(info) # 用于保存信息
  133. logger.info(f'采集中,已采集到{len(self.results)}条结果')
  134. break
  135. self.has_more = False # 只查第一页
  136. def _append_users(self, user_list: List[dict]):
  137. if not user_list:
  138. logger.error("本次请求结果为空")
  139. return
  140. with self.lock: # 加锁避免意外冲突
  141. if self.num < 0 or len(self.results) < self.num:
  142. for item in user_list:
  143. if self.num > 0 and len(self.results) >= self.num:
  144. self.has_more = False
  145. logger.info(f'已达到限制采集数量:{len(self.results)}')
  146. return
  147. info = {}
  148. info['nickname'] = self.str2path(item['nickname'])
  149. info['signature'] = self.str2path(item['signature'])
  150. info['avatar'] = item['avatar_larger']['url_list'][0]
  151. for i in [
  152. 'sec_uid', 'uid', 'short_id', 'unique_id', 'unique_id_modify_time', 'aweme_count', 'favoriting_count',
  153. 'follower_count', 'following_count', 'constellation', 'create_time', 'enterprise_verify_reason',
  154. 'is_gov_media_vip', 'live_status', 'total_favorited', 'share_qrcode_uri'
  155. ]:
  156. if item.get(i):
  157. info[i] = item[i]
  158. room_id = item.get('room_id')
  159. if room_id: # 直播间
  160. info['live_room_id'] = room_id
  161. info['live_room_url'] = [
  162. f'http://pull-flv-f26.douyincdn.com/media/stream-{room_id}.flv',
  163. f'http://pull-hls-f26.douyincdn.com/media/stream-{room_id}.m3u8'
  164. ]
  165. music_count = item['original_musician']['music_count']
  166. if music_count: # 原创音乐人
  167. info['original_musician'] = item['original_musician']
  168. self.results.append(info) # 用于保存信息
  169. logger.info(f'采集中,已采集到{len(self.results)}条结果')
  170. else:
  171. self.has_more = False
  172. logger.info(f'已达到限制采集数量:{len(self.results)}')
  173. def _append_awemes(self, aweme_list: List[dict]):
  174. """
  175. 数据入库
  176. """
  177. if not aweme_list:
  178. logger.error("本次请求结果为空")
  179. return
  180. with self.lock: # 加锁避免意外冲突
  181. if self.num < 0 or len(self.results) < self.num:
  182. for item in aweme_list:
  183. # =====限制数量=====
  184. if self.num > 0 and len(self.results) >= self.num:
  185. self.has_more = False
  186. logger.info(f'已达到限制采集数量:{len(self.results)}')
  187. return
  188. # =====增量采集=====
  189. _time = item.get('create_time', item.get('createTime'))
  190. _is_top = item.get('is_top', item.get('tag', {}).get('isTop'))
  191. if self.results_old:
  192. old = self.results_old[0]['time']
  193. if _time <= old: # 如果当前作品时间早于上次采集的最新作品时间,且不是置顶作品,直接退出
  194. if _is_top:
  195. continue
  196. if self.has_more:
  197. self.has_more = False
  198. logger.success(f'增量采集完成,上次运行结果:{old}')
  199. return
  200. # =====保存结果=====
  201. _type = item.get('aweme_type', item.get('awemeType'))
  202. info = item.get('statistics', item.get('stats', {}))
  203. for i in [
  204. 'playCount', 'downloadCount', 'forwardCount', 'collectCount', "digest", "exposure_count",
  205. "live_watch_count", "play_count", "download_count", "forward_count", "lose_count",
  206. "lose_comment_count"
  207. ]:
  208. if not info.get(i):
  209. info.pop(i, '')
  210. info.pop('aweme_id', '')
  211. if _type <= 66 or _type in [69, 107]: # 视频 77西瓜视频
  212. play_addr = item['video'].get('play_addr')
  213. if play_addr:
  214. download_addr = item['video']['play_addr']['url_list'][-1]
  215. else:
  216. download_addr = f"https:{ item['video']['playApi']}"
  217. info['download_addr'] = download_addr
  218. elif _type == 68: # 图文
  219. info['download_addr'] = [images.get('url_list', images.get('urlList'))[-1] for images in item['images']]
  220. elif _type == 101: # 直播
  221. continue
  222. else: # 其他类型作品
  223. info['download_addr'] = '其他类型作品'
  224. logger.info('type', _type)
  225. with open(f'{_type}.json', 'w', encoding='utf-8') as f: # 保存未区分的类型
  226. json.dump(item, f, ensure_ascii=False) # 中文不用Unicode编码
  227. continue
  228. info['id'] = item.get('aweme_id', item.get('awemeId'))
  229. info['time'] = _time
  230. desc = self.str2path(item.get('desc'))
  231. info['desc'] = desc
  232. music = item.get('music')
  233. if music:
  234. info['music_title'] = self.str2path(music['title'])
  235. info['music_url'] = music.get('play_url', music.get('playUrl'))['uri']
  236. cover = item['video'].get('origin_cover')
  237. if cover:
  238. info['cover'] = item['video']['origin_cover']['url_list'][-1]
  239. else:
  240. info['cover'] = f"https:{item['video']['originCover']}"
  241. tags = item.get('text_extra', item.get('textExtra'))
  242. if tags:
  243. info['tags'] = [{
  244. 'tag_id': hashtag.get('hashtag_id', hashtag.get('hashtagId')),
  245. 'tag_name': hashtag.get('hashtag_name', hashtag.get('hashtagName'))
  246. } for hashtag in tags]
  247. self.results.append(info) # 用于保存信息
  248. logger.info(f'采集中,已采集到{len(self.results)}条结果')
  249. else:
  250. self.has_more = False
  251. logger.info(f'已达到限制采集数量:{len(self.results)}')
  252. def download(self):
  253. """
  254. 采集完成后,统一下载已采集的结果
  255. """
  256. if os.path.exists(self.aria2_conf):
  257. logger.info('开始下载')
  258. # command = f'aria2c -c --console-log-level warn -d {self.down_path} -i {self.aria2_conf}'
  259. command = ['aria2c', '-c', '--console-log-level', 'warn', '-d', self.down_path, '-i', self.aria2_conf]
  260. subprocess.run(command) # shell=True时字符串会转义
  261. else:
  262. logger.error('没有发现可下载的配置文件')
  263. def save(self):
  264. if self.results:
  265. logger.success(f'采集完成,本次共采集到{len(self.results)}条结果')
  266. if self.type in ['post', 'like', 'music', 'search', 'collection', 'video', 'favorite']: # 视频列表保存为Aria下载文件
  267. self.msToken = [_['value'] for _ in self.context.cookies() if _['name'] == 'msToken'] if self.msToken else None
  268. _ = []
  269. with open(self.aria2_conf, 'w', encoding='utf-8') as f:
  270. for line in self.results: # 只保存本次采集结果的下载配置
  271. filename = f'{line["id"]}_{line["desc"]}'
  272. if isinstance(line["download_addr"], list):
  273. down_path = self.down_path.replace(line["id"], filename) if self.type == 'video' else os.path.join(
  274. self.down_path, filename)
  275. [
  276. _.append(f'{addr}\n\tdir={down_path}\n\tout={line["id"]}_{index + 1}.jpeg\n')
  277. for index, addr in enumerate(line["download_addr"])
  278. ]
  279. elif isinstance(line["download_addr"], str):
  280. if self.msToken: # 下载0kb时,使用msToken
  281. _.append(
  282. f'{line["download_addr"]}\n\tdir={self.down_path}\n\tout={filename}.mp4\n\tuser-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36\n\theader=Cookie:msToken={self.msToken[0]}\n'
  283. )
  284. else:
  285. _.append(f'{line["download_addr"]}\n\tdir={self.down_path}\n\tout={filename}.mp4\n') # 能正常下载的
  286. else:
  287. logger.error("下载地址错误")
  288. f.writelines(_)
  289. elif self.type in ['follow', 'fans', 'id']: # 用户列表保存主页链接
  290. with open(self.aria2_conf, 'w', encoding='utf-8') as f:
  291. f.writelines([
  292. f"https://www.douyin.com/user/{line.get('sec_uid', 'None')}" for line in self.results
  293. if line.get('sec_uid', None)
  294. ])
  295. with open(f'{self.down_path}.json', 'w', encoding='utf-8') as f: # 保存所有数据到文件,包括旧数据
  296. if self.type == 'post': # 除主页作品外都不需要按时间排序
  297. self.results.sort(key=lambda item: item['id'], reverse=True)
  298. self.results.extend(self.results_old)
  299. json.dump(self.results, f, ensure_ascii=False)
  300. else:
  301. logger.info("本次采集结果为空")
  302. def save1(self):
  303. db = zmysql.TiYoumeiDb()
  304. db.insert("youmei_videos", ["offset", "title", "video_url", "author", "platform", "publish_time", "create_time", "synced"],
  305. [[x['id'], x['desc'], x['download_addr'], self.author, '抖音',
  306. datetime.datetime.fromtimestamp(x['time']).strftime("%Y-%m-%d %H:%M:%S"),
  307. datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 0] for x in self.results if int(x['id']) > self.mid])
  308. def handle(self, route: Route):
  309. try:
  310. if self.has_more:
  311. if self.pageDown > 0:
  312. self.pageDown = 0
  313. response = route.fetch()
  314. if int(response.headers.get('content-length', 1)) > 0:
  315. resj = response.json()
  316. if self.has_more:
  317. self.has_more = resj.get('has_more', True)
  318. if self.type == 'follow':
  319. info = resj.get('followings')
  320. self._append_users(info)
  321. elif self.type == 'fans':
  322. info = resj.get('followers')
  323. self._append_users(info)
  324. elif self.type == 'id':
  325. info = resj.get('user_list')
  326. self._append_user(info)
  327. elif self.type == 'search':
  328. info = []
  329. for item in resj.get('data'):
  330. if item['type'] == 1: # 1作品 16合集 76百科 77头条文章 996热榜 997微头条
  331. _info = item['aweme_info']
  332. info.append(_info)
  333. elif item['type'] == 16:
  334. _info = item['aweme_mix_info']['mix_items']
  335. info.extend(_info)
  336. elif item['type'] == 996:
  337. _info = item['sub_card_list'][0]['hotspot_info']['hotspot_items']
  338. info.extend(_info)
  339. else:
  340. pass
  341. self._append_awemes(info)
  342. else:
  343. info = resj.get('aweme_list')
  344. self._append_awemes(info)
  345. route.fulfill(response=response)
  346. else:
  347. route.abort()
  348. except KeyError as err:
  349. logger.error(f'Error: {err}')
  350. with open('error.json', 'w', encoding='utf-8') as f: # 保存未区分的类型
  351. json.dump(response.text(), f, ensure_ascii=False)
  352. except Error as err:
  353. msg = err.message.split("\n")[0]
  354. logger.info(f'浏览器已关闭: {msg}')
  355. # logger.info(f'Playwright Error: {msg}')
  356. except Exception as err:
  357. logger.error(f'Error: {err}')
  358. def init_(self):
  359. if not self.url: # 未需输入URL时,默认采集本账号
  360. if self.type == 'favorite':
  361. self.url = 'https://www.douyin.com/user/self?showTab=favorite_collection'
  362. elif self.type == 'like':
  363. self.url = 'https://www.douyin.com/user/self?showTab=like'
  364. elif self.type in ['post', 'follow', 'fans']: # 命令行post必须输入URL
  365. self.url = 'https://www.douyin.com/user/self'
  366. else:
  367. self.quit('请输入URL')
  368. hostname = urlparse(self.url).hostname
  369. if self.type == 'id': # 搜索用户ID
  370. self.url = f'https://www.douyin.com/search/{self.url}?type=user'
  371. elif self.url.isdigit(): # 数字ID,作品
  372. self.url = f'https://www.douyin.com/video/{self.url}'
  373. elif hostname and hostname.endswith('douyin.com'): # 链接
  374. if hostname == 'v.douyin.com':
  375. # self.url = self.url2redirect(self.url)
  376. self.url = self.url2redirect_requests(self.url)
  377. else: # 关键字,搜索
  378. self.url = f'https://www.douyin.com/search/{quote(self.url)}'
  379. *_, _type, self.id = unquote(urlparse(self.url).path.strip('/')).split('/')
  380. hookURL = '/aweme/v[123]/web/'
  381. if _type in ['video', 'note']: # 自动识别 单个作品 video
  382. self.type = 'video'
  383. hookURL = '单个作品无需hookURL'
  384. if _type == 'search':
  385. if self.type == 'id': # 搜索 用户ID
  386. hookURL += 'discover/search'
  387. else:
  388. self.type = 'search' # 搜索 综合或视频
  389. hookURL += '(general/search|search/item)'
  390. elif _type == 'music': # 自动识别 音乐
  391. self.type = 'music'
  392. hookURL += 'music'
  393. elif _type == 'collection': # 自动识别 合集
  394. self.type = 'collection'
  395. hookURL += 'mix/aweme'
  396. elif _type == 'user': # 主页链接
  397. if self.type == 'post' or self.url.endswith('?showTab=post'):
  398. self.type = 'post'
  399. hookURL += 'aweme/post'
  400. elif self.type == 'like' or self.url.endswith('?showTab=like'):
  401. self.type = 'like'
  402. hookURL += 'aweme/favorit'
  403. if not self.url.endswith('showTab=like'):
  404. self.url = f'https://www.douyin.com/user/{self.id}?showTab=like'
  405. elif self.type == 'favorite' or self.url.endswith('?showTab=favorite_collection'):
  406. self.type = 'favorite'
  407. hookURL += 'aweme/listcollection'
  408. self.url = 'https://www.douyin.com/user/self?showTab=favorite_collection' # 采集收藏时无视输入的URL
  409. elif self.type == 'follow':
  410. hookURL += 'user/following'
  411. elif self.type == 'fans':
  412. hookURL += 'user/follower'
  413. else: # 备用
  414. pass
  415. self.hookURL = re.compile(hookURL, re.S)
  416. if self.path_type == 'id':
  417. self.down_path = os.path.join(self.down_path, self.str2path(f'{self.type}_{self.id}'))
  418. self.aria2_conf = f'{self.down_path}.txt'
  419. if self.type == 'post': # 主页作品可以增量采集,先取回旧数据
  420. if os.path.exists(f'{self.down_path}.json') and not self.results_old: # 主页作品可以增量采集,先取回旧数据
  421. with open(f'{self.down_path}.json', 'r', encoding='utf-8') as f:
  422. self.results_old = json.load(f)
  423. def page_init(self):
  424. self.page = self.context.new_page()
  425. self.page.set_default_timeout(0)
  426. if self.has_more:
  427. self.page.route(self.hookURL, self.handle)
  428. self.page.goto(self.url)
  429. render_data: dict = json.loads(unquote(self.page.locator('id=RENDER_DATA').inner_text()))
  430. _app = render_data.pop('app', None)
  431. self.client_data = _app if _app else render_data.pop('1', None)
  432. self._location = render_data.pop('_location', None)
  433. self.render_data = render_data.popitem()[1] if render_data else None
  434. if self.type in ['post', 'like', 'follow', 'fans', 'favorite']:
  435. if self.render_data:
  436. self.info = self.render_data['user'] # 备用
  437. self.title = self.info['user']['nickname']
  438. if self.type == 'follow': # 点击关注列表
  439. self.page.locator('[data-e2e="user-info-follow"]').click()
  440. self.page.locator('[data-e2e="user-fans-container"]').click()
  441. elif self.type == 'fans': # 点击粉丝列表
  442. self.page.locator('[data-e2e="user-info-fans"]').click()
  443. self.page.locator('[data-e2e="user-fans-container"]').click()
  444. elif self.type == 'id':
  445. self.title = self.id
  446. elif self.type == 'search':
  447. self.title = self.id
  448. if self.render_data:
  449. self.info = self.render_data['defaultSearchParams']
  450. # self.title = self.info['keyword']
  451. elif self.type == 'collection':
  452. if self.render_data:
  453. self.info = self.render_data['aweme']['detail']['mixInfo']
  454. self.title = self.info['mixName']
  455. elif self.type == 'music': # 聚焦滚动列表
  456. if self.render_data:
  457. self.info = self.render_data['musicDetail']
  458. self.title = self.info['title']
  459. self.page.locator('[data-e2e="scroll-list"]').last.click()
  460. elif self.type == 'video':
  461. if self.render_data:
  462. self.info = self.render_data['aweme']['detail']
  463. self.title = self.id
  464. else: # 备用
  465. pass
  466. if self.path_type == 'title':
  467. self.down_path = os.path.join(self.down_path, self.str2path(f'{self.type}_{self.title}'))
  468. self.aria2_conf = f'{self.down_path}.txt'
  469. if self.type == 'post': # 主页作品可以增量采集,先取回旧数据
  470. if os.path.exists(f'{self.down_path}.json') and not self.results_old:
  471. with open(f'{self.down_path}.json', 'r', encoding='utf-8') as f:
  472. self.results_old = json.load(f)
  473. # has_more控制是否提取初始页面数据render-data,但打开主页后会立即hook到一次请求
  474. # 此时has_more可能会变成0,不应影响提取render-data
  475. if self.has_more is not False:
  476. if self.type == 'post' and self.render_data.get('post', None): # post页面需提取
  477. # 从新到旧排序,无视置顶作品(此需求一般用来采集最新作品)
  478. if self.has_more:
  479. self.has_more = self.render_data['post']['hasMore']
  480. render_data_ls = self.render_data['post']['data']
  481. render_data_ls.sort(key=lambda item: item.get('aweme_id', item.get('awemeId')), reverse=True)
  482. self._append_awemes(render_data_ls)
  483. elif self.type == 'video' and self.render_data.get('aweme', None): # video页面需提取
  484. render_data_ls = [self.render_data['aweme']['detail']]
  485. self._append_awemes(render_data_ls)
  486. self.has_more = False
  487. else: # 备用
  488. pass
  489. def page_next(self): # 加载数据
  490. if self.type == 'collection':
  491. self.page.get_by_role("button", name="点击加载更多").click()
  492. else:
  493. self.page.keyboard.press('End')
  494. # logger.info("加载中")
  495. def run(self):
  496. """
  497. 开始采集
  498. """
  499. self.page_init()
  500. while self.has_more and self.pageDown <= self.pageDownMax:
  501. try:
  502. with self.page.expect_request_finished(lambda request: self.hookURL.search(request.url), timeout=3000):
  503. self.page_next() # 加载下一批数据
  504. # print('下一页')
  505. except TimeoutError: # 重试
  506. self.pageDown += 1
  507. logger.error("重试 + 1")
  508. self.save() # 保存结果
  509. # self.save1()
  510. self.page.close()
  511. def test():
  512. edge = Browser(channel='chromium', headless=True)
  513. # a = Douyin(
  514. # context=edge.context,
  515. # url='https://v.douyin.com/U3eAtXx/'
  516. # # url='https://www.douyin.com/user/MS4wLjABAAAA1UojDGpM_JuQ91nbVjo6jLfJSpQ5hswNRBaAndW_5spMTAUJ4xjhOKtOW0f5IDa8'
  517. # # url='https://www.douyin.com/user/MS4wLjABAAAAtSPIL_StfoqgclIO3YGO_wnQeGsRQuFP7hA3j6tUv2sXA2oGfVm9fwCLq8bmurs3?showTab=post'
  518. # ) # 作品
  519. # a = Douyin(
  520. # context=edge.context,
  521. # url='https://www.douyin.com/user/MS4wLjABAAAAtSPIL_StfoqgclIO3YGO_wnQeGsRQuFP7hA3j6tUv2sXA2oGfVm9fwCLq8bmurs3?showTab=like'
  522. # ) # 喜欢
  523. a = Douyin(context=edge.context,url='https://www.douyin.com/user/MS4wLjABAAAA8U_l6rBzmy7bcy6xOJel4v0RzoR_wfAubGPeJimN__4', num=11) # 作品
  524. # a = Douyin(context=edge.context,url='https://www.douyin.com/user/MS4wLjABAAAA8U_l6rBzmy7bcy6xOJel4v0RzoR_wfAubGPeJimN__4') # 作品
  525. # a = Douyin(context=edge.context, url='https://v.douyin.com/UhYnoMS/') # 单个作品
  526. # a = Douyin(context=edge.context, url='7233251303269453089') # 单个作品 ID图文
  527. # a = Douyin(context=edge.context,url='https://v.douyin.com/BK2VMkG/') # 图集主页
  528. # a = Douyin(context=edge.context,url='https://v.douyin.com/BGPBena/', type='music') # 音乐
  529. # a = Douyin(context=edge.context,url='https://v.douyin.com/BGPBena/', num=11) # 音乐
  530. # a = Douyin(context=edge.context,url='https://www.douyin.com/search/%E4%B8%8D%E8%89%AF%E4%BA%BA', num=30) # 搜索
  531. # a = Douyin(context=edge.context,url='https://www.douyin.com/search/%E4%B8%8D%E8%89%AF%E4%BA%BA', type='search') # 搜索
  532. # a = Douyin(context=edge.context,url='不良人', num=11) # 关键字搜索
  533. # a = Douyin(context=edge.context,url='不良人', type='search', num=11) # 关键字搜索
  534. # a = Douyin(context=edge.context,url='https://www.douyin.com/user/MS4wLjABAAAA8U_l6rBzmy7bcy6xOJel4v0RzoR_wfAubGPeJimN__4?showTab=like') # 长链接+喜欢
  535. # a = Douyin(context=edge.context,url='https://www.douyin.com/user/MS4wLjABAAAA8U_l6rBzmy7bcy6xOJel4v0RzoR_wfAubGPeJimN__4', type='like') # 长链接+喜欢
  536. # a = Douyin(context=edge.context, url='https://v.douyin.com/BGf3Wp6/', type='like') # 短链接+喜欢+自己的私密账号需登录
  537. # a = Douyin(context=edge.context,url='https://www.douyin.com/user/MS4wLjABAAAA8U_l6rBzmy7bcy6xOJel4v0RzoR_wfAubGPeJimN__4', type='fans') # 粉丝
  538. # a = Douyin(context=edge.context,url='https://www.douyin.com/user/MS4wLjABAAAA8U_l6rBzmy7bcy6xOJel4v0RzoR_wfAubGPeJimN__4',type='follow') # 关注
  539. # a = Douyin(context=edge.context,url='https://www.douyin.com/collection/7018087406876231711') # 合集
  540. # a = Douyin(context=edge.context,url='https://www.douyin.com/collection/7018087406876231711', type='collection') # 合集
  541. # a = Douyin(context=edge.context, type='like') # 登录账号的喜欢
  542. # a = Douyin(context=edge.context, type='favorite') # 登录账号的收藏
  543. # a = Douyin(context=edge.context, url='xinhuashe', type='id') # 合集
  544. a.run()
  545. # a.download()
  546. # python ./douyin.py -u https://v.douyin.com/BGf3Wp6/ -t like
  547. edge.stop()
  548. def run1():
  549. edge = Browser(channel='chromium', headless=True)
  550. db = zmysql.TiYoumeiDb()
  551. mid = int(db.query_one("select max(offset) mo from youmei_videos")['mo'])
  552. for x in db.query("select * from videos_account where platform = '抖音'"):
  553. print(x)
  554. a = Douyin(context=edge.context, url=x['url'], num=15, author=x['nick_name'], mid=mid)
  555. a.run()
  556. edge.stop()
  557. if __name__ == "__main__":
  558. time.sleep(random.randint(1, 180))
  559. run1()