07用Playwright爬取v2ray订阅地址
Playwright书中没有实例,刚好顺手写了一个爬取v2ray订阅地址的小代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from playwright.async_api import async_playwright ,TimeoutError
import asyncio
import logging
import datetime
headless=True
URL_MIBEI='https://www.mibei77.com/'
logging.basicConfig(level=logging.INFO,format="%(asctime)s:%(levelname)s - %(message)s")
TIMEOUT = 10
browser= None
page=None
async def goto_page(url:str,selector:str)->bool:
logging.info('goto_page %s',url)
try:
await page.goto(url,timeout=20*1000,wait_until='domcontentloaded')#不传wait_until='domcontentloaded'总是Timeout
return await page.wait_for_selector(selector,timeout=TIMEOUT*1000)
except TimeoutError:
logging.error("error goto_page!",exc_info=True)
return None
async def parse_v2ray_url(item_url:str)->str:
return await page.eval_on_selector('p:has-text("v2ray订阅链接")+p','n=>n.innerText') if await goto_page(item_url,'#post-body p') else None
async def parse_detail_url()->(str,str):
article=await goto_page(URL_MIBEI,'div article:first-child')
if not article:
return (None,None)
published_time=await article.eval_on_selector("time.published",'n=>n.getAttribute("datetime")')
dt=datetime.datetime.now().strftime("%Y-%m-%d")
if dt not in published_time:
return (None,None)
href=await article.eval_on_selector("h2 a",'n=>n.href')
return (published_time,href) if href else (None,None)
async def main()->None:
global browser
global page
async with async_playwright() as playwright:
browser=await playwright.chromium.launch(headless=headless)
page=await browser.new_page()
try:
time,url= await parse_detail_url()
if url:
url=await parse_v2ray_url(url)
logging.info("time=%s, url= %s",time,url)
except Exception:
logging.error("main!",exc_info=True)
await page.close()
await browser.close()
if __name__=='__main__':
asyncio.run(main())
This post is licensed under CC BY 4.0 by the author.