Post

07用Playwright爬取v2ray订阅地址

Playwright书中没有实例,刚好顺手写了一个爬取v2ray订阅地址的小代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from playwright.async_api import async_playwright ,TimeoutError
import asyncio
import logging
import datetime

headless=True
URL_MIBEI='https://www.mibei77.com/'
logging.basicConfig(level=logging.INFO,format="%(asctime)s:%(levelname)s - %(message)s")
TIMEOUT = 10
browser= None
page=None
async def goto_page(url:str,selector:str)->bool:
    logging.info('goto_page %s',url)
    try:
        await page.goto(url,timeout=20*1000,wait_until='domcontentloaded')#不传wait_until='domcontentloaded'总是Timeout
        return await page.wait_for_selector(selector,timeout=TIMEOUT*1000) 
    except TimeoutError:
        logging.error("error goto_page!",exc_info=True)
    return None
async def parse_v2ray_url(item_url:str)->str:
    return await page.eval_on_selector('p:has-text("v2ray订阅链接")+p','n=>n.innerText') if await goto_page(item_url,'#post-body p') else None

async def parse_detail_url()->(str,str):
    article=await goto_page(URL_MIBEI,'div article:first-child')
    if not article:
        return (None,None)
    published_time=await article.eval_on_selector("time.published",'n=>n.getAttribute("datetime")')
    dt=datetime.datetime.now().strftime("%Y-%m-%d")
    if dt not in published_time:
        return (None,None)
    
    href=await article.eval_on_selector("h2 a",'n=>n.href')
    return (published_time,href) if href else (None,None)
    
async def main()->None:
    global browser
    global page
    async with async_playwright() as playwright:
        browser=await playwright.chromium.launch(headless=headless)
        page=await browser.new_page()
        try:
            time,url= await parse_detail_url()
            if url:
                url=await parse_v2ray_url(url)
                logging.info("time=%s, url= %s",time,url)
        except Exception:
            logging.error("main!",exc_info=True)
    await page.close()
    await browser.close()

if __name__=='__main__':
    asyncio.run(main())
This post is licensed under CC BY 4.0 by the author.