1.说明
在爬取某个网站的时候遇到加密参数,由于js代码经过混淆编译不好破解,所以采用Selenium的方式获取参数,但是我们获取selenium的数据基本上都是基于页面的,对于网站发起的异步请求,我们可以从日志中提取

2.设置driver参数
我们首先要通过Option对象(比如说ChromeOptions)设置监控浏览器日志,旧版本的Selenium是通过DesiredCapabilities设置的,下面是新版本的写法
from selenium.webdriver import Chrome, ChromeOptions from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions from selenium.webdriver.support.wait import WebDriverWait options = ChromeOptions() options.add_argument("--no-sandbox") options.add_argument("--allow-running-insecure-content") options.add_argument("--ignore-certificate-errors") options.add_argument("--disable-single-click-autofill") options.add_argument("--disable-autofill-keyboard-accessory-view[8]") options.add_argument("--disable-full-form-autofill-ios") options.add_experimental_option('perfLoggingPrefs', { 'enableNetwork': True, 'enablePage': False, }) options.set_capability("goog:loggingPrefs", { 'browser': 'ALL', 'performance': 'ALL', }) options.set_capability("goog:perfLoggingPrefs", { 'enableNetwork': True, 'enablePage': False, 'enableTimeline': False })
3.请求网页
现在实例化一个driver,发起一个网页请求,我这里使用WebDriverWait显式等待的方式等待某个元素出现,你也可以隐式等待或者直接sleep,如果你不等待,异步请求还没加载完就开始获取,你可能会拿不到想要的数据

service = Service(executable_path=executable_path) driver = Chrome(service=service, options=options) driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": """Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"""}) driver.get(page_url) wait = WebDriverWait(driver, 15, 0.5) try: wait.until(expected_conditions.presence_of_element_located((By.CLASS_NAME, "item "))) except Exception as e: print("WebDriverWait.until timeout error: {}".format(e)) html = driver.execute_script("return document.documentElement.outerHTML")
4.处理日志
访问一下driver的log_types属性可以获取到所有日志类型,遍历它,通过get_log()方法获取对应的日志,之后再过滤出自己想要的日志就行。
比如说,我这里是过滤出所有Network.requestWillBeSent的日志,即发送异步请求的数据,因为我需要该请求的请求头,如果是响应类型的日志(Network.responseReceived),它只包含响应头。具体支持的类型可以参考谷歌devtools的文档
如果需要过滤出Ajax(XHR)请求,可以根据日志的params里的type进行判断,也可以通过它判断
sign_dict = dict() # 用来存储自己想要的数据 for log_type in driver.log_types: perf_list = driver.get_log(log_type) for row_log in perf_list: try: log_json = json.loads(row_log['message']) message_log = log_json['message'] except Exception as e: print(e) continue if message_log.get('method') != 'Network.requestWillBeSent': continue if message_log.get("params", {}).get("type", "").upper() != "XHR": continue headers = message_log['params'].get('request', {}).get('headers') if not headers: continue x_sign = headers.get('X-Sign') if not x_sign: continue x_app_id = headers.get('X-AppID') x_ts = headers.get('X-Ts') print("success:", x_sign, x_app_id, x_ts) req_url = message_log['params'].get('request', {}).get('url') key = os.path.split(req_url.split("?")[0])[1] sign_dict[key] = {"X-AppID": x_app_id, "X-Sign": x_sign, "X-Ts": x_ts}
注意,如果你想要响应体,Network.responseReceived类型的日志的response字段是没有响应体的,你需要通过params字段里的requestId获取,参考代码如下
res_body_dict = dict() for log_type in driver.log_types: perf_list = driver.get_log(log_type) for row_log in perf_list: try: log_json = json.loads(row_log['message']) message_log = log_json['message'] except Exception as e: print(e) continue if message_log.get('method') != 'Network.responseReceived': continue if message_log.get("params", {}).get("type", "").upper() != "XHR": continue request_id = message_log['params'].get("requestId") if not request_id: continue req_url = message_log['params'].get('response', {}).get('url') key = os.path.split(req_url.split("?")[0])[1] content = driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': request_id}) body = None try: body = json.loads(content["body"]) except Exception as e: print("get_unisat_data_by_selenium() json loads error: {}, content:{}".format(e, content)) res_body_dict[key] = body
5.完整代码
上面的完整参考代码如下
import json import os.path from selenium.webdriver import Chrome, ChromeOptions from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions from selenium.webdriver.support.wait import WebDriverWait def get_selenium_driver(executable_path=r"E:\webdriver\ChromeDriver.exe"): options = ChromeOptions() options.add_argument("--no-sandbox") options.add_argument("--allow-running-insecure-content") options.add_argument("--ignore-certificate-errors") options.add_argument("--disable-single-click-autofill") options.add_argument("--disable-autofill-keyboard-accessory-view[8]") options.add_argument("--disable-full-form-autofill-ios") options.add_experimental_option('perfLoggingPrefs', { 'enableNetwork': True, 'enablePage': False, }) options.set_capability("goog:loggingPrefs", { 'browser': 'ALL', 'performance': 'ALL', }) options.set_capability("goog:perfLoggingPrefs", { 'enableNetwork': True, 'enablePage': False, 'enableTimeline': False }) service = Service(executable_path=executable_path) driver = Chrome(service=service, options=options) driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": """Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"""}) return driver def get_sign_by_selenium(page_url): driver = get_selenium_driver() driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": """Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"""}) driver.get(page_url) wait = WebDriverWait(driver, 15, 0.5) try: wait.until(expected_conditions.presence_of_element_located((By.CLASS_NAME, "item "))) except Exception as e: print("WebDriverWait.until timeout error: {}".format(e)) # html = driver.execute_script("return document.documentElement.outerHTML") # with open(r"C:\Users\admin\Desktop\test\test.html", "w") as f: # f.write(html) # time.sleep(10) sign_dict = dict() for log_type in driver.log_types: perf_list = driver.get_log(log_type) for row_log in perf_list: try: log_json = json.loads(row_log['message']) message_log = log_json['message'] except Exception as e: print(e) continue if message_log.get('method') != 'Network.requestWillBeSent': continue if message_log.get("params", {}).get("type", "").upper() != "XHR": continue headers = message_log['params'].get('request', {}).get('headers') if not headers: continue x_sign = headers.get('X-Sign') if not x_sign: continue x_app_id = headers.get('X-AppID') x_ts = headers.get('X-Ts') print("success:", x_sign, x_app_id, x_ts) req_url = message_log['params'].get('request', {}).get('url') key = os.path.split(req_url.split("?")[0])[1] sign_dict[key] = {"X-AppID": x_app_id, "X-Sign": x_sign, "X-Ts": x_ts} return sign_dict def get_unisat_data_by_selenium(page_url): driver = get_selenium_driver() driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source": """Object.defineProperty(navigator, 'webdriver', {get: () => undefined})"""}) driver.get(page_url) wait = WebDriverWait(driver, 15, 0.5) try: wait.until(expected_conditions.presence_of_element_located((By.CLASS_NAME, "item "))) except Exception as e: print("WebDriverWait.until timeout error: {}".format(e)) res_body_dict = dict() for log_type in driver.log_types: perf_list = driver.get_log(log_type) for row_log in perf_list: try: log_json = json.loads(row_log['message']) message_log = log_json['message'] except Exception as e: print(e) continue if message_log.get('method') != 'Network.responseReceived': continue if message_log.get("params", {}).get("type", "").upper() != "XHR": continue request_id = message_log['params'].get("requestId") if not request_id: continue req_url = message_log['params'].get('response', {}).get('url') key = os.path.split(req_url.split("?")[0])[1] content = driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': request_id}) body = None try: body = json.loads(content["body"]) except Exception as e: print("get_unisat_data_by_selenium() json loads error: {}, content:{}".format(e, content)) res_body_dict[key] = body return res_body_dict if __name__ == '__main__': url = "https://unisat.io/brc20?q=bc1pkmnh3nj89uns3yp2mtqqxjns65vy6ca6n5jvp4s8ua8nke69cnjs987vtp" print("get_sign_by_selenium(url):", get_sign_by_selenium(url)) # print("get_unisat_data_by_selenium(url):", get_unisat_data_by_selenium(url))
附:关于selenium的使用可以参考之前的文章
【测试】Selenium的使用(常用属性方法、元素等待、操作cookie、操作元素、无头模式、获取HTML源码)
【测试】selenium反爬操作
【测试】修改selenium选项配置参数优化性能
【测试】在Linux(CentOS、Ubuntu)无界面服务器使用selenium
【测试】Selenium操作Cookie