Recently, I am learning python crawler. When crawling some websites, I need to submit encrypted data, so I record the crawling process.
For your own study and archive.
1. Target website
- The China Air Quality Online Monitoring and Analysis Platform includes PM2.5 and weather information data of 367 cities across the country, including AQI, PM2.5, PM10, S02, N02, O3, CO, temperature, humidity, wind level, wind direction, satellite Monitoring items such as cloud maps.
- website link: https://www.aqistudy.cn/html/city_detail.php?v=1.10
2. Parse web pages
After opening the URL, right-click F12 to open the developer mode, click the query button, and the following request will appear,
Searching for keyword names, we found encrypted parameters, submitted parameter names
Locate key parameters
But only part of the encrypted code in this js file is found here after a global search,
Exported by the icon eval function, js code is obtained after js de-obfuscation
After many tests, it was found that the function name and secret key in the js file that submitted the request were dynamically changed, and later
The source is found on the homepage, as shown in the figure:
After combining two pieces of js code, the complete encryption and decryption code can be obtained, and the encryption function name, the submitted parameter name and the decryption function can be extracted with regular expressions.
Third, the specific implementation
""" =================================== -*- coding:utf-8 -*- Author :GadyPu E_mail :Gadypy@gmail.com Time :2020/8/18 0010 01 pm:31 FileName :go_aqi.py =================================== """ import re import json import execjs import requests import warnings from lxml import etree warnings.filterwarnings('ignore') class GetWeather(object): def __init__(self): self.url = 'https://www.aqistudy.cn/html/city_detail.php?v=1.10' self.api = 'https://www.aqistudy.cn' self.headers = { 'Host': 'www.aqistudy.cn', 'Origin': 'https://www.aqistudy.cn', 'Referer': 'https://www.aqistudy.cn/html/city_detail.php?v=1.10', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; ZTE BA520 Build/MRA58K; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/55.0.2883.77 Mobile Safari/537.36', } self.req_url = 'https://www.aqistudy.cn/apinew/aqistudyapi.php' self.js_code = None self.js_ctx = None self.encrypt_func = None self.encrypt_param = None self.decode_func = None def init_js_code(self): try: response = requests.get(url = self.url, headers = self.headers, verify = False) html = etree.HTML(response.text) js_path = html.xpath('/html/body/script[2]/@src')[0] #print(self.api + js_path[2:]) response = requests.get(url = self.api + js_path[2:], headers = self.headers, verify = False) #print(response.text) # function name to encrypt parameters pat_f = r'var param = (.*)\(.*\)' self.encrypt_func = re.findall(pat_f, response.text)[0] #print(func_name) # The parameter name of the post submitted data pat_p = r'(?<=\{).+(?=\})' self.encrypt_param = re.search(pat_p, response.text)[0].split(':')[0].strip() #print(param_name) # deocde function pat_d = r'data = (.*)\(.*\)' self.decode_func = re.findall(pat_d, response.text)[-1] #print(func_decode) ''' combination js code ''' with open('gogo.js', 'r', encoding = 'utf-8') as fp: self.js_code = fp.read() self.js_code += response.text self.js_ctx = execjs.compile(self.js_code) except Exception as e: print(e) def get_weather_data(self, method, obj): param = { self.encrypt_param: self.js_ctx.call(self.encrypt_func, method, obj) } print(param) response = requests.post(url = self.req_url, headers = self.headers, data = param, verify = False) return self.js_ctx.call(self.decode_func, response.text) def run(self, method, obj, months: list): self.init_js_code() for mon in months: if mon[0] == '2020-08-01': obj.update({"startTime":f"{mon[0]} 00:00:00"}) obj.update({"endTime":f"{mon[1]} 00:00:00"}) js_data = self.get_weather_data(method, obj) # with open('urumqi_weather_2020.json', 'a', encoding = 'utf-8') as wf: # wf.write(js_data + '\n') js_data = json.loads(js_data)['result']['data']['rows'] max_per_month = max(js_data, key = lambda x: float(x['temp']))['temp'] min_per_month = min(js_data, key = lambda x: float(x['temp']))['temp'] ave_per_month = sum([float(i['temp']) for i in js_data]) print([max_per_month, min_per_month, round(ave_per_month / len(js_data), 1)]) if __name__ == '__main__': obj = {"city":"Urumqi","type":"DAY","startTime":"2020-08-01 00:00:00","endTime":"2020-08-14 00:00:00"} #'GETCITYWEATHER' to get weather data such as temperature, humidity, wind strength #'GETDETAIL' to get pm2.5, co, so2... method = "GETCITYWEATHER" ll = map(lambda m, d: "2020" + '-' + "%02d"%m + '-' + "%02d"%d, [_ for _ in range(1, 9)], [31, 28, 31, 30, 31, 30, 31, 18]) months = map(lambda x, y: ("2020" + '-' + "%02d"%x + '-' + "01", y), [_ for _ in range(1, 9)], [_ for _ in list(ll)]) d = GetWeather() d.run(method, obj, months)