Assignment 1
(1) China Meteorological Network( http://www.weather.com.cn )Give the 7-day weather forecast of Dingcheng market and save it in the database.
Idea:
1. First, establish a class to establish the database and write the data to the database
2. Then create a class for crawling data
Implementation code:
1 # 031804127wl 2 3 from bs4 import BeautifulSoup 4 from bs4 import UnicodeDammit 5 import urllib.request 6 import sqlite3 7 8 9 class WeatherDB: 10 def openDB(self): 11 self.con = sqlite3.connect("weathers.db") 12 self.cursor = self.con.cursor() 13 try: 14 self.cursor.execute("create table weathers (wCity varchar(16), wDate varchar(16), wWeather varchar(64), " 15 "wTemp varchar(32), constraint pk_weather primary key(wCity, wDate))") 16 except: 17 self.cursor.execute("delete from weathers") 18 19 def closeDB(self): 20 self.con.commit() 21 self.con.close() 22 23 def insert(self, city, date, weather, temp): 24 try: 25 self.cursor.execute("insert into weathers (wCity, wDate, wWeather, wTemp)values(?, ?, ?, ?)", 26 (city, date, weather, temp)) 27 except Exception as err: 28 print(err) 29 30 def show(self): 31 self.cursor.execute("select * form weathers") 32 rows = self.cursor.fetchall() 33 print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp")) 34 for row in rows: 35 print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3])) 36 37 38 class WeatherForecast: 39 def __init__(self): 40 self.headers = {"User-Agent": "Mozilla/5.0"} 41 self.cityCode = {"Beijing": "101010100", "Shanghai": "101020100", "Guangzhou": "101280101", "Shenzhen": "101280601"} 42 43 def forecastCity(self, city): 44 if city not in self.cityCode.keys(): 45 print(city + " code cannot be found") 46 return 47 48 url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml" 49 try: 50 req = urllib.request.Request(url, headers=self.headers) 51 data = urllib.request.urlopen(req) 52 data = data.read() 53 dammit = UnicodeDammit(data, ["utf-8", "gbk"]) 54 data = dammit.unicode_markup 55 soup = BeautifulSoup(data, "lxml") 56 lis = soup.select("ul[class='t clearfix'] li") 57 for li in lis: 58 try: 59 date = li.select('h1')[0].text 60 weather = li.select('p[class="wea"]')[0].text 61 temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text 62 print(city, date, weather, temp) 63 except Exception as err: 64 print(err) 65 except Exception as err: 66 print(err) 67 68 def process(self, cities): 69 self.db = WeatherDB() 70 self.db.openDB() 71 72 for city in cities: 73 self.forecastCity(city) 74 75 # self.db.show() 76 self.db.closeDB() 77 78 79 ws = WeatherForecast() 80 ws.process(["Beijing", "Shanghai", "Guangzhou", "Shenzhen"]) 81 print("completed")
Result picture:
(2) experimental experience:
Learned how to establish a database and store data into it, which is also a review of some contents learned last semester.
Assignment 2:
(1) Use requests and beautiful soup library methods to crawl stock related information.
Idea:
1. Use the browser's own packet capture tool to find the url used for loading the stock list
2. Analyze the obtained data and crawl
3. Canonical output format
Implementation code:
1 # 031804127wl 2 3 import re 4 import requests 5 import json 6 import pandas as pd 7 8 # Show all columns 9 pd.set_option('display.max_columns', None) 10 # Show all rows 11 pd.set_option('display.max_rows', None) 12 # Column names and data are displayed on them 13 pd.set_option('display.unicode.ambiguous_as_wide', True) 14 pd.set_option('display.unicode.east_asian_width', True) 15 # Display 5000 characters horizontally 16 pd.set_option('display.width', 5000) 17 18 19 def HTML(url): 20 gupiao_list = [] 21 headers = { 22 "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " 23 "Chrome/80.0.3987.149 Safari/537.36"} 24 try: 25 r = requests.get(url, headers=headers, timeout=30) 26 r.raise_for_status() 27 r.encoding = r.apparent_encoding 28 html = r.text 29 except Exception as e: 30 print("wrong:" + e) 31 # Using regular expressions, crawl data All data under 32 pat = re.compile("\[\{.*?\}\]") 33 data = pat.findall(html) 34 js = json.loads(data[0]) # take json Code normalization 35 gupiao_list.append(("code", "name", "Latest price", "Fluctuation range", "Rise and fall", "Turnover", "Turnover", "amplitude", "highest", "minimum", "Today open", "Received yesterday", "Volume ratio")) 36 for i in range(len(js)): 37 diezhangfu = str(js[i]["f3"]) + "%" 38 zhenfu = str(js[i]["f7"]) + "%" 39 gupiao_list.append((js[i]["f12"], js[i]["f14"], js[i]["f2"], diezhangfu, js[i]["f4"], js[i]["f5"], js[i]["f6"], 40 zhenfu, js[i]["f15"], js[i]["f16"], js[i]["f17"], js[i]["f18"], js[i]["f10"])) 41 # Normalize list information format 42 df = pd.DataFrame(gupiao_list) 43 return df 44 45 def main(): 46 # Crawl the contents of the first two pages 47 for i in range(1, 3): 48 url = "http://28.push2.eastmoney.com/api/qt/clist/get?cb=jQuery11240023072405517482908_1601430405294&pn=" + str( 49 i) + "&pz" \ 50 "=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80," \ 51 "m:1+t:2,m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24," \ 52 "f25,f22,f11,f62,f128,f136,f115,f152&_=1601430405304 " 53 print(HTML(url)) 54 55 56 main()
Result picture:
(2) Experimental experience
Learned to use the packet capture tool to obtain web page data, and preliminarily understood the usage of json library. Moreover, due to the large amount of data finally output, I used the pandas library to standardize the data output, and mastered a method to make the output more beautiful.
Assignment 3
(1) Select stocks according to the last three digits of the self selected 3 digits + student number to obtain the printed stock information.
Idea:
The last three digits of my student number are 127, so it's OK to directly output the stock information with code suffix of 127. Since there are only a few such stocks, there is no need to use pandas library and directly manually standardize the output format. Since the position of the stock is changing at any time, when it is found that there is no data output, it is OK to change the page number range.
Implementation code:
1 # 031804127wl 2 3 import re 4 import requests 5 import json 6 7 def HTML(url): 8 headers = { 9 "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " 10 "Chrome/80.0.3987.149 Safari/537.36"} 11 try: 12 r = requests.get(url, headers=headers, timeout=30) 13 r.raise_for_status() 14 r.encoding = r.apparent_encoding 15 html = r.text 16 except Exception as e: 17 print("wrong:" + e) 18 # Using regular expressions, crawl data All data under 19 pat = re.compile("\[\{.*?\}\]") 20 data = pat.findall(html) 21 js = json.loads(data[0]) # take json Code normalization 22 for i in range(len(js)): 23 diezhangfu = str(js[i]["f3"]) + "%" 24 zhenfu = str(js[i]["f7"]) + "%" 25 daima = js[i]["f12"] 26 # Output stock data with code suffix 127 27 if daima[3:] == '127': 28 print("code:" + str(js[i]["f12"]) + "\t" + "name:" + str(js[i]["f14"]) + "\t" + "Latest price:" + str(js[i]["f2"]) 29 + "\t" + "Fluctuation range:" + str(diezhangfu) + "\t" + "Rise and fall:" + str(js[i]["f4"]) + "\t" + "Turnover:" + str( 30 js[i]["f5"]) 31 + "\t" + "Turnover:" + str(js[i]["f6"]) + "\t" + "amplitude:" + str(zhenfu) + "\t" + "highest:" + str( 32 js[i]["f15"]) + "\t" + 33 "minimum:" + str(js[i]["f16"]) + "\t" + "Today open:" + str(js[i]["f17"]) + "\t " + "Received yesterday:" + str(js[i]["f18"]) + 34 "\t" + "Volume ratio:" + str(js[i]["f10"])) 35 else: 36 continue 37 38 39 def main(): 40 # Climb 30-40 Contents of the page 41 for i in range(30, 41): 42 url = "http://28.push2.eastmoney.com/api/qt/clist/get?cb=jQuery11240023072405517482908_1601430405294&pn=" + str( 43 i) + "&pz" \ 44 "=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80," \ 45 "m:1+t:2,m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24," \ 46 "f25,f22,f11,f62,f128,f136,f115,f152&_=1601430405304 " 47 HTML(url) 48 49 50 main()
Result picture:
(2) Experimental experience
This is to modify the output based on the previous job.