Second operation

Assignment 1

(1) China Meteorological Network( http://www.weather.com.cn )Give the 7-day weather forecast of Dingcheng market and save it in the database.

Idea:

           1. First, establish a class to establish the database and write the data to the database

           2. Then create a class for crawling data

Implementation code:

 1 # 031804127wl
 2 
 3 from bs4 import BeautifulSoup
 4 from bs4 import UnicodeDammit
 5 import urllib.request
 6 import sqlite3
 7 
 8 
 9 class WeatherDB:
10     def openDB(self):
11         self.con = sqlite3.connect("weathers.db")
12         self.cursor = self.con.cursor()
13         try:
14             self.cursor.execute("create table weathers (wCity varchar(16), wDate varchar(16), wWeather varchar(64), "
15                                 "wTemp varchar(32), constraint pk_weather primary key(wCity, wDate))")
16         except:
17             self.cursor.execute("delete from weathers")
18 
19     def closeDB(self):
20         self.con.commit()
21         self.con.close()
22 
23     def insert(self, city, date, weather, temp):
24         try:
25             self.cursor.execute("insert into weathers (wCity, wDate, wWeather, wTemp)values(?, ?, ?, ?)",
26                                 (city, date, weather, temp))
27         except Exception as err:
28             print(err)
29 
30     def show(self):
31         self.cursor.execute("select * form weathers")
32         rows = self.cursor.fetchall()
33         print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
34         for row in rows:
35             print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))
36 
37 
38 class WeatherForecast:
39     def __init__(self):
40         self.headers = {"User-Agent": "Mozilla/5.0"}
41         self.cityCode = {"Beijing": "101010100", "Shanghai": "101020100", "Guangzhou": "101280101", "Shenzhen": "101280601"}
42 
43     def forecastCity(self, city):
44         if city not in self.cityCode.keys():
45             print(city + " code cannot be found")
46             return
47 
48         url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
49         try:
50             req = urllib.request.Request(url, headers=self.headers)
51             data = urllib.request.urlopen(req)
52             data = data.read()
53             dammit = UnicodeDammit(data, ["utf-8", "gbk"])
54             data = dammit.unicode_markup
55             soup = BeautifulSoup(data, "lxml")
56             lis = soup.select("ul[class='t clearfix'] li")
57             for li in lis:
58                 try:
59                     date = li.select('h1')[0].text
60                     weather = li.select('p[class="wea"]')[0].text
61                     temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
62                     print(city, date, weather, temp)
63                 except Exception as err:
64                     print(err)
65         except Exception as err:
66             print(err)
67 
68     def process(self, cities):
69         self.db = WeatherDB()
70         self.db.openDB()
71 
72         for city in cities:
73             self.forecastCity(city)
74 
75         # self.db.show()
76         self.db.closeDB()
77 
78 
79 ws = WeatherForecast()
80 ws.process(["Beijing", "Shanghai", "Guangzhou", "Shenzhen"])
81 print("completed")

Result picture:

(2) experimental experience:

Learned how to establish a database and store data into it, which is also a review of some contents learned last semester.

 

Assignment 2:

(1) Use requests and beautiful soup library methods to crawl stock related information.

Idea:

        1. Use the browser's own packet capture tool to find the url used for loading the stock list

        2. Analyze the obtained data and crawl

        3. Canonical output format

Implementation code:

 1 # 031804127wl
 2 
 3 import re
 4 import requests
 5 import json
 6 import pandas as pd
 7 
 8 # Show all columns
 9 pd.set_option('display.max_columns', None)
10 # Show all rows
11 pd.set_option('display.max_rows', None)
12 # Column names and data are displayed on them
13 pd.set_option('display.unicode.ambiguous_as_wide', True)
14 pd.set_option('display.unicode.east_asian_width', True)
15 # Display 5000 characters horizontally
16 pd.set_option('display.width', 5000)
17 
18 
19 def HTML(url):
20     gupiao_list = []
21     headers = {
22         "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
23                       "Chrome/80.0.3987.149 Safari/537.36"}
24     try:
25         r = requests.get(url, headers=headers, timeout=30)
26         r.raise_for_status()
27         r.encoding = r.apparent_encoding
28         html = r.text
29     except Exception as e:
30         print("wrong:" + e)
31     # Using regular expressions, crawl data All data under
32     pat = re.compile("\[\{.*?\}\]")
33     data = pat.findall(html)
34     js = json.loads(data[0])  # take json Code normalization
35     gupiao_list.append(("code", "name", "Latest price", "Fluctuation range", "Rise and fall", "Turnover", "Turnover", "amplitude", "highest", "minimum", "Today open", "Received yesterday", "Volume ratio"))
36     for i in range(len(js)):
37         diezhangfu = str(js[i]["f3"]) + "%"
38         zhenfu = str(js[i]["f7"]) + "%"
39         gupiao_list.append((js[i]["f12"], js[i]["f14"], js[i]["f2"], diezhangfu, js[i]["f4"], js[i]["f5"], js[i]["f6"],
40                             zhenfu, js[i]["f15"], js[i]["f16"], js[i]["f17"], js[i]["f18"], js[i]["f10"]))
41     # Normalize list information format
42     df = pd.DataFrame(gupiao_list)
43     return df
44 
45 def main():
46     # Crawl the contents of the first two pages
47     for i in range(1, 3):
48         url = "http://28.push2.eastmoney.com/api/qt/clist/get?cb=jQuery11240023072405517482908_1601430405294&pn=" + str(
49             i) + "&pz" \
50                  "=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80," \
51                  "m:1+t:2,m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24," \
52                  "f25,f22,f11,f62,f128,f136,f115,f152&_=1601430405304 "
53         print(HTML(url))
54 
55 
56 main()

Result picture:

(2) Experimental experience

Learned to use the packet capture tool to obtain web page data, and preliminarily understood the usage of json library. Moreover, due to the large amount of data finally output, I used the pandas library to standardize the data output, and mastered a method to make the output more beautiful.

 

Assignment 3

(1) Select stocks according to the last three digits of the self selected 3 digits + student number to obtain the printed stock information.

Idea:

The last three digits of my student number are 127, so it's OK to directly output the stock information with code suffix of 127. Since there are only a few such stocks, there is no need to use pandas library and directly manually standardize the output format. Since the position of the stock is changing at any time, when it is found that there is no data output, it is OK to change the page number range.

Implementation code:

 1 # 031804127wl
 2 
 3 import re
 4 import requests
 5 import json
 6 
 7 def HTML(url):
 8     headers = {
 9         "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
10                       "Chrome/80.0.3987.149 Safari/537.36"}
11     try:
12         r = requests.get(url, headers=headers, timeout=30)
13         r.raise_for_status()
14         r.encoding = r.apparent_encoding
15         html = r.text
16     except Exception as e:
17         print("wrong:" + e)
18     # Using regular expressions, crawl data All data under
19     pat = re.compile("\[\{.*?\}\]")
20     data = pat.findall(html)
21     js = json.loads(data[0])  # take json Code normalization
22     for i in range(len(js)):
23         diezhangfu = str(js[i]["f3"]) + "%"
24         zhenfu = str(js[i]["f7"]) + "%"
25         daima = js[i]["f12"]
26         # Output stock data with code suffix 127
27         if daima[3:] == '127':
28             print("code:" + str(js[i]["f12"]) + "\t" + "name:" + str(js[i]["f14"]) + "\t" + "Latest price:" + str(js[i]["f2"])
29                   + "\t" + "Fluctuation range:" + str(diezhangfu) + "\t" + "Rise and fall:" + str(js[i]["f4"]) + "\t" + "Turnover:" + str(
30                 js[i]["f5"])
31                   + "\t" + "Turnover:" + str(js[i]["f6"]) + "\t" + "amplitude:" + str(zhenfu) + "\t" + "highest:" + str(
32                 js[i]["f15"]) + "\t" +
33                   "minimum:" + str(js[i]["f16"]) + "\t" + "Today open:" + str(js[i]["f17"]) + "\t " + "Received yesterday:" + str(js[i]["f18"]) +
34                   "\t" + "Volume ratio:" + str(js[i]["f10"]))
35         else:
36             continue
37 
38 
39 def main():
40     # Climb 30-40 Contents of the page
41     for i in range(30, 41):
42         url = "http://28.push2.eastmoney.com/api/qt/clist/get?cb=jQuery11240023072405517482908_1601430405294&pn=" + str(
43             i) + "&pz" \
44                  "=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80," \
45                  "m:1+t:2,m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24," \
46                  "f25,f22,f11,f62,f128,f136,f115,f152&_=1601430405304 "
47         HTML(url)
48 
49 
50 main()

Result picture:

(2) Experimental experience

This is to modify the output based on the previous job.

 

Posted by jlsports on Fri, 13 May 2022 16:03:17 +0300