First operation - combined with three small operations

Assignment 1

1) Experiment content: use the methods of requests and beautiful soup library to crawl the given website( http://www.shanghairanking.cn/rankings/bcur/2020 )According to the data, the screen prints the crawling university ranking information.

The code is as follows:

import requests
from bs4 import BeautifulSoup
import bs4
#General code basic framework of crawling
def getHTMlText(url):
    try:
        r = requests.get(url, timeout = 30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""
#Put the crawled info into the List
def fillUnivList(ulist,html):
    soup = BeautifulSoup(html,"html.parser")
    for tr in soup.find('tbody').children:
        if isinstance(tr,bs4.element.Tag): #Determine label type
            tds = tr('td')
            ulist.append([tds[0],tds[1],tds[2],tds[3],tds[4]])

#Printing method
def printUnivList(ulist,num):
    print("{:^10}\t{:^10}\t{:^10}\t{:^10}\t{:^10}".format("ranking","School name","Provinces and cities","School type","Total score"))
    for i in range(num):
        u = ulist[i]
        print("{:^10}\t{:^10}\t{:^10}\t{:^10}\t{:^10}".format(u[0].text.strip(),u[1].text.strip(),u[2].text.strip(),u[3].text.strip(),u[4].text.strip()))


#Main function
def main():
    uinfo = []
    url = 'http://www.shanghairanking.cn/rankings/bcur/2020 '# crawled url
    html = getHTMlText(url)
    fillUnivList(uinfo,html)
    printUnivList(uinfo,15)
main()

The results are as follows:

2) I. experience

The experience of this experiment is that the style of the code should be controlled well, and the functions should be divided into functions to facilitate calling. In addition, pay attention to analyzing the source code when crawling. For example, this experiment needs to delete the line breaks before and after the obtained content.

Assignment 2

1) Experiment content: using the methods of requests and re library to design a commodity price comparison directional crawler of a mall (choose by yourself), crawl the mall, search the data of the page with the keyword "schoolbag", and crawl the commodity name and price.

The code is as follows:

import requests
import re

headers = {
    'authority': 's.taobao.com',
    'cache-control': 'max-age=0',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-user': '?1',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'sec-fetch-site': 'none',
    'referer': 'https://s.taobao.com/search?q=shubao&commend=all&ssid=s5-e&search_type=mall&sourceId=tb.index&area=c2c&spm=a1z02.1.6856637.d4910789',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'cookie': 'thw=cn; cna=Vzk9FtjcI20CAd9Vz/wYMTMu; t=20e35e562420e4844071fdb958fb7c9a; hng=CN%7Czh-CN%7CCNY%7C156; miid=791733911206048212; tracknick=chia_jia; tg=0; cookie2=1f0898f4d5e217732638dedf9fe15701; v=0; _tb_token_=ebe5eeed35f33; enc=0hYktGOhhe0QclVgvyiraV50UAu2nXH2DGGiUhLzUiXhhwjN3%2BmWuY8a%2Bg%2B13VWtqA42kqOMQxOCBM%2F9y%2FMKEA%3D%3D; alitrackid=www.taobao.com; _samesite_flag_=true; sgcookie=ErzxRE%2F%2Fujbceh7Nk8tsW; unb=2925825942; uc3=lg2=U%2BGCWk%2F75gdr5Q%3D%3D&id2=UUGgqLe1BUBPyw%3D%3D&nk2=AHLe94pmu18%3D&vt3=F8dBxd9lptyvS0VrdSI%3D; csg=2ff7a88b; lgc=chia_jia; cookie17=UUGgqLe1BUBPyw%3D%3D; dnk=chia_jia; skt=fffc9202f189ba15; existShop=MTU4NDgwNjA5OA%3D%3D; uc4=nk4=0%40AhyIi%2BV%2FGWSNaFwor7d%2Fi8aNNg%3D%3D&id4=0%40U2OXkqaj%2BLnczzIixfRAeE2zi2mx; _cc_=U%2BGCWk%2F7og%3D%3D; _l_g_=Ug%3D%3D; sg=a24; _nk_=chia_jia; cookie1=VW7ubnoPKm6ZWpbFap8xTV%2BlfhUdVkTTn8y8%2Fh5pWuE%3D; tfstk=c-PRBp27QijoTXTB7NX0R8OD7-jGZX8-xUip9wQB5nAdp0OdilPg_WDhF2--MZC..; uc1=cookie16=V32FPkk%2FxXMk5UvIbNtImtMfJQ%3D%3D&cookie21=WqG3DMC9Fb5mPLIQo9kR&cookie15=URm48syIIVrSKA%3D%3D&existShop=false&pas=0&cookie14=UoTUPvg16Dl1fw%3D%3D&tag=8&lng=zh_CN; mt=ci=-1_1; lastalitrackid=buyertrade.taobao.com; JSESSIONID=46486E97DAB8DFE70AF7B618C2AE4309; l=dB_cmQBHqVHSpFQ9BOfwIH3Ssv7t4IdbzrVy9-oE7ICP991H5uC5WZ4ztBTMCnGVn6rkJ3JsgJC4BKm92ydZGhZfP3k_J_xmed8h.; isg=BAwM2R7Qvilbman-Lz4SX8Sj3Wo-RbDvFjL_wmbMw7da8a77j1WsfldLkflJv-hH',
}

#General code basic framework of crawling
def getHTMLText_031804139(url):
    try:
        r = requests.get(url, timeout=30, headers=headers)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "raise an exception"

#Parsing each obtained page is the key to the whole program
def parsePage_031804139(ilt, html):
    try:
        prices = re.findall(r'"view_price":"[(\d.)]*"', html) #Commodity price information
        titles = re.findall(r'"raw_title":".*?"', html) #Commodity name information
        for i in range(len(prices)):
            price = eval(prices[i].split(':')[1])
            title = eval(titles[i].split(':')[1])
            ilt.append([price, title])
    except:
        print("")

#Output the product information of Taobao to the screen
def printGoodsList_031804139(ilt):
    tplt = "{:4}\t{:8}\t{:16}"
    print(tplt.format("Serial number", "Price", "Trade name"))
    count = 0
    for goods in ilt:
        count += 1
        print(count, goods[0], goods[1])


def main():
    goods = 'Whitening Essence'
    url = "https://s.taobao.com/search?q="+ goods
    depth = 8 #Depth of climb
    infoList = [] #Information of packaged goods
    for i in range(depth):
        try:
            start_url = url + "&s=" + str(44 * i)
            html = getHTMLText_031804139(url)
            parsePage_031804139(infoList, html)
            print(printGoodsList_031804139(infoList))
        except:
            continue
        #If something goes wrong with one page, continue to the next page

main()

Experimental results:

2) I. experience

This experiment is mainly to design regular expressions to filter information, because the construction of regular expressions is very applicable in this type of crawlers, and headers must be added to climb Taobao information.

Assignment 3

1) Experiment content: crawl a given web page( http://xcb.fzu.edu.cn/html/2019ztjy )Or select all JPG format files of the web page

The code is as follows:

import requests
from bs4 import BeautifulSoup
import os
#General code basic framework of crawling
def getHTMlText(url):
    try:
        r = requests.get(url, timeout = 30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""
#Put the crawled info into the List
def fillUnivList(ulist,html):
    soup = BeautifulSoup(html,"html.parser")
    pic_all=soup.find_all("img")
    for pic in pic_all:
        pic = pic["src"]
        if pic[-1]=='g' or pic[-1]=='G':
            ulist.append(pic)
#Printing method
def printUnivList(ulist):
    print(ulist)

#Write file method
def writein_kata(list):
    for pics in list:
        if pics[0]=='/':
            pics='http://xcb.fzu.edu.cn'+pics
        with open("./File"+os.path.basename(pics),'wb') as f:
            f.write(requests.get(pics).content)

#Main function
def main():
    uinfo = []
    url = 'http://xcb.fzu.edu.cn'
    html = getHTMlText(url)
    fillUnivList(uinfo,html)
    printUnivList(uinfo)
    writein_kata(uinfo)

main()

Experimental results:

2) I. experience

In fact, the crawling of the picture has something in common with the previous two assignments. Just draw the gourd and ladle, but some details still need to be checked.

Posted by waterssaz on Sat, 14 May 2022 09:14:43 +0300