The fourth assignment (data collection and fusion)

1. Homework 1

Requirements: Crawling the information and pictures of certain products in Jingdong Mall

Proficient in Selenium to find HTML elements, crawl Ajax web page data, wait for HTML elements, etc.
Use the Selenium framework to crawl certain product information and pictures in Jingdong Mall.
Code implementation and results:

from work4_Scrapy.items import *
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import urllib.request
import threading
import sqlite3
import os
import datetime
from selenium.webdriver.common.keys import Keys
import time
import pymysql

class work5_spider1_JD(scrapy.Spider):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}

    imagePath = "download"
    name="work5_spider1_JD"
    # Initialize the database table, browser, image path, and request the search key from the browser
    def startUp(self,url,key):
        # Initialize the browser, set up a headless browser
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--disable-gpu')
        self.driver = webdriver.Chrome(chrome_options=chrome_options)

        self.threads = []
        self.No = 0
        self.imgNo = 0
        try:
            self.connect = pymysql.connect(host='localhost', user='root', passwd='mysql',
                                           db='scrapy_db')  # The last three are the database connection name, database password, and database name.
            # get cursor
            self.cursor = self.connect.cursor()
            print("Successfully connected to the database")
            # self.con = sqlite3.connect("phones.db")
            # self.cursor = self.con.cursor()
            try:
                # delete table if there is one
                self.cursor.execute("drop table phones")
                self.connect.commit()
            except:
                pass
            try:
                #  create new table
                sql = "create  table  phones  (mNo  varchar(32) primary key, mMark varchar(256),mPrice varchar(32),mNote varchar(1024),mFile varchar(256))"
                self.cursor.execute(sql)
                self.connect.commit()
            except:
                pass

        except Exception as err:
            print(err)

        try:
            if not os.path.exists(work5_spider1_JD.imagePath):
                os.mkdir(work5_spider1_JD.imagePath)
            images = os.listdir(work5_spider1_JD.imagePath)
            for img in images:
                s = os.path.join(work5_spider1_JD.imagePath, img)
                os.remove(s)
        except Exception as err:
            print(err)

        # The browser that selenium created the url
        self.driver.get(url)
        # get keywords
        keyInput=self.driver.find_element_by_id("key")
        # Keyboard sends data and enter key to website
        keyInput.send_keys(key)
        keyInput.send_keys(Keys.ENTER)

    def closeUp(self):
        try:
            self.cursor.close()
            self.connect.close()
            self.driver.close()

        except Exception as err:
            print(err)

    def insertDB(self,mNo,mMark,mPrice,mNote,mFile):
        try:
            print("insert data into database")
            sql = "insert into phones (mNo,mMark,mPrice,mNote,mFile) values (%s,%s,%s,%s,%s)"
            self.cursor.execute(sql, (mNo, mMark, mPrice, mNote, mFile))
            self.connect.commit()
        except Exception as err:
            print(err)

    def showDB(self):
        try:
            self.connect = pymysql.connect(host='localhost', user='root', passwd='mysql',
                                           db='scrapy_db')  # The last three are the database connection name, database password, and database name.
            # get cursor
            self.cursor = self.connect.cursor()
            print("Successfully connected to the database,ready to view")
            print("%-8s%-16s%-8s%-16s%s" % ("No", "Mark", "Price", "Image", "Note"))
            self.cursor.execute("select mNo,mMark,mPrice,mFile,mNote from phones  order by mNo")
            # will return all results as a 2D tuple like (('id','title'),('id','title')),
            rows = self.cursor.fetchall()
            for row in rows:
                print("%-8s %-16s %-8s %-16s %s" % (row[0], row[1], row[2], row[3], row[4]))

            self.connect.close()
        except Exception as err:
            print(err)

    # store pictures
    def download(self,src1,src2,mFile):
        print("thread task start")
        data = None
        if src1:
            print(src1+"  src1")
            try:
                # Use the Request class to construct requests.
                req = urllib.request.Request(src1, headers=work5_spider1_JD.headers)
                resp = urllib.request.urlopen(req, timeout=10)
                data = resp.read()
                print("data       "+data)
            except:
                pass
        if not data and src2:
            print(src2 + "  src2")
            try:
                req = urllib.request.Request(src2, headers=work5_spider1_JD.headers)
                resp = urllib.request.urlopen(req, timeout=10)
                data = resp.read()
                print("data       " + data)
            except:
                pass
        if data:
            print("download begin", mFile)
            fobj = open(work5_spider1_JD.imagePath + "\\" + mFile, "wb")
            fobj.write(data)
            fobj.close()
            print("download finish", mFile)

    # Crawl data
    def processSpider(self):
        try:
            time.sleep(1)
            print(self.driver.current_url)
            lis = self.driver.find_elements_by_xpath("//div[@id='J_goodsList']//li[@class='gl-item']")
            # Get the url of the image and store it as src1 or src2
            for li in lis:
                try:
                    src1 = li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("src")
                except:
                    src1 = ""

                try:
                    src2 = li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("data-lazy-img")
                except:
                    src2 = ""
                try:
                    price = li.find_element_by_xpath(".//div[@class='p-price']//i").text
                except:
                    price = "0"

                try:
                    note = li.find_element_by_xpath(".//div[@class='p-name p-name-type-2']//em").text
                    mark = note.split(" ")[0]
                    mark = mark.replace("love stuff\n", "")
                    mark = mark.replace(",", "")
                    note = note.replace("love stuff\n", "")
                    note = note.replace(",", "")

                except:
                    note = ""
                    mark = ""
                self.No = self.No + 1
                no=str(self.No)
                # Make all no lengths the same
                while(len(no)<6):
                    no="0"+no
                print(no,mark,price)
                if src1:
                    src1 = urllib.request.urljoin(self.driver.current_url, src1)
                    p = src1.rfind(".")
                    mFile = no + src1[p:]
                elif src2:
                    src2 = urllib.request.urljoin(self.driver.current_url, src2)
                    p = src2.rfind(".")
                    mFile = no + src2[p:]
                if src1 or src2:
                    print("join thread")
                    # Multithreading call download() to download pictures
                    T = threading.Thread(target=self.download, args=(src1, src2, mFile))
                    T.setDaemon(False)
                    T.start()
                    self.threads.append(T)
                else:
                    mFile = ""
                self.insertDB(no, mark, price, note, mFile)
                # Fetch the data of the next page until the last page
            # try:
            #     self.driver.find_element_by_xpath("//span[@class='p-num']//a[@class='pn-next disabled']")
            # except:
            #     nextPage = self.driver.find_element_by_xpath("//span[@class='p-num']//a[@class='pn-next']")
            #     time.sleep(10)
            #     nextPage.click()
            #     self.processSpider()
        except Exception as err:
            print(err)
    # Execute crawler code that crawls web pages
    def executeSpider(self,url,key):
        starttime = datetime.datetime.now()
        print("Spider starting......")
        self.startUp(url, key)
        print("Spider processing......")
        self.processSpider()
        print("Spider closing......")
        self.closeUp()
        for t in self.threads:
            t.join()
        print("Spider completed......")
        endtime = datetime.datetime.now()
        elapsed = (endtime - starttime).seconds
        print("Total ", elapsed, " seconds elapsed")


url = "http://www.jd.com"
spider = work5_spider1_JD()
while True:
    print("1.Crawling")
    print("2.show")
    print("3.quit")
    s = input("please choose(1,2,3):")
    if s == "1":
        spider.executeSpider(url, "Tree")
        continue
    elif s == "2":
        spider.showDB()
        continue
    elif s == "3":
        break

result:

Experience:
Although there are many references to the teacher's code, there are still unexpected errors. It has been written for a long time. It is really good to analyze the teacher's code carefully. I changed it to access the mysql database, and specify utf-8 when building the table, otherwise it will fail to insert Chinese characters

2. Homework 2

Requirement: Crawl forex website data

Proficient in Selenium to find HTML elements, crawl Ajax web page data, wait for HTML elements, etc.
Use the Selenium framework + MySQL database storage technology route to crawl the stock data information of "Shanghai and Shenzhen A shares", "Shanghai A shares" and "Shenzhen A shares".
Code implementation and results:


from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import time
import pymysql
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By




def createTable():
    try:
        # delete table if there is one
        cursor.execute("drop table stocks")
        connect.commit()
    except:
        pass
    try:
        #  Create a new table num,name,new_price,new_change,new_change_num,money
        sql = "create  table  stocks  (num  varchar(32), name varchar(256),new_price varchar(32),new_change varchar(1024),new_change_num varchar(256),money varchar(1024))"
        cursor.execute(sql)
        connect.commit()
    except:
        pass


def insertIntoDB(num,name,new_price,new_change,new_change_num,money):
    try:
        sql = "insert into stocks (num,name,new_price,new_change,new_change_num,money) values (%s,%s,%s,%s,%s,%s)"
        cursor.execute(sql, (num,name,new_price,new_change,new_change_num,money))
        connect.commit()
    except Exception as err:
        print(err)



sum=0
def spider():
    global sum
    # sum indicates the number of pages crawled
    sum=sum+1

    locator = (By.XPATH, "//table[@id='table_wrapper-table']/tbody/tr/td")
    # Wait for the form to load before crawling the website information
    WebDriverWait(driver, 10, 0.5).until(EC.presence_of_element_located(locator))
    print(driver.current_url)
    print("The information is loaded and can be crawled")
    trs = driver.find_elements_by_xpath("//table[@class='table_wrapper-table']/tbody/tr")
    count=0
    # Crawl 4 data per page
    for tds in trs:
        count+=1
        if(count==4):
            break
        td = tds.find_elements_by_xpath("./td")
        # Serial number, name, latest price, change, change, volume
        # item["f12"], item["f14"], item["f2"], item["f3"], item["f4"], item["f5"]
        num = td[1].text
        name = td[2].text
        new_price = td[4].text
        new_change_num = td[5].text
        new_change = td[6].text
        money = td[7].text
        print(num)
        insertIntoDB(num,name,new_price,new_change,new_change_num,money)
    #     Get the next page's button and click
    try:
        driver.find_element_by_xpath("/html/body/div[@class='page-wrapper']/div[@id='page-body']/div[@id='body-main']/div[@id='table_wrapper']/div[@class='listview full']/div[@class='dataTables_wrapper']/div[@id='main-table_paginate']/a[@class='next paginate_button disabled']")
    except:
        nextPage = driver.find_element_by_xpath("/html/body/div[@class='page-wrapper']/div[@id='page-body']/div[@id='body-main']/div[@id='table_wrapper']/div[@class='listview full']/div[@class='dataTables_wrapper']/div[@id='main-table_paginate']/a[@class='next paginate_button']")
        # Just crawl three pages
        if(sum<=3):
            nextPage.click()
            spider()


chrome_options = Options()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument('--disable-gpu')

url="http://quote.eastmoney.com/center/gridlist.html#hs_a_board"

driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get(url)
# establish a connection with the database
connect = pymysql.connect(host='localhost', user='root', passwd='mysql',db='scrapy_db')  # The last three are the database connection name, database password, and database name.
cursor = connect.cursor()
createTable()
spider()

# Close the database connection after crawling
connect.close()
driver.close()

result:

Experience:
With the teacher's example above, I decided to write it myself. Unsurprisingly, the code was very fragile, but it worked.
Click the button of the next page to jump for 0.00001 seconds and then return to the original page. After several print s and careful observation of the browser, it turns out that after click(), the returned function uses driver.get(url), and the browser again Reload the original page,

3. Homework 3

Requirements: Crawl Mooc course data

Proficient in Selenium to find HTML elements, implement user simulated login, crawl Ajax web page data, wait for HTML elements, etc.
Use Selenium framework + MySQL to crawl Chinese mooc network course resource information (course number, course name, school name, lecturer, team members, number of participants, course progress, course introduction)
Code implementation and results:

from work4_Scrapy.items import *
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import urllib.request
import threading
import sqlite3
import os
import datetime
from selenium.webdriver.common.keys import Keys
import time
import pymysql

class work5_spider3_mooc(scrapy.Spider):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}

    name="work5_spider3_mooc"
    # Initialize the database table, browser, image path, and request the search key from the browser
    def startUp(self,url):
        # Initialize the browser, set up a headless browser
        chrome_options = Options()
        # chrome_options.add_argument('--headless')
        # chrome_options.add_argument('--disable-gpu')
        self.driver = webdriver.Chrome(chrome_options=chrome_options)

        self.No = 0
        self.pagesum = 0
        try:
            self.connect = pymysql.connect(host='localhost', user='root', passwd='mysql',
                                           db='scrapy_db')  # The last three are the database connection name, database password, and database name.
            # get cursor
            self.cursor = self.connect.cursor()
            print("Successfully connected to the database")

            try:
                # delete table if there is one
                self.cursor.execute("drop table Mooc_course")
                self.connect.commit()
            except:
                pass
            try:
                #  create new table
                sql = "create  table  Mooc_course  (Id  varchar(32), cCourse varchar(256),cCollege varchar(256),cTeacher varchar(256),cTeam varchar(256)," \
                      "cCount varchar(256),cProcess varchar(256),cBrief varchar(1024))"
                self.cursor.execute(sql)
                self.connect.commit()
            except:
                pass

        except Exception as err:
            print(err)

        # The browser that selenium created the url
        self.driver.get(url)
        time.sleep(1)
        self.driver.maximize_window()


    def closeUp(self):
        try:
            self.cursor.close()
            self.connect.close()
            self.driver.close()

        except Exception as err:
            print(err)

    def insertDB(self,Id,cCourse,cCollege,cTeacher,cTeam,cCount,cProcess,cBrief):
        try:
            print("insert data into database")
            sql = "insert into Mooc_course (Id,cCourse,cCollege,cTeacher,cTeam,cCount,cProcess,cBrief) values (%s,%s,%s,%s,%s,%s,%s,%s)"
            self.cursor.execute(sql, (Id,cCourse,cCollege,cTeacher,cTeam,cCount,cProcess,cBrief))
            self.connect.commit()
        except Exception as err:
            print(err)

    def showDB(self):
        try:
            self.connect = pymysql.connect(host='localhost', user='root', passwd='mysql',
                                           db='scrapy_db')  # The last three are the database connection name, database password, and database name.
            # get cursor
            self.cursor = self.connect.cursor()
            print("Successfully connected to the database,ready to view")
            print("%-8s%-16s%-8s%-16s%s" % ("No", "Mark", "Price", "Image", "Note"))
            self.cursor.execute("select Id,cCourse,cCollege,cTeacher,cTeam,cCount,cProcess,cBrief from Mooc_course  order by Id")
            # will return all results as a 2D tuple like (('id','title'),('id','title')),
            rows = self.cursor.fetchall()
            for row in rows:
                print("%-8s %-8s %-8s %-8s %-8s %-8s %-8s %-16s" % (row[0], row[1], row[2], row[3], row[4],row[5], row[6], row[7]))

            self.connect.close()
        except Exception as err:
            print(err)


    # Crawl data
    def processSpider(self):
        self.pagesum+=1
        try:
            print(self.driver.current_url)
            # All courses
            self.driver.maximize_window()
            items=self.driver.find_elements_by_xpath("//div[@class='_1aoKr']//div[@class='_1gBJC']//div[@class='_2mbYw']")
            count=0
            for item in items:
                count+=1
                if(count>10):
                    break
                # Click the image to jump to the page, .// crawl under the current path
                item.click()
                # All pages opened by the current browser Get all the handles of the current page
                win = self.driver.window_handles
                # switch to the front page
                self.driver.switch_to.window(win[-1])
                # Wait two seconds for the page to load
                time.sleep(2)
                self.No+=1
                Id=str(self.No)
                cCourse= self.driver.find_element_by_xpath("//span[@class='course-title f-ib f-vam']").text
                cCollege = self.driver.find_element_by_xpath("//img[@class='u-img']").get_attribute("alt")
                cTeacher=self.driver.find_element_by_xpath("//div[@class='um-list-slider_con']/div[1]//h3[@class='f-fc3']").text
                cCount = self.driver.find_element_by_xpath("//span[@class='course-enroll-info_course-enroll_price-enroll_enroll-count']").text
                cProcess = self.driver.find_element_by_xpath("//div[@class='course-enroll-info_course-info_term-info_term-time']//span[position()=2]").text
                cBrief = self.driver.find_element_by_xpath("//div[@class='course-heading-intro_intro']").text
                print(Id)
                Team = self.driver.find_elements_by_xpath("//div[@class='um-list-slider_con_item']//h3[@class='f-fc3']")
                cTeam=""
                for team in Team:
                    cTeam+=team.text+","
                self.insertDB(Id,cCourse,cCollege,cTeacher,cTeam,cCount,cProcess,cBrief)

                # close current page
                self.driver.close()
                time.sleep(2)
                # back to previous page
                self.driver.switch_to.window(self.driver.window_handles[0])

            # Crawl the next page
            if(self.pagesum<3):
                try:
                    nextpage = self.driver.find_element_by_xpath("//a[@class='_3YiUU ']")
                    time.sleep(3)
                    nextpage.click()
                    self.processSpider()
                except:
                    self.driver.find_element_by_xpath("//a[@class='_3YiUU _1BSqy']")



        except Exception as err:
            print(err)
    # Execute crawler code that crawls web pages
    def executeSpider(self,url):
        starttime = datetime.datetime.now()
        print("Spider starting......")
        self.startUp(url)
        print("Spider processing......")
        self.processSpider()
        print("Spider closing......")
        self.closeUp()

        print("Spider completed......")
        endtime = datetime.datetime.now()
        elapsed = (endtime - starttime).seconds
        print("Total ", elapsed, " seconds elapsed")


url = "https://www.icourse163.org/channel/2001.htm"
spider = work5_spider3_mooc()
while True:
    print("1.Crawling")
    print("2.show")
    print("3.quit")
    s = input("please choose(1,2,3):")
    if s == "1":
        spider.executeSpider(url)
        continue
    elif s == "2":
        spider.showDB()
        continue
    elif s == "3":
        break

result:

Experience:
Seemingly simple code, simple principle, simple function, but it is really not easy to write
When inserting into the database AttributeError: 'XXX' object has no attribute 'translate', it turns out that the inserted data type is inconsistent with the database type, and .text() is added after the acquired element
When using find_element_by_xpath() on the obtained element, use .// to crawl in the current directory
Don't forget time.sleep() when loading the page
When clicking on an element, sometimes element click intercepted: Element will appear

...
is not clickable at point (271, 126). Other element would receive the click:
...
, it should be that the current element of the page is blocked by other elements, use js to locate the element or driver.maximize_window(), I used the latter one, which can be solved occasionally

Posted by Nick~C on Sat, 07 May 2022 16:18:48 +0300