1. Homework 1
Requirements: Crawling the information and pictures of certain products in Jingdong Mall
Proficient in Selenium to find HTML elements, crawl Ajax web page data, wait for HTML elements, etc.
Use the Selenium framework to crawl certain product information and pictures in Jingdong Mall.
Code implementation and results:
from work4_Scrapy.items import * from selenium import webdriver from selenium.webdriver.chrome.options import Options import urllib.request import threading import sqlite3 import os import datetime from selenium.webdriver.common.keys import Keys import time import pymysql class work5_spider1_JD(scrapy.Spider): headers = { "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"} imagePath = "download" name="work5_spider1_JD" # Initialize the database table, browser, image path, and request the search key from the browser def startUp(self,url,key): # Initialize the browser, set up a headless browser chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') self.driver = webdriver.Chrome(chrome_options=chrome_options) self.threads = [] self.No = 0 self.imgNo = 0 try: self.connect = pymysql.connect(host='localhost', user='root', passwd='mysql', db='scrapy_db') # The last three are the database connection name, database password, and database name. # get cursor self.cursor = self.connect.cursor() print("Successfully connected to the database") # self.con = sqlite3.connect("phones.db") # self.cursor = self.con.cursor() try: # delete table if there is one self.cursor.execute("drop table phones") self.connect.commit() except: pass try: # create new table sql = "create table phones (mNo varchar(32) primary key, mMark varchar(256),mPrice varchar(32),mNote varchar(1024),mFile varchar(256))" self.cursor.execute(sql) self.connect.commit() except: pass except Exception as err: print(err) try: if not os.path.exists(work5_spider1_JD.imagePath): os.mkdir(work5_spider1_JD.imagePath) images = os.listdir(work5_spider1_JD.imagePath) for img in images: s = os.path.join(work5_spider1_JD.imagePath, img) os.remove(s) except Exception as err: print(err) # The browser that selenium created the url self.driver.get(url) # get keywords keyInput=self.driver.find_element_by_id("key") # Keyboard sends data and enter key to website keyInput.send_keys(key) keyInput.send_keys(Keys.ENTER) def closeUp(self): try: self.cursor.close() self.connect.close() self.driver.close() except Exception as err: print(err) def insertDB(self,mNo,mMark,mPrice,mNote,mFile): try: print("insert data into database") sql = "insert into phones (mNo,mMark,mPrice,mNote,mFile) values (%s,%s,%s,%s,%s)" self.cursor.execute(sql, (mNo, mMark, mPrice, mNote, mFile)) self.connect.commit() except Exception as err: print(err) def showDB(self): try: self.connect = pymysql.connect(host='localhost', user='root', passwd='mysql', db='scrapy_db') # The last three are the database connection name, database password, and database name. # get cursor self.cursor = self.connect.cursor() print("Successfully connected to the database,ready to view") print("%-8s%-16s%-8s%-16s%s" % ("No", "Mark", "Price", "Image", "Note")) self.cursor.execute("select mNo,mMark,mPrice,mFile,mNote from phones order by mNo") # will return all results as a 2D tuple like (('id','title'),('id','title')), rows = self.cursor.fetchall() for row in rows: print("%-8s %-16s %-8s %-16s %s" % (row[0], row[1], row[2], row[3], row[4])) self.connect.close() except Exception as err: print(err) # store pictures def download(self,src1,src2,mFile): print("thread task start") data = None if src1: print(src1+" src1") try: # Use the Request class to construct requests. req = urllib.request.Request(src1, headers=work5_spider1_JD.headers) resp = urllib.request.urlopen(req, timeout=10) data = resp.read() print("data "+data) except: pass if not data and src2: print(src2 + " src2") try: req = urllib.request.Request(src2, headers=work5_spider1_JD.headers) resp = urllib.request.urlopen(req, timeout=10) data = resp.read() print("data " + data) except: pass if data: print("download begin", mFile) fobj = open(work5_spider1_JD.imagePath + "\\" + mFile, "wb") fobj.write(data) fobj.close() print("download finish", mFile) # Crawl data def processSpider(self): try: time.sleep(1) print(self.driver.current_url) lis = self.driver.find_elements_by_xpath("//div[@id='J_goodsList']//li[@class='gl-item']") # Get the url of the image and store it as src1 or src2 for li in lis: try: src1 = li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("src") except: src1 = "" try: src2 = li.find_element_by_xpath(".//div[@class='p-img']//a//img").get_attribute("data-lazy-img") except: src2 = "" try: price = li.find_element_by_xpath(".//div[@class='p-price']//i").text except: price = "0" try: note = li.find_element_by_xpath(".//div[@class='p-name p-name-type-2']//em").text mark = note.split(" ")[0] mark = mark.replace("love stuff\n", "") mark = mark.replace(",", "") note = note.replace("love stuff\n", "") note = note.replace(",", "") except: note = "" mark = "" self.No = self.No + 1 no=str(self.No) # Make all no lengths the same while(len(no)<6): no="0"+no print(no,mark,price) if src1: src1 = urllib.request.urljoin(self.driver.current_url, src1) p = src1.rfind(".") mFile = no + src1[p:] elif src2: src2 = urllib.request.urljoin(self.driver.current_url, src2) p = src2.rfind(".") mFile = no + src2[p:] if src1 or src2: print("join thread") # Multithreading call download() to download pictures T = threading.Thread(target=self.download, args=(src1, src2, mFile)) T.setDaemon(False) T.start() self.threads.append(T) else: mFile = "" self.insertDB(no, mark, price, note, mFile) # Fetch the data of the next page until the last page # try: # self.driver.find_element_by_xpath("//span[@class='p-num']//a[@class='pn-next disabled']") # except: # nextPage = self.driver.find_element_by_xpath("//span[@class='p-num']//a[@class='pn-next']") # time.sleep(10) # nextPage.click() # self.processSpider() except Exception as err: print(err) # Execute crawler code that crawls web pages def executeSpider(self,url,key): starttime = datetime.datetime.now() print("Spider starting......") self.startUp(url, key) print("Spider processing......") self.processSpider() print("Spider closing......") self.closeUp() for t in self.threads: t.join() print("Spider completed......") endtime = datetime.datetime.now() elapsed = (endtime - starttime).seconds print("Total ", elapsed, " seconds elapsed") url = "http://www.jd.com" spider = work5_spider1_JD() while True: print("1.Crawling") print("2.show") print("3.quit") s = input("please choose(1,2,3):") if s == "1": spider.executeSpider(url, "Tree") continue elif s == "2": spider.showDB() continue elif s == "3": break
result:


Experience:
Although there are many references to the teacher's code, there are still unexpected errors. It has been written for a long time. It is really good to analyze the teacher's code carefully. I changed it to access the mysql database, and specify utf-8 when building the table, otherwise it will fail to insert Chinese characters
2. Homework 2
Requirement: Crawl forex website data
Proficient in Selenium to find HTML elements, crawl Ajax web page data, wait for HTML elements, etc.
Use the Selenium framework + MySQL database storage technology route to crawl the stock data information of "Shanghai and Shenzhen A shares", "Shanghai A shares" and "Shenzhen A shares".
Code implementation and results:
from selenium.webdriver.chrome.options import Options from selenium import webdriver import time import pymysql from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By def createTable(): try: # delete table if there is one cursor.execute("drop table stocks") connect.commit() except: pass try: # Create a new table num,name,new_price,new_change,new_change_num,money sql = "create table stocks (num varchar(32), name varchar(256),new_price varchar(32),new_change varchar(1024),new_change_num varchar(256),money varchar(1024))" cursor.execute(sql) connect.commit() except: pass def insertIntoDB(num,name,new_price,new_change,new_change_num,money): try: sql = "insert into stocks (num,name,new_price,new_change,new_change_num,money) values (%s,%s,%s,%s,%s,%s)" cursor.execute(sql, (num,name,new_price,new_change,new_change_num,money)) connect.commit() except Exception as err: print(err) sum=0 def spider(): global sum # sum indicates the number of pages crawled sum=sum+1 locator = (By.XPATH, "//table[@id='table_wrapper-table']/tbody/tr/td") # Wait for the form to load before crawling the website information WebDriverWait(driver, 10, 0.5).until(EC.presence_of_element_located(locator)) print(driver.current_url) print("The information is loaded and can be crawled") trs = driver.find_elements_by_xpath("//table[@class='table_wrapper-table']/tbody/tr") count=0 # Crawl 4 data per page for tds in trs: count+=1 if(count==4): break td = tds.find_elements_by_xpath("./td") # Serial number, name, latest price, change, change, volume # item["f12"], item["f14"], item["f2"], item["f3"], item["f4"], item["f5"] num = td[1].text name = td[2].text new_price = td[4].text new_change_num = td[5].text new_change = td[6].text money = td[7].text print(num) insertIntoDB(num,name,new_price,new_change,new_change_num,money) # Get the next page's button and click try: driver.find_element_by_xpath("/html/body/div[@class='page-wrapper']/div[@id='page-body']/div[@id='body-main']/div[@id='table_wrapper']/div[@class='listview full']/div[@class='dataTables_wrapper']/div[@id='main-table_paginate']/a[@class='next paginate_button disabled']") except: nextPage = driver.find_element_by_xpath("/html/body/div[@class='page-wrapper']/div[@id='page-body']/div[@id='body-main']/div[@id='table_wrapper']/div[@class='listview full']/div[@class='dataTables_wrapper']/div[@id='main-table_paginate']/a[@class='next paginate_button']") # Just crawl three pages if(sum<=3): nextPage.click() spider() chrome_options = Options() # chrome_options.add_argument('--headless') # chrome_options.add_argument('--disable-gpu') url="http://quote.eastmoney.com/center/gridlist.html#hs_a_board" driver = webdriver.Chrome(chrome_options=chrome_options) driver.get(url) # establish a connection with the database connect = pymysql.connect(host='localhost', user='root', passwd='mysql',db='scrapy_db') # The last three are the database connection name, database password, and database name. cursor = connect.cursor() createTable() spider() # Close the database connection after crawling connect.close() driver.close()
result:

Experience:
With the teacher's example above, I decided to write it myself. Unsurprisingly, the code was very fragile, but it worked.
Click the button of the next page to jump for 0.00001 seconds and then return to the original page. After several print s and careful observation of the browser, it turns out that after click(), the returned function uses driver.get(url), and the browser again Reload the original page,
3. Homework 3
Requirements: Crawl Mooc course data
Proficient in Selenium to find HTML elements, implement user simulated login, crawl Ajax web page data, wait for HTML elements, etc.
Use Selenium framework + MySQL to crawl Chinese mooc network course resource information (course number, course name, school name, lecturer, team members, number of participants, course progress, course introduction)
Code implementation and results:
from work4_Scrapy.items import * from selenium import webdriver from selenium.webdriver.chrome.options import Options import urllib.request import threading import sqlite3 import os import datetime from selenium.webdriver.common.keys import Keys import time import pymysql class work5_spider3_mooc(scrapy.Spider): headers = { "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"} name="work5_spider3_mooc" # Initialize the database table, browser, image path, and request the search key from the browser def startUp(self,url): # Initialize the browser, set up a headless browser chrome_options = Options() # chrome_options.add_argument('--headless') # chrome_options.add_argument('--disable-gpu') self.driver = webdriver.Chrome(chrome_options=chrome_options) self.No = 0 self.pagesum = 0 try: self.connect = pymysql.connect(host='localhost', user='root', passwd='mysql', db='scrapy_db') # The last three are the database connection name, database password, and database name. # get cursor self.cursor = self.connect.cursor() print("Successfully connected to the database") try: # delete table if there is one self.cursor.execute("drop table Mooc_course") self.connect.commit() except: pass try: # create new table sql = "create table Mooc_course (Id varchar(32), cCourse varchar(256),cCollege varchar(256),cTeacher varchar(256),cTeam varchar(256)," \ "cCount varchar(256),cProcess varchar(256),cBrief varchar(1024))" self.cursor.execute(sql) self.connect.commit() except: pass except Exception as err: print(err) # The browser that selenium created the url self.driver.get(url) time.sleep(1) self.driver.maximize_window() def closeUp(self): try: self.cursor.close() self.connect.close() self.driver.close() except Exception as err: print(err) def insertDB(self,Id,cCourse,cCollege,cTeacher,cTeam,cCount,cProcess,cBrief): try: print("insert data into database") sql = "insert into Mooc_course (Id,cCourse,cCollege,cTeacher,cTeam,cCount,cProcess,cBrief) values (%s,%s,%s,%s,%s,%s,%s,%s)" self.cursor.execute(sql, (Id,cCourse,cCollege,cTeacher,cTeam,cCount,cProcess,cBrief)) self.connect.commit() except Exception as err: print(err) def showDB(self): try: self.connect = pymysql.connect(host='localhost', user='root', passwd='mysql', db='scrapy_db') # The last three are the database connection name, database password, and database name. # get cursor self.cursor = self.connect.cursor() print("Successfully connected to the database,ready to view") print("%-8s%-16s%-8s%-16s%s" % ("No", "Mark", "Price", "Image", "Note")) self.cursor.execute("select Id,cCourse,cCollege,cTeacher,cTeam,cCount,cProcess,cBrief from Mooc_course order by Id") # will return all results as a 2D tuple like (('id','title'),('id','title')), rows = self.cursor.fetchall() for row in rows: print("%-8s %-8s %-8s %-8s %-8s %-8s %-8s %-16s" % (row[0], row[1], row[2], row[3], row[4],row[5], row[6], row[7])) self.connect.close() except Exception as err: print(err) # Crawl data def processSpider(self): self.pagesum+=1 try: print(self.driver.current_url) # All courses self.driver.maximize_window() items=self.driver.find_elements_by_xpath("//div[@class='_1aoKr']//div[@class='_1gBJC']//div[@class='_2mbYw']") count=0 for item in items: count+=1 if(count>10): break # Click the image to jump to the page, .// crawl under the current path item.click() # All pages opened by the current browser Get all the handles of the current page win = self.driver.window_handles # switch to the front page self.driver.switch_to.window(win[-1]) # Wait two seconds for the page to load time.sleep(2) self.No+=1 Id=str(self.No) cCourse= self.driver.find_element_by_xpath("//span[@class='course-title f-ib f-vam']").text cCollege = self.driver.find_element_by_xpath("//img[@class='u-img']").get_attribute("alt") cTeacher=self.driver.find_element_by_xpath("//div[@class='um-list-slider_con']/div[1]//h3[@class='f-fc3']").text cCount = self.driver.find_element_by_xpath("//span[@class='course-enroll-info_course-enroll_price-enroll_enroll-count']").text cProcess = self.driver.find_element_by_xpath("//div[@class='course-enroll-info_course-info_term-info_term-time']//span[position()=2]").text cBrief = self.driver.find_element_by_xpath("//div[@class='course-heading-intro_intro']").text print(Id) Team = self.driver.find_elements_by_xpath("//div[@class='um-list-slider_con_item']//h3[@class='f-fc3']") cTeam="" for team in Team: cTeam+=team.text+"," self.insertDB(Id,cCourse,cCollege,cTeacher,cTeam,cCount,cProcess,cBrief) # close current page self.driver.close() time.sleep(2) # back to previous page self.driver.switch_to.window(self.driver.window_handles[0]) # Crawl the next page if(self.pagesum<3): try: nextpage = self.driver.find_element_by_xpath("//a[@class='_3YiUU ']") time.sleep(3) nextpage.click() self.processSpider() except: self.driver.find_element_by_xpath("//a[@class='_3YiUU _1BSqy']") except Exception as err: print(err) # Execute crawler code that crawls web pages def executeSpider(self,url): starttime = datetime.datetime.now() print("Spider starting......") self.startUp(url) print("Spider processing......") self.processSpider() print("Spider closing......") self.closeUp() print("Spider completed......") endtime = datetime.datetime.now() elapsed = (endtime - starttime).seconds print("Total ", elapsed, " seconds elapsed") url = "https://www.icourse163.org/channel/2001.htm" spider = work5_spider3_mooc() while True: print("1.Crawling") print("2.show") print("3.quit") s = input("please choose(1,2,3):") if s == "1": spider.executeSpider(url) continue elif s == "2": spider.showDB() continue elif s == "3": break
result:

Experience:
Seemingly simple code, simple principle, simple function, but it is really not easy to write
When inserting into the database AttributeError: 'XXX' object has no attribute 'translate', it turns out that the inserted data type is inconsistent with the database type, and .text() is added after the acquired element
When using find_element_by_xpath() on the obtained element, use .// to crawl in the current directory
Don't forget time.sleep() when loading the page
When clicking on an element, sometimes element click intercepted: Element will appear