python动态数据抓取介绍
用到的工具
Selenium
介绍
Selenium [1] 是一个用于Web应用程序测试的工具。Selenium测试直接运行在浏览器中,就像真正的用户在操作一样。支持的浏览器包括IE(7, 8, 9, 10, 11),Mozilla Firefox,Safari,Google Chrome,Opera等。这个工具的主要功能包括:测试与浏览器的兼容性——测试你的应用程序看是否能够很好得工作在不同浏览器和操作系统之上。测试系统功能——创建回归测试检验软件功能和用户需求。支持自动录制动作和自动生成 .Net、Java、Perl等不同语言的测试脚本。
为什么要用这个工具
两个原因:1js异步加载用传统的方式无法获取到数据2有些网站对js请求的接口做的有ip认证,如果不是当前服务器无法获取数据
原理
其实就是模拟了浏览器的浏览行为
延伸思考
怎么能避免目标网站的屏蔽
用法介绍:
Selenium 文档http://selenium-python-zh.readthedocs.io/en/latest/index.html
代码示例:
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException from selenium.common.exceptions import StaleElementReferenceException from selenium.webdriver.chrome.options import Options import pymysql from concurrent.futures import ThreadPoolExecutor from bs4 import BeautifulSoup import time ''' 初次加载数据等待 ''' def waitTime(driver,times): is_timeout = 0 error = None try: WebDriverWait(driver, times).until(EC.presence_of_all_elements_located((By.XPATH, "//ul[@id='list']/li",)),message="Time out!!") # 这里检测,如果ul#id li 元素出现后再进行文件html内容的获取 except TimeoutException as er: error = er.msg is_timeout = 1 finally: return {'timeout':is_timeout,'error':error} ''' 点击分页时等待 ''' def waitTimePage(driver,times): is_timeout = 0 error = None try: WebDriverWait(driver, times).until(EC.presence_of_all_elements_located((By.XPATH, "//div[@class=r'layui-layer-content']")),message="Time out!!") # 这里检测,如果ul#id li 元素出现后再进行文件html内容的获取 except TimeoutException as er: error = er.msg is_timeout = 1 return {'timeout':is_timeout,'error':error} def dWaits(driver,times,path): is_timeout = 0 error = None try: WebDriverWait(driver, times).until(EC.presence_of_all_elements_located((By.XPATH, path)),message="Time out!!") # 这里检测,如果ul#id li 元素出现后再进行文件html内容的获取 except TimeoutException as er: error = er.msg is_timeout = 1 return {'timeout':is_timeout,'error':error} def getMyslq(): db = pymysql.connect(host='192.168.0.11', port=3306, user='root', password="root", database="gather", charset='utf8') return db ''' 如果不存在就写入 ''' def insertData(data): db = getMyslq() cursor = db.cursor() for item in data: if cursor.execute("select * from `midou_list` where url = '"+item+"'") == 0: print("---") cursor.execute("INSERT INTO `midou_list` (`url`) VALUES ('"+item+"')") db.commit() return cursor ''' 获取连写入数据库 ''' def getUrl(driver): url_list = [] try: li_lists = driver.find_element_by_id('list').find_elements_by_tag_name('li') for item in li_lists: try: url = item.find_element_by_tag_name('a').get_attribute('href') url_list.append(url) except StaleElementReferenceException as msg: continue insertData(url_list).close() except StaleElementReferenceException as msg: return url_list return url_list def creatWeb(path = 'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe'): options = Options() options.add_argument('-headless') return webdriver.Chrome(executable_path=path,chrome_options=options) # 这里的exe文件需要单独下载,如果是自动安装的没有chromedriver.exe文件。只有这个才能用于测试 当然这里ie 火狐都可以选的 # dWaits(driver,10,"//div[@class=r'dingYou']/div[@id='xiangQing']/h4") ''' 业务逻辑为,页面加载完成-》获取分页总数-》点击每一个分页 ''' def start(): driver = creatWeb() driver.get('http://www.midooroom.com/list.html') wait_resutl = waitTime(driver,10) if int(wait_resutl['timeout']) == 0 : getUrl(driver) totals = driver.find_element_by_xpath("//div/a[@title='尾页']").get_attribute('data-page') i = 1 while i<= int(totals) : if i >1: driver.find_element_by_xpath("//div/a[@data-page='"+str(i)+"']").click() print("click----"+str(i)) waitTimePage(driver,10) getUrl(driver) i += 1 else: print(wait_resutl['error']) driver.quit() #采集列表 # start() def getDetail(url,cursor,db): web = creatWeb() web.get(url) web_time_out = dWaits(web, 12, "//div[@id='xiangQing']/div[@class='huxing']") if web_time_out['timeout'] ==0: try: title = web.find_element_by_xpath("//div[@id='xiangQing']/div[@class='huxing']").text price = web.find_element_by_xpath("//div[@id='xiangQing']/h4/strong").text bianhao = web.find_element_by_xpath("//div[@id='xiangQing']/div[@class='xiangxi']/p[1]").text loucheng = web.find_element_by_xpath("//div[@id='xiangQing']/div[@class='xiangxi']/p[2]").text mianji = web.find_element_by_xpath("//div[@id='xiangQing']/div[@class='xiangxi']/p[3]").text chaoxiang = web.find_element_by_xpath("//div[@id='xiangQing']/div[@class='xiangxi']/p[4]").text jushi = web.find_element_by_xpath("//div[@id='xiangQing']/div[@class='xiangxi']/p[5]").text xiaoqu = web.find_element_by_xpath("//div[@id='xiangQing']/div[@class='xiangxi']/p[6]").text text= {'title':title,'price':price,'bianhao':bianhao,'loucheng':loucheng,'chaoxiang':chaoxiang,'jushi':jushi,'xiaoqu':xiaoqu,'mianji':mianji} img_list = [] for imgli in web.find_elements_by_xpath("//ul[@id='smallImg']/li"): img = imgli.find_element_by_tag_name('img').get_attribute("bimg") img_list.append(img) status = ' 未租' cursor.execute("UPDATE `midou_list` SET `status`='"+status+"',`text`=\""+str(text)+"\",`rsync`=0,`img`=\""+str(img_list)+"\" WHERE (`url`='"+url+"')") db.commit() print(url) except StaleElementReferenceException as msg: print('访问元素错误') else: print('页面加载超时') web.quit() return "" db = getMyslq() cursor = db.cursor() cursor.execute('SELECT * FROM `midou_list` where ISNULL(img)') for row in cursor.fetchall(): getDetail(row[1],cursor,db) db.close()
python静态的html爬取笔记(一)http://suiyidian.cn/post-166.html
本文由 kevin 创作,采用 知识共享署名4.0 国际许可协议进行许可。
本站文章除注明转载/出处外,均为本站原创或翻译,转载前请务必署名。