python动态数据抓取介绍

用到的工具

Selenium 

介绍

Selenium [1]  是一个用于Web应用程序测试的工具。Selenium测试直接运行在浏览器中,就像真正的用户在操作一样。支持的浏览器包括IE(7, 8, 9, 10, 11),Mozilla Firefox,Safari,Google Chrome,Opera等。这个工具的主要功能包括:测试与浏览器的兼容性——测试你的应用程序看是否能够很好得工作在不同浏览器和操作系统之上。测试系统功能——创建回归测试检验软件功能和用户需求。支持自动录制动作和自动生成 .Net、Java、Perl等不同语言的测试脚本。

为什么要用这个工具

两个原因:1js异步加载用传统的方式无法获取到数据2有些网站对js请求的接口做的有ip认证,如果不是当前服务器无法获取数据

原理
其实就是模拟了浏览器的浏览行为

延伸思考

怎么能避免目标网站的屏蔽

用法介绍:

Selenium 文档http://selenium-python-zh.readthedocs.io/en/latest/index.html

代码示例:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.chrome.options import Options
import pymysql
from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup
import time

'''
初次加载数据等待
'''
def waitTime(driver,times):
    is_timeout = 0
    error = None
    try:
        WebDriverWait(driver, times).until(EC.presence_of_all_elements_located((By.XPATH, "//ul[@id='list']/li",)),message="Time out!!")  # 这里检测,如果ul#id li 元素出现后再进行文件html内容的获取
    except TimeoutException as er:
        error = er.msg
        is_timeout = 1
    finally:
        return {'timeout':is_timeout,'error':error}



'''
点击分页时等待
'''

def waitTimePage(driver,times):
    is_timeout = 0
    error = None
    try:
        WebDriverWait(driver, times).until(EC.presence_of_all_elements_located((By.XPATH, "//div[@class=r'layui-layer-content']")),message="Time out!!")  # 这里检测,如果ul#id li 元素出现后再进行文件html内容的获取
    except TimeoutException as er:
        error = er.msg
        is_timeout = 1

    return {'timeout':is_timeout,'error':error}


def dWaits(driver,times,path):
    is_timeout = 0
    error = None
    try:
        WebDriverWait(driver, times).until(EC.presence_of_all_elements_located((By.XPATH, path)),message="Time out!!")  # 这里检测,如果ul#id li 元素出现后再进行文件html内容的获取
    except TimeoutException as er:
        error = er.msg
        is_timeout = 1

    return {'timeout':is_timeout,'error':error}


def getMyslq():
    db = pymysql.connect(host='192.168.0.11', port=3306, user='root', password="root", database="gather",
                         charset='utf8')
    return db

'''
如果不存在就写入
'''
def insertData(data):
    db = getMyslq()
    cursor = db.cursor()
    for item in data:

        if cursor.execute("select * from `midou_list` where url = '"+item+"'") == 0:
            print("---")
            cursor.execute("INSERT INTO `midou_list` (`url`) VALUES ('"+item+"')")
            db.commit()
    return cursor

'''
获取连写入数据库
'''

def getUrl(driver):
    url_list = []
    try:
        li_lists = driver.find_element_by_id('list').find_elements_by_tag_name('li')
        for item in li_lists:
            try:
                url = item.find_element_by_tag_name('a').get_attribute('href')
                url_list.append(url)
            except StaleElementReferenceException as msg:
                continue
        insertData(url_list).close()
    except StaleElementReferenceException as msg:
        return url_list
    return url_list

def creatWeb(path = 'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe'):
    options = Options()
    options.add_argument('-headless')
    return webdriver.Chrome(executable_path=path,chrome_options=options)  # 这里的exe文件需要单独下载,如果是自动安装的没有chromedriver.exe文件。只有这个才能用于测试 当然这里ie 火狐都可以选的


# dWaits(driver,10,"//div[@class=r'dingYou']/div[@id='xiangQing']/h4")

'''
业务逻辑为,页面加载完成-》获取分页总数-》点击每一个分页
'''

def start():
    driver = creatWeb()
    driver.get('http://www.midooroom.com/list.html')

    wait_resutl = waitTime(driver,10)
    if int(wait_resutl['timeout']) == 0 :
        getUrl(driver)
        totals = driver.find_element_by_xpath("//div/a[@title='尾页']").get_attribute('data-page')
        i = 1
        while i<= int(totals) :
            if i >1:
                driver.find_element_by_xpath("//div/a[@data-page='"+str(i)+"']").click()
                print("click----"+str(i))
                waitTimePage(driver,10)
                getUrl(driver)
            i += 1
    else:
        print(wait_resutl['error'])

    driver.quit()

#采集列表
# start()


def getDetail(url,cursor,db):
    web = creatWeb()
    web.get(url)
    web_time_out = dWaits(web, 12, "//div[@id='xiangQing']/div[@class='huxing']")
    if web_time_out['timeout'] ==0:
        try:
            title = web.find_element_by_xpath("//div[@id='xiangQing']/div[@class='huxing']").text
            price = web.find_element_by_xpath("//div[@id='xiangQing']/h4/strong").text
            bianhao = web.find_element_by_xpath("//div[@id='xiangQing']/div[@class='xiangxi']/p[1]").text
            loucheng = web.find_element_by_xpath("//div[@id='xiangQing']/div[@class='xiangxi']/p[2]").text
            mianji = web.find_element_by_xpath("//div[@id='xiangQing']/div[@class='xiangxi']/p[3]").text
            chaoxiang = web.find_element_by_xpath("//div[@id='xiangQing']/div[@class='xiangxi']/p[4]").text
            jushi = web.find_element_by_xpath("//div[@id='xiangQing']/div[@class='xiangxi']/p[5]").text
            xiaoqu = web.find_element_by_xpath("//div[@id='xiangQing']/div[@class='xiangxi']/p[6]").text
            text= {'title':title,'price':price,'bianhao':bianhao,'loucheng':loucheng,'chaoxiang':chaoxiang,'jushi':jushi,'xiaoqu':xiaoqu,'mianji':mianji}
            img_list = []
            for imgli in web.find_elements_by_xpath("//ul[@id='smallImg']/li"):
                img = imgli.find_element_by_tag_name('img').get_attribute("bimg")
                img_list.append(img)
            status = ' 未租'
            cursor.execute("UPDATE `midou_list` SET `status`='"+status+"',`text`=\""+str(text)+"\",`rsync`=0,`img`=\""+str(img_list)+"\" WHERE (`url`='"+url+"')")
            db.commit()
            print(url)
        except StaleElementReferenceException as msg:
            print('访问元素错误')
    else:
        print('页面加载超时')
    web.quit()
    return ""

db = getMyslq()
cursor = db.cursor()
cursor.execute('SELECT * FROM `midou_list` where ISNULL(img)')
for row in cursor.fetchall():
    getDetail(row[1],cursor,db)
db.close()

 

python静态的html爬取笔记(一)http://suiyidian.cn/post-166.html

标签: selenium, 爬虫, python

添加新评论