爬虫阶段总结

文章列表

爬虫阶段总结

获取数据

requests

步骤

引入模块
```
import requests
```

请求网络数据

response = requests.get('目标网页')

设置解码方式
```
response.encoding = 'utf-8'
```

获取请求结果

# 获取请求结果的文本数据
response.text
# 获取二进制格式的请求结果（图片,视频,音频）
response.content
# 下载图片
response = requests.get('图片地址')
result = response.content
# 将数据保存到文件里
with open('files/a.jpg', 'wb') as f:f.write(result)
# 获取请求结果json转换的结果
response.json()

浏览器伪装

header = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
}
response = requests.get('目标网页',headers = header)

登录反爬

检查->network->all->找到网页请求->复制cookie的值

header = {'user-agent': 'user-agent的值','cookie'：'cookie的值'
}
response = requests.get('目标网页',headers = header)

代理ip

# 给proxies赋值
proxoes = {'https': '180.105.81.232:4515'
}
response = requests.get('https://www.zhihu.com', headers=header, proxies=proxoes)

selenium

基础语法

from selenium.webdriver import Chrome# 创建浏览器对象
b = Chrome()
# 打开网页
b.get('https://cd.zu.ke.com/zufang/pg2/#contentList')
# 获取网页源代码
print(b.page_source)

利用循环获取多个网页内容

from selenium.webdriver import Chrome# 创建浏览器对象
b = Chrome()
# 找到多页规律利用循环获取多页内容
for x in range(0, 100):b.get(f'https://cd.zu.ke.com/zufang/pg{x}/#contentList')print(b.page_source)

点击翻页按钮，再刷新后获取网页源代码

# 点击翻页按钮，再刷新后获取网页源代码
b = Chrome()
b.get(f'https://cd.zu.ke.com/zufang/#contentList')
for _ in range(5):b.page_source# 点击下一页按钮# 通过class属性名获取标签c = b.find_element(By.CLASS_NAME,'next')c.click()

输入框输入内容

from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
# 创建浏览器对象
b = Chrome()
# 打开网页
b.get('https://www.cnki.net/')
# 获取输入框对象
input_ = b.find_element(By.ID, 'txt_SearchText')
# 输入框输入文字
input_.send_keys('数据分析\\n')

点击标签-标签.click()

# 获取点击对象
title = b.find_elements(By.CLASS_NAME, 'fz14')
for x in range(20):# 点击title[x].click()

切换标签页

# 切换到最新的标签页
b.switch_to.window(b.window_handles[-1])
# 关闭标签页
b.close()
# 返回第一个标签页
b.switch_to.window(b.window_handles[0])

鼠标滚动

from selenium.webdriver import Chrome
import timeb = Chrome()
b.get('https://www.cnki.net/')
# 单次滚动
b.execute_script('window.scrollBy(0,200)')
# 多次滚动
for x in range(8):b.execute_script('window.scrollBy(0,200)')time.sleep(2)

selenium获取标签

-关键字	-含义	说明
By.ID	听过ID属性值获取标签
By.CLASS_NAME	通过class属性名获取标签
By.CSS_SELECTOR	通过css选择器获取标签
By.LINK_TEXT	通过a标签的标签内容获取标签	只有a标签有用，必须全部匹配
By.NAME	通过名字获
By.TAG_NAME
By.PARTIAL_LINK_TEXT	获取包含str的a标签	部分匹配
By.XPATH

登录反爬

获取cookie

from selenium.webdriver import Chrome# 创建浏览器打开需要自动登录的网站
b = Chrome()
b.get('https://www.taobao.com')
# 留足够长的时间，人工完成登录（必须保证b指定的网址能看到登录成功以后的信息）
input('是否一登录')
# 获取登录成功之后的cookie信息保存到本地文件
result = b.get_cookies()
with open('files/taobao.txt', 'w') as f:f.write(str(result))

使用cookie

from selenium.webdriver import Chrome# 创建浏览器打开需要自动登录的网站
b = Chrome()
b.get('https://www.taobao.com')
# 获取本地cookie文件里的cookie
with open('files/taobao.txt','r') as f:result = eval(f.read())
# 添加cookie
for item in result:b.add_cookie(item)
# 重新打开网页
b.get('https://www.taobao.com')
input('end')

代理ip

解析数据

正则

bs4

步骤

引入模块
```
from bs4 import BeautifulSoup
```

创建soup对象

f = open('files/data.html', encoding='utf-8')
soup = BeautifulSoup(f.read(), 'lxml')
f.close()

获取标签

# 获取整个网页中选择器选中的所有标签，返回值是一个列表，列表中的元素是标签对象（找不到返回空列表）
soup对象.select(css选择器)
# 获取整个网页中选择器选中的第一个标签，返回值是标签对象(找不到返回None)
soup对象.select_one(css选择器)
# 获取指定标签中css选择器选中的所有标签
标签对象.select(css选择器)  
# 获取指定标签中css选择器选中的第一个标签
标签对象.select_one(css选择器)
result = soup.select('p')

获取标签内容和属性

# 获取标签内容
标签对象.text  
# 获取标签指定属性的值
标签对象.attrs[属性名]
print(p1.text)      # '我是段落5'
print(a1.text)      # '我是超链接3'
print(a1.attrs['href'])

xpath

通过路径来获取标签

# 1.绝对路径： /开头，从根节点开始层层往下写路径
# 2.相对路径： ./..开头，从当前路径开始写 ..表示上层路径
# 3.全路径 ： 以  // 开头的路径
html = open('files/data.html').read()

语法

引用

html = open('files/data.html').read()
# 解析对象是html
# 根据数据创建树结构
root1 = etree.HTML(html)
# 解析对象是xml
root1= etree.XML()

基础语法结构

<!DOCTYPE html>
<html lang="en">
<head><meta charset="UTF-8"><title>Title</title>
</head>
<body>
<div><p><span>我是span1</span><a href="">我是超连接1</a></p><p><span>我是span2</span><a href="">我是超连接2</a></p><a href="https://www.baidu.com">我是超链接2</a><p>我是段落2</p><a href="https://www.taobao.com">我是超链接4</a><div><div><a href="">我超链接3</a></div><div><p>我是段落3</p></div></div>
</div>
</body>
</html>

节点对象.xpath()  -  根据获取所有的标签，返回值是列表

相对路径

# 相对路径写法  -  用谁.xpath.就代表谁
root1.xpath('./body/div/p')

绝对路径

# 绝对路径写法
root1.xpath('/html/body')
# 获取标签内容
root1.xpath('/html/body/div/p/text()')
# 获取标签属性
root1.xpath('/html/body/div/p/@href')

全路径

root1.xpath('//div/p') # 找符合这个结构所有的a标签
# 获取标签内容
root1.xpath('//div/p/text()')
# 获取标签属性
root1.xpath('//div/p/@href')

加谓语（条件）

位置相关谓语

[n]

# 符合条件的有多个，取第一个
root1.xpath('//div/p[1]/text()')
# 取第二个
root1.xpath('//div/p[2]/text()')
# 取最后一个
root1.xpath('//div/p[last()]/text()')
# 获取位置大于n的数据
root1.xpath('//div/p[position()>n]/text()')
# 获取倒数第二个
root1.xpath('//div/p[last()-1]/text()')

属性相关谓语

[@属性 = 属性值]

# 通过属性值对标签进行筛选
# [@属性 = 属性值]
# 找到符合id为name且符合div/p结构的标签
root1.xpath('//div/p[@id = ’name‘]/text()')
root1.xpath('//span[@class = "a2"]/*[@class = "c2"]/text()')
root1.xpath('//span[2]/*[@class = "c2"]/text()')

通配符

在xpath中通过通配符表示任意标签和任意属性

# 表示获取div下所有id值为name的标签内容
root1.xpath('//div/*[@id = ’name‘]/text()')
# 表示获取div下所有id值为name的标签所有属性
root1.xpath('//div/p[@id = ’name‘]/@*')

爬虫阶段总结

获取数据

requests

selenium

解析数据

正则

bs4

xpath

相关问题

公告

标签