HTTP 请求与爬虫
大约 11 分钟约 3291 字
HTTP 请求与爬虫
简介
Python 的 requests 库是最流行的 HTTP 客户端,简洁的 API 让发送 HTTP 请求变得轻松。结合 BeautifulSoup,可以快速构建网页爬虫,获取互联网数据。
requests 库的设计哲学是"HTTP for Humans"——它将 HTTP 协议的复杂性隐藏在简洁的 API 之下。一个 GET 请求只需要一行代码 requests.get(url),但它背后处理了连接池管理、内容编码解码、Cookie 持久化、SSL 验证等大量底层细节。从工程角度看,理解这些底层行为对于编写健壮的网络请求代码至关重要。
特点
requests 基础
HTTP 请求
# pip install requests
import requests
# GET 请求
response = requests.get('https://httpbin.org/get',
params={'keyword': 'python', 'page': 1},
headers={'User-Agent': 'MyApp/1.0'},
timeout=10
)
print(response.status_code) # 200
print(response.json()) # 解析 JSON 响应
print(response.headers) # 响应头
# POST 请求(表单)
response = requests.post('https://httpbin.org/post',
data={'username': 'admin', 'password': '123456'}
)
# POST 请求(JSON)
response = requests.post('https://httpbin.org/post',
json={'name': '张三', 'age': 30},
headers={'Content-Type': 'application/json'}
)
# PUT/DELETE
response = requests.put('https://api.example.com/users/1',
json={'name': '李四'})
response = requests.delete('https://api.example.com/users/1')
# 下载文件
response = requests.get('https://example.com/file.pdf', stream=True)
with open('file.pdf', 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
# 上传文件
files = {'file': open('report.pdf', 'rb')}
response = requests.post('https://api.example.com/upload', files=files)深入 Response 对象
import requests
response = requests.get('https://httpbin.org/json')
# 状态码
print(response.status_code) # 200
print(response.ok) # True (2xx)
print(response.is_redirect) # False
print(response.is_permanent_redirect) # False
# 响应内容
print(response.text) # 自动解码的文本
print(response.content) # 原始字节
print(response.json()) # JSON 解析(自动处理)
print(response.url) # 最终 URL(处理了重定向后)
print(response.encoding) # 检测到的编码
print(response.apparent_encoding) # 服务器声明的编码
# 响应头
print(response.headers['Content-Type'])
print(response.headers.get('X-Custom-Header', 'default'))
# Cookie
print(response.cookies.get('session_id'))
# 请求信息
print(response.request.method) # GET
print(response.request.headers) # 请求头
print(response.request.url) # 请求 URL
print(response.elapsed.total_seconds()) # 请求耗时(秒)
# 状态码判断(推荐使用 raise_for_status)
if response.status_code == 200:
data = response.json()
# 使用内置方法
try:
response.raise_for_status() # 4xx/5xx 抛 HTTPError
data = response.json()
except requests.HTTPError as e:
print(f"请求失败: {e.response.status_code}")会话管理
# Session 保持 Cookie
session = requests.Session()
# 登录
login_resp = session.post('https://api.example.com/login',
json={'username': 'admin', 'password': '123456'})
# 后续请求自动携带 Cookie
profile = session.get('https://api.example.com/profile')
orders = session.get('https://api.example.com/orders')
# 设置公共 Headers
session.headers.update({
'Authorization': 'Bearer your-token',
'User-Agent': 'MyApp/1.0'
})
# 重试策略
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
retry_strategy = Retry(
total=3,
backoff_factor=1,
status_forcelist=[429, 500, 502, 503, 504]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("https://", adapter)
session.mount("http://", adapter)生产级 Session 封装
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from typing import Any, Optional
import logging
import time
logger = logging.getLogger(__name__)
class HttpClient:
"""生产级 HTTP 客户端
特性:
- 自动重试
- 连接池管理
- 请求/响应日志
- 超时控制
- 速率限制
"""
def __init__(
self,
base_url: str = "",
timeout: int = 30,
max_retries: int = 3,
retry_backoff: float = 1.0,
rate_limit: float = 0, # 每秒最大请求数,0 表示不限
):
self.base_url = base_url.rstrip("/")
self.timeout = timeout
self.rate_limit = rate_limit
self._last_request_time = 0.0
self.session = requests.Session()
# 配置重试策略
retry_strategy = Retry(
total=max_retries,
backoff_factor=retry_backoff,
status_forcelist=[429, 500, 502, 503, 504],
allowed_methods=["GET", "POST", "PUT", "DELETE", "PATCH"],
)
adapter = HTTPAdapter(
max_retries=retry_strategy,
pool_connections=10,
pool_maxsize=100,
)
self.session.mount("https://", adapter)
self.session.mount("http://", adapter)
# 默认头
self.session.headers.update({
"Content-Type": "application/json",
"Accept": "application/json",
})
def _rate_limit_wait(self) -> None:
"""速率限制等待"""
if self.rate_limit > 0:
now = time.time()
elapsed = now - self._last_request_time
min_interval = 1.0 / self.rate_limit
if elapsed < min_interval:
time.sleep(min_interval - elapsed)
self._last_request_time = time.time()
def request(
self,
method: str,
path: str,
params: Optional[dict] = None,
json_data: Optional[dict] = None,
data: Optional[dict] = None,
headers: Optional[dict] = None,
timeout: Optional[int] = None,
) -> dict:
"""发送 HTTP 请求"""
url = f"{self.base_url}/{path.lstrip('/')}" if self.base_url else path
timeout = timeout or self.timeout
self._rate_limit_wait()
logger.info(f"HTTP {method} {url}")
response = self.session.request(
method=method,
url=url,
params=params,
json=json_data,
data=data,
headers=headers,
timeout=timeout,
)
logger.info(
f"HTTP {response.status_code} {url} "
f"({response.elapsed.total_seconds():.3f}s)"
)
response.raise_for_status()
return response.json()
def get(self, path: str, **kwargs) -> dict:
return self.request("GET", path, **kwargs)
def post(self, path: str, **kwargs) -> dict:
return self.request("POST", path, **kwargs)
def put(self, path: str, **kwargs) -> dict:
return self.request("PUT", path, **kwargs)
def delete(self, path: str, **kwargs) -> dict:
return self.request("DELETE", path, **kwargs)
def close(self) -> None:
"""关闭会话,释放连接池"""
self.session.close()
def __enter__(self):
return self
def __exit__(self, *args):
self.close()
# 使用示例
with HttpClient(base_url="https://httpbin.org", timeout=10) as client:
result = client.get("/get", params={"key": "value"})
print(result)异常处理
from requests.exceptions import RequestException, Timeout, ConnectionError
def safe_request(url: str, retries: int = 3) -> dict:
for attempt in range(retries):
try:
response = requests.get(url, timeout=10)
response.raise_for_status() # 4xx/5xx 抛异常
return response.json()
except Timeout:
print(f"请求超时,第 {attempt+1} 次重试")
except ConnectionError:
print(f"连接失败,第 {attempt+1} 次重试")
except requests.HTTPError as e:
print(f"HTTP 错误: {e.response.status_code}")
break
except RequestException as e:
print(f"请求异常: {e}")
break
return {}完善的异常处理策略
import requests
from requests.exceptions import (
RequestException, Timeout, ConnectionError,
HTTPError, SSLError, TooManyRedirects, ChunkedEncodingError
)
import logging
import time
logger = logging.getLogger(__name__)
class APIError(Exception):
"""自定义 API 错误"""
def __init__(self, message: str, status_code: int = 0, response: dict = None):
super().__init__(message)
self.status_code = status_code
self.response = response or {}
def robust_request(
url: str,
method: str = "GET",
max_retries: int = 3,
backoff_factor: float = 1.0,
timeout: tuple = (5, 30), # (连接超时, 读取超时)
) -> requests.Response:
"""健壮的 HTTP 请求函数
Args:
url: 请求 URL
method: HTTP 方法
max_retries: 最大重试次数
backoff_factor: 退避因子
timeout: (连接超时, 读取超时) 元组
Returns:
Response 对象
Raises:
APIError: 请求失败
"""
last_exception = None
for attempt in range(1, max_retries + 1):
try:
response = requests.request(
method=method,
url=url,
timeout=timeout,
)
response.raise_for_status()
return response
except Timeout as e:
last_exception = e
logger.warning(f"超时 (尝试 {attempt}/{max_retries}): {url}")
if attempt < max_retries:
time.sleep(backoff_factor * attempt)
except ConnectionError as e:
last_exception = e
logger.warning(f"连接失败 (尝试 {attempt}/{max_retries}): {url}")
if attempt < max_retries:
time.sleep(backoff_factor * attempt)
except HTTPError as e:
status = e.response.status_code
if status == 429: # Too Many Requests
retry_after = int(e.response.headers.get("Retry-After", 5))
logger.warning(f"速率限制,等待 {retry_after}s")
if attempt < max_retries:
time.sleep(retry_after)
continue
elif status >= 500: # 服务器错误,可重试
last_exception = e
logger.warning(f"服务器错误 {status} (尝试 {attempt}/{max_retries})")
if attempt < max_retries:
time.sleep(backoff_factor * attempt)
continue
else:
# 4xx 客户端错误,不应重试
raise APIError(
f"客户端错误: {status} - {e.response.text[:200]}",
status_code=status,
)
except SSLError as e:
raise APIError(f"SSL 错误: {e}")
except (TooManyRedirects, ChunkedEncodingError) as e:
raise APIError(f"请求异常: {e}")
except RequestException as e:
last_exception = e
logger.warning(f"请求失败 (尝试 {attempt}/{max_retries}): {e}")
raise APIError(f"请求失败,已重试 {max_retries} 次: {last_exception}")BeautifulSoup 爬虫
HTML 解析
# pip install beautifulsoup4 lxml
from bs4 import BeautifulSoup
import requests
# 获取网页
response = requests.get('https://news.example.com',
headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(response.text, 'lxml')
# 提取标题
title = soup.find('h1').text
print(title)
# 提取所有链接
links = soup.find_all('a')
for link in links:
href = link.get('href')
text = link.text.strip()
if href:
print(f"{text}: {href}")
# CSS 选择器
articles = soup.select('.article-list .item')
for article in articles:
title = article.select_one('.title').text
author = article.select_one('.author').text
date = article.select_one('.date').text
print(f"[{date}] {title} - {author}")
# 提取表格数据
table = soup.find('table')
rows = table.find_all('tr')
for row in rows[1:]: # 跳过表头
cols = [td.text.strip() for td in row.find_all('td')]
print(cols)BeautifulSoup 深入使用
from bs4 import BeautifulSoup, Tag, NavigableString
html = """
<div class="container">
<h1 id="main-title">文章标题</h1>
<div class="meta">
<span class="author">作者A</span>
<span class="date">2024-01-15</span>
<span class="tags">
<a href="/tag/python">Python</a>
<a href="/tag/web">Web</a>
</span>
</div>
<div class="content">
<p>第一段内容</p>
<p>第二段内容 <strong>加粗文字</strong></p>
<ul>
<li>列表项 1</li>
<li>列表项 2</li>
</ul>
<a href="https://example.com" data-id="123">外部链接</a>
</div>
</div>
"""
soup = BeautifulSoup(html, "lxml")
# 1. find vs find_all
title = soup.find("h1") # 返回第一个匹配的 Tag
title_by_id = soup.find(id="main-title") # 按 id 查找
all_links = soup.find_all("a") # 返回列表
# 2. CSS 选择器(推荐方式)
container = soup.select_one(".container") # 第一个
tags = soup.select(".tags a") # 所有匹配
second_p = soup.select(".content p:nth-of-type(2)")
# 3. 属性访问
link = soup.select_one(".content a")
print(link["href"]) # https://example.com
print(link.get("data-id")) # 123
print(link.get("class")) # None(a 标签没有 class)
# 4. 文本提取
h1 = soup.find("h1")
print(h1.string) # "文章标题"(只有一个子 NavigableString 时)
print(h1.text) # "文章标题"(递归获取所有文本)
print(h1.get_text(strip=True)) # 去除首尾空白
# 5. 父子兄弟导航
first_p = soup.select_one(".content p")
print(first_p.parent["class"]) # ['content']
print(first_p.find_next_sibling()) # 第二个 p
print(first_p.find_previous_sibling()) # None(第一个 p 前面没有兄弟 p)
# 6. 正则表达式匹配
import re
external_links = soup.find_all("a", href=re.compile(r"^https?://"))
print([link["href"] for link in external_links])
# 7. 修改 DOM
for p in soup.select(".content p"):
p["class"] = "paragraph"
new_tag = soup.new_tag("div", attrs={"class": "footer"})
new_tag.string = "页脚"
soup.find(".container").append(new_tag)结构化爬取
import requests
from bs4 import BeautifulSoup
from dataclasses import dataclass
import json
import time
@dataclass
class Article:
title: str
author: str
date: str
content: str
def scrape_articles(base_url: str, max_pages: int = 5) -> list[Article]:
articles = []
session = requests.Session()
session.headers['User-Agent'] = 'Mozilla/5.0 (compatible; Bot/1.0)'
for page in range(1, max_pages + 1):
url = f"{base_url}?page={page}"
try:
response = session.get(url, timeout=10)
soup = BeautifulSoup(response.text, 'lxml')
items = soup.select('.article-item')
if not items:
break
for item in items:
article = Article(
title=item.select_one('.title').text.strip(),
author=item.select_one('.author').text.strip(),
date=item.select_one('.date').text.strip(),
content=item.select_one('.summary').text.strip()
)
articles.append(article)
time.sleep(1) # 礼貌性延迟
except Exception as e:
print(f"爬取第 {page} 页失败: {e}")
continue
return articles
# 保存结果
# articles = scrape_articles("https://example.com/articles")
# with open("articles.json", "w", encoding="utf-8") as f:
# json.dump([a.__dict__ for a in articles], f, ensure_ascii=False, indent=2)反爬虫应对策略
import requests
import random
import time
from fake_useragent import UserAgent
# 1. 随机 User-Agent
ua = UserAgent()
def get_with_random_ua(url: str) -> requests.Response:
"""使用随机 User-Agent 发送请求"""
headers = {
"User-Agent": ua.random,
"Accept": "text/html,application/xhtml+xml",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
}
return requests.get(url, headers=headers, timeout=10)
# 2. 代理 IP 轮换
class ProxyRotator:
"""代理 IP 轮换器"""
def __init__(self, proxies: list[str]):
self.proxies = proxies
self._index = 0
self._failed = set()
def get_proxy(self) -> dict[str, str]:
"""获取下一个可用代理"""
for _ in range(len(self.proxies)):
proxy = self.proxies[self._index % len(self.proxies)]
self._index += 1
if proxy not in self._failed:
return {"http": proxy, "https": proxy}
raise RuntimeError("所有代理均不可用")
def mark_failed(self, proxy: dict[str, str]):
"""标记代理失败"""
self._failed.add(proxy.get("http", proxy.get("https", "")))
# 3. 随机延迟(避免规律性请求)
def random_delay(min_seconds: float = 1.0, max_seconds: float = 3.0):
"""随机延迟"""
delay = random.uniform(min_seconds, max_seconds)
time.sleep(delay)
# 4. 请求频率限制
import threading
class RateLimiter:
"""线程安全的速率限制器"""
def __init__(self, max_per_second: float):
self.min_interval = 1.0 / max_per_second
self._lock = threading.Lock()
self._last_time = 0.0
def wait(self):
with self._lock:
now = time.time()
elapsed = now - self._last_time
if elapsed < self.min_interval:
time.sleep(self.min_interval - elapsed)
self._last_time = time.time()
# 5. Cookie 和 Session 管理
def login_and_scrape(login_url: str, credentials: dict):
"""模拟登录后爬取"""
session = requests.Session()
# 获取登录页面(获取 CSRF token 等)
login_page = session.get(login_url)
soup = BeautifulSoup(login_page.text, "lxml")
csrf_token = soup.find("input", {"name": "csrf_token"})["value"]
# 提交登录表单
session.post(login_url, data={
**credentials,
"csrf_token": csrf_token,
})
# 后续请求自动携带认证 Cookie
response = session.get("https://example.com/dashboard")
return response.text优点
缺点
总结
HTTP 请求用 requests 库:GET/POST/PUT/DELETE 方法、Session 管理 Cookie、重试策略处理异常。爬虫用 requests + BeautifulSoup:requests 获取 HTML、BeautifulSoup 解析提取数据、CSS 选择器定位元素。反爬应对:设置 User-Agent、添加延迟、使用代理。JS 渲染页面用 Selenium 或 Playwright。爬虫需遵守 robots.txt 和法律法规。
关键知识点
- requests.Session 复用 TCP 连接,大幅减少连接建立开销
- timeout 参数应始终设置,推荐使用 (connect_timeout, read_timeout) 元组形式
- raise_for_status() 可以将 4xx/5xx 状态码转为异常
- 重试策略应区分可重试错误(5xx、超时)和不可重试错误(4xx)
- BeautifulSoup 的 CSS 选择器比 find/find_all 更灵活强大
项目落地视角
- 统一虚拟环境、依赖锁定、格式化和日志方案。
- 把入口、配置、业务逻辑和工具函数拆开,避免单文件膨胀。
- 对网络请求、文件读写和数据处理结果做异常与样本校验。
- 明确项目入口、配置管理、依赖管理、日志和测试策略。
- HTTP 客户端封装为可测试的类,mock 外部 API 进行单元测试
常见误区
- 把临时脚本直接当生产代码使用。
- 忽略依赖版本、编码、路径和时区差异。
- 只会写 happy path,没有补超时、重试和资源释放。
- 把 notebook 或脚本风格直接带入长期维护项目。
- 不设置 timeout 导致请求无限等待
- 使用 requests.get() 而非 Session,无法复用连接
- 爬虫不遵守 robots.txt,存在法律风险
进阶路线
- 学习 httpx 库,支持 HTTP/2 和异步请求
- 掌握 Scrapy 框架构建大型爬虫项目
- 学习 Playwright/Selenium 处理 JS 渲染页面
- 研究分布式爬虫架构
适用场景
- 当你准备把《HTTP 请求与爬虫》真正落到项目里时,最适合先在一个独立模块或最小样例里验证关键路径。
- 适合脚本自动化、数据处理、Web 开发和测试工具建设。
- 当需求强调快速迭代和丰富生态时,Python 往往能快速起步。
落地建议
- 统一使用虚拟环境与依赖锁定,避免环境漂移。
- 对核心函数补类型注解、异常处理和日志,减少"脚本黑盒"。
- 一旦脚本进入生产链路,及时补测试和监控。
- HTTP 客户端统一封装,集中管理认证、重试、日志和监控
排错清单
- 先确认当前解释器、虚拟环境和依赖版本是否正确。
- 检查编码、路径、时区和第三方库行为差异。
- 排查同步阻塞、数据库连接未释放或网络请求无超时。
- 确认 SSL 证书验证是否正确配置
- 检查 User-Agent 是否被目标网站拒绝
复盘问题
- 如果把《HTTP 请求与爬虫》放进你的当前项目,最先要验证的输入、输出和失败路径分别是什么?
- 《HTTP 请求与爬虫》最容易在什么规模、什么边界条件下暴露问题?你会用什么指标或日志去确认?
- 相比默认实现或替代方案,采用《HTTP 请求与爬虫》最大的收益和代价分别是什么?
