HTTP 请求与爬虫

SunnyFan大约 11 分钟约 3291 字

HTTP 请求与爬虫

简介

Python 的 requests 库是最流行的 HTTP 客户端，简洁的 API 让发送 HTTP 请求变得轻松。结合 BeautifulSoup，可以快速构建网页爬虫，获取互联网数据。

requests 库的设计哲学是"HTTP for Humans"——它将 HTTP 协议的复杂性隐藏在简洁的 API 之下。一个 GET 请求只需要一行代码 requests.get(url)，但它背后处理了连接池管理、内容编码解码、Cookie 持久化、SSL 验证等大量底层细节。从工程角度看，理解这些底层行为对于编写健壮的网络请求代码至关重要。

特点

1.requests 库 — 简洁的 HTTP 客户端
2.会话管理 — Cookie/Session 持久化
3.BeautifulSoup — HTML 解析提取数据
4.异步请求 — aiohttp 高并发爬取
5.重试机制 — 自动处理瞬态故障
6.连接池 — 复用 TCP 连接提升性能

requests 基础

HTTP 请求

# pip install requests
import requests

# GET 请求
response = requests.get('https://httpbin.org/get',
    params={'keyword': 'python', 'page': 1},
    headers={'User-Agent': 'MyApp/1.0'},
    timeout=10
)
print(response.status_code)       # 200
print(response.json())            # 解析 JSON 响应
print(response.headers)           # 响应头

# POST 请求（表单）
response = requests.post('https://httpbin.org/post',
    data={'username': 'admin', 'password': '123456'}
)

# POST 请求（JSON）
response = requests.post('https://httpbin.org/post',
    json={'name': '张三', 'age': 30},
    headers={'Content-Type': 'application/json'}
)

# PUT/DELETE
response = requests.put('https://api.example.com/users/1',
    json={'name': '李四'})
response = requests.delete('https://api.example.com/users/1')

# 下载文件
response = requests.get('https://example.com/file.pdf', stream=True)
with open('file.pdf', 'wb') as f:
    for chunk in response.iter_content(chunk_size=8192):
        f.write(chunk)

# 上传文件
files = {'file': open('report.pdf', 'rb')}
response = requests.post('https://api.example.com/upload', files=files)

深入 Response 对象

import requests

response = requests.get('https://httpbin.org/json')

# 状态码
print(response.status_code)            # 200
print(response.ok)                     # True (2xx)
print(response.is_redirect)            # False
print(response.is_permanent_redirect)  # False

# 响应内容
print(response.text)      # 自动解码的文本
print(response.content)   # 原始字节
print(response.json())    # JSON 解析（自动处理）
print(response.url)       # 最终 URL（处理了重定向后）
print(response.encoding)  # 检测到的编码
print(response.apparent_encoding)  # 服务器声明的编码

# 响应头
print(response.headers['Content-Type'])
print(response.headers.get('X-Custom-Header', 'default'))

# Cookie
print(response.cookies.get('session_id'))

# 请求信息
print(response.request.method)    # GET
print(response.request.headers)   # 请求头
print(response.request.url)       # 请求 URL
print(response.elapsed.total_seconds())  # 请求耗时（秒）

# 状态码判断（推荐使用 raise_for_status）
if response.status_code == 200:
    data = response.json()

# 使用内置方法
try:
    response.raise_for_status()  # 4xx/5xx 抛 HTTPError
    data = response.json()
except requests.HTTPError as e:
    print(f"请求失败: {e.response.status_code}")

会话管理

# Session 保持 Cookie
session = requests.Session()

# 登录
login_resp = session.post('https://api.example.com/login',
    json={'username': 'admin', 'password': '123456'})

# 后续请求自动携带 Cookie
profile = session.get('https://api.example.com/profile')
orders = session.get('https://api.example.com/orders')

# 设置公共 Headers
session.headers.update({
    'Authorization': 'Bearer your-token',
    'User-Agent': 'MyApp/1.0'
})

# 重试策略
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

retry_strategy = Retry(
    total=3,
    backoff_factor=1,
    status_forcelist=[429, 500, 502, 503, 504]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("https://", adapter)
session.mount("http://", adapter)

生产级 Session 封装

import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from typing import Any, Optional
import logging
import time

logger = logging.getLogger(__name__)

class HttpClient:
    """生产级 HTTP 客户端

    特性：
    - 自动重试
    - 连接池管理
    - 请求/响应日志
    - 超时控制
    - 速率限制
    """

    def __init__(
        self,
        base_url: str = "",
        timeout: int = 30,
        max_retries: int = 3,
        retry_backoff: float = 1.0,
        rate_limit: float = 0,  # 每秒最大请求数，0 表示不限
    ):
        self.base_url = base_url.rstrip("/")
        self.timeout = timeout
        self.rate_limit = rate_limit
        self._last_request_time = 0.0

        self.session = requests.Session()

        # 配置重试策略
        retry_strategy = Retry(
            total=max_retries,
            backoff_factor=retry_backoff,
            status_forcelist=[429, 500, 502, 503, 504],
            allowed_methods=["GET", "POST", "PUT", "DELETE", "PATCH"],
        )
        adapter = HTTPAdapter(
            max_retries=retry_strategy,
            pool_connections=10,
            pool_maxsize=100,
        )
        self.session.mount("https://", adapter)
        self.session.mount("http://", adapter)

        # 默认头
        self.session.headers.update({
            "Content-Type": "application/json",
            "Accept": "application/json",
        })

    def _rate_limit_wait(self) -> None:
        """速率限制等待"""
        if self.rate_limit > 0:
            now = time.time()
            elapsed = now - self._last_request_time
            min_interval = 1.0 / self.rate_limit
            if elapsed < min_interval:
                time.sleep(min_interval - elapsed)
            self._last_request_time = time.time()

    def request(
        self,
        method: str,
        path: str,
        params: Optional[dict] = None,
        json_data: Optional[dict] = None,
        data: Optional[dict] = None,
        headers: Optional[dict] = None,
        timeout: Optional[int] = None,
    ) -> dict:
        """发送 HTTP 请求"""
        url = f"{self.base_url}/{path.lstrip('/')}" if self.base_url else path
        timeout = timeout or self.timeout

        self._rate_limit_wait()

        logger.info(f"HTTP {method} {url}")

        response = self.session.request(
            method=method,
            url=url,
            params=params,
            json=json_data,
            data=data,
            headers=headers,
            timeout=timeout,
        )

        logger.info(
            f"HTTP {response.status_code} {url} "
            f"({response.elapsed.total_seconds():.3f}s)"
        )
        response.raise_for_status()
        return response.json()

    def get(self, path: str, **kwargs) -> dict:
        return self.request("GET", path, **kwargs)

    def post(self, path: str, **kwargs) -> dict:
        return self.request("POST", path, **kwargs)

    def put(self, path: str, **kwargs) -> dict:
        return self.request("PUT", path, **kwargs)

    def delete(self, path: str, **kwargs) -> dict:
        return self.request("DELETE", path, **kwargs)

    def close(self) -> None:
        """关闭会话，释放连接池"""
        self.session.close()

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.close()

# 使用示例
with HttpClient(base_url="https://httpbin.org", timeout=10) as client:
    result = client.get("/get", params={"key": "value"})
    print(result)

异常处理

from requests.exceptions import RequestException, Timeout, ConnectionError

def safe_request(url: str, retries: int = 3) -> dict:
    for attempt in range(retries):
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()  # 4xx/5xx 抛异常
            return response.json()
        except Timeout:
            print(f"请求超时，第 {attempt+1} 次重试")
        except ConnectionError:
            print(f"连接失败，第 {attempt+1} 次重试")
        except requests.HTTPError as e:
            print(f"HTTP 错误: {e.response.status_code}")
            break
        except RequestException as e:
            print(f"请求异常: {e}")
            break
    return {}

完善的异常处理策略

import requests
from requests.exceptions import (
    RequestException, Timeout, ConnectionError,
    HTTPError, SSLError, TooManyRedirects, ChunkedEncodingError
)
import logging
import time

logger = logging.getLogger(__name__)

class APIError(Exception):
    """自定义 API 错误"""
    def __init__(self, message: str, status_code: int = 0, response: dict = None):
        super().__init__(message)
        self.status_code = status_code
        self.response = response or {}

def robust_request(
    url: str,
    method: str = "GET",
    max_retries: int = 3,
    backoff_factor: float = 1.0,
    timeout: tuple = (5, 30),  # (连接超时, 读取超时)
) -> requests.Response:
    """健壮的 HTTP 请求函数

    Args:
        url: 请求 URL
        method: HTTP 方法
        max_retries: 最大重试次数
        backoff_factor: 退避因子
        timeout: (连接超时, 读取超时) 元组
    Returns:
        Response 对象
    Raises:
        APIError: 请求失败
    """
    last_exception = None

    for attempt in range(1, max_retries + 1):
        try:
            response = requests.request(
                method=method,
                url=url,
                timeout=timeout,
            )
            response.raise_for_status()
            return response

        except Timeout as e:
            last_exception = e
            logger.warning(f"超时 (尝试 {attempt}/{max_retries}): {url}")
            if attempt < max_retries:
                time.sleep(backoff_factor * attempt)

        except ConnectionError as e:
            last_exception = e
            logger.warning(f"连接失败 (尝试 {attempt}/{max_retries}): {url}")
            if attempt < max_retries:
                time.sleep(backoff_factor * attempt)

        except HTTPError as e:
            status = e.response.status_code
            if status == 429:  # Too Many Requests
                retry_after = int(e.response.headers.get("Retry-After", 5))
                logger.warning(f"速率限制，等待 {retry_after}s")
                if attempt < max_retries:
                    time.sleep(retry_after)
                continue
            elif status >= 500:  # 服务器错误，可重试
                last_exception = e
                logger.warning(f"服务器错误 {status} (尝试 {attempt}/{max_retries})")
                if attempt < max_retries:
                    time.sleep(backoff_factor * attempt)
                continue
            else:
                # 4xx 客户端错误，不应重试
                raise APIError(
                    f"客户端错误: {status} - {e.response.text[:200]}",
                    status_code=status,
                )

        except SSLError as e:
            raise APIError(f"SSL 错误: {e}")

        except (TooManyRedirects, ChunkedEncodingError) as e:
            raise APIError(f"请求异常: {e}")

        except RequestException as e:
            last_exception = e
            logger.warning(f"请求失败 (尝试 {attempt}/{max_retries}): {e}")

    raise APIError(f"请求失败，已重试 {max_retries} 次: {last_exception}")

BeautifulSoup 爬虫

HTML 解析

# pip install beautifulsoup4 lxml
from bs4 import BeautifulSoup
import requests

# 获取网页
response = requests.get('https://news.example.com',
    headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(response.text, 'lxml')

# 提取标题
title = soup.find('h1').text
print(title)

# 提取所有链接
links = soup.find_all('a')
for link in links:
    href = link.get('href')
    text = link.text.strip()
    if href:
        print(f"{text}: {href}")

# CSS 选择器
articles = soup.select('.article-list .item')
for article in articles:
    title = article.select_one('.title').text
    author = article.select_one('.author').text
    date = article.select_one('.date').text
    print(f"[{date}] {title} - {author}")

# 提取表格数据
table = soup.find('table')
rows = table.find_all('tr')
for row in rows[1:]:  # 跳过表头
    cols = [td.text.strip() for td in row.find_all('td')]
    print(cols)

BeautifulSoup 深入使用

from bs4 import BeautifulSoup, Tag, NavigableString

html = """
<div class="container">
    <h1 id="main-title">文章标题</h1>
    <div class="meta">
        <span class="author">作者A</span>
        <span class="date">2024-01-15</span>
        <span class="tags">
            <a href="/tag/python">Python</a>
            <a href="/tag/web">Web</a>
        </span>
    </div>
    <div class="content">
        <p>第一段内容</p>
        <p>第二段内容 <strong>加粗文字</strong></p>
        <ul>
            <li>列表项 1</li>
            <li>列表项 2</li>
        </ul>
        <a href="https://example.com" data-id="123">外部链接</a>
    </div>
</div>
"""

soup = BeautifulSoup(html, "lxml")

# 1. find vs find_all
title = soup.find("h1")                  # 返回第一个匹配的 Tag
title_by_id = soup.find(id="main-title") # 按 id 查找
all_links = soup.find_all("a")           # 返回列表

# 2. CSS 选择器（推荐方式）
container = soup.select_one(".container")     # 第一个
tags = soup.select(".tags a")                  # 所有匹配
second_p = soup.select(".content p:nth-of-type(2)")

# 3. 属性访问
link = soup.select_one(".content a")
print(link["href"])          # https://example.com
print(link.get("data-id"))   # 123
print(link.get("class"))     # None（a 标签没有 class）

# 4. 文本提取
h1 = soup.find("h1")
print(h1.string)              # "文章标题"（只有一个子 NavigableString 时）
print(h1.text)                # "文章标题"（递归获取所有文本）
print(h1.get_text(strip=True))  # 去除首尾空白

# 5. 父子兄弟导航
first_p = soup.select_one(".content p")
print(first_p.parent["class"])      # ['content']
print(first_p.find_next_sibling())  # 第二个 p
print(first_p.find_previous_sibling())  # None（第一个 p 前面没有兄弟 p）

# 6. 正则表达式匹配
import re
external_links = soup.find_all("a", href=re.compile(r"^https?://"))
print([link["href"] for link in external_links])

# 7. 修改 DOM
for p in soup.select(".content p"):
    p["class"] = "paragraph"
new_tag = soup.new_tag("div", attrs={"class": "footer"})
new_tag.string = "页脚"
soup.find(".container").append(new_tag)

结构化爬取

import requests
from bs4 import BeautifulSoup
from dataclasses import dataclass
import json
import time

@dataclass
class Article:
    title: str
    author: str
    date: str
    content: str

def scrape_articles(base_url: str, max_pages: int = 5) -> list[Article]:
    articles = []
    session = requests.Session()
    session.headers['User-Agent'] = 'Mozilla/5.0 (compatible; Bot/1.0)'

    for page in range(1, max_pages + 1):
        url = f"{base_url}?page={page}"
        try:
            response = session.get(url, timeout=10)
            soup = BeautifulSoup(response.text, 'lxml')

            items = soup.select('.article-item')
            if not items:
                break

            for item in items:
                article = Article(
                    title=item.select_one('.title').text.strip(),
                    author=item.select_one('.author').text.strip(),
                    date=item.select_one('.date').text.strip(),
                    content=item.select_one('.summary').text.strip()
                )
                articles.append(article)

            time.sleep(1)  # 礼貌性延迟

        except Exception as e:
            print(f"爬取第 {page} 页失败: {e}")
            continue

    return articles

# 保存结果
# articles = scrape_articles("https://example.com/articles")
# with open("articles.json", "w", encoding="utf-8") as f:
#     json.dump([a.__dict__ for a in articles], f, ensure_ascii=False, indent=2)

反爬虫应对策略

import requests
import random
import time
from fake_useragent import UserAgent

# 1. 随机 User-Agent
ua = UserAgent()

def get_with_random_ua(url: str) -> requests.Response:
    """使用随机 User-Agent 发送请求"""
    headers = {
        "User-Agent": ua.random,
        "Accept": "text/html,application/xhtml+xml",
        "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
        "Accept-Encoding": "gzip, deflate",
        "Connection": "keep-alive",
    }
    return requests.get(url, headers=headers, timeout=10)

# 2. 代理 IP 轮换
class ProxyRotator:
    """代理 IP 轮换器"""
    def __init__(self, proxies: list[str]):
        self.proxies = proxies
        self._index = 0
        self._failed = set()

    def get_proxy(self) -> dict[str, str]:
        """获取下一个可用代理"""
        for _ in range(len(self.proxies)):
            proxy = self.proxies[self._index % len(self.proxies)]
            self._index += 1
            if proxy not in self._failed:
                return {"http": proxy, "https": proxy}
        raise RuntimeError("所有代理均不可用")

    def mark_failed(self, proxy: dict[str, str]):
        """标记代理失败"""
        self._failed.add(proxy.get("http", proxy.get("https", "")))

# 3. 随机延迟（避免规律性请求）
def random_delay(min_seconds: float = 1.0, max_seconds: float = 3.0):
    """随机延迟"""
    delay = random.uniform(min_seconds, max_seconds)
    time.sleep(delay)

# 4. 请求频率限制
import threading

class RateLimiter:
    """线程安全的速率限制器"""
    def __init__(self, max_per_second: float):
        self.min_interval = 1.0 / max_per_second
        self._lock = threading.Lock()
        self._last_time = 0.0

    def wait(self):
        with self._lock:
            now = time.time()
            elapsed = now - self._last_time
            if elapsed < self.min_interval:
                time.sleep(self.min_interval - elapsed)
            self._last_time = time.time()

# 5. Cookie 和 Session 管理
def login_and_scrape(login_url: str, credentials: dict):
    """模拟登录后爬取"""
    session = requests.Session()

    # 获取登录页面（获取 CSRF token 等）
    login_page = session.get(login_url)
    soup = BeautifulSoup(login_page.text, "lxml")
    csrf_token = soup.find("input", {"name": "csrf_token"})["value"]

    # 提交登录表单
    session.post(login_url, data={
        **credentials,
        "csrf_token": csrf_token,
    })

    # 后续请求自动携带认证 Cookie
    response = session.get("https://example.com/dashboard")
    return response.text

优点

1.requests 简洁 — HTTP 请求几行代码搞定
2.BeautifulSoup — HTML 解析灵活强大
3.Session 管理 — Cookie 和认证自动处理
4.重试机制 — 网络异常自动恢复

缺点

1.同步阻塞 — requests 是同步库
2.反爬限制 — 网站可能封禁爬虫
3.JS 渲染 — 无法执行 JavaScript
4.合规风险 — 需遵守 robots.txt 和法律

HTTP 请求用 requests 库：GET/POST/PUT/DELETE 方法、Session 管理 Cookie、重试策略处理异常。爬虫用 requests + BeautifulSoup：requests 获取 HTML、BeautifulSoup 解析提取数据、CSS 选择器定位元素。反爬应对：设置 User-Agent、添加延迟、使用代理。JS 渲染页面用 Selenium 或 Playwright。爬虫需遵守 robots.txt 和法律法规。