Python-第17篇《简单的网页爬虫》

导语

每天睁开眼三件事：看汇率、看股价、看政策新闻。手动刷新十几个网站？太累了！今天咱们造一个“财经资讯雷达”——自动从新闻网站抓取最新财经头条，提取关键信息，保存成表格慢慢看。学会这招，你也能爬天气、爬电影评分、爬招聘数据，互联网就是你的数据库！

本篇目标

理解网页的基本结构（HTML）
使用requests库获取网页内容
用BeautifulSoup解析和提取信息
批量获取财经新闻并保存到CSV
了解爬虫的“礼貌原则”

一、准备工作：安装库

# 在命令行运行
# pip install requests beautifulsoup4 pandas

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from datetime import datetime
import csv
import os

二、认识网页的“骨架”

网页本质上是个“文本文件”，用HTML标签搭建结构：

<html>
  <head><title>财经新闻</title></head>
  <body>
    <div class="news-list">
      <div class="news-item">
        <h3><a href="链接">新闻标题</a></h3>
        <p class="summary">新闻摘要</p>
        <span class="time">发布时间</span>
      </div>
    </div>
  </body>
</html>

咱们要做的，就是按图索骥：找到news-item这个“盒子”，掏出里面的标题、链接、时间。

# ========== 观察网页结构 ==========
def view_html_structure(url):
    """查看网页的HTML结构（仅学习用）"""
    try:
        response = requests.get(url, timeout=5)
        response.encoding = 'utf-8'
        
        # 只打印前1000个字符，不然太多
        print("网页前1000个字符：")
        print(response.text[:1000])
        print("\n..." + "="*50 + "...")
        
        return response.text
    except Exception as e:
        print(f"❌ 获取失败: {e}")
        return None

# 测试一下（新浪财经财经头条）
# test_url = "https://finance.sina.com.cn/china/"
# html = view_html_structure(test_url)

三、获取网页内容

# ========== 财经新闻爬虫核心 ==========
class FinanceNewsSpider:
    """财经新闻爬虫"""
    
    def __init__(self):
        # 设置请求头，模拟浏览器访问
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        self.news_list = []
    
    def fetch_news(self, url: str, max_news: int = 10):
        """
        从指定URL抓取新闻
        url: 目标网页地址
        max_news: 最多抓取多少条
        """
        print(f"\n🌐 正在访问: {url}")
        
        try:
            # 1. 发送请求
            response = requests.get(url, headers=self.headers, timeout=10)
            response.raise_for_status()  # 检查是否成功
            response.encoding = response.apparent_encoding  # 自动识别编码
            
            # 2. 解析HTML
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # 3. 提取新闻条目（以新浪财经为例）
            # 注意：不同网站结构不同，需根据实际情况调整
            
            # 新浪新闻通常使用ul.list_009, li下的a标签
            news_items = soup.select('ul.list_009 li')
            
            print(f"📰 找到 {len(news_items)} 条新闻，抓取前{max_news}条...")
            
            for i, item in enumerate(news_items[:max_news], 1):
                try:
                    # 提取标题和链接
                    link_tag = item.find('a')
                    title = link_tag.get_text(strip=True)
                    link = link_tag['href']
                    
                    # 提取时间（如果有）
                    time_tag = item.find('span', class_='time')
                    pub_time = time_tag.get_text(strip=True) if time_tag else datetime.now().strftime('%m-%d %H:%M')
                    
                    # 提取摘要（部分新闻有）
                    summary_tag = item.find('p', class_='summary')
                    summary = summary_tag.get_text(strip=True) if summary_tag else ""
                    
                    news_item = {
                        '序号': i,
                        '标题': title,
                        '发布时间': pub_time,
                        '链接': link,
                        '摘要': summary,
                        '抓取时间': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                    }
                    
                    self.news_list.append(news_item)
                    print(f"  [{i}] {title[:30]}...")
                    
                except Exception as e:
                    print(f"  ⚠️ 解析第{i}条失败: {e}")
                    continue
            
            print(f"✅ 成功抓取 {len(self.news_list)} 条新闻")
            return True
            
        except Exception as e:
            print(f"❌ 爬取失败: {e}")
            return False
    
    def save_to_csv(self, filename: str = None):
        """保存到CSV文件"""
        if not self.news_list:
            print("⚠️ 暂无数据可保存")
            return
        
        if filename is None:
            filename = f"财经新闻_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        
        df = pd.DataFrame(self.news_list)
        df.to_csv(filename, index=False, encoding='utf-8-sig')
        
        print(f"✅ 已保存到: {filename}")
        print(f"📊 文件大小: {os.path.getsize(filename)} 字节")
        
        return filename
    
    def show_news(self):
        """在控制台显示新闻"""
        if not self.news_list:
            print("⚠️ 暂无新闻")
            return
        
        print("\n" + "=" * 60)
        print("最新财经新闻")
        print("=" * 60)
        
        for news in self.news_list:
            print(f"\n【{news['序号']}】{news['标题']}")
            print(f"      发布时间: {news['发布时间']}")
            if news['摘要']:
                print(f"      摘要: {news['摘要'][:60]}...")
            print(f"      链接: {news['链接']}")
        
        print("\n" + "=" * 60)

# 测试爬虫
if __name__ == "__main__":
    spider = FinanceNewsSpider()
    
    # 新浪财经财经头条
    url = "https://finance.sina.com.cn/china/"
    
    if spider.fetch_news(url, max_news=8):
        spider.show_news()
        spider.save_to_csv()

四、针对东方财富网的适配

不同网站结构不同，咱们以东方财富网为例，展示如何“对症下药”：

# ========== 东方财富网新闻爬虫 ==========
def fetch_eastmoney_news(max_news=8):
    """专门抓取东方财富网新闻"""
    url = "https://www.eastmoney.com/"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    
    news_list = []
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.encoding = 'utf-8'
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 东方财富的头条新闻通常在class为"newlist"的div中
        news_items = soup.select('div.newlist li')
        
        for i, item in enumerate(news_items[:max_news], 1):
            try:
                link_tag = item.find('a')
                title = link_tag.get_text(strip=True)
                link = link_tag['href']
                
                # 补全链接（如果是相对路径）
                if not link.startswith('http'):
                    link = 'https://www.eastmoney.com' + link
                
                news_list.append({
                    '序号': i,
                    '标题': title,
                    '来源': '东方财富',
                    '链接': link,
                    '抓取时间': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                })
                
            except Exception as e:
                continue
        
        return news_list
        
    except Exception as e:
        print(f"❌ 抓取失败: {e}")
        return []

# 测试
# eastmoney_news = fetch_eastmoney_news(5)
# print(eastmoney_news)

五、完整财经新闻聚合器

整合多个来源，打造“财经资讯雷达”：

# ========== 财经新闻聚合器 - 完整版 ==========
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time

class FinanceNewsAggregator:
    """财经新闻聚合器"""
    
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        self.all_news = []
    
    def fetch_sina_finance(self, max_news=10):
        """抓取新浪财经"""
        url = "https://finance.sina.com.cn/china/"
        
        try:
            resp = requests.get(url, headers=self.headers, timeout=10)
            resp.encoding = 'utf-8'
            soup = BeautifulSoup(resp.text, 'html.parser')
            
            # 新浪的结构
            items = soup.select('ul.list_009 li')
            
            for i, item in enumerate(items[:max_news], 1):
                link_tag = item.find('a')
                time_tag = item.find('span')
                
                news = {
                    '序号': len(self.all_news) + 1,
                    '标题': link_tag.get_text(strip=True),
                    '来源': '新浪财经',
                    '发布时间': time_tag.get_text(strip=True) if time_tag else '',
                    '链接': link_tag['href'],
                    '摘要': '',
                    '关键词': self._extract_keywords(link_tag.get_text(strip=True))
                }
                self.all_news.append(news)
            
            print(f"✅ 新浪: {len(items[:max_news])}条")
            return True
        except Exception as e:
            print(f"❌ 新浪失败: {e}")
            return False
    
    def fetch_eastmoney(self, max_news=10):
        """抓取东方财富"""
        url = "https://www.eastmoney.com/"
        
        try:
            resp = requests.get(url, headers=self.headers, timeout=10)
            resp.encoding = 'utf-8'
            soup = BeautifulSoup(resp.text, 'html.parser')
            
            items = soup.select('div.newlist li')[:max_news]
            
            for item in items:
                link_tag = item.find('a')
                news = {
                    '序号': len(self.all_news) + 1,
                    '标题': link_tag.get_text(strip=True),
                    '来源': '东方财富',
                    '发布时间': datetime.now().strftime('%m-%d %H:%M'),
                    '链接': link_tag['href'],
                    '摘要': '',
                    '关键词': self._extract_keywords(link_tag.get_text(strip=True))
                }
                self.all_news.append(news)
            
            print(f"✅ 东财: {len(items)}条")
            return True
        except Exception as e:
            print(f"❌ 东财失败: {e}")
            return False
    
    def _extract_keywords(self, title: str):
        """简单关键词提取"""
        keywords = ['股市', '央行', '利率', '汇率', '房价', '通胀', 'GDP', '就业']
        found = [kw for kw in keywords if kw in title]
        return ','.join(found) if found else '—'
    
    def fetch_all(self, max_per_source=8):
        """抓取所有来源"""
        print("\n" + "=" * 40)
        print("开始抓取财经新闻...")
        print("=" * 40)
        
        self.all_news = []  # 清空旧数据
        
        self.fetch_sina_finance(max_per_source)
        time.sleep(1)  # 礼貌性暂停1秒
        self.fetch_eastmoney(max_per_source)
        
        print(f"\n🎉 总计抓取 {len(self.all_news)} 条新闻")
    
    def show_dashboard(self):
        """显示新闻仪表板"""
        if not self.all_news:
            print("⚠️ 暂无数据")
            return
        
        print("\n" + "=" * 70)
        print("📊 财经新闻聚合 - 实时更新")
        print("=" * 70)
        
        # 按来源分组
        df = pd.DataFrame(self.all_news)
        source_stats = df['来源'].value_counts()
        
        print("\n来源统计:")
        for source, count in source_stats.items():
            print(f"  {source}: {count}条")
        
        # 关键词热点
        keywords = []
        for kw in df['关键词']:
            if kw != '—':
                keywords.extend(kw.split(','))
        kw_series = pd.Series(keywords).value_counts()
        
        print("\n今日热点关键词:")
        for kw, count in kw_series.head(5).items():
            print(f"  {kw}: {count}次")
        
        # 显示新闻列表
        print("\n" + "-" * 70)
        print("新闻列表（按抓取顺序）:")
        print("-" * 70)
        
        for news in self.all_news:
            print(f"\n{news['序号']}. [{news['来源']}] {news['标题']}")
            if news['关键词'] != '—':
                print(f"   🔑 {news['关键词']}")
            print(f"   🔗 {news['链接']}")
    
    def save_to_csv(self):
        """保存所有新闻"""
        if not self.all_news:
            print("⚠️ 无数据可保存")
            return
        
        filename = f"财经新闻聚合_{datetime.now().strftime('%Y%m%d_%H%M')}.csv"
        df = pd.DataFrame(self.all_news)
        df.to_csv(filename, index=False, encoding='utf-8-sig')
        
        print(f"\n✅ 已保存: {filename} ({len(df)}条)")
        return filename

def main():
    """主程序"""
    aggregator = FinanceNewsAggregator()
    
    while True:
        print("\n" + "=" * 45)
        print("财经新闻聚合器")
        print("=" * 45)
        print("1. 抓取新闻")
        print("2. 显示新闻")
        print("3. 保存到CSV")
        print("4. 退出")
        print("=" * 45)
        
        choice = input("请选择: ").strip()
        
        if choice == "1":
            aggregator.fetch_all(max_per_source=8)
        elif choice == "2":
            aggregator.show_dashboard()
        elif choice == "3":
            aggregator.save_to_csv()
        elif choice == "4":
            print("👋 再见！")
            break
        else:
            print("请输入1-4！")

if __name__ == "__main__":
    main()

六、爬虫的“礼貌原则”

# ========== 爬虫礼仪 ==========
"""
🤝 爬虫社交指南：

1. 控制频率
   time.sleep(1)  # 每次请求暂停1-2秒
   不要像机关枪一样不停请求

2. 设置User-Agent
   告诉网站"我是友好的浏览器访问"
   而不是"我是恶意机器人"

3. 遵守robots.txt
   在网站域名后加/robots.txt查看规则
   例如：https://www.sina.com.cn/robots.txt

4. 只爬取公开信息
   不要尝试登录后的内容
   不要爬取个人信息

5. 数据本地化使用
   抓取的数据仅用于学习分析
   不要商业发布或二次传播

❌ 绝对不能做的事：
- 爬取需要登录的页面
- 高频访问导致网站卡顿
- 爬取个人敏感信息
- 违反网站服务条款
"""

七、知识点加油站

7.1 CSS选择器速查

soup.select('div.news')          # 所有class=news的div
soup.select('#header')           # id=header的元素
soup.select('a[href]')           # 所有有href的a标签
soup.select('li:nth-child(2)')   # 第二个li
soup.select('a[href*="sina"]')   # href包含sina的a标签

7.2 常见的反爬策略

策略	应对方法
需要登录	不碰，只爬公开信息
IP封禁	放慢速度，不要高频
动态加载	需要selenium（本教程不涉）
验证码	手动识别或使用OCR（高级）

八、总结

✅ 用requests获取网页内容
✅ 用BeautifulSoup解析HTML
✅ 按标签和class提取数据
✅ 多网站新闻聚合
✅ 爬虫的礼貌原则

下篇预告

第18篇《数据可视化入门》——把枯燥的数字变成漂亮的图表，让领导眼前一亮！

🤖 Powered by Kimi K2 Thinking 💻 内容经葵葵🌻审核与修改