简单Python脚本

豆瓣读书排行榜

1
import requests # HTTP请求库
2
from bs4 import BeautifulSoup  # HTML解析库
3
import pandas as pd #数据存储
4
import time
5
from fake_useragent import UserAgent
6

7

8
def get_douban_books(pages=10):
9
    books_data = []
10
    #反爬虫处理，随机生成User-Agent
11
    ua = UserAgent()
12

13
    headers = {
14
        'User-Agent': ua.random, # 每次请求使用不同的User-Agent
15
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
16
        'Accept-Language': 'zh-CN,zh;q=0.8',
17
        'Connection': 'keep-alive'
18
    }
19

20
    for page in range(pages):
21
        url = f'https://book.douban.com/top250?start={page * 25}'
22

23
        try:
24
            # 发送请求
25
            response = requests.get(url, headers=headers)
26
            response.raise_for_status() #检查请求状态，如果不是200会抛出异常
27
            # 解析HTML
28
            soup = BeautifulSoup(response.text, 'html.parser')
29
            books = soup.select('tr.item') # CSS选择器，选择class为item的tr标签
30

31
            for book in books:
32
                try:
33
                    title = book.select_one('div.pl2 a').get('title')# select_one()返回第一个匹配的元素
34
                    info = book.select_one('p.pl').text.strip()  # 获取a标签的title属性
35
                    rating = book.select_one('span.rating_nums').text.strip()# 获取文本内容并去除首尾空白
36

37
                    # 评价人数处理
38
                    rating_people = book.select_one('span.pl').text
39
                    # 清理字符串，只保留数字，使用filter()函数过滤字符串，只保留数字
40
                    # str.isdigit() 判断字符是否为数字
41
                    # filter() 过滤函数，第一个参数是判断条件，第二个参数是可迭代对象
42
                    # ''.join() 将字符列表连接成字符串
43
                    rating_people = ''.join(filter(str.isdigit, rating_people))
44
                    # 条件判断处理可能不存在的元素
45
                    quote = book.select_one('span.inq')
46
                    quote = quote.text if quote else ''# 如果元素存在则获取文本，否则返回空字符串
47

48
                    # 使用字典存储每本书的信息
49
                    books_data.append({
50
                        '书名': title,
51
                        '基本信息': info,
52
                        '评分': float(rating),  # 字符串转float
53
                        '评价人数': int(rating_people),# 字符串转int
54
                        '一句话评价': quote
55
                    })
56

57
                except Exception as e:
58
                    print(f'解析单本书籍信息出错: {e}')
59
                    continue
60
            # 添加延时，避免请求过快
61
            time.sleep(2)
62

63
        except Exception as e:
64
            print(f'爬取第{page + 1}页时出错: {e}')
65
            continue
66

67
        print(f'成功爬取第{page + 1}页')
68

69
    return books_data
70

71

72
def save_to_excel(books_data, filename='douban_books.xlsx'):
73
    # 将字典列表转换为DataFrame
74
    df = pd.DataFrame(books_data)
75
    # 保存为Excel文件
76
    df.to_excel(filename, index=False, engine='openpyxl')
77
    print(f'数据已保存到 {filename}')
78

79

80
if __name__ == '__main__':
81
    print('开始爬取豆瓣图书排行榜...')
82
    books_data = get_douban_books(pages=10)
83
    save_to_excel(books_data)

豆瓣电影top

1
import requests
2
from bs4 import BeautifulSoup
3
import pandas as pd
4
import time
5
from fake_useragent import UserAgent
6

7

8
def clean_text(text):
9
    """清理文本数据，移除多余的空格和换行符"""
10
    if text:
11
        return ' '.join(text.strip().split())
12
    return ''
13

14

15
def get_douban_movies(pages=10):
16
    movies_data = []
17
    ua = UserAgent()
18

19
    headers = {
20
        'User-Agent': ua.random,
21
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/web  p,*/*;q=0.8',
22
        'Accept-Language': 'zh-CN,zh;q=0.8',
23
        'Connection': 'keep-alive'
24
    }
25

26
    for page in range(pages):
27
        url = f'https://movie.douban.com/top250?start={page * 25}'
28

29
        try:
30
            response = requests.get(url, headers=headers)
31
            response.raise_for_status()
32

33
            soup = BeautifulSoup(response.text, 'html.parser')
34
            movies = soup.select('div.item')
35

36
            for movie in movies:
37
                try:
38
                    # 获取标题
39
                    title = movie.select_one('.title').text.strip() if movie.select_one('.title') else '未知'
40

41
                    # 获取外文名（如果存在）
42
                    foreign_title = movie.select_one('.other')
43
                    foreign_name = clean_text(foreign_title.text) if foreign_title else ''
44

45
                    # 获取详细信息
46
                    info_element = movie.select_one('div.bd p')
47
                    if info_element:
48
                        info_text = clean_text(info_element.text)
49
                        # 分割信息
50
                        info_parts = info_text.split('/')
51

52
                        # 提取导演和演员信息（通常在第一部分）
53
                        director_actor = info_parts[0] if info_parts else ''
54

55
                        # 提取年份、地区、类型（通常在后面几部分）
56
                        year_region_type = ' / '.join(info_parts[1:]) if len(info_parts) > 1 else ''
57
                    else:
58
                        director_actor = ''
59
                        year_region_type = ''
60

61
                    # 获取评分
62
                    rating = movie.select_one('.rating_num')
63
                    rating = float(rating.text.strip()) if rating else 0.0
64

65
                    # 获取评价人数
66
                    rating_people = movie.select_one('.star span:last-child')
67
                    if rating_people:
68
                        rating_people = ''.join(filter(str.isdigit, rating_people.text))
69
                        rating_people = int(rating_people) if rating_people else 0
70
                    else:
71
                        rating_people = 0
72

73
                    # 获取一句话评价
74
                    quote = movie.select_one('.quote .inq')
75
                    quote = quote.text.strip() if quote else ''
76

77
                    movies_data.append({
78
                        '中文名': title,
79
                        '外文名': foreign_name,
80
                        '导演演员': director_actor,
81
                        '年份地区类型': year_region_type,
82
                        '评分': rating,
83
                        '评价人数': rating_people,
84
                        '一句话评价': quote
85
                    })
86

87
                except Exception as e:
88
                    print(f'解析单部电影信息出错: {e}')
89
                    continue
90

91
            print(f'成功爬取第{page + 1}页')
92
            time.sleep(2)
93

94
        except Exception as e:
95
            print(f'爬取第{page + 1}页时出错: {e}')
96
            continue
97

98
    return movies_data
99

100

101
def save_to_excel(movies_data, filename='douban_movies.xlsx'):
102
    df = pd.DataFrame(movies_data)
103
    df.to_excel(filename, index=False, engine='openpyxl')
104
    print(f'数据已保存到 {filename}')
105

106

107
def main():
108
    print('开始爬取豆瓣电影Top250...')
109
    movies_data = get_douban_movies(pages=10)
110

111
    if movies_data:
112
        # 按评分排序
113
        sorted_movies = sorted(movies_data, key=lambda x: x['评分'], reverse=True)
114
        save_to_excel(sorted_movies, 'douban_movies_top250.xlsx')
115
    else:
116
        print('未获取到数据')
117

118

119
if __name__ == '__main__':
120
    main()