简易python爬虫常用功能

# 下载 zip 保存到 zip 文件夹下

安装pip install requests

import requests
import os

def download_zip(url, save_path):
    # 发送GET请求
    response = requests.get(url)

    # 检查响应状态码是否为200（成功）
    if response.status_code == 200:
        # 创建保存目录
        if not os.path.exists(save_path):
            os.makedirs(save_path)

        # 获取文件名
        filename = url.split('/')[-1]

        # 完整保存路径
        file_path = os.path.join(save_path, filename)

        # 将内容写入文件
        with open(file_path, 'wb') as f:
            f.write(response.content)

        print(f"文件已保存至 {file_path}")
    else:
        print("下载失败，请检查URL或网络连接。")

# 使用示例
url = "https://example.com/path/to/file.zip"
save_folder = "zip_files"

download_zip(url, save_folder)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32

# 创建个 excel，每插入一条数据保存一下

pip install pandas openpyxl

import pandas as pd

def insert_data_to_excel(data, excel_file):
    # 尝试读取现有Excel文件
    try:
        existing_df = pd.read_excel(excel_file)
    except FileNotFoundError:
        # 如果文件不存在，则创建一个新的DataFrame
        existing_df = pd.DataFrame()

    # 将新数据添加到DataFrame中
    new_row = pd.DataFrame([data])
    updated_df = pd.concat([existing_df, new_row], ignore_index=True)

    # 保存更新后的DataFrame到Excel文件
    updated_df.to_excel(excel_file, index=False)
    print(f"数据已保存至 {excel_file}")

# 使用示例
excel_file = "data.xlsx"
data = {
    "url": "http://d1-mb.mobanwang.com/myup200701/200701054.rar",
    # 可以添加更多列
}

insert_data_to_excel(data, excel_file)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26

# 提取内容中文乱码问题

pip install requests beautifulsoup4 chardet

import requests
from bs4 import BeautifulSoup
import chardet

def get_html_content(url):
    # 发送GET请求
    response = requests.get(url)

    # 检查响应状态码是否为200（成功）
    if response.status_code == 200:
        # 自动检测编码
        encoding = chardet.detect(response.content)['encoding']
        # 将内容解码为字符串
        content = response.content.decode(encoding, errors='replace')
        return content
    else:
        print("请求失败，请检查URL或网络连接。")
        return None

def extract_text(url):
    html_content = get_html_content(url)

    if html_content:
        # 使用正确的编码解析HTML
        soup = BeautifulSoup(html_content, 'html.parser')

        # 提取所有文本
        text = soup.get_text()

        # 打印提取的文本
        print(text)

# 使用示例
url = "https://example.com"
extract_text(url)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35

#python

上次更新: 2024/08/23, 17:13:37

← python之pyenv版本管理 java注解→