简易python爬虫常用功能
# 下载 zip 保存到 zip 文件夹下
安装pip install requests
import requests
import os
def download_zip(url, save_path):
# 发送GET请求
response = requests.get(url)
# 检查响应状态码是否为200(成功)
if response.status_code == 200:
# 创建保存目录
if not os.path.exists(save_path):
os.makedirs(save_path)
# 获取文件名
filename = url.split('/')[-1]
# 完整保存路径
file_path = os.path.join(save_path, filename)
# 将内容写入文件
with open(file_path, 'wb') as f:
f.write(response.content)
print(f"文件已保存至 {file_path}")
else:
print("下载失败,请检查URL或网络连接。")
# 使用示例
url = "https://example.com/path/to/file.zip"
save_folder = "zip_files"
download_zip(url, save_folder)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# 创建个 excel,每插入一条数据保存一下
pip install pandas openpyxl
import pandas as pd
def insert_data_to_excel(data, excel_file):
# 尝试读取现有Excel文件
try:
existing_df = pd.read_excel(excel_file)
except FileNotFoundError:
# 如果文件不存在,则创建一个新的DataFrame
existing_df = pd.DataFrame()
# 将新数据添加到DataFrame中
new_row = pd.DataFrame([data])
updated_df = pd.concat([existing_df, new_row], ignore_index=True)
# 保存更新后的DataFrame到Excel文件
updated_df.to_excel(excel_file, index=False)
print(f"数据已保存至 {excel_file}")
# 使用示例
excel_file = "data.xlsx"
data = {
"url": "http://d1-mb.mobanwang.com/myup200701/200701054.rar",
# 可以添加更多列
}
insert_data_to_excel(data, excel_file)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# 提取内容中文乱码问题
pip install requests beautifulsoup4 chardet
1
import requests
from bs4 import BeautifulSoup
import chardet
def get_html_content(url):
# 发送GET请求
response = requests.get(url)
# 检查响应状态码是否为200(成功)
if response.status_code == 200:
# 自动检测编码
encoding = chardet.detect(response.content)['encoding']
# 将内容解码为字符串
content = response.content.decode(encoding, errors='replace')
return content
else:
print("请求失败,请检查URL或网络连接。")
return None
def extract_text(url):
html_content = get_html_content(url)
if html_content:
# 使用正确的编码解析HTML
soup = BeautifulSoup(html_content, 'html.parser')
# 提取所有文本
text = soup.get_text()
# 打印提取的文本
print(text)
# 使用示例
url = "https://example.com"
extract_text(url)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
上次更新: 2024/08/23, 17:13:37