爬虫:抓取某网站信息

 可以根据搜索内容获取任意 影片的信息并下载图片😋。如果需要使用的话,请看以下教程💃。

使用方法

1· 搭建python环境

任意能运行Python的环境即可

2· 导包

from urllib import parse
import time
import random
import csv
from bs4 import BeautifulSoup #需外部导入,下同 pip install bs4
import requests pip install requests
from lxml import etree pip install lxml
import pandas as pd pip install pandas
import os

3· 主程序

class Crawler(object):
#初始化url属性
def __init__(self):
self.url = 'https://www.avgc1.com/zh/search/{}?{}'

#获取网页的HTML信息
def get_html(self, url):

html = requests.get(
url = url,
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36 Edg/140.0.0.0'}
).text

return html

#一级页面解析函数
def parse_html_1(self, html):
#提取标题和演员
soup = BeautifulSoup(html, 'html.parser')
# 根据实际HTML结构调整选择器
movies = soup.find_all('div', class_='v-card-text pa-3') # 解析标题和演员
movies_data = []

for movie in movies:
# 在同一个容器内提取标题和演员
title_element = movie.find('h5')
title = title_element.get_text(strip=True) if title_element else "未知标题"

name_elements = movie.find_all('span', class_='v-btn__content')
name = [None] * len(name_elements) # 创建长度和name_elements的空列表
i = 0
for name_element in name_elements:
name[i] = name_element.get_text().strip()
i += 1

movies_data.append({
'title': title,
'actors': name # 可能包含多个演员
})

self.save_html(movies_data)

#获取二级页面链接
parse_html = etree.HTML(html)
href_list = parse_html.xpath('//div[contains(@class, "grid")]//a[contains(@href, "/zh/movies/")]/@href')

number_list = []
category_list = []
views_list = []
url_list = []
img_list = []

for href in href_list[:]:
#遍历2级url列表提取信息保存在以上列表里
url_2 = 'https://www.avgc1.com{}'
url_2 = url_2.format(href)
html_2 = self.get_html(url_2)
number, category, views, img = self.parse_html_2(html_2)

number_list.append(number) if number else "无"
category_list.append(category) if category else "无"
views_list.append(views) if views else "无"
img_list.append(img)
url_list.append(url_2)

self.save_html_2(number_list, category_list, views_list, url_list)
self.get_image(img_list, movies_data)

#二级页面解析函数
def parse_html_2(self, html_2):
#进入下一级页面提取信息
parse_html = etree.HTML(html_2)
number = parse_html.xpath('//div[@class="mt-4 d-flex ga-2 align-center"]/h2/text()')[0] #注意xpath表达式匹配结果是一个列表,因此需要索引[0]提取数据
category = ', '.join(parse_html.xpath('//div[@class="mt-4 flex flex-wrap ga-2 align-center"]/a[contains(@href, "/zh/genre/")]/@title')) # 将所有元素合并为一个字符串
views = parse_html.xpath('//div[@class="mt-4 d-flex ga-2 align-center"]/span[contains(., "浏览量:")]/following-sibling::span/text()')[0]
img = parse_html.xpath('//meta[@name="twitter:image"]/@content')[0]

return number, category, views, img

#保存文件函数
def save_html(self, movies_data):
#生成文件对象
with open('search.csv', 'a', newline='', encoding="utf-8-sig") as f:
#生成csv操作对象
writer = csv.writer(f)
writer.writerow(['标题', '演员', '链接', '类型', '番号', '播放量'])

for movie in movies_data[:]:

title = movie['title']
if movie['actors']:
writer.writerow([title, ', '.join(movie['actors'])])
else:
writer.writerow([title, '未知演员'])

#改写文件函数,加入新数据
def save_html_2(self, number_list, category_list, views_list, url_list):

df = pd.read_csv('search.csv', encoding='utf-8-sig')

# 直接使用loc更新列
N1 = len(url_list)
N2 = len(df['标题'])
df.loc[N2- N1: N2, '链接'] = url_list
df.loc[N2- N1: N2, '类型'] = category_list
df.loc[N2- N1: N2, '番号'] = number_list
df.loc[N2- N1: N2, '播放量'] = views_list

# 保存回文件
df.to_csv('search.csv', index=False, encoding='utf-8-sig')

#获取图像
def get_image(self, img_list, movies_data):

title_list = []
for movie in movies_data:

title_list.append(movie['title'])

for i, img_url in enumerate(img_list[:]):

filename = '{}.jpg'.format(title_list[i])
self.save_image(img_url, filename)

#保存图像
def save_image(self, img_url, filename):

folder_path = r'D:\王沛杰\爬虫\imgs'
# 确保文件夹存在
if not os.path.exists(folder_path):
os.makedirs(folder_path)
try:

# 修改文件名包含路径
filename = os.path.join(folder_path, filename)

with requests.Session() as session:
img = session.get(url = img_url, headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36 Edg/140.0.0.0'}).content
with open(filename, 'wb') as f:
f.write(img)
print(filename, '下载成功')

except requests.exceptions.RequestException as e:
print(f'下载失败: {e}')
except OSError as e:
print(f'文件操作失败: {e}')

#入口函数
def run(self):

word = input('请输入搜索内容:')
begin = int(input('输入起始页:'))
stop = int(input('输入终止页:'))

for page in range(begin, stop+1):
#拼接URL地址
params={
'page': str(page)
}
word2 = parse.quote(word)
result = parse.urlencode(params)
url = self.url.format(word2, result)

#发请求
html = self.get_html(url)
self.parse_html_1(html)

#提示
print('第%d页抓取成功' %page)
#每爬取一个页面随机休眠1-2秒钟的时间
time.sleep(random.randint(1, 2))

#以脚本的形式启动爬虫
if __name__ == '__main__':

start = time.time()
spider = Crawler()
spider.run()
end = time.time()

print('执行时间: %.2f' %(end-start))

您需要更改文件保存路径(您想要保存到的地方)
如果您不想下载图片,可以将一级页面解析函数中的self.get_image(img_list, movies_data)注释掉。

4· 最终效果

微信图片_20251006160549_45_25.png

效果图😋