class Crawler(object): def __init__(self): self.url = 'https://www.avgc1.com/zh/search/{}?{}' def get_html(self, url): html = requests.get( url = url, headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36 Edg/140.0.0.0'} ).text return html def parse_html_1(self, html): soup = BeautifulSoup(html, 'html.parser') movies = soup.find_all('div', class_='v-card-text pa-3') movies_data = [] for movie in movies: title_element = movie.find('h5') title = title_element.get_text(strip=True) if title_element else "未知标题" name_elements = movie.find_all('span', class_='v-btn__content') name = [None] * len(name_elements) i = 0 for name_element in name_elements: name[i] = name_element.get_text().strip() i += 1 movies_data.append({ 'title': title, 'actors': name }) self.save_html(movies_data) parse_html = etree.HTML(html) href_list = parse_html.xpath('//div[contains(@class, "grid")]//a[contains(@href, "/zh/movies/")]/@href') number_list = [] category_list = [] views_list = [] url_list = [] img_list = [] for href in href_list[:]: url_2 = 'https://www.avgc1.com{}' url_2 = url_2.format(href) html_2 = self.get_html(url_2) number, category, views, img = self.parse_html_2(html_2) number_list.append(number) if number else "无" category_list.append(category) if category else "无" views_list.append(views) if views else "无" img_list.append(img) url_list.append(url_2) self.save_html_2(number_list, category_list, views_list, url_list) self.get_image(img_list, movies_data) def parse_html_2(self, html_2): parse_html = etree.HTML(html_2) number = parse_html.xpath('//div[@class="mt-4 d-flex ga-2 align-center"]/h2/text()')[0] category = ', '.join(parse_html.xpath('//div[@class="mt-4 flex flex-wrap ga-2 align-center"]/a[contains(@href, "/zh/genre/")]/@title')) views = parse_html.xpath('//div[@class="mt-4 d-flex ga-2 align-center"]/span[contains(., "浏览量:")]/following-sibling::span/text()')[0] img = parse_html.xpath('//meta[@name="twitter:image"]/@content')[0] return number, category, views, img def save_html(self, movies_data): with open('search.csv', 'a', newline='', encoding="utf-8-sig") as f: writer = csv.writer(f) writer.writerow(['标题', '演员', '链接', '类型', '番号', '播放量']) for movie in movies_data[:]: title = movie['title'] if movie['actors']: writer.writerow([title, ', '.join(movie['actors'])]) else: writer.writerow([title, '未知演员']) def save_html_2(self, number_list, category_list, views_list, url_list): df = pd.read_csv('search.csv', encoding='utf-8-sig') N1 = len(url_list) N2 = len(df['标题']) df.loc[N2- N1: N2, '链接'] = url_list df.loc[N2- N1: N2, '类型'] = category_list df.loc[N2- N1: N2, '番号'] = number_list df.loc[N2- N1: N2, '播放量'] = views_list
df.to_csv('search.csv', index=False, encoding='utf-8-sig') def get_image(self, img_list, movies_data): title_list = [] for movie in movies_data: title_list.append(movie['title']) for i, img_url in enumerate(img_list[:]): filename = '{}.jpg'.format(title_list[i]) self.save_image(img_url, filename) def save_image(self, img_url, filename): folder_path = r'D:\王沛杰\爬虫\imgs' if not os.path.exists(folder_path): os.makedirs(folder_path) try: filename = os.path.join(folder_path, filename) with requests.Session() as session: img = session.get(url = img_url, headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36 Edg/140.0.0.0'}).content with open(filename, 'wb') as f: f.write(img) print(filename, '下载成功') except requests.exceptions.RequestException as e: print(f'下载失败: {e}') except OSError as e: print(f'文件操作失败: {e}') def run(self): word = input('请输入搜索内容:') begin = int(input('输入起始页:')) stop = int(input('输入终止页:')) for page in range(begin, stop+1): params={ 'page': str(page) } word2 = parse.quote(word) result = parse.urlencode(params) url = self.url.format(word2, result) html = self.get_html(url) self.parse_html_1(html) print('第%d页抓取成功' %page) time.sleep(random.randint(1, 2))
if __name__ == '__main__': start = time.time() spider = Crawler() spider.run() end = time.time() print('执行时间: %.2f' %(end-start))
|