最终爬取效果如上图所示,需要注意的是域名被大墙屏蔽了,只能通过ip进行访问,代码已经进行了相关处理。
最终爬取效果如上图所示,需要注意的是域名被大墙屏蔽了,只能通过ip进行访问,代码已经进行了相关处理。
# -*- coding:utf-8 -*-
import os
#
# http://115.68.13.42/studio_md
import requests
from bs4 import BeautifulSoup
HEADERS = {
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Referer': 'http://www.a6d.cn/'
}
DIR_PATH = "H:/图片/missdica.com.gallery"
def get_url_source_code(url):
souce = requests.get(url=url, headers=HEADERS, timeout=10)
html = souce.content
html_doc = str(html, 'utf-8')
return html_doc
def save_pic(url, path):
try:
img_name = os.path.join(path, str(url).split('/')[-1])
print('[S] 下载图片路径:' + path + ' \r\n链接:' + url)
durl = url
if '_S' in img_name:
durl = str(url).replace('_S', '')
img_name = img_name.replace('_S', '')
if os.path.isfile(img_name):
print('[F] 文件已经存在,跳过保存')
return False
req = requests.get(durl, headers=HEADERS, timeout=10)
img = req.content
if 'found on this server' in str(req.text):
print('[E] 文件不存在,跳过下载, 删除文件')
try:
os.remove(img_name)
except:
pass
return False
with open(img_name, 'ab') as f:
f.write(img)
print('[S] 下载图片成功')
except Exception as e:
# print(e)
print('[S] 下载图片失败: ' + str(e))
def mark_dir(flot_name):
"""
检测文件夹是否创建,没有创建则创建文件夹,创建了就跳过
"""
print('[C] 创建目录: ' + flot_name)
PATH = os.path.join(DIR_PATH, flot_name)
if not os.path.exists(PATH): # 检测是否有这个文件夹
os.makedirs(PATH)
os.chdir(PATH)
return PATH
def get_all_title_and_links(page_url):
print('-' * 70)
print('[A] 分析所有子页面信息......')
# http://115.68.13.42/index.php?mid=studio_md&page=2
html_doc = get_url_source_code(page_url)
# print(html_doc)
bs = BeautifulSoup(html_doc, "html.parser")
# print(bs)
# zzr = bs.find_all('li', _class="title")
zzr = bs.find_all('a', class_='title black')
ll = []
# print(zzr)
for z in zzr:
url = str(z.get("href")).replace('http://www.missdica.com', 'http://115.68.13.42')
name = str(z.get_text()).replace(' ', '')\
.replace('\r', '').replace('\n', '')\
.replace('\t', '').replace('\'', '')\
.replace('"', '').replace('?', '')\
.replace('.', '').replace(':', '')\
.replace('\\', '').replace('~', '').replace('^', '').replace('*', '')
i = {
'url': url,
'name': name
}
print('[*] 名字: ' + name + ' URL:' + url)
ll.append(i)
print('[A] 分类总数: ' + str(len(ll)) + ' 全部解析完成')
print('-' * 70)
return ll
def download_all_image_in_links(url, name):
print('_' * 70)
print('[A] 解析图片地址......')
html_doc = get_url_source_code(url)
bs4 = BeautifulSoup(html_doc, "html.parser")
# document_2501334_962455 xe_content
bs = bs4.find('div', class_='xe_content')
urls = []
surls = bs.find_all('img')
floder = mark_dir(name)
for u in surls:
org_link = str(u.get("src"))
if not str(org_link).startswith('http://'):
if str(org_link).startswith('/'):
org_link = 'http://115.68.13.42' + org_link
else:
org_link = 'http://115.68.13.42/' + org_link
link = org_link.replace('http://www.missdica.com', 'http://115.68.13.42')\
.replace('http://missdica.com' ,'http://115.68.13.42')
if link not in urls:
# print(link)
urls.append(link)
save_pic(link, floder)
print('_' * 70)
print('[A] 图片下载完成,当前页面图片一共 ' + str(len(urls)) + ' 张')
if __name__ == '__main__':
print('*' * 80)
print('韩国美女 图片下载器')
print('http://www.missdica.com')
print('by: obaby')
print('http://www.obaby.org.cn')
print('http://www.h4ck.org.cn')
print('*' * 80)
# save_pic('http://115.68.13.42/data2/studio_md/2010/04/17/5798469754bc88d9ea4d3c.jpg', 'dsfdafasd')
for i in range(1, 248):
print('[S] 开始下载第', i, '页')
# page_list = get_all_title_and_links('http://115.68.13.42/index.php?mid=studio_md&page=' + str(i))
page_list = get_all_title_and_links('http://115.68.13.42/index.php?mid=gal_event&page=' + str(i))
for p in page_list:
download_all_image_in_links(p['url'], p['name'])
print('[S] 第', i, '页下载完成')
print('_' * 100)
2 comments
这个好厉害,还能用吗
最近没试 你可以试试