对于27270网站美女 进行了一个尝试
使用
python3
库:
urllib
BeautifulSoup
lxml
主要是下载静态网页的图片
显示从IP代理网站上获取代理池,然后在从静态页面获取图片的连接,最后下载图片
1、爬虫下载IP代理
2、模拟浏览器下载
分析页面
<body>
....省略其他页面代码
<div>
....省略其他页面代码
<div class="MeinvTuPianBox">
<ul>
....省略其他页面代码
<li> <a href="*****" title="******" class="MMPic" target="_blank"><i><img src="*****" width="190" height="280" alt="*****" /></i></a>
....省略其他页面代码
</li>
....省略其他页面代码
</div>
从上面可以看出页面各个元素之间的关系,确定好要找元素的位置
body > div > div class=MeinvTuPianBox > ul > li > a class=MMPic > i > img
完整的代码
from urllib.request import urlopen
import urllib.request
from bs4 import BeautifulSoup
import os, time
import http.cookiejar
import random
from urllib.request import urlretrieve ,HTTPError ,urlopen,URLError
base_url='http://www.27270.com/'#ent/meinvtupian/' #list_11_%s.html';
one_url=['word']
base_dir=''
proxy_ip=[]
#class myThread (threading.Thread):
# def __init__(self, start,end):
# threading.Thread.__init__(self)
# #self.threadID = threadID
# self.start = start
# self.end = end
#
# def run(self):
# print ("开始线程:" + self.name)
# #print_time(self.name, self.counter, 5)
# get_url_list( self.start,self.end )
# print ("退出线程:" + self.name)
#ip代理池
def getProxyIp():
proxy = []
for i in range(1, 3):
#print(i)
header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Ubuntu Chromium/44.0.2403.89 '
'Chrome/44.0.2403.89 '
'Safari/537.36'}
req = urllib.request.Request(url='http://www.xicidaili.com/nt/{0}'.format(i), headers=header)
r = urllib.request.urlopen(req)
soup = BeautifulSoup(r,'html.parser',from_encoding='utf-8')
table = soup.find('table', attrs={'id': 'ip_list'})
tr = table.find_all('tr')[1:]
#解析得到代理ip的地址,端口,和类型
for item in tr:
tds = item.find_all('td')
temp_dict = {}
kind = "{0}:{1}".format(tds[1].get_text().lower(), tds[2].get_text())
proxy.append("http://"+kind)
return proxy
#随机获取IP地址
def getIP():
ip=random.choice(proxy_ip)
return ip
def makeMyOpener(head={
'Connection': 'Keep-Alive',
'Accept': 'text/html, application/xhtml+xml, */*',
'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
}):
proxy_dict=getIP()
print(proxy_dict)
cj = http.cookiejar.CookieJar()
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
header = []
for key, value in head.items():
elem = (key, value)
header.append(elem)
elem=('http',proxy_dict)
header.append(elem)
opener.addheaders = header
return opener
#图片下载
def download(url,file_name,index):
dir=base_dir+str(index)+'/'
if not os.path.isdir(dir):
os.makedirs(dir)
dir=dir+file_name
try:
with urlopen(url,timeout=30) as r:
content=r.read();
with open(dir,'wb') as code:
#open(name,[mode[,buffering]]) 打开文件夹
code.write(content)
#time.sleep(1)
except :
pass
def get_url_list(index,end):
girl_list=[]
try:
#if end==index:
# print(u'已经全部抓取完毕')
# threading.currentThread().stop()
oper = makeMyOpener()
url='http://www.27270.com/ent/meinvtupian/list_11_%s.html' % index
html = oper.open(url)
#第一种方法
#bsObj = BeautifulSoup(html,'lxml')
#girl_list = bsObj.findAll('img')
#第二种方法
soup = BeautifulSoup(html,'lxml')
girl_list = soup.select('body > div > div.MeinvTuPianBox > ul > li > a.MMPic > i > img')
if not girl_list:
print(u'已经全部抓取完毕')
sys.exit(0)
#第三寻找元素方法
#response = requests.get(image_detail_link).content
#sel = html.fromstring(html)
#girl_list =sel.xpath("//div[@class='MeinvTuPianBox']/ul/li/a[@class='MMPic']/i/img")[0]
mm_down = []
mm_names = []
#第四种方法 正则,此处略
for mpoto in girl_list:
mm_link = mpoto.get('src')
mm_nick = mpoto.get('alt')
mm_down.append(mm_link)
mm_names.append(mm_nick)
for gril,name in zip(mm_down,mm_names):
download(gril, name + '.jpg',index)
print(gril+name)
index=index+1
get_url_list(index,end)
except HTTPError as e:
print('HTTPError'+str(e.code))
get_url_list(index,end)
except URLError as e:
print('URLError'+e)
get_url_list(index,end)
#return girl_list
if __name__ == '__main__':
proxy_ip=getProxyIp()
base_dir='E:/cache-work/python3/images1/'
if not os.path.isdir(base_dir):
os.makedirs(base_dir)
get_url_list(163,100)
"""
try:
_thread.start_new_thread( get_url_list, ( 1,35, ) )
_thread.start_new_thread(get_url_list, ( 35,70, ) )
_thread.start_new_thread( get_url_list, ( 70,110, ) )
_thread.start_new_thread( get_url_list, ( 110,150, ) )
_thread.start_new_thread( get_url_list, ( 150,500,) )
except:
print ("Error: 无法启动线程")
while 1:
pass
"""
"""
thread1= myThread( 1,35)
thread2= myThread(35,70)
thread3= myThread(70,110)
thread4= myThread(110,150)
thread5= myThread(150,1000)
thread1.start()
thread2.start()
thread3.start()
thread4.start()
thread5.start()
"""
# 创建两个线程