爬取后发现有一些女优并没有新建文件夹存放,居然有些title属性为空。
最主要的是没有使用代理,IP被封了。。。。。。。。。
将同一个女优的放到一个文件夹,用
- threading.Lock()
防止新建文件夹错误,但注释掉后还能正常运行,有待观察
- from lxml import etree
import requests
import os
import re
from urllib import request
import threading
from queue import Queue
from threading import Lock
base = 'http://nanrenvip.xyz'
headers = {
'User-Agent': '~~~~~~~~~~~~~~~',
'Referer': 'http://nanrenvip.xyz/nvyouku/1-0-0-0-0-0-0.html'
}
class Producer(threading.Thread):
"""在女优列表中提取每人的姓名和链接,name 用于分文件夹, 最后一直传递到img中"""
def __init__(self, pages, women_pages, *args, **kwargs):
super(Producer, self).__init__(*args, **kwargs)
self.pages = pages
self.women_pages = women_pages
def run(self):
while True:
if self.pages.empty():
break
url = self.pages.get()
self.get_women(url)
def get_women(self, url):
response = requests.get(url, headers=headers)
text = response.content.decode('utf-8')
html = etree.HTML(text)
box = html.xpath('//div[@class="list_box"]//div[@class="list_l"]//li')[:15]
for each in box:
name = each.xpath('./a/@title')[0]
her_url = each.xpath('./a/@href')[0]
women = {'name': name, 'url': base + her_url}
self.women_pages.put(women)
class Producer_2(threading.Thread):
"""获取女优详情页所有番号链接以及标题, 标题用来命名图片"""
def __init__(self, pages, women_pages, avs, *args, **kwargs):
super(Producer_2, self).__init__(*args, **kwargs)
self.pages = pages
self.women_pages = women_pages
self.avs = avs
def run(self):
while True:
if self.pages.empty() and self.women_pages.empty():
break
women = self.women_pages.get()
self.get_av_list(women)
def get_av_list(self, women):
url = women['url']
name = women['name']
response = requests.get(url, headers=headers)
text = response.content.decode('utf-8')
html = etree.HTML(text)
lst = html.xpath('//div[@class="zp_list"]')[0]
text = etree.tostring(lst, encoding='utf-8').decode('utf-8')
avs = re.findall(r'<a href="(.*?)">(.*?)</a>', text, re.DOTALL)
for each in avs:
her_url = base + each[0]
her_title = each[1]
av_list = {'url': her_url, 'title': her_title, 'name': name}
self.avs.put(av_list)
class Producer_3(threading.Thread):
"""获取该番号的图片src, 加上name title"""
def __init__(self, pages, women_pages, avs, imgs, *args, **kwargs):
super(Producer_3, self).__init__(*args, **kwargs)
self.pages = pages
self.women_pages = women_pages
self.avs = avs
self.imgs = imgs
def run(self):
while True:
if self.pages.empty() and self.women_pages.empty() and self.avs.empty():
break
av = self.avs.get()
self.get_imgs(av)
def get_imgs(self, av):
url = av['url']
name = av['name']
title = av['title']
response = requests.get(url, headers=headers)
text = response.content.decode('utf-8')
html = etree.HTML(text)
tar = html.xpath('//div[@class="artCon"]')[0]
text = etree.tostring(tar, encoding='utf-8').decode('utf-8')
src = re.findall(r'data-original="(.*?)"', text, re.DOTALL)[0]
src = base + src
img = {}
img['name'] = name
img['title'] = title
img['src'] = src
self.imgs.put(img)
class Consumer(threading.Thread):
"""下载图片, lock 防止出现新建文件夹时错误"""
def __init__(self, pages, women_pages, avs, imgs, lock, *args, **kwargs):
super(Consumer, self).__init__(*args, **kwargs)
self.pages = pages
self.women_pages = women_pages
self.avs = avs
self.imgs = imgs
self.lock = lock
def run(self):
while True:
if self.pages.empty() and self.women_pages.empty() and self.avs.empty() and self.imgs.empty():
break
img = self.imgs.get()
self.download(img)
def download(self, img):
src = img['src']
name = img['name']
title = img['title']
self.lock.acquire()
if not os.path.exists('vip/'+name):
os.makedirs('vip/'+name)
self.lock.release()
try:
request.urlretrieve(src, './vip/%s/%s.jpg' % (name, title))
except:
print(src)
def main():
base_url = 'http://nanrenvip.xyz/nvyouku/1-0-0-0-0-0-{}.html'
pages = Queue(60)
women_pages = Queue(1000)
avs = Queue(100000)
imgs = Queue(100000)
lock = threading.Lock()
for x in range(55):
url = base_url.format(x)
pages.put(url)
for x in range(2):
Producer(pages, women_pages).start()
for x in range(10):
Producer_2(pages, women_pages, avs).start()
for x in range(10):
Producer_3(pages, women_pages, avs, imgs).start()
for x in range(10):
Consumer(pages, women_pages, avs, imgs, lock).start()
if __name__ == '__main__':
main()