经验首页 前端设计 程序设计 Java相关 移动开发 数据库/运维 软件/图像 大数据/云计算 其他经验
当前位置:技术经验 » 程序设计 » Python » 查看文章
爬虫:爬取男人团女优们的封面
来源:cnblogs  作者:xxxxf  时间:2018/11/17 14:56:33  对本文有异议

爬取后发现有一些女优并没有新建文件夹存放,居然有些title属性为空。

最主要的是没有使用代理,IP被封了。。。。。。。。。

将同一个女优的放到一个文件夹,用

  1. threading.Lock()

防止新建文件夹错误,但注释掉后还能正常运行,有待观察

 

  1. from lxml import etree
    import requests
    import os
    import re
    from urllib import request
    import threading
    from queue import Queue
    from threading import Lock
    base = 'http://nanrenvip.xyz'
    headers = {
    'User-Agent': '~~~~~~~~~~~~~~~',
    'Referer': 'http://nanrenvip.xyz/nvyouku/1-0-0-0-0-0-0.html'
    }


    class Producer(threading.Thread):
    """在女优列表中提取每人的姓名和链接,name 用于分文件夹, 最后一直传递到img中"""
    def __init__(self, pages, women_pages, *args, **kwargs):
    super(Producer, self).__init__(*args, **kwargs)
    self.pages = pages
    self.women_pages = women_pages

    def run(self):
    while True:
    if self.pages.empty():
    break
    url = self.pages.get()
    self.get_women(url)

    def get_women(self, url):
    response = requests.get(url, headers=headers)
    text = response.content.decode('utf-8')
    html = etree.HTML(text)
    box = html.xpath('//div[@class="list_box"]//div[@class="list_l"]//li')[:15]
    for each in box:
    name = each.xpath('./a/@title')[0]
    her_url = each.xpath('./a/@href')[0]
    women = {'name': name, 'url': base + her_url}
    self.women_pages.put(women)


    class Producer_2(threading.Thread):
    """获取女优详情页所有番号链接以及标题, 标题用来命名图片"""
    def __init__(self, pages, women_pages, avs, *args, **kwargs):
    super(Producer_2, self).__init__(*args, **kwargs)
    self.pages = pages
    self.women_pages = women_pages
    self.avs = avs

    def run(self):
    while True:
    if self.pages.empty() and self.women_pages.empty():
    break
    women = self.women_pages.get()
    self.get_av_list(women)

    def get_av_list(self, women):
    url = women['url']
    name = women['name']
    response = requests.get(url, headers=headers)
    text = response.content.decode('utf-8')
    html = etree.HTML(text)
    lst = html.xpath('//div[@class="zp_list"]')[0]
    text = etree.tostring(lst, encoding='utf-8').decode('utf-8')
    avs = re.findall(r'<a href="(.*?)">(.*?)</a>', text, re.DOTALL)
    for each in avs:
    her_url = base + each[0]
    her_title = each[1]
    av_list = {'url': her_url, 'title': her_title, 'name': name}
    self.avs.put(av_list)


    class Producer_3(threading.Thread):
    """获取该番号的图片src, 加上name title"""
    def __init__(self, pages, women_pages, avs, imgs, *args, **kwargs):
    super(Producer_3, self).__init__(*args, **kwargs)
    self.pages = pages
    self.women_pages = women_pages
    self.avs = avs
    self.imgs = imgs

    def run(self):
    while True:
    if self.pages.empty() and self.women_pages.empty() and self.avs.empty():
    break
    av = self.avs.get()
    self.get_imgs(av)

    def get_imgs(self, av):
    url = av['url']
    name = av['name']
    title = av['title']
    response = requests.get(url, headers=headers)
    text = response.content.decode('utf-8')
    html = etree.HTML(text)
    tar = html.xpath('//div[@class="artCon"]')[0]
    text = etree.tostring(tar, encoding='utf-8').decode('utf-8')
    src = re.findall(r'data-original="(.*?)"', text, re.DOTALL)[0]
    src = base + src
    img = {}
    img['name'] = name
    img['title'] = title
    img['src'] = src
    self.imgs.put(img)


    class Consumer(threading.Thread):
    """下载图片, lock 防止出现新建文件夹时错误"""
    def __init__(self, pages, women_pages, avs, imgs, lock, *args, **kwargs):
    super(Consumer, self).__init__(*args, **kwargs)
    self.pages = pages
    self.women_pages = women_pages
    self.avs = avs
    self.imgs = imgs
    self.lock = lock

    def run(self):
    while True:
    if self.pages.empty() and self.women_pages.empty() and self.avs.empty() and self.imgs.empty():
    break
    img = self.imgs.get()
    self.download(img)

    def download(self, img):
    src = img['src']
    name = img['name']
    title = img['title']
    self.lock.acquire()
    if not os.path.exists('vip/'+name):
    os.makedirs('vip/'+name)
    self.lock.release()
    try:
    request.urlretrieve(src, './vip/%s/%s.jpg' % (name, title))
    except:
    print(src)


    def main():
    base_url = 'http://nanrenvip.xyz/nvyouku/1-0-0-0-0-0-{}.html'
    pages = Queue(60)
    women_pages = Queue(1000)
    avs = Queue(100000)
    imgs = Queue(100000)
    lock = threading.Lock()
    for x in range(55):
    url = base_url.format(x)
    pages.put(url)
    for x in range(2):
    Producer(pages, women_pages).start()
    for x in range(10):
    Producer_2(pages, women_pages, avs).start()
    for x in range(10):
    Producer_3(pages, women_pages, avs, imgs).start()
    for x in range(10):
    Consumer(pages, women_pages, avs, imgs, lock).start()


    if __name__ == '__main__':
    main()
 友情链接:直通硅谷  点职佳  北美留学生论坛

本站QQ群:前端 618073944 | Java 606181507 | Python 626812652 | C/C++ 612253063 | 微信 634508462 | 苹果 692586424 | C#/.net 182808419 | PHP 305140648 | 运维 608723728

W3xue 的所有内容仅供测试,对任何法律问题及风险不承担任何责任。通过使用本站内容随之而来的风险与本站无关。
关于我们  |  意见建议  |  捐助我们  |  报错有奖  |  广告合作、友情链接(目前9元/月)请联系QQ:27243702 沸活量
皖ICP备17017327号-2 皖公网安备34020702000426号