- import requests
- from bs4 import BeautifulSoup
- def getHTMLText(url):
- try:
- r = requests.get(url,timeout=30)
- r.raise_for_status()
- r.encoding = r.apparent_encoding
- return r.text
- except:
- return '产生异常'
-
- if __name__ == '__main__':
- i = 0
- urls = ['https://movie.douban.com/top250?start='+str(n)+'&filter=' for n in range(0,250,25)]
- for url in urls:
- r = getHTMLText(url)
- soup = BeautifulSoup(r,'html.parser')
- titles = soup.select('div.hd a')
- rates = soup.select('span.rating_num')
- pics = soup.select('img[width="100"]')
- for title,rate,pic in zip(titles,rates,pics):
- data={'title':list(title.stripped_strings),
- 'rate':rate.get_text(),
- 'pic':pic.get('src')}
- i+=1
- fileName=str(i)+'_'+data['title'][0]+' '+data['rate']+'分.jpg'
- pic1 = requests.get(data['pic'])
- with open('G:\\test\\'+fileName,'wb') as photo:
- photo.write(pic1.content)
- print(data)