- 1 import time
- 2 import traceback
- 3 import requests
- 4 from lxml import etree
- 5 import re
- 6 from bs4 import BeautifulSoup
- 7 from lxml.html.diff import end_tag
- 8 import json
- 9 import pymysql
- 10 #连接数据库 获取游标
- 11 def get_conn():
- 12 """
- 13 :return: 连接,游标
- 14 """
- 15 # 创建连接
- 16 conn = pymysql.connect(host="82.157.112.34",
- 17 user="root",
- 18 password="root",
- 19 db="MovieRankings",
- 20 charset="utf8")
- 21 # 创建游标
- 22 cursor = conn.cursor() # 执行完毕返回的结果集默认以元组显示
- 23 if ((conn != None) & (cursor != None)):
- 24 print("数据库连接成功!游标创建成功!")
- 25 else:
- 26 print("数据库连接失败!")
- 27 return conn, cursor
- 28 #关闭数据库连接和游标
- 29 def close_conn(conn, cursor):
- 30 if cursor:
- 31 cursor.close()
- 32 if conn:
- 33 conn.close()
- 34 return 1
- 35 def get_iqy():
- 36 # 获取数据库总数据条数
- 37 conn, cursor = get_conn()
- 38 sql = "select count(*) from movieiqy"
- 39 cursor.execute(sql) # 执行sql语句
- 40 conn.commit() # 提交事务
- 41 all_num = cursor.fetchall()[0][0] #cursor 返回值的类型是一个元祖的嵌套形式 比如( ( ) ,)
- 42 pagenum=int(all_num/48)+1 #这里是计算一个下面循环的起始值 每48个电影分一组
- 43 # print(pagenum)
- 44 print("movieiqy数据库有", all_num, "条数据!")
- 45
- 46
- 47 url = "https://pcw-api.iqiyi.com/search/recommend/list?channel_id=1&data_type=1&mode=11&page_id=1&ret_num=48&session=ee4d98ebb4e8e44c8d4b14fa90615fb7"
- 48 headers = {
- 49 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"
- 50 }
- 51 # response=requests.get(url=url,headers=headers)
- 52 # response.encoding="utf-8"
- 53 # page_text=response.text
- 54 # print(page_text)
- 55 """
- 56 """
- 57 #
- 58 temp_list = [] #暂时存放单部电影的数据
- 59 dataRes = [] #每次循环把单部电影数据放到这个list
- 60 for i in range(1, 137): #循环1-136 第137 json 是空的 也就是全部爬完
- 61 url = "https://pcw-api.iqiyi.com/search/recommend/list?channel_id=1&data_type=1&mode=11&page_id=1&ret_num=48&session=ee4d98ebb4e8e44c8d4b14fa90615fb7"
- 62 url_0 = "https://pcw-api.iqiyi.com/search/recommend/list?channel_id=1&data_type=1&mode=11&page_id="
- 63 url_0 = url_0 + str(i) + "&ret_num=48&session=ad1d98bb953b7e5852ff097c088d66f2"
- 64 print(url_0) #输出拼接好的url
- 65 response = requests.get(url=url_0, headers=headers)
- 66 response.encoding = "utf-8"
- 67 try:
- 68 page_text = response.text
- 69 #解析json对象
- 70 json_obj = json.loads(page_text)
- 71 #这里的异常捕获是因为 测试循环的次数有可能超过电影网站提供的电影数 为了防止后续爬到空的json对象报错
- 72 json_list = json_obj['data']['list']
- 73 except:
- 74 print("捕获异常!")
- 75 return dataRes #json为空 程序结束
- 76 for j in json_list: # 开始循环遍历json串
- 77 # print(json_list)
- 78 name = j['name'] #找到电影名
- 79 print(name)
- 80 temp_list.append(name)
- 81 #异常捕获,防止出现电影没有评分的现象
- 82 try:
- 83 score = j['score'] #找到电影评分
- 84 print(score)
- 85 temp_list.append(score)
- 86 except KeyError:
- 87 print( "评分---KeyError")
- 88 temp_list.append("iqy暂无评分") #替换字符串
- 89
- 90 link = j['playUrl'] #找到电影链接
- 91 temp_list.append(link)
- 92 # 解析播放状态
- 93 """
- 94 独播:https://www.iqiyipic.com/common/fix/site-v4/video-mark/only.png
- 95 VIP:https://pic0.iqiyipic.com/common/20171106/ac/1b/vip_100000_v_601_0_21.png
- 96 星钻:https://www.iqiyipic.com/common/fix/site-v4/video-mark/star-movie.png
- 97 """
- 98 state = []
- 99 pay_text = j['payMarkUrl'] #因为播放状态只有在一个图片链接里有 所以需要使用re解析出类似vip和only(独播)的字样
- 100 print(pay_text)
- 101 if (len(pay_text) == 0): #如果没有这个图片链接 说明电影是免费播放
- 102 state="免费"
- 103 else:
- 104 find_state = re.compile("(.*?).png")
- 105 state = re.findall(find_state, pay_text) #正则匹配链接找到vip
- 106 # print(state[0])
- 107
- 108 if(len(state)!=0): #只有当链接不为空再执行
- 109 # print(state)
- 110 # 再次解析
- 111 part_state=str(state[0])
- 112 part_state=part_state.split('/')
- 113 print(part_state[-1])
- 114 state = part_state[-1][0:3] #字符串分片
- 115 # 这里只输出了三个字符,如果是独播,页面显示的是only,我们设置为”独播“
- 116 if (state == "onl"):
- 117 state = "独播"
- 118 if (state == "sta"):
- 119 state = "星钻"
- 120 if(state == "vip"):
- 121 state="VIP"
- 122 print(state)
- 123 # 添加播放状态
- 124 # print(state)
- 125 temp_list.append(state)
- 126 dataRes.append(temp_list)
- 127 # print(temp_list)
- 128 temp_list = []
- 129
- 130 print('___________________________')
- 131 return dataRes
- 132
- 133 def insert_iqy():
- 134 cursor = None
- 135 conn = None
- 136 try:
- 137 count=0
- 138 list = get_iqy()
- 139 print(f"{time.asctime()}开始插入爱奇艺电影数据")
- 140 conn, cursor = get_conn()
- 141 sql = "insert into movieiqy (id,name,score,path,state) values(%s,%s,%s,%s,%s)"
- 142 for item in list:
- 143 print(item)
- 144 count = count + 1
- 145 if (count % 48 == 0):
- 146 print('___________________________')
- 147 #异常捕获,防止数据库主键冲突
- 148 try:
- 149 cursor.execute(sql, [0, item[0], item[1], item[2], item[3] ])
- 150 except pymysql.err.IntegrityError:
- 151 print("重复!跳过!")
- 152
- 153 conn.commit() # 提交事务 update delete insert操作
- 154 print(f"{time.asctime()}插入爱奇艺电影数据完毕")
- 155 except:
- 156 traceback.print_exc()
- 157 finally:
- 158 close_conn(conn, cursor)
- 159 return;
- 160
- 161 if __name__ == '__main__':
- 162 # get_iqy()
- 163 insert_iqy()
- 1 import requests
- 2 import json
- 3 from bs4 import BeautifulSoup #网页解析获取数据
- 4 import sys
- 5 import re
- 6 import urllib.request,urllib.error #制定url,获取网页数据
- 7 import sqlite3
- 8 import xlwt #excel操作
- 9 import time
- 10 import pymysql
- 11 import traceback
- 12 #连接数据库 获取游标
- 13 def get_conn():
- 14 """
- 15 :return: 连接,游标
- 16 """
- 17 # 创建连接
- 18 conn = pymysql.connect(host="82.157.112.34",
- 19 user="root",
- 20 password="root",
- 21 db="MovieRankings",
- 22 charset="utf8")
- 23 # 创建游标
- 24 cursor = conn.cursor() # 执行完毕返回的结果集默认以元组显示
- 25 if ((conn != None) & (cursor != None)):
- 26 print("数据库连接成功!游标创建成功!")
- 27 else:
- 28 print("数据库连接失败!")
- 29 return conn, cursor
- 30 #关闭数据库连接和游标
- 31 def close_conn(conn, cursor):
- 32 if cursor:
- 33 cursor.close()
- 34 if conn:
- 35 conn.close()
- 36 return 1
- 37
- 38 #爬取腾讯视频电影数据
- 39 def get_ten():
- 40 conn,cursor=get_conn()
- 41 sql="select count(*) from movieten"
- 42 cursor.execute(sql)
- 43 conn.commit()
- 44 all_num=cursor.fetchall()[0][0]
- 45
- 46 print("movieten数据库有",all_num,"条数据!")
- 47 # https://v.qq.com/channel/movie?listpage=1&channel=movie&sort=18&_all=1&offset=0&pagesize=30
- 48 url="https://v.qq.com/channel/movie?listpage=1&channel=movie&sort=18&_all=1" #链接
- 49 param={ #参数字典
- 50 'offset':0,
- 51 'pagesize':30
- 52 }
- 53 headers={ #UA伪装
- 54 'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '+
- 55 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'
- 56 }
- 57 # param['offset']=all_num
- 58 offset = 0 #拼接url
- 59 dataRes = []
- 60 findLink = re.compile(r'href="(.*?)"') # 链接
- 61 findName = re.compile(r'title="(.*?)"') # 影片名
- 62 findScore= re.compile(r'<div class="figure_score">(.*?) </div>') #评分
- 63 #3*170
- 64 for i in range(0,300):
- 65 # res = urllib.request.urlopen(url) #urllib不推荐使用
- 66 res = requests.get(url=url,params=param,headers=headers) #编辑request请求
- 67 # print(url)
- 68 res.encoding='utf-8' #设置返回数据的编码格式为utf-8
- 69 html=BeautifulSoup(res.text,"html.parser") #BeautifulSoup解析
- 70 part_html = html.find_all(r"a", class_="figure") #找到整个html界面里a标签对应的html代码,返回值是一个list
- 71 # print(part_html)
- 72 if (len(part_html) == 0):
- 73 print("页面返回空!")
- 74 return dataRes
- 75 offset = offset + 30 #修改参数字典+30部电影
- 76 print("下面从第"+str(offset)+"部电影开始:")
- 77 param['offset'] = offset
- 78 print(param['offset'])
- 79 for i in part_html: #遍历每一个part_html
- 80 # print(i)
- 81 words = str(i)
- 82 name=re.findall(findName, words)# 添加影片名
- 83 score=re.findall(findScore, words)# 添加评分
- 84 link=re.findall(findLink, words)# 添加链接
- 85 findState=BeautifulSoup(words,'lxml') #单独解析播放状态
- 86 state=findState.select('a > img') #找到img父级标签
- 87 if(len(state)==1): #免费电影不存在播放状态的标志,所以当img长度是1的时候,需要补上一个空串
- 88 state.append("")
- 89 state_text=str(state[1]) #拿到第二个img对应的内容,使用正则匹配到alt属性对应的字符串
- 90 # print(state_text)
- 91 temp_state=re.findall('<img alt="(.*?)"', state_text)
- 92 if(len(temp_state)==0):
- 93 temp_state.insert(0,"免费") # 添加播放状态---免费
- 94 # print(temp_state[0])
- 95 list_=[]
- 96 if(len(score)==0):
- 97 score.insert(0,"暂无评分")
- 98 for i in dataRes:
- 99 if name[0] in i[0]:
- 100 name.insert(0,name[0]+"(其他版本)")
- 101 list_.append(name[0])
- 102 list_.append(score[0])
- 103 list_.append(link[0])
- 104 list_.append(temp_state[0])
- 105 # list_.append(statu)
- 106 # print(list_)
- 107 print(list_)
- 108 dataRes.append(list_)
- 109 # print(dataRes) #打印最终结果
- 110 # list=html.select(".figure_score")
- 111 # for item in list:
- 112 # print(item)
- 113
- 114 #把同一部电影的信息放到一个 [ ] 里面
- 115
- 116 return dataRes
- 117 #插入到腾讯电影数据库
- 118 def insert_ten():
- 119 """
- 120 插入腾讯电影数据
- 121 :return:
- 122 """
- 123 cursor = None
- 124 conn = None
- 125 try:
- 126 list = get_ten()
- 127 print(f"{time.asctime()}开始插入腾讯电影数据")
- 128 conn, cursor = get_conn()
- 129 sql = "insert into movieten (id,name,score,path,state) values(%s,%s,%s,%s,%s)"
- 130 for item in list:
- 131 try:
- 132 cursor.execute(sql,[0,item[0],item[1],item[2],item[3]])
- 133 except pymysql.err.IntegrityError:
- 134 print("重复!跳过!")
- 135 conn.commit() # 提交事务 update delete insert操作
- 136 print(f"{time.asctime()}插入腾讯电影数据完毕")
- 137 except:
- 138 traceback.print_exc()
- 139 finally:
- 140 close_conn(conn, cursor)
- 141 return ;
- 142 if __name__ == '__main__':
- 143 # conn,cursor=get_conn()
- 144 # list=[]
- 145 # res_list=get_ten()
- 146 # print(res_list)
- 147 insert_ten()