用python实现的抓取腾讯视频所有电影的爬虫
1. # -*- coding: utf-8 -*-2. import re3. import urllib24. from bs4 import BeautifulSoup5. import string, time6. import pymongo8. NUM = 0 #全局变量,电影数量9. m_type = u'' #全局变量,电影类型10. m_site = u'qq' #全局变量,电影网站12. #根据指定的URL获取网页内容13. def gethtml(url):14. req = urllib2.Request(url)15. response = urllib2.urlopen(req)16. html = response.read()17. return html18. '''19. 在学习过程中有什么不懂得可以加我的python学习交流扣扣qun,784758214,群里有不错的学习教程与开发工具。20. '''22. #从电影分类列表页面获取电影分类23. def gettags(html):24. global m_type25. soup = BeautifulSoup(html) #过滤出分类内容26. #print soup27. #<ul class="clearfix _group" gname="mi_type" gtype="1">28. tags_all = soup.find_all('ul', {'class' : 'clearfix _group' , 'gname' : 'mi_type'})29. #print len(tags_all), tags_all30. #print str(tags_all[1]).replace('\n', '')32. #<a _hot="tag.sub" class="_gtag _hotkey" href="http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html"title="动作" tvalue="0">动作</a>33. re_tags = r'<a _hot=\"tag\.sub\" class=\"_gtag _hotkey\" href=\"(.+?)\" title=\"(.+?)\" tvalue=\"(.+?)\">.+?</a>'34. p = re.compile(re_tags, re.DOTALL)36. tags = p.findall(str(tags_all[0]))37. if tags:38. tags_url = {}39. #print tags40. for tag in tags:41. tag_url = tag[0].decode('utf-8')42. #print tag_url43. m_type = tag[1].decode('utf-8')44. tags_url[m_type] = tag_url46. else:47. print "Not Find"48. return tags_url50. #获取每个分类的页数51. def get_pages(tag_url):52. tag_html = gethtml(tag_url)53. #div class="paginator54. soup = BeautifulSoup(tag_html) #过滤出标记页面的html55. #print soup56. #<div class="mod_pagenav" id="pager">57. div_page = soup.find_all('div', {'class' : 'mod_pagenav', 'id' : 'pager'})58. #print div_page #len(div_page), div_page[0]60. #<a class="c_txt6" href="http://v.qq.com/list/1_2_-1_-1_1_0_24_20_0_-1_0.html" title="25"><span>25</span></a>61. re_pages = r'<a class=.+?><span>(.+?)</span></a>'62. p = re.compile(re_pages, re.DOTALL)63. pages = p.findall(str(div_page[0]))64. #print pages65. if len(pages) > 1:66. return pages[-2]67. else:68. return 171. def getmovielist(html):72. soup = BeautifulSoup(html)74. #<ul class="mod_list_pic_130">75. divs = soup.find_all('ul', {'class' : 'mod_list_pic_130'})76. #print divs77. for div_html in divs:78. div_html = str(div_html).replace('\n', '')79. #print div_html80. getmovie(div_html)83. def getmovie(html):84. global NUM85. global m_type86. global m_site88. re_movie = r'<li><a class=\"mod_poster_130\" href=\"(.+?)\" target=\"_blank\" title=\"(.+?)\"><img.+?</li>'89. p = re.compile(re_movie, re.DOTALL)90. movies = p.findall(html)91. if movies:92. conn = pymongo.Connection('localhost', 27017)93. movie_db = conn.dianying94. playlinks = movie_db.playlinks95. #print movies96. for movie in movies:97. #print movie98. NUM += 199. print "%s : %d" % ("=" * 70, NUM)100. values = dict(101. movie_title = movie[1],102. movie_url = movie[0],103. movie_site = m_site,104. movie_type = m_type105. )106. print values107. playlinks.insert(values)108. print "_" * 70109. NUM += 1110. print "%s : %d" % ("=" * 70, NUM)112. #else:113. # print "Not Find"115. def getmovieinfo(url):116. html = gethtml(url)117. soup = BeautifulSoup(html)119. #pack pack_album album_cover120. divs = soup.find_all('div', {'class' : 'pack pack_album album_cover'})121. #print divs[0]123. #<a href="http://www.tudou.com/albumplay/9NyofXc_lHI/32JqhiKJykI.html" target="new" title="《血滴子》独家纪录片" wl="1"> </a>124. re_info = r'<a href=\"(.+?)\" target=\"new\" title=\"(.+?)\" wl=\".+?\"> </a>'125. p_info = re.compile(re_info, re.DOTALL)126. m_info = p_info.findall(str(divs[0]))127. if m_info:128. return m_info129. else:130. print "Not find movie info"132. return m_info135. def insertdb(movieinfo):136. global conn137. movie_db = conn.dianying_at138. movies = movie_db.movies139. movies.insert(movieinfo)141. if __name__ == "__main__":142. global conn144. tags_url = "http://v.qq.com/list/1_-1_-1_-1_1_0_0_20_0_-1_0.html"145. #print tags_url146. tags_html = gethtml(tags_url)147. #print tags_html148. tag_urls = gettags(tags_html)149. #print tag_urls152. for url in tag_urls.items():153. print str(url[1]).encode('utf-8') #,url[0]154. maxpage = int(get_pages(str(url[1]).encode('utf-8')))155. print maxpage157. for x in range(0, maxpage):158. #http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html159. m_url = str(url[1]).replace('0_20_0_-1_0.html', '')160. movie_url = "%s%d_20_0_-1_0.html" % (m_url, x)161. print movie_url162. movie_html = gethtml(movie_url.encode('utf-8'))163. #print movie_html164. getmovielist(movie_html)165. time.sleep(0.1)
大工告成,以上代码大家都看明白了没?
版权声明:本站所有资料均为网友推荐收集整理而来,仅供学习和研究交流使用。
工作时间:8:00-18:00
客服电话
电子邮件
admin@qq.com
扫码二维码
获取最新动态