Python爬虫实现抓取腾讯视频所有电影-源码【实战必学】

 2023-09-05 阅读 59 评论 0

摘要:用python实现的抓取腾讯视频所有电影的爬虫 1. # -*- coding: utf-8 -*-2. import re3. import urllib24. from bs4import BeautifulSoup5. import string, time6. import pymongo8. NUM =0#全局变量,电影数量9. m_type =u''#全局变量,电影类型10. m_site 

用python实现的抓取腾讯视频所有电影的爬虫

1.  # -*- coding: utf-8 -*-2.  import re3.  import urllib24.  from bs4 import BeautifulSoup5.  import string, time6.  import pymongo8.  NUM = 0 #全局变量,电影数量9.  m_type = u'' #全局变量,电影类型10.  m_site = u'qq' #全局变量,电影网站12.  #根据指定的URL获取网页内容13.  def gethtml(url):14.  req = urllib2.Request(url)15.  response = urllib2.urlopen(req)16.  html = response.read()17.  return html18.  '''19.  在学习过程中有什么不懂得可以加我的python学习交流扣扣qun,784758214,群里有不错的学习教程与开发工具。20.  '''22.  #从电影分类列表页面获取电影分类23.  def gettags(html):24.  global m_type25.  soup = BeautifulSoup(html) #过滤出分类内容26.  #print soup27.  #<ul class="clearfix _group" gname="mi_type" gtype="1">28.  tags_all = soup.find_all('ul', {'class' : 'clearfix _group' , 'gname' : 'mi_type'})29.  #print len(tags_all), tags_all30.  #print str(tags_all[1]).replace('\n', '')32.  #<a _hot="tag.sub" class="_gtag _hotkey" href="http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html"title="动作" tvalue="0">动作</a>33.  re_tags = r'<a _hot=\"tag\.sub\" class=\"_gtag _hotkey\" href=\"(.+?)\" title=\"(.+?)\" tvalue=\"(.+?)\">.+?</a>'34.  p = re.compile(re_tags, re.DOTALL)36.  tags = p.findall(str(tags_all[0]))37.  if tags:38.  tags_url = {}39.  #print tags40.  for tag in tags:41.  tag_url = tag[0].decode('utf-8')42.  #print tag_url43.  m_type = tag[1].decode('utf-8')44.  tags_url[m_type] = tag_url46.  else:47.  print "Not Find"48.  return tags_url50.  #获取每个分类的页数51.  def get_pages(tag_url):52.  tag_html = gethtml(tag_url)53.  #div class="paginator54.  soup = BeautifulSoup(tag_html) #过滤出标记页面的html55.  #print soup56.  #<div class="mod_pagenav" id="pager">57.  div_page = soup.find_all('div', {'class' : 'mod_pagenav', 'id' : 'pager'})58.  #print div_page #len(div_page), div_page[0]60.  #<a class="c_txt6" href="http://v.qq.com/list/1_2_-1_-1_1_0_24_20_0_-1_0.html" title="25"><span>25</span></a>61.  re_pages = r'<a class=.+?><span>(.+?)</span></a>'62.  p = re.compile(re_pages, re.DOTALL)63.  pages = p.findall(str(div_page[0]))64.  #print pages65.  if len(pages) > 1:66.  return pages[-2]67.  else:68.  return 171.  def getmovielist(html):72.  soup = BeautifulSoup(html)74.  #<ul class="mod_list_pic_130">75.  divs = soup.find_all('ul', {'class' : 'mod_list_pic_130'})76.  #print divs77.  for div_html in divs:78.  div_html = str(div_html).replace('\n', '')79.  #print div_html80.  getmovie(div_html)83.  def getmovie(html):84.  global NUM85.  global m_type86.  global m_site88.  re_movie = r'<li><a class=\"mod_poster_130\" href=\"(.+?)\" target=\"_blank\" title=\"(.+?)\"><img.+?</li>'89.  p = re.compile(re_movie, re.DOTALL)90.  movies = p.findall(html)91.  if movies:92.  conn = pymongo.Connection('localhost', 27017)93.  movie_db = conn.dianying94.  playlinks = movie_db.playlinks95.  #print movies96.  for movie in movies:97.  #print movie98.  NUM += 199.  print "%s : %d" % ("=" * 70, NUM)100.  values = dict(101.  movie_title = movie[1],102.  movie_url = movie[0],103.  movie_site = m_site,104.  movie_type = m_type105.  )106.  print values107.  playlinks.insert(values)108.  print "_" * 70109.  NUM += 1110.  print "%s : %d" % ("=" * 70, NUM)112.  #else:113.  # print "Not Find"115.  def getmovieinfo(url):116.  html = gethtml(url)117.  soup = BeautifulSoup(html)119.  #pack pack_album album_cover120.  divs = soup.find_all('div', {'class' : 'pack pack_album album_cover'})121.  #print divs[0]123.  #<a href="http://www.tudou.com/albumplay/9NyofXc_lHI/32JqhiKJykI.html" target="new" title="《血滴子》独家纪录片" wl="1"> </a>124.  re_info = r'<a href=\"(.+?)\" target=\"new\" title=\"(.+?)\" wl=\".+?\"> </a>'125.  p_info = re.compile(re_info, re.DOTALL)126.  m_info = p_info.findall(str(divs[0]))127.  if m_info:128.  return m_info129.  else:130.  print "Not find movie info"132.  return m_info135.  def insertdb(movieinfo):136.  global conn137.  movie_db = conn.dianying_at138.  movies = movie_db.movies139.  movies.insert(movieinfo)141.  if __name__ == "__main__":142.  global conn144.  tags_url = "http://v.qq.com/list/1_-1_-1_-1_1_0_0_20_0_-1_0.html"145.  #print tags_url146.  tags_html = gethtml(tags_url)147.  #print tags_html148.  tag_urls = gettags(tags_html)149.  #print tag_urls152.  for url in tag_urls.items():153.  print str(url[1]).encode('utf-8') #,url[0]154.  maxpage = int(get_pages(str(url[1]).encode('utf-8')))155.  print maxpage157.  for x in range(0, maxpage):158.  #http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html159.  m_url = str(url[1]).replace('0_20_0_-1_0.html', '')160.  movie_url = "%s%d_20_0_-1_0.html" % (m_url, x)161.  print movie_url162.  movie_html = gethtml(movie_url.encode('utf-8'))163.  #print movie_html164.  getmovielist(movie_html)165.  time.sleep(0.1)

大工告成,以上代码大家都看明白了没?

版权声明:本站所有资料均为网友推荐收集整理而来,仅供学习和研究交流使用。

原文链接:https://808629.com/911.html

发表评论:

本站为非赢利网站,部分文章来源或改编自互联网及其他公众平台,主要目的在于分享信息,版权归原作者所有,内容仅供读者参考,如有侵权请联系我们删除!

Copyright © 2022 86后生记录生活 Inc. 保留所有权利。

底部版权信息