爬虫小程序,python爬虫携程酒店_携程酒店爬虫

 2023-09-28 阅读 38 评论 0

摘要:# 请求地址import jsonfrom urllib import request爬虫小程序、import pymysqlclass Database():def __init__(self):self.conn = pymysql.connect(host='localhost',携程网数据爬取不了、port=3306,user='root',password='',dat

# 请求地址

import json

from urllib import request

爬虫小程序、import pymysql

class Database():

def __init__(self):

self.conn = pymysql.connect(host='localhost',

携程网数据爬取不了、port=3306,

user='root',

password='',

database='xiecheng',

如何爬携程的数据?charset='utf8mb4')

def __exit__(self, exc_type, exc_val, exc_tb):

self.conn.commit()

self.conn.cursor().close()

携程酒店价格爬虫。self.conn.close()

class Spider:

def __init__(self, url, hotel_map):

self.url = url

爬虫 Python、self.hotel_map = hotel_map

def get_data(self, hotel_id, page):

if len(self.url) == 0:

print("没有找到请求网址")

携程爬虫,return

data = {

"hotelId": hotel_id,

"pageIndex": page,

python有什么用、"tagId": 0,

"pageSize": 20,

"groupTypeBitMap": 2,

"needStatisticInfo": 0,

python爬虫教程。"order": 0,

"basicRoomName": "",

"travelType": -1,

"head": {

"cid": "09031129410921166704",

"ctok": "",

"cver": "1.0",

"lang": "01",

"sid": "8888",

"syscode": "09",

"auth": "",

"extension": []

}

}

data = json.dumps(data).encode(encoding='utf-8')

headers = {

'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1',

'Content-Type': 'application/json'

}

url_request = request.Request(url=self.url, data=data, headers=headers)

html = request.urlopen(url_request)

return html.read()

def clean_data(self, items, hotel_name):

result_list = []

for item in items:

item.setdefault('id', '')

item.setdefault('checkInDate', '')

item.setdefault('postDate', '')

item.setdefault('content', '')

item.setdefault('ratingPoint', '')

item.setdefault('baseRoomName', '')

item.setdefault('userNickName', '')

item.setdefault('img_list', '')

# 格式化数据

result_list.append((item['id'], hotel_name, item['checkInDate'], item['postDate'], item['content'],

item['ratingPoint'], item['baseRoomName'], item['userNickName']))

return result_list

def save_db(self, data):

if (len(data) < 1):

print('数据缺失请重试!!')

return

db = Database()

cursor = db.conn.cursor()

_sql = """ insert into comment(external_id, hotel_name, arr_date, create_date, content, grade, roomTypeName, author)values (%s,%s,%s,%s,%s,%s,%s,%s) """

try:

result = cursor.executemany(_sql, data)

db.conn.commit()

print(result)

except Exception as error:

print(error)

def run(self):

for key in self.hotel_map:

comment_data = self.get_data(key, 1)

result_data = []

current_page = 1

# 重复获取数据

while (len(json.loads(comment_data)['othersCommentList']) > 0):

result_data.extend(json.loads(comment_data)['othersCommentList'])

current_page = current_page + 1

print('正在爬取'+str(current_page)+'页')

comment_data = self.get_data(key, current_page)

cleanData = self.clean_data(result_data, self.hotel_map[key])

self.save_db(cleanData)

if __name__ == '__main__':

BASE_URL = 'http://m.ctrip.com/restapi/soa2/16765/gethotelcomment?&_fxpcqlniredt=09031129410921166704'

# hotel map

HOTEL_MAP = {

'690117': '广州珠江新城木莲庄',

'430407': '广州花都木莲庄酒店',

'21934677': '广州南沙凤凰湖木莲庄酒店',

'6833415': '成都环球中心木莲庄酒店',

'23674123': '苏州高铁北站木莲庄酒店'

}

spider=Spider(BASE_URL,HOTEL_MAP)

spider.run()

版权声明:本站所有资料均为网友推荐收集整理而来,仅供学习和研究交流使用。

原文链接:https://808629.com/108119.html

发表评论:

本站为非赢利网站,部分文章来源或改编自互联网及其他公众平台,主要目的在于分享信息,版权归原作者所有,内容仅供读者参考,如有侵权请联系我们删除!

Copyright © 2022 86后生记录生活 Inc. 保留所有权利。

底部版权信息