# 请求地址
import json
from urllib import request
爬虫小程序、import pymysql
class Database():
def __init__(self):
self.conn = pymysql.connect(host='localhost',
携程网数据爬取不了、port=3306,
user='root',
password='',
database='xiecheng',
如何爬携程的数据?charset='utf8mb4')
def __exit__(self, exc_type, exc_val, exc_tb):
self.conn.commit()
self.conn.cursor().close()
携程酒店价格爬虫。self.conn.close()
class Spider:
def __init__(self, url, hotel_map):
self.url = url
爬虫 Python、self.hotel_map = hotel_map
def get_data(self, hotel_id, page):
if len(self.url) == 0:
print("没有找到请求网址")
携程爬虫,return
data = {
"hotelId": hotel_id,
"pageIndex": page,
python有什么用、"tagId": 0,
"pageSize": 20,
"groupTypeBitMap": 2,
"needStatisticInfo": 0,
python爬虫教程。"order": 0,
"basicRoomName": "",
"travelType": -1,
"head": {
"cid": "09031129410921166704",
"ctok": "",
"cver": "1.0",
"lang": "01",
"sid": "8888",
"syscode": "09",
"auth": "",
"extension": []
}
}
data = json.dumps(data).encode(encoding='utf-8')
headers = {
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1',
'Content-Type': 'application/json'
}
url_request = request.Request(url=self.url, data=data, headers=headers)
html = request.urlopen(url_request)
return html.read()
def clean_data(self, items, hotel_name):
result_list = []
for item in items:
item.setdefault('id', '')
item.setdefault('checkInDate', '')
item.setdefault('postDate', '')
item.setdefault('content', '')
item.setdefault('ratingPoint', '')
item.setdefault('baseRoomName', '')
item.setdefault('userNickName', '')
item.setdefault('img_list', '')
# 格式化数据
result_list.append((item['id'], hotel_name, item['checkInDate'], item['postDate'], item['content'],
item['ratingPoint'], item['baseRoomName'], item['userNickName']))
return result_list
def save_db(self, data):
if (len(data) < 1):
print('数据缺失请重试!!')
return
db = Database()
cursor = db.conn.cursor()
_sql = """ insert into comment(external_id, hotel_name, arr_date, create_date, content, grade, roomTypeName, author)values (%s,%s,%s,%s,%s,%s,%s,%s) """
try:
result = cursor.executemany(_sql, data)
db.conn.commit()
print(result)
except Exception as error:
print(error)
def run(self):
for key in self.hotel_map:
comment_data = self.get_data(key, 1)
result_data = []
current_page = 1
# 重复获取数据
while (len(json.loads(comment_data)['othersCommentList']) > 0):
result_data.extend(json.loads(comment_data)['othersCommentList'])
current_page = current_page + 1
print('正在爬取'+str(current_page)+'页')
comment_data = self.get_data(key, current_page)
cleanData = self.clean_data(result_data, self.hotel_map[key])
self.save_db(cleanData)
if __name__ == '__main__':
BASE_URL = 'http://m.ctrip.com/restapi/soa2/16765/gethotelcomment?&_fxpcqlniredt=09031129410921166704'
# hotel map
HOTEL_MAP = {
'690117': '广州珠江新城木莲庄',
'430407': '广州花都木莲庄酒店',
'21934677': '广州南沙凤凰湖木莲庄酒店',
'6833415': '成都环球中心木莲庄酒店',
'23674123': '苏州高铁北站木莲庄酒店'
}
spider=Spider(BASE_URL,HOTEL_MAP)
spider.run()
版权声明:本站所有资料均为网友推荐收集整理而来,仅供学习和研究交流使用。
工作时间:8:00-18:00
客服电话
电子邮件
admin@qq.com
扫码二维码
获取最新动态