lxml提取-百度贴吧数据

下面的实例是使用python的lxml模块,利用xpath来提取数据

仅供爬虫学习练习使用,请勿用其它用途,由于网站经常改版,有可能不能正常提取数据,需要做适当的修改即可。

python参考代码如下:

#coding:utf-8
import requests
from lxml import etree


class Tieba(object):

    def __init__(self, name):
        self.name = name
        self.url = 'http://tieba.baidu.com/f?kw={}'.format(self.name)
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
            # 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1) '
        }

    def get_data(self, url):
        response = requests.get(url,headers=self.headers)
        return response.content

    def parse_list_page(self, data):
        """
            解析贴吧帖子列表页面的响应
        :param data: 帖子列表页面的响应
        :return: 帖子标题和链接列表 与 下一页url
        """
        data = data.decode().replace("<!--","").replace("-->","")
        html = etree.HTML(data)

        el_list = html.xpath('//li[@class=" j_thread_list clearfix"]/div/div[2]/div[1]/div[1]/a')
        # print(len(el_list))
        data_list = []
        for el in el_list:
            temp = {}

            temp['title'] = el.xpath('./text()')[0]
            temp['link'] = 'http://tieba.baidu.com' + el.xpath('./@href')[0]
            data_list.append(temp)
        try:
            next_url = 'http:' + html.xpath('//*[contains(text(),"下一页>")]/@href')[0]
        except:
            next_url = None

        return data_list, next_url

    def parse_detail_page(self, data):

        html = etree.HTML(data)

        return html.xpath('//*[contains(@id,"post_content_")]/img/@src')

    def run(self):
        # url
        # headers
        next_url = self.url
        while True:
            # 发送列表请求,获取响应
            list_page_data = self.get_data(next_url)

            # 解析列表页面的响应,提取帖子列表数据和下一页url
            data_list, next_url = self.parse_list_page(list_page_data)

            # 遍历帖子列表,获取每一个详细url
            for data in data_list:
                # 发起请求,获取到详情页面的响应
                data = self.get_data(data['link'])
                # 从响应中提取图片地址
                image_list = self.parse_detail_page(data)
                # 保存
                print(image_list)
            # 翻页&循环终止条件
            if next_url == None:
                break


if __name__ == '__main__':
    tieba = Tieba("某贴吧的关键字")
    tieba.run()