Published on

在scrapy中间件中使用protobuf解析

Authors
  • avatar
    Name
    Shelton Ma
    Twitter
  1. 定义中间件

    class DemoScrapyDownloaderMiddleware(object):
    
        def process_request(self, request, spider):
    
            # 添加body
            _url = request._get_url()
            meta_data = request._meta
            if 'v2/entities:search' in _url:
                spider.logger.debug('添加protobuf格式的body, 并更新headers')
                bd = Body()
                bd.limit = 120
                bd.filter.status = meta_data.get('status')
                bd.field15 = 1
                spider.logger.debug(f'MessageToJson(bd): {MessageToJson(bd)}')
                payload = bd.SerializeToString()
                request._set_body(payload)
                # 添加headers, 响应格式为json, 因为protobuf的格式可能会解析异常
                headers = {
                    'Accept': "application/json",
                    'Pragma': "no-cache",
                    'Cache-Control': "no-cache",
                    'cache-control': "no-cache",
                    "x-app-type": "groundup",
                    "content-type": "application/x-protobuf",
                }
                scrapy_headers = {}
                for k, v in headers.items():
                    scrapy_headers[k.encode()] = v.encode()
                request.headers.update(scrapy_headers)
            return None
    
  2. settings.py

    # settings.py
    DOWNLOADER_MIDDLEWARES = {
        'demo_scrapy.middlewares.DemoScrapyDownloaderMiddleware': 739,
        'demo_scrapy.middlewares.ProxyMiddleware': 740,
    }