Published on

Custom Scrapy Spider

Authors
  • avatar
    Name
    Shelton Ma
    Twitter

1. Custom MySpider

import scrapy
import time
import os
import logging
import datetime
from scrapy.crawler import Crawler


class MySpider(scrapy.Spider):
    """MySpider Custom Spider

    1. Custom metrics: `inc_stats()` and `set_stats()` allow direct manipulation of `scrapy.StatsCollector` for flexible metrics.
    2. `update_settings()`: Adds `spider_name` for convenient log file generation.
    3. Easy log customization with `install_customize_root_handler`.
    4. `_set_crawler()`: Automatically adds `crawler_batch_id` and `spider_id` to spider logs.
    """

    def __init__(self, name=None, **kwargs):
        super().__init__(name, **kwargs)
        MY_LOG_HELPER = self.settings.getbool('MY_LOG_HELPER')
        if MY_LOG_HELPER:
            self.install_customize_root_handler(self.settings)
        self.spider_id = '{}_{}'.format(self.name, int(time.time()*1000))

    @classmethod
    def update_settings(cls, settings):
        """
        Updates `settings` if `configure_logging` is used and log files need to be named by `spider_name`.
        Ensures that `settings` is accessible earlier, allowing full configuration in `__init__` instead of waiting for `Crawl._create_spider`.
        """
        settings.set('spider_name', cls.name)
        cls.settings = settings
        super(MySpider, cls).update_settings(settings)

    def _set_crawler(self, crawler: Crawler):
        """Executes after `MySpider.__init__` to add additional configurations to `stats`.

        Args:
            crawler (Crawler): The crawler instance.
        """
        super()._set_crawler(crawler)
        crawler.stats.set_value('spider_id', self.spider_id)
        if hasattr(self, "crawler_batch_id"):
            self.set_stats('crawler_batch_id', self.crawler_batch_id)
        else:
            self.set_stats('crawler_batch_id', datetime.datetime.strftime(datetime.datetime.now(), '%Y_%m_%d'))

    def inc_stats(self, name, value=1):
        """Custom metric for the spider added to `corestats`.
           Example: `self.inc_stats('success_cnt', 1)`

        Args:
            name (str): Metric name.
            value (int, optional): Metric value. Defaults to 1.
        """
        self.crawler.stats.inc_value(name, value)

    def set_stats(self, name, value=0):
        """Sets a specific metric value."""
        self.crawler.stats.set_value(name, value)

    def install_customize_root_handler(self, settings):
        """Custom `TimedRotatingFileHandler` for logs.

        Args (in `settings.py`):
            MY_LOG_HELPER: Set to `True` to enable log file writing.
            BASIC_LOG_DIR: Root directory for logs, defaults to `/logs`.
            Log file format: `{BASIC_LOG_DIR}/{bot_name}/{spider_name}_YYYY_mm_dd_HH_MM_SS_ffffff.log`
        """
        BASIC_LOG_DIR = settings.get('BASIC_LOG_DIR', '/logs')
        datetime_str = datetime.datetime.strftime(datetime.datetime.now(), '%Y_%m_%d_%H_%M_%S_%f')
        file_dir = os.path.join(BASIC_LOG_DIR, settings.get('BOT_NAME', 'bot_name'))
        if not os.path.exists(file_dir):
            os.makedirs(file_dir)
        filename = os.path.join(file_dir, '{}_{}.log'.format(settings.get('spider_name', 'spider_name'), datetime_str))
        encoding = settings.get('LOG_ENCODING')
        file_handler = logging.handlers.TimedRotatingFileHandler(
                filename=filename,
                when='midnight',
                backupCount=20,
                encoding=encoding,
            )
        formatter = logging.Formatter(
            fmt="%(asctime)s %(name)s %(filename)s %(lineno)s %(levelname)s %(message)s",
            datefmt=settings.get('LOG_DATEFORMAT')
        )
        file_handler.setFormatter(formatter)
        file_handler.setLevel(settings.get('LOG_LEVEL'))
        logging.root.addHandler(file_handler)

2. Usage

from customs import MySpider
import scrapy

class ExampleSpider(MySpider):
    """Example Spider

    1. Test custom arguments: `-a crawler_batch_id='11' -a custom_metrics='custom_metrics01'`
    2. Test logging: `MY_LOG_HELPER=True` logs saved at `/logs/bot_name/spider_name...log`.
    """
    name = "example"
    allowed_domains = ["example.com"]
    start_urls = ["https://example.com"]
    custom_settings = {
        'MY_LOG_HELPER': True,
        'BASIC_LOG_DIR': './logs'
    }

    def parse(self, response):
        self.inc_stats('success_cnt', 1)
        pass