Source code for cronster.crawler

#!/usr/bin/env python
import os
import glob
import json
import time
import hashlib
from datetime import datetime

import yaml
import click
import redis
from tabulate import tabulate


[docs]class CronsterCrawler(object):
    """
    Cronster crawler class. Crawl the file system recursively for ``crontab``
    files, read the contents and store a list of
    :class:`~cronster.scheduler.CronsterJob` in a Redis cache.
    """
[docs]    def __init__(self, root, cache_host, cache_port, interval):
        """
        Initialise a :class:`~cronster.crawler.CronsterCrawler`.

        :param root: File system root to crawl
        :type root: str
        :param cache_host: Host that serves the Redis cache
        :type cache_host: str
        :param cache_port: Port on the host that exposes the Redis service
        :type cache_port: int
        :param interval: Time between crawls in seconds
        :type interval: int
        """
        if not os.path.exists(root):
            raise OSError('Root does not exist')
        self.root = root
        os.chdir(self.root)
        self.cache = redis.StrictRedis(
            host=cache_host, port=cache_port, db=0, decode_responses=True)
        self.interval = interval

[docs]    def get_crontab_data(self, crontab):
        """
        Given a ``crontab`` file path, load and return the
        :class:`~cronster.scheduler.CronsterJob` contained in
        the file.

        :param crontab: Crontab file path
        :type crontab: str
        :return: Jobs
        :rtype: list

        Example output:

        .. code-block:: json

            [
                {
                    "name": "job_name",
                    "cmd": "echo $PATH",
                    "schedule": "* * * * *",
                    "path": "/path/to/crontab/file",
                    "hash": "dc8a776c99d9b8ab97550e87c857dc959a857c5b"
                }
            ]
        """
        with open(crontab, 'r') as fp:
            crontab_data = yaml.load(fp)
        jobs = []
        for job_name, job_data in crontab_data.items():
            job = {}
            job_hash = hashlib.sha1()
            job['path'] = crontab
            job_hash.update(crontab.encode('ascii'))
            job['name'] = job_name
            job_hash.update(job_name.encode('ascii'))
            for key, value in job_data.items():
                job[key] = value
                job_hash.update(value.encode('ascii'))
            job['hash'] = job_hash.hexdigest()
            jobs.append(job)
        return jobs

[docs]    def crawl(self):
        """
        Recursively crawl the file system from ``root`` in a given
        ``interval``. Add :class:`~cronster.scheduler.CronsterJob` from
        ``crontab`` files to the cache as a JSON string.
        """
        while True:
            self.crontabs = []
            for crontab in glob.glob('**/crontab', recursive=True):
                self.crontabs += self.get_crontab_data(crontab)
            self.cache.set('cronster_crawler', json.dumps(self.crontabs))
            self.display_crontabs()
            time.sleep(self.interval)

[docs]    def display_crontabs(self):
        """
        Print the current cache content to the console in tabulated form.
        """
        crontab_data = json.loads(self.cache.get('cronster_crawler'))
        click.clear()
        metadata = [
            [
                'Cronster Crawler'
            ],
            [
                'Crawler time: ',
                datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            ]
        ]
        print(tabulate(metadata))
        print('\n')
        headers = ['Job Name', 'Hash', 'Schedule']
        job_data = []
        for job in sorted(crontab_data, key=lambda x: x['name']):
            job_data.append([job['name'], job['hash'], job['schedule']])
        print(tabulate(job_data, headers=headers))

    def __repr__(self):
        return '<CronsterCrawler>'

    def __str__(self):
        return str({
            'root': self.root,
            'cache_host': self.cache_host,
            'cache_port': self.cache_port,
            'interval': self.interval
        })


@click.command()
@click.option(
    '-r', '--root', default=os.getcwd(),
    help='Crawling root, default: the current working directory')
@click.option(
    '-h', '--cache-host', default='localhost',
    help='Cache host, default: localhost')
@click.option(
    '-p', '--cache-port', type=int, default=6379,
    help='Cache port, default: 6379 (Redis default)')
@click.option(
    '-i', '--interval', type=int, default=2,
    help='Crawling interval, default: 2 seconds')
def cli(root, cache_host, cache_port, interval):
    crawler = CronsterCrawler(root, cache_host, cache_port, interval)
    crawler.crawl()


if __name__ == '__main__':
    cli()