"""
parser_cache.py

Copyright 2006 Andres Riancho

This file is part of w3af, http://w3af.org/ .

w3af is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation version 2 of the License.

w3af is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with w3af; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA

"""
from __future__ import with_statement, print_function

import os
import zlib
import signal
import atexit
import threading
import multiprocessing

from darts.lib.utils.lru import SynchronizedLRUDict
from tblib.decorators import Error

import w3af.core.controllers.output_manager as om

from w3af.core.controllers.profiling import start_profiling_no_core
from w3af.core.controllers.threads.process_pool import ProcessPool
from w3af.core.controllers.threads.is_main_process import is_main_process
from w3af.core.controllers.output_manager import log_sink_factory
from w3af.core.controllers.exceptions import BaseFrameworkException
from w3af.core.controllers.ci.detect import is_running_on_ci
from w3af.core.controllers.threads.decorators import apply_with_return_error
from w3af.core.controllers.profiling.core_stats import core_profiling_is_enabled
from w3af.core.data.parsers.document_parser import DocumentParser


class ParserCache(object):
    """
    This class is a document parser cache.

    :author: Andres Riancho (andres.riancho@gmail.com)
    """
    LRU_LENGTH = 40
    MAX_CACHEABLE_BODY_LEN = 1024 * 1024
    PARSER_TIMEOUT = 60 # in seconds
    DEBUG = core_profiling_is_enabled()
    MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() / 2) or 1

    def __init__(self):
        self._cache = SynchronizedLRUDict(self.LRU_LENGTH)
        self._pool = None
        self._processes = None
        self._parser_finished_events = {}
        self._start_lock = threading.RLock()

        # These are here for debugging:
        self._from_LRU = 0.0
        self._total = 0.0

    def start_workers(self):
        """
        Start the pool and workers
        :return: The pool instance
        """
        with self._start_lock:
            if self._pool is None:
                # Keep track of which pid is processing which http response
                # pylint: disable=E1101
                self._processes = manager.dict()
                # pylint: enable=E1101

                # The pool
                log_queue = om.manager.get_in_queue()
                self._pool = ProcessPool(self.MAX_WORKERS,
                                         maxtasksperchild=25,
                                         initializer=init_worker,
                                         initargs=(log_queue,))

        return self._pool

    def stop_workers(self):
        """
        Stop the pool workers
        :return: None
        """
        if self._pool is not None:
            self._pool.terminate()
            self._pool = None
            self._processes = None

        # Make sure the parsers clear all resources
        for parser in self._cache.itervalues():
            parser.clear()

        # We don't need the parsers anymore
        self._cache.clear()

    def get_hit_rate(self):
        """
        :note: Only returns useful information if debugging is enabled
        """
        try:
            return self._from_LRU / self._total
        except ZeroDivisionError:
            return None

    def get_max_lru_items(self):
        """
        :note: Only returns useful information if debugging is enabled
        """
        return self.LRU_LENGTH

    def get_current_lru_items(self):
        """
        :note: Only returns useful information if debugging is enabled
        """
        return len(self._cache)

    def get_total_queries(self):
        return self._total

    def get_cache_key(self, http_response):
        """
        Before I used md5, but I realized that it was unnecessary. I
        experimented a little bit with python's hash functions and the builtin
        hash was the fastest.

        At first I thought that the built-in hash wasn't good enough, as it
        could create collisions... but... given that the LRU has only 40
        positions, the real probability of a collision is too low.

        :return: The key to be used in the cache for storing this http_response
        """
        # @see: test_bug_13_Dec_2012 to understand why we concat the uri to the
        #       body before hashing
        uri_str = http_response.get_uri().url_string.encode('utf-8')

        body_str = http_response.body
        if isinstance(body_str, unicode):
            body_str = body_str.encode('utf-8', 'replace')

        _to_hash = body_str + uri_str

        # Added adler32 after finding some hash() collisions in builds
        hash_string = str(hash(_to_hash))
        hash_string += str(zlib.adler32(_to_hash))
        return hash_string

    def should_cache(self, http_response):
        """
        Defines if this http_response parser should be cached or not

        :param http_response: The http response instance
        :return: True if we should cache the parser for this response
        """
        return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN

    def _test_parse_http_response(self, http_response, *args):
        """
        Left here for testing!
        """
        return DocumentParser(http_response)

    def _parse_http_response_in_worker(self, http_response, hash_string):
        """
        This parses the http_response in a pool worker. This has two features:
            * We can kill the worker if the parser is taking too long
            * We can have different parsers

        :return: The DocumentParser instance
        """
        event = multiprocessing.Event()
        self._parser_finished_events[hash_string] = event

        # Start the worker processes if needed
        self.start_workers()

        apply_args = (ProcessDocumentParser,
                      http_response,
                      self._processes,
                      hash_string)

        # Push the task to the workers
        result = self._pool.apply_async(apply_with_return_error, (apply_args,))

        try:
            parser_output = result.get(timeout=self.PARSER_TIMEOUT)
        except multiprocessing.TimeoutError:
            # Near the timeout error, so we make sure that the pid is still
            # running our "buggy" input
            pid = self._processes.pop(hash_string, None)
            if pid is not None:
                try:
                    os.kill(pid, signal.SIGTERM)
                except OSError, ose:
                    msg = 'An error occurred while killing the parser' \
                          ' process: "%s"'
                    om.out.debug(msg % ose)

            msg = '[timeout] The parser took more than %s seconds'\
                  ' to complete parsing of "%s", killed it!'

            om.out.debug(msg % (self.PARSER_TIMEOUT,
                                http_response.get_url()))

            # Act just like when there is no parser
            msg = 'There is no parser for "%s".' % http_response.get_url()
            raise BaseFrameworkException(msg)
        else:
            if isinstance(parser_output, Error):
                parser_output.reraise()

        finally:
            # Just remove it so it doesn't use memory
            self._processes.pop(hash_string, None)

            # Let other threads know that we're done
            event = self._parser_finished_events.pop(hash_string, None)

            if event is not None:
                # There is a really rare race condition where more than one
                # thread calls _parse_http_response_in_worker and queues the
                # same hash_string for processing, since it's so rare I believe
                # the best way to fix it is to:
                #
                #   * Avoid adding a lock
                #   * Accept that in these rare edge case we'll waste some CPU
                #
                # https://circleci.com/gh/andresriancho/w3af/1354
                event.set()

        return parser_output

    def get_document_parser_for(self, http_response):
        """
        Get a document parser for http_response using the cache if required

        :param http_response: The http response instance
        :return: An instance of DocumentParser
        """
        hash_string = self.get_cache_key(http_response)

        if not self.should_cache(http_response):
            # Just return the document parser, no need to cache
            return self._parse_http_response_in_worker(http_response,
                                                       hash_string)

        parser_finished = self._parser_finished_events.get(hash_string, None)
        if parser_finished is not None:
            # There is one subprocess already processing this http response
            # body, the best thing to do here is to make this thread wait
            # until that process has finished
            try:
                parser_finished.wait(timeout=self.PARSER_TIMEOUT)
            except:
                # Act just like when there is no parser
                msg = 'There is no parser for "%s".' % http_response.get_url()
                raise BaseFrameworkException(msg)

        # metric increase
        self._total += 1

        parser = self._cache.get(hash_string, None)
        if parser is not None:
            self._debug_handle_cache_hit(hash_string)
            return parser
        else:
            self._debug_handle_cache_miss(hash_string)

            # Create a new instance of DocumentParser, add it to the cache
            parser = self._parse_http_response_in_worker(http_response,
                                                         hash_string)
            self._cache[hash_string] = parser
            return parser

    def _debug_handle_cache_hit(self, hash_string):
        if self.DEBUG:
            om.out.debug('[parser_cache] Hit for %s' % hash_string)
            self._from_LRU += 1

    def _debug_handle_cache_miss(self, hash_string):
        if self.DEBUG:
            om.out.debug('[parser_cache] Miss for %s' % hash_string)


class ProcessDocumentParser(DocumentParser):
    """
    Simple wrapper to get the current process id and store it in a shared object
    so we can kill the process if needed.
    """
    def __init__(self, http_resp, processes, hash_string):
        pid = multiprocessing.current_process().pid
        processes[hash_string] = pid
        
        super(ProcessDocumentParser, self).__init__(http_resp)


@atexit.register
def cleanup_pool():
    if 'dpc' in globals():
        dpc.stop_workers()
    

def init_worker(log_queue):
    """
    This function is called right after each Process in the ProcessPool is
    created, and it will initialized some variables/handlers which are required
    for it to work as expected

    :return: None
    """
    signal.signal(signal.SIGINT, signal.SIG_IGN)
    log_sink_factory(log_queue)
    start_profiling_no_core()


if is_main_process():
    manager = multiprocessing.Manager()
    dpc = ParserCache()
