# -*- Mode: Python; coding: utf-8; indent-tabs-mode: nil; tab-width: 4 -*-
### BEGIN LICENSE
# Copyright (C) 2012 Pete Burgers <deltify81@gmail.com>
# This program is free software: you can redistribute it and/or modify it 
# under the terms of the GNU General Public License version 3, as published 
# by the Free Software Foundation.
# 
# This program is distributed in the hope that it will be useful, but 
# WITHOUT ANY WARRANTY; without even the implied warranties of 
# MERCHANTABILITY, SATISFACTORY QUALITY, or FITNESS FOR A PARTICULAR 
# PURPOSE.  See the GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License along 
# with this program.  If not, see <http://www.gnu.org/licenses/>.
### END LICENSE

import os
import sys
import datetime
import calendar
import urllib2
import re
import traceback
import logging
logger = logging.getLogger(__name__)

import deltify.utils as utils
import deltify.html_diff as html_diff
from deltify.bookmarks import FIELDS, CHARSET_AUTO

class Scan:
    def __init__(self):
        self.charset = CHARSET_AUTO
        self.time = None
        self.data = None
        self.error = None
        self.error_traceback = None
        self.error_function = None

    def run(self, bookmark):
        logger.debug("Scanning %s...", bookmark[FIELDS.title])
        self.charset = bookmark[FIELDS.character_set]
        self.time = _get_utc_timestamp()
        try:
            request = urllib2.urlopen(bookmark[FIELDS.uri])

            if self.charset == CHARSET_AUTO:
                self.charset = _get_charset(request)
            logger.debug("Decoding using charset: %s", self.charset)

            data = request.read()
            self.data = unicode(data, self.charset)
            request.close() 
            return True

        except Exception: # pylint: disable=W0703
            self._record_error("scan.run")
            logger.error("%s: %s", bookmark[FIELDS.title], self.error)
            return False

    def diff(self, other_filename, strip_tags=False):
        """ 
        Returns a tuple (diff, words_highlighted), where
        diff is self.data (HTML) with the differences from other_filename highlighted
        words_highlighted is the number of words that were highlighted
        If strip_tags is True, HTML tags are completely removed before diffing.
        """
        other_data = utils.load_scan(other_filename)
        if not other_data:
            return (self.data, 0)

        # Split into head and body
        old_head, old_body = _split_head(other_data)
        new_head, new_body = _split_head(self.data)

        # self._timed_diff(old_body, new_body)

        # Diff the bodies
        (diff_body, words_highlighted) = html_diff.diff(old_body, new_body, strip_tags)
        diff = new_head + diff_body
        return (diff, words_highlighted)

    def _record_error(self, function=""):
        self.error = sys.exc_type.__name__
        self.error_traceback = traceback.format_exc().strip()
        self.error_function = function


def _get_charset(request, default="utf-8"):
    """ request.info().getparam() doesn't work with GAE. 
    Need to parse the header ourselves
    """
    content_type = request.info().getheader("content-type")
    charset_match = re.search("charset=(\S*)", content_type)
    if charset_match:
        charset = charset_match.group(1)
    else:
        charset = default
        logger.debug("HTTP charset not found, using default (%s).",
                      charset)
    return charset

def _get_utc_timestamp():
    """ 
    Returns UNIX timestamp of current UTC time.
    From http://ruslanspivak.com/2011/07/20/how-to-convert-python-utc-datetime-object-to-unix-timestamp/
    Why oh why is this so hard?
    """
    return calendar.timegm(datetime.datetime.utcnow().utctimetuple())

def _split_head(html):
    """Split the html string into head and body."""
    match = re.search("<\s*body", html, flags=re.IGNORECASE)
    if match:
        pos = match.start()
    else:
        pos = 0
    return html[:pos], html[pos:]

