# -*- Mode: Python; coding: utf-8; indent-tabs-mode: nil; tab-width: 4 -*-
### BEGIN LICENSE
# Copyright (C) 2012 Pete Burgers <deltify81@gmail.com>
# This program is free software: you can redistribute it and/or modify it 
# under the terms of the GNU General Public License version 3, as published 
# by the Free Software Foundation.
# 
# This program is distributed in the hope that it will be useful, but 
# WITHOUT ANY WARRANTY; without even the implied warranties of 
# MERCHANTABILITY, SATISFACTORY QUALITY, or FITNESS FOR A PARTICULAR 
# PURPOSE.  See the GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License along 
# with this program.  If not, see <http://www.gnu.org/licenses/>.
### END LICENSE

import re
import HTMLParser
import difflib
import logging
logger = logging.getLogger("deltify." + __name__)

class MLStripper(HTMLParser.HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def diff(old, new, strip_tags):
    """ 
    Returns a tuple containing diff, words_highlighted where
    diff is new (HTML) with the highlighted changes,
    words_highlighted is the number of words that were highlighted.
    If strip_tags is True, HTML tags are completely removed before diffing.
    """
    out = []
    words_highlighted = 0

    # Convert html to lists of words
    old = to_list(old, strip_tags)
    new = to_list(new, strip_tags)

    # Perform the diff
    s = difflib.SequenceMatcher(None, old, new)

    # Add insert/delete highlighting
    for (opcode, old_start, old_end, new_start, new_end) in s.get_opcodes():
#        logger.debug("%s, %d, %d, %d, %d" % (opcode, old_start, old_end, new_start, new_end))
        if opcode == "replace" or opcode == "insert":
            words_highlighted += new_end - new_start
            out.append(highlight_text(new[new_start:new_end]))
        elif opcode == "delete": 
            pass
        elif opcode == "equal":
            out.append(' '.join(new[new_start:new_end]))
        else:
            raise ValueError("Unknown opcode: %s" % opcode)

    return (from_list(out), words_highlighted)

# Used to match all spaces inside tags
COMMENT_RE = re.compile("<!--(.|\s)*?-->")
TAG_SPACE_RE = re.compile("\s(?=[^<>]*>)")
TAG_SPACE = "__D-SpC__"

def to_list(html, strip_tags):
    if strip_tags:
        logger.debug("Stripping all HTML tags...")
        # Remove HTML tags
        stripper = MLStripper()
        stripper.feed(html)
        html = stripper.get_data()
    else:
        # Remove comments, since TAG_SPACE_RE doesn't like them
        logger.debug("Removing comments...")
        html = COMMENT_RE.sub("", html)

        # Substitute spaces inside tags
        logger.debug("Removing spaces inside tags...")
        html = TAG_SPACE_RE.sub(TAG_SPACE, html)

        # Ensure tags have spaces before and after
        logger.debug("Padding tags...")
        html = html.replace("<", " <")
        html = html.replace(">", "> ")

    return html.split()

def from_list(data):
    html = " ".join(data)
    # Recreate spaces inside tags
    html = html.replace(TAG_SPACE, " ")
    return html

def highlight_text(words):
    result = ""
    tag = True
    for word in words:
        if tag and word[0] != '<':
            # Next word isn't a tag - start highlighting
            result += "<span class='insdiff'>"
            tag = False
        elif not tag and word[0] == '<':
            # Next word is a tag - stop highlighting
            result += "</span>"
            tag = True

        result += word + " "

    if not tag:
        # Stop highlighting
        result += "</span>"
    return result

