Source code for web_monitoring_diff.basic_diffs

from bs4 import Comment
from fast_diff_match_patch import diff
import html5_parser
import re
import sys


# BeautifulSoup can sometimes exceed the default Python recursion limit (1000).
sys.setrecursionlimit(10000)

# Dictionary mapping which maps from diff-match-patch tags to the ones we use
diff_codes = {'=': 0, '-': -1, '+': 1}

REPEATED_BLANK_LINES = re.compile(r'([^\S\n]*\n\s*){2,}')



[docs]
def compare_length(a_body, b_body):
    "Compute difference in response body lengths. (Does not compare contents.)"
    return {'diff': len(b_body) - len(a_body)}




[docs]
def identical_bytes(a_body, b_body):
    "Compute whether response bodies are exactly identical."
    return {'diff': a_body == b_body}



def _get_text(html):
    "Extract textual content from HTML."
    soup = html5_parser.parse(html, treebuilder='soup', return_root=False)
    [element.extract() for element in
     soup.find_all(string=lambda text: isinstance(text, Comment))]
    return soup.find_all(text=True)


INVISIBLE_TAGS = set(['style', 'script', '[document]', 'head', 'title'])
_RE_HTML_COMMENT = re.compile('<!--.*-->')


def _is_visible(element):
    "A best-effort guess at whether an HTML element is visible on the page."
    # adapted from https://www.quora.com/How-can-I-extract-only-text-data-from-HTML-pages
    if element.parent.name in INVISIBLE_TAGS:
        return False
    elif _RE_HTML_COMMENT.match(str(element.encode('utf-8'))):
        return False
    return True


def _get_visible_text(html):
    text = ' '.join(filter(_is_visible, _get_text(html)))
    return REPEATED_BLANK_LINES.sub('\n\n', text).strip()



[docs]
def side_by_side_text(a_text, b_text):
    "Extract the visible text from both response bodies."
    return {'diff': {'a_text': _get_visible_text(a_text),
                     'b_text': _get_visible_text(b_text)}}



def compute_dmp_diff(a_text, b_text, timelimit=4):
    if (isinstance(a_text, (str, bytes)) and isinstance(b_text, (str, bytes))):
        changes = diff(a_text, b_text, checklines=False, timelimit=timelimit, cleanup="Semantic",
                       counts_only=False)
    else:
        raise TypeError("Both the texts should be either of type 'str' or 'bytes'.")

    result = [(diff_codes[change[0]], change[1]) for change in changes]
    return result



[docs]
def html_text_diff(a_text, b_text):
    """
    Diff the visible textual content of an HTML document.

    Examples
    --------
    >>> html_text_diff('<p>Deleted</p><p>Unchanged</p>',
    ...                '<p>Added</p><p>Unchanged</p>')
    [[-1, 'Delet'], [1, 'Add'], [0, 'ed Unchanged']]
    """

    t1 = _get_visible_text(a_text)
    t2 = _get_visible_text(b_text)

    TIMELIMIT = 2  # seconds
    res = compute_dmp_diff(t1, t2, timelimit=TIMELIMIT)
    count = len([[type_, string_] for type_, string_ in res if type_])
    return {'change_count': count, 'diff': res}




[docs]
def html_source_diff(a_text, b_text):
    """
    Diff the full source code of an HTML document.

    Examples
    --------
    >>> html_source_diff('<p>Deleted</p><p>Unchanged</p>',
    ...                  '<p>Added</p><p>Unchanged</p>')
    [[0, '<p>'], [-1, 'Delet'], [1, 'Add'], [0, 'ed</p><p>Unchanged</p>']]
    """
    TIMELIMIT = 2  # seconds
    res = compute_dmp_diff(a_text, b_text, timelimit=TIMELIMIT)
    count = len([[type_, string_] for type_, string_ in res if type_])
    return {'change_count': count, 'diff': res}