Source code for web_monitoring_diff.basic_diffs

from bs4 import Comment
from fast_diff_match_patch import diff
import html5_parser
import re
import sys


# BeautifulSoup can sometimes exceed the default Python recursion limit (1000).
sys.setrecursionlimit(10000)

# Dictionary mapping which maps from diff-match-patch tags to the ones we use
diff_codes = {'=': 0, '-': -1, '+': 1}

REPEATED_BLANK_LINES = re.compile(r'([^\S\n]*\n\s*){2,}')


[docs] def compare_length(a_body, b_body): "Compute difference in response body lengths. (Does not compare contents.)" return {'diff': len(b_body) - len(a_body)}
[docs] def identical_bytes(a_body, b_body): "Compute whether response bodies are exactly identical." return {'diff': a_body == b_body}
def _get_text(html): "Extract textual content from HTML." soup = html5_parser.parse(html, treebuilder='soup', return_root=False) [element.extract() for element in soup.find_all(string=lambda text: isinstance(text, Comment))] return soup.find_all(text=True) INVISIBLE_TAGS = set(['style', 'script', '[document]', 'head', 'title']) _RE_HTML_COMMENT = re.compile('<!--.*-->') def _is_visible(element): "A best-effort guess at whether an HTML element is visible on the page." # adapted from https://www.quora.com/How-can-I-extract-only-text-data-from-HTML-pages if element.parent.name in INVISIBLE_TAGS: return False elif _RE_HTML_COMMENT.match(str(element.encode('utf-8'))): return False return True def _get_visible_text(html): text = ' '.join(filter(_is_visible, _get_text(html))) return REPEATED_BLANK_LINES.sub('\n\n', text).strip()
[docs] def side_by_side_text(a_text, b_text): "Extract the visible text from both response bodies." return {'diff': {'a_text': _get_visible_text(a_text), 'b_text': _get_visible_text(b_text)}}
def compute_dmp_diff(a_text, b_text, timelimit=4): if (isinstance(a_text, (str, bytes)) and isinstance(b_text, (str, bytes))): changes = diff(a_text, b_text, checklines=False, timelimit=timelimit, cleanup="Semantic", counts_only=False) else: raise TypeError("Both the texts should be either of type 'str' or 'bytes'.") result = [(diff_codes[change[0]], change[1]) for change in changes] return result
[docs] def html_text_diff(a_text, b_text): """ Diff the visible textual content of an HTML document. Examples -------- >>> html_text_diff('<p>Deleted</p><p>Unchanged</p>', ... '<p>Added</p><p>Unchanged</p>') [[-1, 'Delet'], [1, 'Add'], [0, 'ed Unchanged']] """ t1 = _get_visible_text(a_text) t2 = _get_visible_text(b_text) TIMELIMIT = 2 # seconds res = compute_dmp_diff(t1, t2, timelimit=TIMELIMIT) count = len([[type_, string_] for type_, string_ in res if type_]) return {'change_count': count, 'diff': res}
[docs] def html_source_diff(a_text, b_text): """ Diff the full source code of an HTML document. Examples -------- >>> html_source_diff('<p>Deleted</p><p>Unchanged</p>', ... '<p>Added</p><p>Unchanged</p>') [[0, '<p>'], [-1, 'Delet'], [1, 'Add'], [0, 'ed</p><p>Unchanged</p>']] """ TIMELIMIT = 2 # seconds res = compute_dmp_diff(a_text, b_text, timelimit=TIMELIMIT) count = len([[type_, string_] for type_, string_ in res if type_]) return {'change_count': count, 'diff': res}