Source code for web_monitoring_diff.html_links_diff

import html5_parser
from .content_type import raise_if_not_diffable_html
from .basic_diffs import compute_dmp_diff
from .utils import get_color_palette
from difflib import SequenceMatcher
from .html_render_diff import (get_title, _html_for_dmp_operation,
                               undiffable_content_tags)
import re



[docs]
def links_diff(a_text, b_text, a_headers=None, b_headers=None,
               content_type_options='normal'):
    """
    Extracts all the outgoing links from a page and produces a diff of an
    HTML document that is simply a list of the text and URL of those links.

    It ignores links that merely navigate within the page.

    NOTE: this diff currently suffers from the fact that our diff server does
    not know the original URL of the content, so it can identify:

    >>> <a href="#anchor-in-this-page">Text</a>

    as an internal link, but not:

    >>> <a href="http://this.domain.com/this/page#anchor-in-this-page">Text</a>
    """
    raise_if_not_diffable_html(
        a_text,
        b_text,
        a_headers,
        b_headers,
        content_type_options)

    a_soup = html5_parser.parse(a_text, treebuilder='soup', return_root=False)
    b_soup = html5_parser.parse(b_text, treebuilder='soup', return_root=False)

    a_links = sorted(
        set([Link.from_element(element) for element in _find_outgoing_links(a_soup)]),
        key=lambda link: link.text.lower() + f'({link.href})')
    b_links = sorted(
        set([Link.from_element(element) for element in _find_outgoing_links(b_soup)]),
        key=lambda link: link.text.lower() + f'({link.href})')

    matcher = SequenceMatcher(a=a_links, b=b_links)
    opcodes = matcher.get_opcodes()
    diff = list(_assemble_diff(a_links, b_links, opcodes))

    return {
        'change_count': _count_changes(diff),
        'diff': diff,
        'a_parsed': a_soup,
        'b_parsed': b_soup
    }




[docs]
def links_diff_json(a_text, b_text, a_headers=None, b_headers=None,
                    content_type_options='normal'):
    """
    Generate a diff of all outgoing links (see `links_diff()`) where the `diff`
    property is formatted as a list of change codes and values.
    """
    diff = links_diff(a_text, b_text, a_headers, b_headers,
                      content_type_options)
    return {
        'change_count': diff['change_count'],
        'diff': diff['diff']
    }




[docs]
def links_diff_html(a_text, b_text, a_headers=None, b_headers=None,
                    content_type_options='normal'):
    """
    Generate a diff of all outgoing links (see `links_diff()`) where the `diff`
    property is an HTML string. Note the actual return type is still JSON.
    """
    diff = links_diff(a_text, b_text, a_headers, b_headers,
                      content_type_options)
    soup = _render_html_diff(diff['diff'])

    # Add styling and metadata
    color_palette = get_color_palette()
    change_styles = soup.new_tag(
        'style',
        type='text/css',
        id='wm-diff-style')
    change_styles.string = f"""
        body {{
            font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
            margin: 0;
        }}
        .links-list {{
            border-collapse: collapse;
            table-layout: fixed;
            width: 100%;
        }}
        .links-list th {{
            background: #f6f6f6;
            border-bottom: 1px solid #ccc;
            padding: 0.25em;
            text-align: left;
        }}
        .links-list > tbody > tr:first-child > td {{
            padding-top: 0.5em;
        }}
        .links-list--item > td {{
            border-bottom: 1px solid #fff;
            opacity: 0.5;
            padding: 0.25em;
        }}
        .links-list--change-type-col {{
            width: 1.5em;
        }}
        .links-list--text-col,
        .links-list--href-col {{
            width: 50%;
        }}
        .links-list--href a {{
            line-break: loose;
            word-break: break-all;
        }}
        [wm-has-deletions] > td,
        [wm-has-insertions] > td {{
            background-color: #eee;
            opacity: 1;
        }}
        [wm-inserted] > td {{
            background-color: {color_palette['differ_insertion']};}}
        [wm-deleted] > td  {{
            background-color: {color_palette['differ_deletion']};}}
        ins.wm-diff {{ background-color: {color_palette['differ_insertion']}
        !important; all: unset;}}
        del.wm-diff {{ background-color: {color_palette['differ_deletion']}
        !important; all: unset;}}
        script {{display: none !important;}}"""

    soup.head.append(change_styles)
    soup.title.string = get_title(diff['b_parsed'])

    return {
        'change_count': diff['change_count'],
        'diff': soup.prettify(formatter=None)
    }



class Link:
    """
    Represents a link that was used on the page. Designed to be fed into
    SequenceMatcher for diffing.

    Note that Link objects have a very loose sense of equality. That is:

        link_a == link_b

    only indicates that link_a and link_b may be sorta kinda be representing
    the same thing, but that you should still compare them in a more nuanced
    way. To check for strict equality, use their hashes:

        hash(link_a) == hash(link_b)
    """

    @classmethod
    def from_element(cls, element):
        """
        Create a Link from a Beautiful Soup `<a>` element
        """
        return cls(element['href'], _get_link_text(element))

    def __init__(self, href, text):
        # TODO: add a `url` so we can differentiate the href and the actual
        # target of the link (in the case of paths rather than URLs)
        # This requires knowing the base origin and path for the link.
        self.href = self._clean_href(href)
        self.text = text.strip()

    def __hash__(self):
        return hash((self.href, self.text.lower()))

    def __eq__(self, other):
        # This is actually a "rough" equality check -- trying to get a sense
        # of whether two links are the same "thing" even if they have
        # internal differences in their text or href.
        return self.href == other.href or self.text.lower() == other.text.lower()

    def json(self):
        return {'text': self.text, 'href': self.href}

    def _clean_href(self, href):
        origin_match = re.match(r'^([\w+\-]+:)?//[^/]+', href)
        if origin_match:
            return origin_match.group(0).lower() + href[origin_match.end(0):]
        return href


def _find_outgoing_links(soup):
    """
    Yields each of the `<a>` elements in a Beautiful Soup document that point
    to other pages.
    """
    for link in soup.find_all('a'):
        href = link.get('href')
        if href and not href.startswith('#'):
            yield link


def _get_link_text(link):
    """
    Get the "text" to diff and display for an `<a>` element.
    """
    # The content of tags like <script> and <style> shows up in the `.text`
    # attribute, so just go ahead and remove them from the DOM
    for invisible_tag in link.find_all(undiffable_content_tags):
        invisible_tag.extract()

    for image in link.find_all('img'):
        alt = image.get('alt')
        if alt:
            image.replace_with(f'[image: {alt}]')
        else:
            image.replace_with('[image]')

    text = link.text.strip()
    if not text:
        if link.has_attr('title'):
            text = f'[tooltip: {link["title"]}]'
        else:
            text = '[no text]'

    return text


def _count_changes(opcodes):
    return len([operation for operation in opcodes if operation[0] != 0])


def _assemble_diff(a, b, opcodes):
    """
    Yield each link in the diff with a code for addition (1), removal (-1),
    unchanged (0), or nested diff (100).

    Parameters
    ----------
    a : list
        The list of links in the previous verson of a document.
    b : list
        The list of links in the new version of a document.
    opcodes : list
        List of opcodes from SequenceMatcher that defines what to do with each
        item in the lists of links.
    """
    # If we have the lists:
    #    A                             B
    #    --------------------------- | ------------------------------
    # 1. "Pony time!" ponytime.com/a | "Pony time!" ponytime.com/b
    # 2. "Pony time!" ponytime.com/b | "Donkey time." not-ponies.com/
    #
    # SequenceMatcher can wind up putting the first row together as equal, but
    # then sectioning off the second row as deleted from A. That's because the
    # links in the first row qualify as "roughly" equal, which is equal as far
    # as SequenceMatcher is allowed to know. Unfortunately, that prevents it
    # from correctly identifying that A2 + B1 are actually the ones that are
    # the same and A1 is the one that should be marked as gone. We need to do a
    # first pass to move any insertions or deletions that are exactly equal to
    # a link in the adjacent set SequenceMatcher marked as equal into that
    # equal set. (Note this also means our equal set handling below will have
    # to account for A having more items than B or vice-versa, which couldn't
    # happen before.)
    operation_count = len(opcodes)
    for index, operation in enumerate(opcodes):
        command = operation[0]
        if command == 'equal':
            continue

        last_equal = None
        next_equal = None
        if index > 0 and opcodes[index - 1][0] == 'equal':
            last_equal = a[opcodes[index - 1][2] - 1]
        if index + 1 < operation_count and opcodes[index + 1][0] == 'equal':
            next_equal = a[opcodes[index + 1][1]]

        if (command == 'insert' or command == 'replace'):
            for link_index, link in enumerate(b[operation[3]:operation[4]]):
                # FIXME: really we should stop looking for last_equal after
                # finding a non-equal one... this implementatin assumes (I
                # *think* correctly, but not with total confidence) that equal
                # items will always be lined up on one side or the other of the
                # added/removed content.
                if last_equal and last_equal == link:
                    # opcodes are tuples, so we can't just edit them.
                    last_op = opcodes[index - 1]
                    opcodes[index - 1] = (last_op[0], last_op[1], last_op[2], last_op[3], last_op[4] + 1)
                    opcodes[index] = (operation[0], operation[1], operation[2], operation[3] + 1, operation[4])
                elif next_equal and next_equal == link:
                    # opcodes are tuples, so we can't just edit them.
                    next_op = opcodes[index + 1]
                    opcodes[index] = (operation[0], operation[1], operation[2], operation[3], operation[4] - 1)
                    opcodes[index + 1] = (next_op[0], next_op[1], next_op[2], next_op[3] - 1, next_op[4])

        if (command == 'delete' or command == 'replace'):
            for link_index, link in enumerate(a[operation[1]:operation[2]]):
                # FIXME: really we should stop looking for last_equal after
                # finding a non-equal one... this implementatin assumes (I
                # *think* correctly, but not with total confidence) that equal
                # items will always be lined up on one side or the other of the
                # added/removed content.
                if last_equal and last_equal == link:
                    # opcodes are tuples, so we can't just edit them.
                    last_op = opcodes[index - 1]
                    opcodes[index - 1] = (last_op[0], last_op[1], last_op[2] + 1, last_op[3], last_op[4])
                    opcodes[index] = (operation[0], operation[1] + 1, operation[2], operation[3], operation[4])
                elif next_equal and next_equal == link:
                    # opcodes are tuples, so we can't just edit them.
                    next_op = opcodes[index + 1]
                    opcodes[index] = (operation[0], operation[1], operation[2] - 1, operation[3], operation[4])
                    opcodes[index + 1] = (next_op[0], next_op[1] - 1, next_op[2], next_op[3], next_op[4])

    for command, a_start, a_end, b_start, b_end in opcodes:
        # The equality comparator for links only tells us whether links were
        # "roughly" equal -- so two links that SequenceMatcher told us were the
        # same may internal differences we need to display (this is by design).
        if command == 'equal':
            # This is a bit tricky: there might be several roughly equal links
            # in a row, where some are exactly equal, but their order is
            # shuffled around. We want to match up all the exactly equal ones.
            a_remainders = []
            a_set = a[a_start:a_end]
            b_set = b[b_start:b_end]
            last_index = a_end - 1 - a_start
            for index, a_link in enumerate(a_set):
                # Look for a B version link that was exactly equal to the
                # current A version link.
                for b_index, b_link in enumerate(b_set):
                    if hash(a_link) == hash(b_link):
                        del b_set[b_index]
                        yield (0, b_link.json())
                        break
                # If we didn't find an exact match, set this A link aside.
                else:
                    a_remainders.append(a_link)

                # If we're at the end of the list or the next link is not
                # roughly equal, go back through the links that were set aside
                # and generate a sub-diff for each of them.
                if index == last_index or a_link != a_set[index + 1]:
                    for a_link in a_remainders:
                        # b_set may contain more items at the end that are not
                        # roughly equal (see how we disbalanced this group in
                        # the first pass over the opcodes above), so we need
                        # to check that the items match before diffing inside.
                        if b_set and b_set[0] == a_link:
                            b_link = b_set[0]
                            del b_set[0]
                            text_diff = compute_dmp_diff(a_link.text, b_link.text)
                            href_diff = compute_dmp_diff(a_link.href, b_link.href)
                            yield (100, {
                                'text': text_diff,
                                'href': href_diff,
                                'hrefs': (a_link.href, b_link.href)
                            })
                        else:
                            yield (-1, a_link.json())

                    while b_set and b_set[0] == a_link:
                        yield (1, b_set[0].json())
                        del b_set[0]

                    a_remainders.clear()

            # Handle any left over additions
            for b_link in b_set:
                yield (1, b_link.json())

        if (command == 'insert' or command == 'replace'):
            for link in b[b_start:b_end]:
                yield (1, link.json())

        if (command == 'delete' or command == 'replace'):
            for link in a[a_start:a_end]:
                yield (-1, link.json())


# HTML DIFF RENDERING -----------------------------------------------------

def _create_empty_soup(title=''):
    """
    Creates a Beautiful Soup document representing an empty HTML page.

    Parameters
    ----------
    title : string
        The new document's title.
    """
    return html5_parser.parse(f"""<!doctype html>
        <html>
            <head>
                <meta charset="utf-8">
                <title>{title}</title>
            </head>
            <body>
            </body>
        </html>
        """, treebuilder='soup', return_root=False)


def not_deleted(diff_item):
    return diff_item[0] >= 0


def not_inserted(diff_item):
    return diff_item[0] <= 0


def _html_for_text_diff(diff):
    return ''.join(map(_html_for_dmp_operation, diff))


def _tag(soup, name, attributes=None, *children):
    """
    Build tags in a quicker, more composable way. Also lets you use 'class'
    attributes without a lot of extra rigamarole.
    """
    tag = soup.new_tag(name)
    if attributes:
        for key, value in attributes.items():
            # Remove boolean attributes that are False
            if value is not None and value is not False:
                tag[key] = value

    for child in children:
        tag.append(child)

    return tag


def _tagger(soup):
    def tagger(*args, **kwargs):
        return _tag(soup, *args, **kwargs)
    return tagger


CHANGE_INFO = {
    -1:  {'symbol': '-', 'title': 'Deleted'},
    0:   {'symbol': '⚬', 'title': None},
    1:   {'symbol': '+', 'title': 'Added'},
    100: {'symbol': '±', 'title': 'Changed'},
}


def _table_row_for_link(soup, change_type, link):
    tag = _tagger(soup)
    row = tag('tr', {
        'class': 'links-list--item',
        'wm-has-insertions': change_type > 0,
        'wm-inserted': change_type == 1,
        'wm-has-deletions': change_type < 0 or change_type == 100,
        'wm-deleted': change_type == -1,
    })

    change = CHANGE_INFO[change_type] or CHANGE_INFO[0]
    row.append(tag('td', {
        'class': 'links-list--change-type',
        'title': change['title'],
    }, change['symbol']))

    text_cell = tag('td', {'class': 'links-list--text'})
    row.append(text_cell)
    if change_type == 100:
        text_insertions = filter(not_deleted, link['text'])
        text_cell.append(_html_for_text_diff(text_insertions))
        if len(link['text']) != 1:
            text_cell.append(tag('br'))
            text_deletions = filter(not_inserted, link['text'])
            text_cell.append(_html_for_text_diff(text_deletions))
    else:
        text_cell.append(link['text'])

    href_cell = tag('td', {'class': 'links-list--href'})
    row.append(href_cell)
    if change_type == 100:
        href_insertions = filter(not_deleted, link['href'])
        url_text = _html_for_text_diff(href_insertions)
        url = link['hrefs'][1]
        href_cell.append(tag('a', {'href': url}, f'({url_text})'))

        if link['hrefs'][0] != link['hrefs'][1]:
            href_cell.append(tag('br'))
            href_deletions = filter(not_inserted, link['href'])
            url_text = _html_for_text_diff(href_deletions)
            url = link['hrefs'][0]
            href_cell.append(tag('a', {'href': url}, f'({url_text})'))
    else:
        url = link['href']
        href_cell.append(tag('a', {'href': url}, f'({url})'))

    return row


def _render_html_diff(raw_diff):
    """
    Create a Beautiful Soup document representing a diff.

    Parameters
    ----------
    raw_diff : sequence
        The basic diff as a sequence of opcodes and links.
    """
    result = _create_empty_soup()
    tag = _tagger(result)
    result.body.append(
        tag('table', {'class': 'links-list'},
            tag('col', {'class': 'links-list--change-type-col'}),
            tag('col', {'class': 'links-list--text-col'}),
            tag('col', {'class': 'links-list--href-col'}),
            tag('thead', {},
                tag('tr', {},
                    tag('th'),
                    tag('th', {}, 'Link Text'),
                    tag('th', {}, 'URL'))),
            tag('tbody', {}, *(
                _table_row_for_link(result, code, link)
                for code, link in raw_diff))))

    return result