Source code for omnipresence.web.html

# -*- test-case-name: omnipresence.test.test_html -*-
"""HTML parsing utility functions."""


from __future__ import unicode_literals

from bs4 import BeautifulSoup, NavigableString, Tag


#: The default parser to use for BeautifulSoup.
DEFAULT_BS4_PARSER = 'html.parser'


def parse(markup):
    """Return a `BeautifulSoup` object from the given markup.

    This is a convenience method that additionally adds a default parser
    argument, to avoid warnings.
    """
    return BeautifulSoup(markup, DEFAULT_BS4_PARSER)


[docs]def textify(html, format_output=True): """Convert the contents of *html* to a Unicode string. *html* can be either a string containing HTML markup, or a Beautiful Soup tag object. If *format_output* is true, mIRC-style formatting codes are added to simulate common element styles.""" if isinstance(html, BeautifulSoup) or isinstance(html, Tag): soup = html else: soup = parse(html) def descend(soup): if not format_output: return u''.join(soup.strings) # Grab the node's tag name, and add formatting if necessary. if soup.name in (u'b', u'strong'): fmt = u'\x02{0}\x02' elif soup.name in (u'i', u'u', u'em', u'cite', u'var'): fmt = u'\x16{0}\x16' elif soup.name == u'sup': fmt = u'^{0}' elif soup.name == u'sub': fmt = u'_{0}' else: fmt = u'{0}' # Recurse into the node's contents. text = u'' for k in soup.children: if isinstance(k, NavigableString): text += unicode(k) else: # is another soup element text += descend(k) return fmt.format(text) # Don't strip whitespace until the very end, in order to avoid # misparsing constructs like <span>hello<b> world</b></span>. return u' '.join(descend(soup).split()).strip()