html

General functions for HTML manipulation.

View Source

  1"""
  2General functions for HTML manipulation.
  3"""
  4
  5import re as _re
  6from html.entities import html5 as _html5
  7
  8
  9__all__ = ['escape', 'unescape']
 10
 11
 12def escape(s, quote=True):
 13    """
 14    Replace special characters "&", "<" and ">" to HTML-safe sequences.
 15    If the optional flag quote is true (the default), the quotation mark
 16    characters, both double quote (") and single quote (') characters are also
 17    translated.
 18    """
 19    s = s.replace("&", "&amp;") # Must be done first!
 20    s = s.replace("<", "&lt;")
 21    s = s.replace(">", "&gt;")
 22    if quote:
 23        s = s.replace('"', "&quot;")
 24        s = s.replace('\'', "&#x27;")
 25    return s
 26
 27
 28# see https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
 29
 30_invalid_charrefs = {
 31    0x00: '\ufffd',  # REPLACEMENT CHARACTER
 32    0x0d: '\r',      # CARRIAGE RETURN
 33    0x80: '\u20ac',  # EURO SIGN
 34    0x81: '\x81',    # <control>
 35    0x82: '\u201a',  # SINGLE LOW-9 QUOTATION MARK
 36    0x83: '\u0192',  # LATIN SMALL LETTER F WITH HOOK
 37    0x84: '\u201e',  # DOUBLE LOW-9 QUOTATION MARK
 38    0x85: '\u2026',  # HORIZONTAL ELLIPSIS
 39    0x86: '\u2020',  # DAGGER
 40    0x87: '\u2021',  # DOUBLE DAGGER
 41    0x88: '\u02c6',  # MODIFIER LETTER CIRCUMFLEX ACCENT
 42    0x89: '\u2030',  # PER MILLE SIGN
 43    0x8a: '\u0160',  # LATIN CAPITAL LETTER S WITH CARON
 44    0x8b: '\u2039',  # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
 45    0x8c: '\u0152',  # LATIN CAPITAL LIGATURE OE
 46    0x8d: '\x8d',    # <control>
 47    0x8e: '\u017d',  # LATIN CAPITAL LETTER Z WITH CARON
 48    0x8f: '\x8f',    # <control>
 49    0x90: '\x90',    # <control>
 50    0x91: '\u2018',  # LEFT SINGLE QUOTATION MARK
 51    0x92: '\u2019',  # RIGHT SINGLE QUOTATION MARK
 52    0x93: '\u201c',  # LEFT DOUBLE QUOTATION MARK
 53    0x94: '\u201d',  # RIGHT DOUBLE QUOTATION MARK
 54    0x95: '\u2022',  # BULLET
 55    0x96: '\u2013',  # EN DASH
 56    0x97: '\u2014',  # EM DASH
 57    0x98: '\u02dc',  # SMALL TILDE
 58    0x99: '\u2122',  # TRADE MARK SIGN
 59    0x9a: '\u0161',  # LATIN SMALL LETTER S WITH CARON
 60    0x9b: '\u203a',  # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
 61    0x9c: '\u0153',  # LATIN SMALL LIGATURE OE
 62    0x9d: '\x9d',    # <control>
 63    0x9e: '\u017e',  # LATIN SMALL LETTER Z WITH CARON
 64    0x9f: '\u0178',  # LATIN CAPITAL LETTER Y WITH DIAERESIS
 65}
 66
 67_invalid_codepoints = {
 68    # 0x0001 to 0x0008
 69    0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
 70    # 0x000E to 0x001F
 71    0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
 72    0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
 73    # 0x007F to 0x009F
 74    0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a,
 75    0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
 76    0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
 77    # 0xFDD0 to 0xFDEF
 78    0xfdd0, 0xfdd1, 0xfdd2, 0xfdd3, 0xfdd4, 0xfdd5, 0xfdd6, 0xfdd7, 0xfdd8,
 79    0xfdd9, 0xfdda, 0xfddb, 0xfddc, 0xfddd, 0xfdde, 0xfddf, 0xfde0, 0xfde1,
 80    0xfde2, 0xfde3, 0xfde4, 0xfde5, 0xfde6, 0xfde7, 0xfde8, 0xfde9, 0xfdea,
 81    0xfdeb, 0xfdec, 0xfded, 0xfdee, 0xfdef,
 82    # others
 83    0xb, 0xfffe, 0xffff, 0x1fffe, 0x1ffff, 0x2fffe, 0x2ffff, 0x3fffe, 0x3ffff,
 84    0x4fffe, 0x4ffff, 0x5fffe, 0x5ffff, 0x6fffe, 0x6ffff, 0x7fffe, 0x7ffff,
 85    0x8fffe, 0x8ffff, 0x9fffe, 0x9ffff, 0xafffe, 0xaffff, 0xbfffe, 0xbffff,
 86    0xcfffe, 0xcffff, 0xdfffe, 0xdffff, 0xefffe, 0xeffff, 0xffffe, 0xfffff,
 87    0x10fffe, 0x10ffff
 88}
 89
 90
 91def _replace_charref(s):
 92    s = s.group(1)
 93    if s[0] == '#':
 94        # numeric charref
 95        if s[1] in 'xX':
 96            num = int(s[2:].rstrip(';'), 16)
 97        else:
 98            num = int(s[1:].rstrip(';'))
 99        if num in _invalid_charrefs:
100            return _invalid_charrefs[num]
101        if 0xD800 <= num <= 0xDFFF or num > 0x10FFFF:
102            return '\uFFFD'
103        if num in _invalid_codepoints:
104            return ''
105        return chr(num)
106    else:
107        # named charref
108        if s in _html5:
109            return _html5[s]
110        # find the longest matching name (as defined by the standard)
111        for x in range(len(s)-1, 1, -1):
112            if s[:x] in _html5:
113                return _html5[s[:x]] + s[x:]
114        else:
115            return '&' + s
116
117
118_charref = _re.compile(r'&(#[0-9]+;?'
119                       r'|#[xX][0-9a-fA-F]+;?'
120                       r'|[^\t\n\f <&#;]{1,32};?)')
121
122def unescape(s):
123    """
124    Convert all named and numeric character references (e.g. &gt;, &#62;,
125    &x3e;) in the string s to the corresponding unicode characters.
126    This function uses the rules defined by the HTML 5 standard
127    for both valid and invalid character references, and the list of
128    HTML 5 named character references defined in html.entities.html5.
129    """
130    if '&' not in s:
131        return s
132    return _charref.sub(_replace_charref, s)

def escape(s, quote=True): View Source

13def escape(s, quote=True):
14    """
15    Replace special characters "&", "<" and ">" to HTML-safe sequences.
16    If the optional flag quote is true (the default), the quotation mark
17    characters, both double quote (") and single quote (') characters are also
18    translated.
19    """
20    s = s.replace("&", "&amp;") # Must be done first!
21    s = s.replace("<", "&lt;")
22    s = s.replace(">", "&gt;")
23    if quote:
24        s = s.replace('"', "&quot;")
25        s = s.replace('\'', "&#x27;")
26    return s

Replace special characters "&", "<" and ">" to HTML-safe sequences. If the optional flag quote is true (the default), the quotation mark characters, both double quote (") and single quote (') characters are also translated.

def unescape(s): View Source

123def unescape(s):
124    """
125    Convert all named and numeric character references (e.g. &gt;, &#62;,
126    &x3e;) in the string s to the corresponding unicode characters.
127    This function uses the rules defined by the HTML 5 standard
128    for both valid and invalid character references, and the list of
129    HTML 5 named character references defined in html.entities.html5.
130    """
131    if '&' not in s:
132        return s
133    return _charref.sub(_replace_charref, s)

Convert all named and numeric character references (e.g. >, >, &x3e;) in the string s to the corresponding unicode characters. This function uses the rules defined by the HTML 5 standard for both valid and invalid character references, and the list of HTML 5 named character references defined in html.entities.html5.