Apr posted 30 Apr 2009 and tagged python
from htmlentitydefs import name2codepoint as n2cp
import re
Pop out
1 2 | from htmlentitydefs import name2codepoint as n2cp import re |
def substitute_entity(match):
ent = match.group(3)
if match.group(1) == "#":
if match.group(2) == '':
return unichr(int(ent))
elif match.group(2) == 'x':
return unichr(int('0x'+ent, 16))
else:
cp = n2cp.get(ent)
if cp:
return unichr(cp)
else:
return match.group()
Pop out
1 2 3 4 5 6 7 8 9 10 11 12 13 | def substitute_entity(match): ent = match.group(3) if match.group(1) == "#": if match.group(2) == '': return unichr(int(ent)) elif match.group(2) == 'x': return unichr(int('0x'+ent, 16)) else: cp = n2cp.get(ent) if cp: return unichr(cp) else: return match.group() |
def decode_htmlentities(string):
entity_re = re.compile(r'&(#?)(x?)(\w+);')
return entity_re.subn(substitute_entity, string)[0]
Pop out
1 2 3 | def decode_htmlentities(string): entity_re = re.compile(r'&(#?)(x?)(\w+);') return entity_re.subn(substitute_entity, string)[0] |
Example usage:
print decode_htmlentities("l'eau")
Pop out
1 | print decode_htmlentities("l'eau") |
Source: http://snippets.dzone.com/posts/show/4569
(NB: this problem – and many, many others – can be more easily solved by BeautifulSoup, but if this is all you need to do, you might not want to pay the computational price.)