From 6403bece675d170ace9a1715106f27d3bf8e8412 Mon Sep 17 00:00:00 2001 From: Jon Bergli Heier Date: Mon, 25 Oct 2010 20:26:07 +0200 Subject: Added python3-feedparser. --- python3-feedparser/PKGBUILD | 28 + python3-feedparser/python3.patch | 3359 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 3387 insertions(+) create mode 100644 python3-feedparser/PKGBUILD create mode 100644 python3-feedparser/python3.patch diff --git a/python3-feedparser/PKGBUILD b/python3-feedparser/PKGBUILD new file mode 100644 index 0000000..9b82e2e --- /dev/null +++ b/python3-feedparser/PKGBUILD @@ -0,0 +1,28 @@ +# Based on the offical PKGBUILD for python-feedparser. +# Contributor: Jon Bergli Heier + +pkgname=python3-feedparser +pkgver=4.2pre315 +_pkgver=4.1 +pkgrel=1 +pkgdesc="Universal Feed Parser for Python 3" +arch=('any') +url="http://bitbucket.org/puzzlet/feedparser-py3/" +license=('custom') +depends=('python' 'libxml2' ) +source=(http://downloads.sourceforge.net/feedparser/feedparser-${_pkgver}.zip python3.patch) +md5sums=('7ab1140c1e29d4cd52ab20fa7b1f8640' + '29fe3762cd3e2a97427001844ef2e772') + +build() { + cd ${srcdir} + patch -p0 -i python3.patch +} + +package() { + cd ${srcdir} + python setup.py install --root=${pkgdir} + install -Dm644 LICENSE ${pkgdir}/usr/share/licenses/${pkgname}/license +} + +# vim:set ts=2 sw=2 et: diff --git a/python3-feedparser/python3.patch b/python3-feedparser/python3.patch new file mode 100644 index 0000000..5af9afc --- /dev/null +++ b/python3-feedparser/python3.patch @@ -0,0 +1,3359 @@ +--- feedparser.py 2010-10-25 20:07:40.000000000 +0200 ++++ python3-feedparser.py 2010-10-25 20:07:02.000000000 +0200 +@@ -6,13 +6,12 @@ + Visit http://feedparser.org/ for the latest version + Visit http://feedparser.org/docs/ for the latest documentation + +-Required: Python 2.1 or later +-Recommended: Python 2.3 or later +-Recommended: CJKCodecs and iconv_codec ++Required: Python 3.0 or later ++Recommended: Python 3.1 or later + """ + +-__version__ = "4.1"# + "$Revision: 1.92 $"[11:15] + "-cvs" +-__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved. ++__version__ = "4.2-pre-" + "$Revision: 315 $"[11:14] + "-svn" ++__license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: +@@ -39,7 +38,10 @@ + "John Beimler ", + "Fazal Majid ", + "Aaron Swartz ", +- "Kevin Marks "] ++ "Kevin Marks ", ++ "Sam Ruby ", ++ "Ade Oshineye ", ++ "Puzzlet Chung "] + _debug = 0 + + # HTTP "User-Agent" header to send to servers when downloading feeds. +@@ -65,12 +67,18 @@ + # if TIDY_MARKUP = 1 + PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] + ++# If you want feedparser to automatically resolve all relative URIs, set this ++# to 1. ++RESOLVE_RELATIVE_URIS = 1 ++ ++# If you want feedparser to automatically sanitize all potentially unsafe ++# HTML content, set this to 1. ++SANITIZE_HTML = 1 ++ + # ---------- required modules (should come with any Python distribution) ---------- +-import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2 +-try: +- from cStringIO import StringIO as _StringIO +-except: +- from StringIO import StringIO as _StringIO ++import html.parser, re, sys, copy, time, email, types, cgi, urllib, urllib.request, urllib.error, urllib.parse ++from io import StringIO as _StringIO ++from io import BytesIO + + # ---------- optional modules (feedparser will work without these, but with reduced functionality) ---------- + +@@ -95,28 +103,16 @@ + _XML_AVAILABLE = 1 + except: + _XML_AVAILABLE = 0 +- def _xmlescape(data): ++ def _xmlescape(data,entities={}): + data = data.replace('&', '&') + data = data.replace('>', '>') + data = data.replace('<', '<') ++ for char, entity in entities: ++ data = data.replace(char, entity) + return data + + # base64 support for Atom feeds that contain embedded binary data +-try: +- import base64, binascii +-except: +- base64 = binascii = None +- +-# cjkcodecs and iconv_codec provide support for more character encodings. +-# Both are available from http://cjkpython.i18n.org/ +-try: +- import cjkcodecs.aliases +-except: +- pass +-try: +- import iconv_codec +-except: +- pass ++import base64, binascii + + # chardet library auto-detects character encodings + # Download from http://chardet.feedparser.org/ +@@ -128,6 +124,18 @@ + except: + chardet = None + ++from html.entities import name2codepoint, codepoint2name ++ ++# BeautifulSoup parser used for parsing microformats from embedded HTML content ++# http://www.crummy.com/software/BeautifulSoup/ ++# feedparser is tested with BeautifulSoup 3.0.x, but it might work with the ++# older 2.x series. If it doesn't, and you can figure out why, I'll accept a ++# patch and modify the compatibility statement accordingly. ++try: ++ import BeautifulSoup ++except: ++ BeautifulSoup = None ++ + # ---------- don't touch these ---------- + class ThingsNobodyCaresAboutButMe(Exception): pass + class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass +@@ -135,9 +143,288 @@ + class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass + class UndeclaredNamespace(Exception): pass + +-sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') +-sgmllib.special = re.compile(']*|' ++ '/([a-zA-Z][^<>]*)?|' ++ '![^<>]*)?') ++ ++entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') ++charref = re.compile('&#(\d+|x[0-9a-fA-F]+);') ++ ++starttagopen = re.compile('<[>a-zA-Z]') ++shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/') ++shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/') ++tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') ++attrfind = re.compile( ++ r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' ++ r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?') ++ ++class EndBracketMatch: ++ endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''') ++ def search(self,string,index=0): ++ self.match = self.endbracket.match(string,index) ++ if self.match: return self ++ def start(self,n): ++ return self.match.end(n) ++endbracket = EndBracketMatch() ++ ++class SGMLParser(html.parser.HTMLParser): ++ # Definition of entities -- derived classes may override ++ entity_or_charref = re.compile('&(?:' ++ '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)' ++ ')(;?)') ++ ++ def __init__(self, verbose=0): ++ """Initialize and reset this instance.""" ++ html.parser.HTMLParser.__init__(self) ++ self.verbose = verbose ++ self.reset() ++ ++ def reset(self): ++ """Reset this instance. Loses all unprocessed data.""" ++ self.__starttag_text = None ++ self.stack = [] ++ self.nomoretags = 0 ++ html.parser.HTMLParser.reset(self) ++ ++ # Internal -- handle data as far as reasonable. May leave state ++ # and data to be processed by a subsequent call. If 'end' is ++ # true, force handling all data as if followed by EOF marker. ++ def goahead(self, end): ++ rawdata = self.rawdata ++ i = 0 ++ n = len(rawdata) ++ while i < n: ++ if self.nomoretags: ++ self.handle_data(rawdata[i:n]) ++ i = n ++ break ++ match = self.interesting.search(rawdata, i) # < or & ++ if match: ++ j = match.start() ++ else: ++ j = n ++ if i < j: self.handle_data(rawdata[i:j]) ++ i = self.updatepos(i, j) ++ if i == n: break ++ startswith = rawdata.startswith ++ if startswith('<', i): ++ if starttagopen.match(rawdata, i): # < + letter ++ k = self.parse_starttag(i) ++ elif startswith("data ++ # XXX Can data contain &... (entity or char refs)? ++ # XXX Can data contain < or > (tag characters)? ++ # XXX Can there be whitespace before the first /? ++ match = shorttag.match(rawdata, i) ++ if not match: ++ return -1 ++ tag, data = match.group(1, 2) ++ self.__starttag_text = '<%s/' % tag ++ tag = tag.lower() ++ k = match.end(0) ++ self.finish_shorttag(tag, data) ++ self.__starttag_text = rawdata[start_pos:match.end(1) + 1] ++ return k ++ # XXX The following should skip matching quotes (' or ") ++ # As a shortcut way to exit, this isn't so bad, but shouldn't ++ # be used to locate the actual end of the start tag since the ++ # < or > characters may be embedded in an attribute value. ++ match = endbracket.search(rawdata, i+1) ++ if not match: ++ return -1 ++ j = match.start(0) ++ # Now parse the data between i+1 and j into a tag and attrs ++ attrs = [] ++ if rawdata[i:i+2] == '<>': ++ # SGML shorthand: <> == ++ k = j ++ tag = self.lasttag ++ else: ++ match = tagfind.match(rawdata, i+1) ++ if not match: ++ self.error('unexpected call to parse_starttag') ++ k = match.end(0) ++ tag = rawdata[i+1:k].lower() ++ self.lasttag = tag ++ while k < j: ++ match = attrfind.match(rawdata, k) ++ if not match: ++ break ++ attrname, rest, attrvalue = match.group(1, 2, 3) ++ if not rest: ++ attrvalue = attrname ++ elif attrvalue[:1] == "'" == attrvalue[-1:] or \ ++ attrvalue[:1] == '"' == attrvalue[-1:]: ++ attrvalue = attrvalue[1:-1] ++ attrvalue = self.entity_or_charref.sub(self._convert_ref, attrvalue) ++ attrs.append((attrname.lower(), attrvalue)) ++ k = match.end(0) ++ if rawdata[j] == '>': ++ j = j+1 ++ self.__starttag_text = rawdata[start_pos:j] ++ self.finish_starttag(tag, attrs) ++ return j ++ ++ # Internal -- convert entity or character reference ++ def _convert_ref(self, match): ++ if match.group(2): ++ return self.convert_charref(match.group(2)) or \ ++ '&#%s%s' % match.groups()[1:] ++ elif match.group(3): ++ return self.convert_entityref(match.group(1)) or \ ++ '&%s;' % match.group(1) ++ else: ++ return '&%s' % match.group(1) ++ ++ # Internal -- parse endtag ++ def parse_endtag(self, i): ++ rawdata = self.rawdata ++ match = endbracket.search(rawdata, i+1) ++ if not match: ++ return -1 ++ j = match.start(0) ++ tag = rawdata[i+2:j].strip().lower() ++ if rawdata[j] == '>': ++ j = j+1 ++ self.finish_endtag(tag) ++ return j ++ ++ # Internal -- finish parsing of data) ++ def finish_shorttag(self, tag, data): ++ self.finish_starttag(tag, []) ++ self.handle_data(data) ++ self.finish_endtag(tag) ++ ++ # Internal -- finish processing of start tag ++ # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag ++ def finish_starttag(self, tag, attrs): ++ method = getattr(self, 'start_' + tag, None) ++ if method: ++ self.stack.append(tag) ++ method(tag, attrs) ++ return 1 ++ method = getattr(self, 'do_' + tag, None) ++ if method: ++ method(tag, attrs) ++ return 0 ++ self.unknown_starttag(tag, attrs) ++ return -1 ++ ++ # Internal -- finish processing of end tag ++ def finish_endtag(self, tag): ++ if not tag: ++ found = len(self.stack) - 1 ++ if found < 0: ++ self.unknown_endtag(tag) ++ return ++ else: ++ if tag not in self.stack: ++ if getattr(self, 'end_' + tag, None): ++ self.report_unbalanced(tag) ++ else: ++ self.unknown_endtag(tag) ++ return ++ found = len(self.stack) ++ for i in range(found): ++ if self.stack[i] == tag: found = i ++ while len(self.stack) > found: ++ tag = self.stack[-1] ++ method = getattr(self, 'end_' + tag, self.unknown_endtag) ++ method(tag) ++ del self.stack[-1] ++ ++ # Example -- report an unbalanced tag. ++ def report_unbalanced(self, tag): ++ if self.verbose: ++ print('*** Unbalanced ') ++ print('*** Stack:', self.stack) ++ ++ def convert_charref(self, name): ++ """Convert character reference, may be overridden.""" ++ try: ++ n = int(name) ++ except ValueError: ++ return ++ if not 0 <= n <= 127: ++ return ++ return chr(n) ++ ++ # Definition of entities -- derived classes may override ++ entitydefs = \ ++ {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''} ++ ++ def convert_entityref(self, name): ++ """Convert entity references. ++ ++ As an alternative to overriding this method; one can tailor the ++ results by setting up the self.entitydefs mapping appropriately. ++ """ ++ return self.entitydefs.get(name, None) + + SUPPORTED_VERSIONS = {'': 'unknown', + 'rss090': 'RSS 0.90', +@@ -158,16 +445,7 @@ + 'hotrss': 'Hot RSS' + } + +-try: +- UserDict = dict +-except NameError: +- # Python 2.1 does not have dict +- from UserDict import UserDict +- def dict(aList): +- rc = {} +- for k, v in aList: +- rc[k] = v +- return rc ++UserDict = dict + + class FeedParserDict(UserDict): + keymap = {'channel': 'feed', +@@ -188,14 +466,21 @@ + def __getitem__(self, key): + if key == 'category': + return UserDict.__getitem__(self, 'tags')[0]['term'] ++ if key == 'enclosures': ++ norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel']) ++ return [norel(link) for link in UserDict.__getitem__(self, 'links') if link['rel']=='enclosure'] ++ if key == 'license': ++ for link in UserDict.__getitem__(self, 'links'): ++ if link['rel']=='license' and 'href' in link: ++ return link['href'] + if key == 'categories': + return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')] + realkey = self.keymap.get(key, key) +- if type(realkey) == types.ListType: ++ if isinstance(realkey, list): + for k in realkey: +- if UserDict.has_key(self, k): ++ if super().__contains__(k): + return UserDict.__getitem__(self, k) +- if UserDict.has_key(self, key): ++ if super().__contains__(key): + return UserDict.__getitem__(self, key) + return UserDict.__getitem__(self, realkey) + +@@ -203,24 +488,24 @@ + for k in self.keymap.keys(): + if key == k: + key = self.keymap[k] +- if type(key) == types.ListType: ++ if isinstance(key, list): + key = key[0] + return UserDict.__setitem__(self, key, value) + + def get(self, key, default=None): +- if self.has_key(key): ++ if key in self: + return self[key] + else: + return default + + def setdefault(self, key, value): +- if not self.has_key(key): ++ if key not in self: + self[key] = value + return self[key] + +- def has_key(self, key): ++ def __contains__(self, key): + try: +- return hasattr(self, key) or UserDict.has_key(self, key) ++ return hasattr(self, key) or UserDict.__contains__(self, key) + except AttributeError: + return False + +@@ -233,7 +518,7 @@ + assert not key.startswith('_') + return self.__getitem__(key) + except: +- raise AttributeError, "object has no attribute '%s'" % key ++ raise AttributeError("object has no attribute '%s'" % key) + + def __setattr__(self, key, value): + if key.startswith('_') or key == 'data': +@@ -241,9 +526,6 @@ + else: + return self.__setitem__(key, value) + +- def __contains__(self, key): +- return self.has_key(key) +- + def zopeCompatibilityHack(): + global FeedParserDict + del FeedParserDict +@@ -275,15 +557,46 @@ + 92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249, + 48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255 + ) +- import string +- _ebcdic_to_ascii_map = string.maketrans( \ +- ''.join(map(chr, range(256))), ''.join(map(chr, emap))) ++ _ebcdic_to_ascii_map = bytes.maketrans(bytes(range(256)), bytes(emap)) + return s.translate(_ebcdic_to_ascii_map) ++ ++_cp1252 = { ++ chr(128): chr(8364), # euro sign ++ chr(130): chr(8218), # single low-9 quotation mark ++ chr(131): chr( 402), # latin small letter f with hook ++ chr(132): chr(8222), # double low-9 quotation mark ++ chr(133): chr(8230), # horizontal ellipsis ++ chr(134): chr(8224), # dagger ++ chr(135): chr(8225), # double dagger ++ chr(136): chr( 710), # modifier letter circumflex accent ++ chr(137): chr(8240), # per mille sign ++ chr(138): chr( 352), # latin capital letter s with caron ++ chr(139): chr(8249), # single left-pointing angle quotation mark ++ chr(140): chr( 338), # latin capital ligature oe ++ chr(142): chr( 381), # latin capital letter z with caron ++ chr(145): chr(8216), # left single quotation mark ++ chr(146): chr(8217), # right single quotation mark ++ chr(147): chr(8220), # left double quotation mark ++ chr(148): chr(8221), # right double quotation mark ++ chr(149): chr(8226), # bullet ++ chr(150): chr(8211), # en dash ++ chr(151): chr(8212), # em dash ++ chr(152): chr( 732), # small tilde ++ chr(153): chr(8482), # trade mark sign ++ chr(154): chr( 353), # latin small letter s with caron ++ chr(155): chr(8250), # single right-pointing angle quotation mark ++ chr(156): chr( 339), # latin small ligature oe ++ chr(158): chr( 382), # latin small letter z with caron ++ chr(159): chr( 376)} # latin capital letter y with diaeresis + + _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)') + def _urljoin(base, uri): + uri = _urifixer.sub(r'\1\3', uri) +- return urlparse.urljoin(base, uri) ++ try: ++ return urllib.parse.urljoin(base, uri) ++ except: ++ uri = urllib.parse.urlunparse([urllib.quote(part) for part in urllib.parse.urlparse(uri)]) ++ return urllib.parse.urljoin(base, uri) + + class _FeedParserMixin: + namespaces = {'': '', +@@ -324,6 +637,8 @@ + 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes', + 'http://purl.org/rss/1.0/modules/link/': 'l', + 'http://search.yahoo.com/mrss': 'media', ++ #Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace ++ 'http://search.yahoo.com/mrss/': 'media', + 'http://madskills.com/public/xml/rss/module/pingback/': 'pingback', + 'http://prismstandard.org/namespaces/1.2/basic/': 'prism', + 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf', +@@ -337,6 +652,7 @@ + 'http://hacks.benhammersley.com/rss/streaming/': 'str', + 'http://purl.org/rss/1.0/modules/subscription/': 'sub', + 'http://purl.org/rss/1.0/modules/syndication/': 'sy', ++ 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf', + 'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo', + 'http://purl.org/rss/1.0/modules/threading/': 'thr', + 'http://purl.org/rss/1.0/modules/textinput/': 'ti', +@@ -344,12 +660,12 @@ + 'http://wellformedweb.org/commentAPI/': 'wfw', + 'http://purl.org/rss/1.0/modules/wiki/': 'wiki', + 'http://www.w3.org/1999/xhtml': 'xhtml', +- 'http://www.w3.org/XML/1998/namespace': 'xml', +- 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf' ++ 'http://www.w3.org/1999/xlink': 'xlink', ++ 'http://www.w3.org/XML/1998/namespace': 'xml' + } + _matchnamespaces = {} + +- can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo'] ++ can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo'] + can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'] + can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'] + html_types = ['text/html', 'application/xhtml+xml'] +@@ -385,8 +701,10 @@ + self.langstack = [] + self.baseuri = baseuri or '' + self.lang = baselang or None ++ self.svgOK = 0 ++ self.hasTitle = 0 + if baselang: +- self.feeddata['language'] = baselang ++ self.feeddata['language'] = baselang.replace('_','-') + + def unknown_starttag(self, tag, attrs): + if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs)) +@@ -397,6 +715,11 @@ + # track xml:base and xml:lang + attrsD = dict(attrs) + baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri ++ if not isinstance(baseuri, str): ++ try: ++ baseuri = str(baseuri, self.encoding) ++ except: ++ baseuri = str(baseuri, 'iso-8859-1') + self.baseuri = _urljoin(self.baseuri, baseuri) + lang = attrsD.get('xml:lang', attrsD.get('lang')) + if lang == '': +@@ -407,7 +730,7 @@ + lang = self.lang + if lang: + if tag in ('feed', 'rss', 'rdf:RDF'): +- self.feeddata['language'] = lang ++ self.feeddata['language'] = lang.replace('_','-') + self.lang = lang + self.basestack.append(self.baseuri) + self.langstack.append(lang) +@@ -420,23 +743,23 @@ + self.trackNamespace(None, uri) + + # track inline content +- if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): ++ if self.incontent and 'type' in self.contentparams and not self.contentparams.get('type', 'xml').endswith('xml'): ++ if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007 + # element declared itself as escaped markup, but it isn't really + self.contentparams['type'] = 'application/xhtml+xml' + if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml': +- # Note: probably shouldn't simply recreate localname here, but +- # our namespace handling isn't actually 100% correct in cases where +- # the feed redefines the default namespace (which is actually +- # the usual case for inline content, thanks Sam), so here we +- # cheat and just reconstruct the element based on localname +- # because that compensates for the bugs in our namespace handling. +- # This will horribly munge inline content with non-empty qnames, +- # but nobody actually does that, so I'm not fixing it. +- tag = tag.split(':')[-1] +- return self.handle_data('<%s%s>' % (tag, ''.join([' %s="%s"' % t for t in attrs])), escape=0) ++ if tag.find(':') != -1: ++ prefix, tag = tag.split(':', 1) ++ namespace = self.namespacesInUse.get(prefix, '') ++ if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML': ++ attrs.append(('xmlns',namespace)) ++ if tag=='svg' and namespace=='http://www.w3.org/2000/svg': ++ attrs.append(('xmlns',namespace)) ++ if tag == 'svg': self.svgOK += 1 ++ return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0) + + # match namespaces +- if tag.find(':') <> -1: ++ if tag.find(':') != -1: + prefix, suffix = tag.split(':', 1) + else: + prefix, suffix = '', tag +@@ -456,30 +779,41 @@ + method = getattr(self, methodname) + return method(attrsD) + except AttributeError: +- return self.push(prefix + suffix, 1) ++ # Since there's no handler or something has gone wrong we explicitly add the element and its attributes ++ unknown_tag = prefix + suffix ++ if len(attrsD) == 0: ++ # No attributes so merge it into the encosing dictionary ++ return self.push(unknown_tag, 1) ++ else: ++ # Has attributes so create it in its own dictionary ++ context = self._getContext() ++ context[unknown_tag] = attrsD + + def unknown_endtag(self, tag): + if _debug: sys.stderr.write('end %s\n' % tag) + # match namespaces +- if tag.find(':') <> -1: ++ if tag.find(':') != -1: + prefix, suffix = tag.split(':', 1) + else: + prefix, suffix = '', tag + prefix = self.namespacemap.get(prefix, prefix) + if prefix: + prefix = prefix + '_' ++ if suffix == 'svg' and self.svgOK: self.svgOK -= 1 + + # call special handler (if defined) or default handler + methodname = '_end_' + prefix + suffix + try: ++ if self.svgOK: raise AttributeError() + method = getattr(self, methodname) + method() + except AttributeError: + self.pop(prefix + suffix) + + # track inline content +- if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): ++ if self.incontent and 'type' in self.contentparams and not self.contentparams.get('type', 'xml').endswith('xml'): + # element declared itself as escaped markup, but it isn't really ++ if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007 + self.contentparams['type'] = 'application/xhtml+xml' + if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml': + tag = tag.split(':')[-1] +@@ -506,7 +840,7 @@ + c = int(ref[1:], 16) + else: + c = int(ref) +- text = unichr(c).encode('utf-8') ++ text = chr(c) + self.elementstack[-1][2].append(text) + + def handle_entityref(self, ref): +@@ -515,19 +849,14 @@ + if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref) + if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): + text = '&%s;' % ref ++ elif ref in self.entities.keys(): ++ text = self.entities[ref] ++ if text.startswith('&#') and text.endswith(';'): ++ return self.handle_entityref(text) + else: +- # entity resolution graciously donated by Aaron Swartz +- def name2cp(k): +- import htmlentitydefs +- if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3 +- return htmlentitydefs.name2codepoint[k] +- k = htmlentitydefs.entitydefs[k] +- if k.startswith('&#') and k.endswith(';'): +- return int(k[2:-1]) # not in latin-1 +- return ord(k) +- try: name2cp(ref) ++ try: name2codepoint[ref] + except KeyError: text = '&%s;' % ref +- else: text = unichr(name2cp(ref)).encode('utf-8') ++ else: text = chr(name2codepoint[ref]) + self.elementstack[-1][2].append(text) + + def handle_data(self, text, escape=1): +@@ -554,12 +883,19 @@ + if _debug: sys.stderr.write('entering parse_declaration\n') + if self.rawdata[i:i+9] == '', i) +- if k == -1: k = len(self.rawdata) ++ if k == -1: ++ # CDATA block began but didn't finish ++ k = len(self.rawdata) ++ return k + self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0) + return k+3 + else: + k = self.rawdata.find('>', i) +- return k+1 ++ if k >= 0: ++ return k+1 ++ else: ++ # We have an incomplete CDATA block. ++ return k + + def mapContentType(self, contentType): + contentType = contentType.lower() +@@ -579,11 +915,11 @@ + self.version = 'rss10' + if loweruri == 'http://www.w3.org/2005/atom' and not self.version: + self.version = 'atom10' +- if loweruri.find('backend.userland.com/rss') <> -1: ++ if loweruri.find('backend.userland.com/rss') != -1: + # match any backend.userland.com namespace + uri = 'http://backend.userland.com/rss' + loweruri = uri +- if self._matchnamespaces.has_key(loweruri): ++ if loweruri in self._matchnamespaces: + self.namespacemap[prefix] = self._matchnamespaces[loweruri] + self.namespacesInUse[self._matchnamespaces[loweruri]] = uri + else: +@@ -595,6 +931,9 @@ + def decodeEntities(self, element, data): + return data + ++ def strattrs(self, attrs): ++ return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'"'})) for t in attrs]) ++ + def push(self, element, expectingText): + self.elementstack.append([element, expectingText, []]) + +@@ -603,6 +942,28 @@ + if self.elementstack[-1][0] != element: return + + element, expectingText, pieces = self.elementstack.pop() ++ ++ if self.version == 'atom10' and self.contentparams.get('type','text') == 'application/xhtml+xml': ++ # remove enclosing child element, but only if it is a
and ++ # only if all the remaining content is nested underneath it. ++ # This means that the divs would be retained in the following: ++ #
foo
bar
++ while pieces and len(pieces)>1 and not pieces[-1].strip(): ++ del pieces[-1] ++ while pieces and len(pieces)>1 and not pieces[0].strip(): ++ del pieces[0] ++ if pieces and (pieces[0] == '
' or pieces[0].startswith('
': ++ depth = 0 ++ for piece in pieces[:-1]: ++ if piece.startswith(''): ++ depth += 1 ++ else: ++ pieces = pieces[1:-1] ++ ++ pieces = [s if isinstance(s, str) else s.encode(self.encoding) for s in pieces] + output = ''.join(pieces) + if stripWhitespace: + output = output.strip() +@@ -611,7 +972,7 @@ + # decode base64 content + if base64 and self.contentparams.get('base64', 0): + try: +- output = base64.decodestring(output) ++ output = base64.decodebytes(output.encode(self.encoding)).decode(self.encoding) + except binascii.Error: + pass + except binascii.Incomplete: +@@ -625,6 +986,9 @@ + if not self.contentparams.get('base64', 0): + output = self.decodeEntities(element, output) + ++ if self.lookslikehtml(output): ++ self.contentparams['type']='text/html' ++ + # remove temporary cruft from contentparams + try: + del self.contentparams['mode'] +@@ -635,25 +999,57 @@ + except KeyError: + pass + ++ is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types + # resolve relative URIs within embedded markup +- if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types: ++ if is_htmlish and RESOLVE_RELATIVE_URIS: + if element in self.can_contain_relative_uris: +- output = _resolveRelativeURIs(output, self.baseuri, self.encoding) ++ output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html')) ++ ++ # parse microformats ++ # (must do this before sanitizing because some microformats ++ # rely on elements that we sanitize) ++ if is_htmlish and element in ['content', 'description', 'summary']: ++ mfresults = _parseMicroformats(output, self.baseuri, self.encoding) ++ if mfresults: ++ for tag in mfresults.get('tags', []): ++ self._addTag(tag['term'], tag['scheme'], tag['label']) ++ for enclosure in mfresults.get('enclosures', []): ++ self._start_enclosure(enclosure) ++ for xfn in mfresults.get('xfn', []): ++ self._addXFN(xfn['relationships'], xfn['href'], xfn['name']) ++ vcard = mfresults.get('vcard') ++ if vcard: ++ self._getContext()['vcard'] = vcard + + # sanitize embedded markup +- if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types: ++ if is_htmlish and SANITIZE_HTML: + if element in self.can_contain_dangerous_markup: +- output = _sanitizeHTML(output, self.encoding) ++ output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html')) + +- if self.encoding and type(output) != type(u''): ++ if self.encoding and not isinstance(output, str): + try: +- output = unicode(output, self.encoding) ++ output = str(output, self.encoding) + except: + pass + ++ # address common error where people take data that is already ++ # utf-8, presume that it is iso-8859-1, and re-encode it. ++ if self.encoding=='utf-8' and isinstance(output, str): ++ try: ++ output = str(output.encode('iso-8859-1'), 'utf-8') ++ except: ++ pass ++ ++ # map win-1252 extensions to the proper code points ++ if isinstance(output, str): ++ output = ''.join([c in _cp1252.keys() and _cp1252[c] or c for c in output]) ++ + # categories/tags/keywords/whatever are handled in _end_category + if element == 'category': + return output ++ ++ if element == 'title' and self.hasTitle: ++ return output + + # store output in appropriate place(s) + if self.inentry and not self.insource: +@@ -674,7 +1070,7 @@ + contentparams = copy.deepcopy(self.contentparams) + contentparams['value'] = output + self.entries[-1][element + '_detail'] = contentparams +- elif (self.infeed or self.insource) and (not self.intextinput) and (not self.inimage): ++ elif (self.infeed or self.insource):# and (not self.intextinput) and (not self.inimage): + context = self._getContext() + if element == 'description': + element = 'subtitle' +@@ -689,6 +1085,7 @@ + + def pushContent(self, tag, attrsD, defaultContentType, expectingText): + self.incontent += 1 ++ if self.lang: self.lang=self.lang.replace('_','-') + self.contentparams = FeedParserDict({ + 'type': self.mapContentType(attrsD.get('type', defaultContentType)), + 'language': self.lang, +@@ -702,9 +1099,31 @@ + self.contentparams.clear() + return value + ++ # a number of elements in a number of RSS variants are nominally plain ++ # text, but this is routinely ignored. This is an attempt to detect ++ # the most common cases. As false positives often result in silent ++ # data loss, this function errs on the conservative side. ++ def lookslikehtml(self, str): ++ if self.version.startswith('atom'): return ++ if self.contentparams.get('type','text/html') != 'text/plain': return ++ ++ # must have a close tag or a entity reference to qualify ++ if not (re.search(r'',str) or re.search("&#?\w+;",str)): return ++ ++ # all tags must be in a restricted subset of valid HTML tags ++ if any(t.lower() not in _HTMLSanitizer.acceptable_elements for t in ++ re.findall(r' -1: ++ if colonpos != -1: + prefix = name[:colonpos] + suffix = name[colonpos+1:] + prefix = self.namespacemap.get(prefix, prefix) +@@ -748,7 +1167,10 @@ + '0.92': 'rss092', + '0.93': 'rss093', + '0.94': 'rss094'} +- if not self.version: ++ #If we're here then this is an RSS feed. ++ #If we don't have a version or have a version that starts with something ++ #other than RSS then there's been a mistake. Correct it. ++ if not self.version or not self.version.startswith('rss'): + attr_version = attrsD.get('version', '') + version = versionmap.get(attr_version) + if version: +@@ -767,11 +1189,11 @@ + _start_feedinfo = _start_channel + + def _cdf_common(self, attrsD): +- if attrsD.has_key('lastmod'): ++ if 'lastmod' in attrsD: + self._start_modified({}) + self.elementstack[-1][-1] = attrsD['lastmod'] + self._end_modified() +- if attrsD.has_key('href'): ++ if 'href' in attrsD: + self._start_link({}) + self.elementstack[-1][-1] = attrsD['href'] + self._end_link() +@@ -794,20 +1216,22 @@ + _end_feed = _end_channel + + def _start_image(self, attrsD): +- self.inimage = 1 +- self.push('image', 0) + context = self._getContext() + context.setdefault('image', FeedParserDict()) ++ self.inimage = 1 ++ self.hasTitle = 0 ++ self.push('image', 0) + + def _end_image(self): + self.pop('image') + self.inimage = 0 + + def _start_textinput(self, attrsD): +- self.intextinput = 1 +- self.push('textinput', 0) + context = self._getContext() + context.setdefault('textinput', FeedParserDict()) ++ self.intextinput = 1 ++ self.hasTitle = 0 ++ self.push('textinput', 0) + _start_textInput = _start_textinput + + def _end_textinput(self): +@@ -877,7 +1301,7 @@ + self._save_contributor('name', value) + elif self.intextinput: + context = self._getContext() +- context['textinput']['name'] = value ++ context['name'] = value + _end_itunes_name = _end_name + + def _start_width(self, attrsD): +@@ -891,7 +1315,7 @@ + value = 0 + if self.inimage: + context = self._getContext() +- context['image']['width'] = value ++ context['width'] = value + + def _start_height(self, attrsD): + self.push('height', 0) +@@ -904,7 +1328,7 @@ + value = 0 + if self.inimage: + context = self._getContext() +- context['image']['height'] = value ++ context['height'] = value + + def _start_url(self, attrsD): + self.push('href', 1) +@@ -917,12 +1341,6 @@ + self._save_author('href', value) + elif self.incontributor: + self._save_contributor('href', value) +- elif self.inimage: +- context = self._getContext() +- context['image']['href'] = value +- elif self.intextinput: +- context = self._getContext() +- context['textinput']['link'] = value + _end_homepage = _end_url + _end_uri = _end_url + +@@ -943,6 +1361,10 @@ + def _getContext(self): + if self.insource: + context = self.sourcedata ++ elif self.inimage and 'image' in self.feeddata: ++ context = self.feeddata['image'] ++ elif self.intextinput: ++ context = self.feeddata['textinput'] + elif self.inentry: + context = self.entries[-1] + else: +@@ -973,23 +1395,28 @@ + elif email: + context[key] = email + else: +- author = context.get(key) ++ author, email = context.get(key), None + if not author: return +- emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))''', author) +- if not emailmatch: return +- email = emailmatch.group(0) +- # probably a better way to do the following, but it passes all the tests +- author = author.replace(email, '') +- author = author.replace('()', '') +- author = author.strip() +- if author and (author[0] == '('): +- author = author[1:] +- if author and (author[-1] == ')'): +- author = author[:-1] +- author = author.strip() +- context.setdefault('%s_detail' % key, FeedParserDict()) +- context['%s_detail' % key]['name'] = author +- context['%s_detail' % key]['email'] = email ++ emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))(\?subject=\S+)?''', author) ++ if emailmatch: ++ email = emailmatch.group(0) ++ # probably a better way to do the following, but it passes all the tests ++ author = author.replace(email, '') ++ author = author.replace('()', '') ++ author = author.replace('<>', '') ++ author = author.replace('<>', '') ++ author = author.strip() ++ if author and (author[0] == '('): ++ author = author[1:] ++ if author and (author[-1] == ')'): ++ author = author[:-1] ++ author = author.strip() ++ if author or email: ++ context.setdefault('%s_detail' % key, FeedParserDict()) ++ if author: ++ context['%s_detail' % key]['name'] = author ++ if email: ++ context['%s_detail' % key]['email'] = email + + def _start_subtitle(self, attrsD): + self.pushContent('subtitle', attrsD, 'text/plain', 1) +@@ -1016,6 +1443,7 @@ + self.push('item', 0) + self.inentry = 1 + self.guidislink = 0 ++ self.hasTitle = 0 + id = self._getAttribute(attrsD, 'rdf:about') + if id: + context = self._getContext() +@@ -1089,25 +1517,41 @@ + self._save('expired_parsed', _parse_date(self.pop('expired'))) + + def _start_cc_license(self, attrsD): +- self.push('license', 1) ++ context = self._getContext() + value = self._getAttribute(attrsD, 'rdf:resource') +- if value: +- self.elementstack[-1][2].append(value) +- self.pop('license') ++ attrsD = FeedParserDict() ++ attrsD['rel']='license' ++ if value: attrsD['href']=value ++ context.setdefault('links', []).append(attrsD) + + def _start_creativecommons_license(self, attrsD): + self.push('license', 1) ++ _start_creativeCommons_license = _start_creativecommons_license + + def _end_creativecommons_license(self): +- self.pop('license') ++ value = self.pop('license') ++ context = self._getContext() ++ attrsD = FeedParserDict() ++ attrsD['rel']='license' ++ if value: attrsD['href']=value ++ context.setdefault('links', []).append(attrsD) ++ del context['license'] ++ _end_creativeCommons_license = _end_creativecommons_license + ++ def _addXFN(self, relationships, href, name): ++ context = self._getContext() ++ xfn = context.setdefault('xfn', []) ++ value = FeedParserDict({'relationships': relationships, 'href': href, 'name': name}) ++ if value not in xfn: ++ xfn.append(value) ++ + def _addTag(self, term, scheme, label): + context = self._getContext() + tags = context.setdefault('tags', []) + if (not term) and (not scheme) and (not label): return + value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label}) + if value not in tags: +- tags.append(FeedParserDict({'term': term, 'scheme': scheme, 'label': label})) ++ tags.append(value) + + def _start_category(self, attrsD): + if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD)) +@@ -1145,17 +1589,20 @@ + + def _start_link(self, attrsD): + attrsD.setdefault('rel', 'alternate') +- attrsD.setdefault('type', 'text/html') ++ if attrsD['rel'] == 'self': ++ attrsD.setdefault('type', 'application/atom+xml') ++ else: ++ attrsD.setdefault('type', 'text/html') ++ context = self._getContext() + attrsD = self._itsAnHrefDamnIt(attrsD) +- if attrsD.has_key('href'): ++ if 'href' in attrsD: + attrsD['href'] = self.resolveURI(attrsD['href']) ++ if attrsD.get('rel')=='enclosure' and not context.get('id'): ++ context['id'] = attrsD.get('href') + expectingText = self.infeed or self.inentry or self.insource +- context = self._getContext() + context.setdefault('links', []) + context['links'].append(FeedParserDict(attrsD)) +- if attrsD['rel'] == 'enclosure': +- self._start_enclosure(attrsD) +- if attrsD.has_key('href'): ++ if 'href' in attrsD: + expectingText = 0 + if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types): + context['link'] = attrsD['href'] +@@ -1166,10 +1613,6 @@ + def _end_link(self): + value = self.pop('link') + context = self._getContext() +- if self.intextinput: +- context['textinput']['link'] = value +- if self.inimage: +- context['image']['link'] = value + _end_producturl = _end_link + + def _start_guid(self, attrsD): +@@ -1178,34 +1621,39 @@ + + def _end_guid(self): + value = self.pop('id') +- self._save('guidislink', self.guidislink and not self._getContext().has_key('link')) ++ self._save('guidislink', self.guidislink and 'link' not in self._getContext()) + if self.guidislink: + # guid acts as link, but only if 'ispermalink' is not present or is 'true', + # and only if the item doesn't already have a link element + self._save('link', value) + + def _start_title(self, attrsD): ++ if self.svgOK: return self.unknown_starttag('title', attrsD.items()) + self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource) + _start_dc_title = _start_title + _start_media_title = _start_title + + def _end_title(self): ++ if self.svgOK: return + value = self.popContent('title') ++ if not value: return + context = self._getContext() +- if self.intextinput: +- context['textinput']['title'] = value +- elif self.inimage: +- context['image']['title'] = value ++ self.hasTitle = 1 + _end_dc_title = _end_title +- _end_media_title = _end_title ++ ++ def _end_media_title(self): ++ hasTitle = self.hasTitle ++ self._end_title() ++ self.hasTitle = hasTitle + + def _start_description(self, attrsD): + context = self._getContext() +- if context.has_key('summary'): ++ if 'summary' in context: + self._summaryKey = 'content' + self._start_content(attrsD) + else: + self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource) ++ _start_dc_description = _start_description + + def _start_abstract(self, attrsD): + self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource) +@@ -1215,13 +1663,9 @@ + self._end_content() + else: + value = self.popContent('description') +- context = self._getContext() +- if self.intextinput: +- context['textinput']['description'] = value +- elif self.inimage: +- context['image']['description'] = value + self._summaryKey = None + _end_abstract = _end_description ++ _end_dc_description = _end_description + + def _start_info(self, attrsD): + self.pushContent('info', attrsD, 'text/plain', 1) +@@ -1234,7 +1678,7 @@ + def _start_generator(self, attrsD): + if attrsD: + attrsD = self._itsAnHrefDamnIt(attrsD) +- if attrsD.has_key('href'): ++ if 'href' in attrsD: + attrsD['href'] = self.resolveURI(attrsD['href']) + self._getContext()['generator_detail'] = FeedParserDict(attrsD) + self.push('generator', 1) +@@ -1242,7 +1686,7 @@ + def _end_generator(self): + value = self.pop('generator') + context = self._getContext() +- if context.has_key('generator_detail'): ++ if 'generator_detail' in context: + context['generator_detail']['name'] = value + + def _start_admin_generatoragent(self, attrsD): +@@ -1262,7 +1706,7 @@ + + def _start_summary(self, attrsD): + context = self._getContext() +- if context.has_key('summary'): ++ if 'summary' in context: + self._summaryKey = 'content' + self._start_content(attrsD) + else: +@@ -1280,18 +1724,26 @@ + + def _start_enclosure(self, attrsD): + attrsD = self._itsAnHrefDamnIt(attrsD) +- self._getContext().setdefault('enclosures', []).append(FeedParserDict(attrsD)) ++ context = self._getContext() ++ attrsD['rel']='enclosure' ++ context.setdefault('links', []).append(FeedParserDict(attrsD)) + href = attrsD.get('href') +- if href: +- context = self._getContext() +- if not context.get('id'): +- context['id'] = href ++ if href and not context.get('id'): ++ context['id'] = href + + def _start_source(self, attrsD): ++ if 'url' in attrsD: ++ # This means that we're processing a source element from an RSS 2.0 feed ++ self.sourcedata['href'] = attrsD['url'] ++ self.push('source', 1) + self.insource = 1 ++ self.hasTitle = 0 + + def _end_source(self): + self.insource = 0 ++ value = self.pop('source') ++ if value: ++ self.sourcedata['title'] = value + self._getContext()['source'] = copy.deepcopy(self.sourcedata) + self.sourcedata.clear() + +@@ -1318,6 +1770,7 @@ + value = self.popContent('content') + if copyToDescription: + self._save('description', value) ++ + _end_body = _end_content + _end_xhtml_body = _end_content + _end_content_encoded = _end_content +@@ -1337,6 +1790,33 @@ + value = self.pop('itunes_explicit', 0) + self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0 + ++ def _start_media_content(self, attrsD): ++ context = self._getContext() ++ context.setdefault('media_content', []) ++ context['media_content'].append(attrsD) ++ ++ def _start_media_thumbnail(self, attrsD): ++ context = self._getContext() ++ context.setdefault('media_thumbnail', []) ++ self.push('url', 1) # new ++ context['media_thumbnail'].append(attrsD) ++ ++ def _end_media_thumbnail(self): ++ url = self.pop('url') ++ context = self._getContext() ++ if url != None and len(url.strip()) != 0: ++ if 'url' not in context['media_thumbnail'][-1]: ++ context['media_thumbnail'][-1]['url'] = url ++ ++ def _start_media_player(self, attrsD): ++ self.push('media_player', 0) ++ self._getContext()['media_player'] = FeedParserDict(attrsD) ++ ++ def _end_media_player(self): ++ value = self.pop('media_player') ++ context = self._getContext() ++ context['media_player']['content'] = value ++ + if _XML_AVAILABLE: + class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler): + def __init__(self, baseuri, baselang, encoding): +@@ -1345,14 +1825,17 @@ + _FeedParserMixin.__init__(self, baseuri, baselang, encoding) + self.bozo = 0 + self.exc = None ++ self.decls = {} + + def startPrefixMapping(self, prefix, uri): + self.trackNamespace(prefix, uri) ++ if uri == 'http://www.w3.org/1999/xlink': ++ self.decls['xmlns:'+prefix] = uri + + def startElementNS(self, name, qname, attrs): + namespace, localname = name + lowernamespace = str(namespace or '').lower() +- if lowernamespace.find('backend.userland.com/rss') <> -1: ++ if lowernamespace.find('backend.userland.com/rss') != -1: + # match any backend.userland.com namespace + namespace = 'http://backend.userland.com/rss' + lowernamespace = namespace +@@ -1361,12 +1844,9 @@ + else: + givenprefix = None + prefix = self._matchnamespaces.get(lowernamespace, givenprefix) +- if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix): +- raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix +- if prefix: +- localname = prefix + ':' + localname ++ if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and givenprefix not in self.namespacesInUse: ++ raise UndeclaredNamespace("'%s' is not associated with a namespace" % givenprefix) + localname = str(localname).lower() +- if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname)) + + # qname implementation is horribly broken in Python 2.1 (it + # doesn't report any), and slightly broken in Python 2.2 (it +@@ -1375,7 +1855,21 @@ + # the qnames the SAX parser gives us (if indeed it gives us any + # at all). Thanks to MatejC for helping me test this and + # tirelessly telling me that it didn't work yet. +- attrsD = {} ++ attrsD, self.decls = self.decls, {} ++ if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML': ++ attrsD['xmlns']=namespace ++ if localname=='svg' and namespace=='http://www.w3.org/2000/svg': ++ attrsD['xmlns']=namespace ++ ++ if prefix: ++ localname = prefix.lower() + ':' + localname ++ elif namespace and not qname: #Expat ++ for name,value in self.namespacesInUse.items(): ++ if name and value == namespace: ++ localname = name + ':' + localname ++ break ++ if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname)) ++ + for (namespace, attrlocalname), attrvalue in attrs._attrs.items(): + lowernamespace = (namespace or '').lower() + prefix = self._matchnamespaces.get(lowernamespace, '') +@@ -1384,7 +1878,7 @@ + attrsD[str(attrlocalname).lower()] = attrvalue + for qname in attrs.getQNames(): + attrsD[str(qname).lower()] = attrs.getValueByQName(qname) +- self.unknown_starttag(localname, attrsD.items()) ++ self.unknown_starttag(localname, list(attrsD.items())) + + def characters(self, text): + self.handle_data(text) +@@ -1399,29 +1893,40 @@ + prefix = self._matchnamespaces.get(lowernamespace, givenprefix) + if prefix: + localname = prefix + ':' + localname ++ elif namespace and not qname: #Expat ++ for name,value in self.namespacesInUse.items(): ++ if name and value == namespace: ++ localname = name + ':' + localname ++ break + localname = str(localname).lower() + self.unknown_endtag(localname) + + def error(self, exc): + self.bozo = 1 + self.exc = exc +- ++ + def fatalError(self, exc): + self.error(exc) + raise exc + +-class _BaseHTMLProcessor(sgmllib.SGMLParser): +- elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', +- 'img', 'input', 'isindex', 'link', 'meta', 'param'] +- +- def __init__(self, encoding): ++class _BaseHTMLProcessor(SGMLParser): ++ special = re.compile('''[<>'"]''') ++ bare_ampersand = re.compile("&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)") ++ elements_no_end_tag = [ ++ 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame', ++ 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param', ++ 'source', 'track', 'wbr' ++ ] ++ ++ def __init__(self, encoding, type): + self.encoding = encoding ++ self.type = type + if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding) +- sgmllib.SGMLParser.__init__(self) +- ++ SGMLParser.__init__(self) ++ + def reset(self): + self.pieces = [] +- sgmllib.SGMLParser.reset(self) ++ SGMLParser.reset(self) + + def _shorttag_replace(self, match): + tag = match.group(1) +@@ -1429,21 +1934,29 @@ + return '<' + tag + ' />' + else: + return '<' + tag + '>' +- ++ ++ def parse_starttag(self,i): ++ j=SGMLParser.parse_starttag(self, i) ++ if self.type == 'application/xhtml+xml': ++ if j>2 and self.rawdata[j-2:j]=='/>': ++ self.unknown_endtag(self.lasttag) ++ return j ++ + def feed(self, data): + data = re.compile(r'', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace +- data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data) ++ data = re.sub(r'<([^<>\s]+?)\s*/>', self._shorttag_replace, data) + data = data.replace(''', "'") + data = data.replace('"', '"') +- if self.encoding and type(data) == type(u''): +- data = data.encode(self.encoding) +- sgmllib.SGMLParser.feed(self, data) ++ SGMLParser.feed(self, data) ++ SGMLParser.close(self) + + def normalize_attrs(self, attrs): ++ if not attrs: return attrs + # utility method to be called by descendants +- attrs = [(k.lower(), v) for k, v in attrs] +- attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] ++ attrs = dict([(k.lower(), v) for k, v in attrs]).items() ++ attrs = [(k, v.lower() if k i