diff options
-rw-r--r-- | python3-feedparser/PKGBUILD | 28 | ||||
-rw-r--r-- | python3-feedparser/python3.patch | 3359 |
2 files changed, 3387 insertions, 0 deletions
diff --git a/python3-feedparser/PKGBUILD b/python3-feedparser/PKGBUILD new file mode 100644 index 0000000..9b82e2e --- /dev/null +++ b/python3-feedparser/PKGBUILD @@ -0,0 +1,28 @@ +# Based on the offical PKGBUILD for python-feedparser. +# Contributor: Jon Bergli Heier <snakebite@jvnv.net> + +pkgname=python3-feedparser +pkgver=4.2pre315 +_pkgver=4.1 +pkgrel=1 +pkgdesc="Universal Feed Parser for Python 3" +arch=('any') +url="http://bitbucket.org/puzzlet/feedparser-py3/" +license=('custom') +depends=('python' 'libxml2' ) +source=(http://downloads.sourceforge.net/feedparser/feedparser-${_pkgver}.zip python3.patch) +md5sums=('7ab1140c1e29d4cd52ab20fa7b1f8640' + '29fe3762cd3e2a97427001844ef2e772') + +build() { + cd ${srcdir} + patch -p0 -i python3.patch +} + +package() { + cd ${srcdir} + python setup.py install --root=${pkgdir} + install -Dm644 LICENSE ${pkgdir}/usr/share/licenses/${pkgname}/license +} + +# vim:set ts=2 sw=2 et: diff --git a/python3-feedparser/python3.patch b/python3-feedparser/python3.patch new file mode 100644 index 0000000..5af9afc --- /dev/null +++ b/python3-feedparser/python3.patch @@ -0,0 +1,3359 @@ +--- feedparser.py 2010-10-25 20:07:40.000000000 +0200 ++++ python3-feedparser.py 2010-10-25 20:07:02.000000000 +0200 +@@ -6,13 +6,12 @@ + Visit http://feedparser.org/ for the latest version + Visit http://feedparser.org/docs/ for the latest documentation + +-Required: Python 2.1 or later +-Recommended: Python 2.3 or later +-Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/> ++Required: Python 3.0 or later ++Recommended: Python 3.1 or later + """ + +-__version__ = "4.1"# + "$Revision: 1.92 $"[11:15] + "-cvs" +-__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved. ++__version__ = "4.2-pre-" + "$Revision: 315 $"[11:14] + "-svn" ++__license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: +@@ -39,7 +38,10 @@ + "John Beimler <http://john.beimler.org/>", + "Fazal Majid <http://www.majid.info/mylos/weblog/>", + "Aaron Swartz <http://aaronsw.com/>", +- "Kevin Marks <http://epeus.blogspot.com/>"] ++ "Kevin Marks <http://epeus.blogspot.com/>", ++ "Sam Ruby <http://intertwingly.net/>", ++ "Ade Oshineye <http://blog.oshineye.com/>", ++ "Puzzlet Chung <http://puzzlet.org/>"] + _debug = 0 + + # HTTP "User-Agent" header to send to servers when downloading feeds. +@@ -65,12 +67,18 @@ + # if TIDY_MARKUP = 1 + PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] + ++# If you want feedparser to automatically resolve all relative URIs, set this ++# to 1. ++RESOLVE_RELATIVE_URIS = 1 ++ ++# If you want feedparser to automatically sanitize all potentially unsafe ++# HTML content, set this to 1. ++SANITIZE_HTML = 1 ++ + # ---------- required modules (should come with any Python distribution) ---------- +-import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2 +-try: +- from cStringIO import StringIO as _StringIO +-except: +- from StringIO import StringIO as _StringIO ++import html.parser, re, sys, copy, time, email, types, cgi, urllib, urllib.request, urllib.error, urllib.parse ++from io import StringIO as _StringIO ++from io import BytesIO + + # ---------- optional modules (feedparser will work without these, but with reduced functionality) ---------- + +@@ -95,28 +103,16 @@ + _XML_AVAILABLE = 1 + except: + _XML_AVAILABLE = 0 +- def _xmlescape(data): ++ def _xmlescape(data,entities={}): + data = data.replace('&', '&') + data = data.replace('>', '>') + data = data.replace('<', '<') ++ for char, entity in entities: ++ data = data.replace(char, entity) + return data + + # base64 support for Atom feeds that contain embedded binary data +-try: +- import base64, binascii +-except: +- base64 = binascii = None +- +-# cjkcodecs and iconv_codec provide support for more character encodings. +-# Both are available from http://cjkpython.i18n.org/ +-try: +- import cjkcodecs.aliases +-except: +- pass +-try: +- import iconv_codec +-except: +- pass ++import base64, binascii + + # chardet library auto-detects character encodings + # Download from http://chardet.feedparser.org/ +@@ -128,6 +124,18 @@ + except: + chardet = None + ++from html.entities import name2codepoint, codepoint2name ++ ++# BeautifulSoup parser used for parsing microformats from embedded HTML content ++# http://www.crummy.com/software/BeautifulSoup/ ++# feedparser is tested with BeautifulSoup 3.0.x, but it might work with the ++# older 2.x series. If it doesn't, and you can figure out why, I'll accept a ++# patch and modify the compatibility statement accordingly. ++try: ++ import BeautifulSoup ++except: ++ BeautifulSoup = None ++ + # ---------- don't touch these ---------- + class ThingsNobodyCaresAboutButMe(Exception): pass + class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass +@@ -135,9 +143,288 @@ + class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass + class UndeclaredNamespace(Exception): pass + +-sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') +-sgmllib.special = re.compile('<!') +-sgmllib.charref = re.compile('&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]') ++incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' ++ '<([a-zA-Z][^<>]*|' ++ '/([a-zA-Z][^<>]*)?|' ++ '![^<>]*)?') ++ ++entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') ++charref = re.compile('&#(\d+|x[0-9a-fA-F]+);') ++ ++starttagopen = re.compile('<[>a-zA-Z]') ++shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/') ++shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/') ++tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') ++attrfind = re.compile( ++ r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' ++ r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?') ++ ++class EndBracketMatch: ++ endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''') ++ def search(self,string,index=0): ++ self.match = self.endbracket.match(string,index) ++ if self.match: return self ++ def start(self,n): ++ return self.match.end(n) ++endbracket = EndBracketMatch() ++ ++class SGMLParser(html.parser.HTMLParser): ++ # Definition of entities -- derived classes may override ++ entity_or_charref = re.compile('&(?:' ++ '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)' ++ ')(;?)') ++ ++ def __init__(self, verbose=0): ++ """Initialize and reset this instance.""" ++ html.parser.HTMLParser.__init__(self) ++ self.verbose = verbose ++ self.reset() ++ ++ def reset(self): ++ """Reset this instance. Loses all unprocessed data.""" ++ self.__starttag_text = None ++ self.stack = [] ++ self.nomoretags = 0 ++ html.parser.HTMLParser.reset(self) ++ ++ # Internal -- handle data as far as reasonable. May leave state ++ # and data to be processed by a subsequent call. If 'end' is ++ # true, force handling all data as if followed by EOF marker. ++ def goahead(self, end): ++ rawdata = self.rawdata ++ i = 0 ++ n = len(rawdata) ++ while i < n: ++ if self.nomoretags: ++ self.handle_data(rawdata[i:n]) ++ i = n ++ break ++ match = self.interesting.search(rawdata, i) # < or & ++ if match: ++ j = match.start() ++ else: ++ j = n ++ if i < j: self.handle_data(rawdata[i:j]) ++ i = self.updatepos(i, j) ++ if i == n: break ++ startswith = rawdata.startswith ++ if startswith('<', i): ++ if starttagopen.match(rawdata, i): # < + letter ++ k = self.parse_starttag(i) ++ elif startswith("</", i): ++ k = self.parse_endtag(i) ++ elif startswith("<!--", i): ++ k = self.parse_comment(i) ++ elif startswith("<?", i): ++ k = self.parse_pi(i) ++ elif startswith("<!", i): ++ k = self.parse_declaration(i) ++ elif (i + 1) < n: ++ self.handle_data("<") ++ k = i + 1 ++ else: ++ break ++ if k < 0: ++ if end: ++ pass #self.error("EOF in middle of construct") ++ break ++ i = self.updatepos(i, k) ++ continue ++ elif startswith('&', i): ++ match = charref.match(rawdata, i) ++ if match: ++ name = match.group(1) ++ self.handle_charref(name) ++ k = match.end() ++ if not startswith(';', k-1): ++ k = k - 1 ++ i = self.updatepos(i, k) ++ continue ++ match = entityref.match(rawdata, i) ++ if match: ++ name = match.group(1) ++ self.handle_entityref(name) ++ k = match.end() ++ if not startswith(';', k-1): ++ k = k - 1 ++ i = self.updatepos(i, k) ++ continue ++ else: ++ self.error('neither < nor & ??') ++ # We get here only if incomplete matches but ++ # nothing else ++ match = incomplete.match(rawdata, i) ++ if not match: ++ self.handle_data(rawdata[i]) ++ i = i+1 ++ continue ++ j = match.end(0) ++ if j == n: ++ break # Really incomplete ++ self.handle_data(rawdata[i:j]) ++ i = j ++ # end while ++ if end and i < n: ++ self.handle_data(rawdata[i:n]) ++ i = self.updatepos(i, n) ++ self.rawdata = rawdata[i:] ++ # XXX if end: check for empty stack ++ ++ # Internal -- handle starttag, return length or -1 if not terminated ++ def parse_starttag(self, i): ++ self.__starttag_text = None ++ start_pos = i ++ rawdata = self.rawdata ++ if shorttagopen.match(rawdata, i): ++ # SGML shorthand: <tag/data/ == <tag>data</tag> ++ # XXX Can data contain &... (entity or char refs)? ++ # XXX Can data contain < or > (tag characters)? ++ # XXX Can there be whitespace before the first /? ++ match = shorttag.match(rawdata, i) ++ if not match: ++ return -1 ++ tag, data = match.group(1, 2) ++ self.__starttag_text = '<%s/' % tag ++ tag = tag.lower() ++ k = match.end(0) ++ self.finish_shorttag(tag, data) ++ self.__starttag_text = rawdata[start_pos:match.end(1) + 1] ++ return k ++ # XXX The following should skip matching quotes (' or ") ++ # As a shortcut way to exit, this isn't so bad, but shouldn't ++ # be used to locate the actual end of the start tag since the ++ # < or > characters may be embedded in an attribute value. ++ match = endbracket.search(rawdata, i+1) ++ if not match: ++ return -1 ++ j = match.start(0) ++ # Now parse the data between i+1 and j into a tag and attrs ++ attrs = [] ++ if rawdata[i:i+2] == '<>': ++ # SGML shorthand: <> == <last open tag seen> ++ k = j ++ tag = self.lasttag ++ else: ++ match = tagfind.match(rawdata, i+1) ++ if not match: ++ self.error('unexpected call to parse_starttag') ++ k = match.end(0) ++ tag = rawdata[i+1:k].lower() ++ self.lasttag = tag ++ while k < j: ++ match = attrfind.match(rawdata, k) ++ if not match: ++ break ++ attrname, rest, attrvalue = match.group(1, 2, 3) ++ if not rest: ++ attrvalue = attrname ++ elif attrvalue[:1] == "'" == attrvalue[-1:] or \ ++ attrvalue[:1] == '"' == attrvalue[-1:]: ++ attrvalue = attrvalue[1:-1] ++ attrvalue = self.entity_or_charref.sub(self._convert_ref, attrvalue) ++ attrs.append((attrname.lower(), attrvalue)) ++ k = match.end(0) ++ if rawdata[j] == '>': ++ j = j+1 ++ self.__starttag_text = rawdata[start_pos:j] ++ self.finish_starttag(tag, attrs) ++ return j ++ ++ # Internal -- convert entity or character reference ++ def _convert_ref(self, match): ++ if match.group(2): ++ return self.convert_charref(match.group(2)) or \ ++ '&#%s%s' % match.groups()[1:] ++ elif match.group(3): ++ return self.convert_entityref(match.group(1)) or \ ++ '&%s;' % match.group(1) ++ else: ++ return '&%s' % match.group(1) ++ ++ # Internal -- parse endtag ++ def parse_endtag(self, i): ++ rawdata = self.rawdata ++ match = endbracket.search(rawdata, i+1) ++ if not match: ++ return -1 ++ j = match.start(0) ++ tag = rawdata[i+2:j].strip().lower() ++ if rawdata[j] == '>': ++ j = j+1 ++ self.finish_endtag(tag) ++ return j ++ ++ # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>) ++ def finish_shorttag(self, tag, data): ++ self.finish_starttag(tag, []) ++ self.handle_data(data) ++ self.finish_endtag(tag) ++ ++ # Internal -- finish processing of start tag ++ # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag ++ def finish_starttag(self, tag, attrs): ++ method = getattr(self, 'start_' + tag, None) ++ if method: ++ self.stack.append(tag) ++ method(tag, attrs) ++ return 1 ++ method = getattr(self, 'do_' + tag, None) ++ if method: ++ method(tag, attrs) ++ return 0 ++ self.unknown_starttag(tag, attrs) ++ return -1 ++ ++ # Internal -- finish processing of end tag ++ def finish_endtag(self, tag): ++ if not tag: ++ found = len(self.stack) - 1 ++ if found < 0: ++ self.unknown_endtag(tag) ++ return ++ else: ++ if tag not in self.stack: ++ if getattr(self, 'end_' + tag, None): ++ self.report_unbalanced(tag) ++ else: ++ self.unknown_endtag(tag) ++ return ++ found = len(self.stack) ++ for i in range(found): ++ if self.stack[i] == tag: found = i ++ while len(self.stack) > found: ++ tag = self.stack[-1] ++ method = getattr(self, 'end_' + tag, self.unknown_endtag) ++ method(tag) ++ del self.stack[-1] ++ ++ # Example -- report an unbalanced </...> tag. ++ def report_unbalanced(self, tag): ++ if self.verbose: ++ print('*** Unbalanced </' + tag + '>') ++ print('*** Stack:', self.stack) ++ ++ def convert_charref(self, name): ++ """Convert character reference, may be overridden.""" ++ try: ++ n = int(name) ++ except ValueError: ++ return ++ if not 0 <= n <= 127: ++ return ++ return chr(n) ++ ++ # Definition of entities -- derived classes may override ++ entitydefs = \ ++ {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''} ++ ++ def convert_entityref(self, name): ++ """Convert entity references. ++ ++ As an alternative to overriding this method; one can tailor the ++ results by setting up the self.entitydefs mapping appropriately. ++ """ ++ return self.entitydefs.get(name, None) + + SUPPORTED_VERSIONS = {'': 'unknown', + 'rss090': 'RSS 0.90', +@@ -158,16 +445,7 @@ + 'hotrss': 'Hot RSS' + } + +-try: +- UserDict = dict +-except NameError: +- # Python 2.1 does not have dict +- from UserDict import UserDict +- def dict(aList): +- rc = {} +- for k, v in aList: +- rc[k] = v +- return rc ++UserDict = dict + + class FeedParserDict(UserDict): + keymap = {'channel': 'feed', +@@ -188,14 +466,21 @@ + def __getitem__(self, key): + if key == 'category': + return UserDict.__getitem__(self, 'tags')[0]['term'] ++ if key == 'enclosures': ++ norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel']) ++ return [norel(link) for link in UserDict.__getitem__(self, 'links') if link['rel']=='enclosure'] ++ if key == 'license': ++ for link in UserDict.__getitem__(self, 'links'): ++ if link['rel']=='license' and 'href' in link: ++ return link['href'] + if key == 'categories': + return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')] + realkey = self.keymap.get(key, key) +- if type(realkey) == types.ListType: ++ if isinstance(realkey, list): + for k in realkey: +- if UserDict.has_key(self, k): ++ if super().__contains__(k): + return UserDict.__getitem__(self, k) +- if UserDict.has_key(self, key): ++ if super().__contains__(key): + return UserDict.__getitem__(self, key) + return UserDict.__getitem__(self, realkey) + +@@ -203,24 +488,24 @@ + for k in self.keymap.keys(): + if key == k: + key = self.keymap[k] +- if type(key) == types.ListType: ++ if isinstance(key, list): + key = key[0] + return UserDict.__setitem__(self, key, value) + + def get(self, key, default=None): +- if self.has_key(key): ++ if key in self: + return self[key] + else: + return default + + def setdefault(self, key, value): +- if not self.has_key(key): ++ if key not in self: + self[key] = value + return self[key] + +- def has_key(self, key): ++ def __contains__(self, key): + try: +- return hasattr(self, key) or UserDict.has_key(self, key) ++ return hasattr(self, key) or UserDict.__contains__(self, key) + except AttributeError: + return False + +@@ -233,7 +518,7 @@ + assert not key.startswith('_') + return self.__getitem__(key) + except: +- raise AttributeError, "object has no attribute '%s'" % key ++ raise AttributeError("object has no attribute '%s'" % key) + + def __setattr__(self, key, value): + if key.startswith('_') or key == 'data': +@@ -241,9 +526,6 @@ + else: + return self.__setitem__(key, value) + +- def __contains__(self, key): +- return self.has_key(key) +- + def zopeCompatibilityHack(): + global FeedParserDict + del FeedParserDict +@@ -275,15 +557,46 @@ + 92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249, + 48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255 + ) +- import string +- _ebcdic_to_ascii_map = string.maketrans( \ +- ''.join(map(chr, range(256))), ''.join(map(chr, emap))) ++ _ebcdic_to_ascii_map = bytes.maketrans(bytes(range(256)), bytes(emap)) + return s.translate(_ebcdic_to_ascii_map) ++ ++_cp1252 = { ++ chr(128): chr(8364), # euro sign ++ chr(130): chr(8218), # single low-9 quotation mark ++ chr(131): chr( 402), # latin small letter f with hook ++ chr(132): chr(8222), # double low-9 quotation mark ++ chr(133): chr(8230), # horizontal ellipsis ++ chr(134): chr(8224), # dagger ++ chr(135): chr(8225), # double dagger ++ chr(136): chr( 710), # modifier letter circumflex accent ++ chr(137): chr(8240), # per mille sign ++ chr(138): chr( 352), # latin capital letter s with caron ++ chr(139): chr(8249), # single left-pointing angle quotation mark ++ chr(140): chr( 338), # latin capital ligature oe ++ chr(142): chr( 381), # latin capital letter z with caron ++ chr(145): chr(8216), # left single quotation mark ++ chr(146): chr(8217), # right single quotation mark ++ chr(147): chr(8220), # left double quotation mark ++ chr(148): chr(8221), # right double quotation mark ++ chr(149): chr(8226), # bullet ++ chr(150): chr(8211), # en dash ++ chr(151): chr(8212), # em dash ++ chr(152): chr( 732), # small tilde ++ chr(153): chr(8482), # trade mark sign ++ chr(154): chr( 353), # latin small letter s with caron ++ chr(155): chr(8250), # single right-pointing angle quotation mark ++ chr(156): chr( 339), # latin small ligature oe ++ chr(158): chr( 382), # latin small letter z with caron ++ chr(159): chr( 376)} # latin capital letter y with diaeresis + + _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)') + def _urljoin(base, uri): + uri = _urifixer.sub(r'\1\3', uri) +- return urlparse.urljoin(base, uri) ++ try: ++ return urllib.parse.urljoin(base, uri) ++ except: ++ uri = urllib.parse.urlunparse([urllib.quote(part) for part in urllib.parse.urlparse(uri)]) ++ return urllib.parse.urljoin(base, uri) + + class _FeedParserMixin: + namespaces = {'': '', +@@ -324,6 +637,8 @@ + 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes', + 'http://purl.org/rss/1.0/modules/link/': 'l', + 'http://search.yahoo.com/mrss': 'media', ++ #Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace ++ 'http://search.yahoo.com/mrss/': 'media', + 'http://madskills.com/public/xml/rss/module/pingback/': 'pingback', + 'http://prismstandard.org/namespaces/1.2/basic/': 'prism', + 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf', +@@ -337,6 +652,7 @@ + 'http://hacks.benhammersley.com/rss/streaming/': 'str', + 'http://purl.org/rss/1.0/modules/subscription/': 'sub', + 'http://purl.org/rss/1.0/modules/syndication/': 'sy', ++ 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf', + 'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo', + 'http://purl.org/rss/1.0/modules/threading/': 'thr', + 'http://purl.org/rss/1.0/modules/textinput/': 'ti', +@@ -344,12 +660,12 @@ + 'http://wellformedweb.org/commentAPI/': 'wfw', + 'http://purl.org/rss/1.0/modules/wiki/': 'wiki', + 'http://www.w3.org/1999/xhtml': 'xhtml', +- 'http://www.w3.org/XML/1998/namespace': 'xml', +- 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf' ++ 'http://www.w3.org/1999/xlink': 'xlink', ++ 'http://www.w3.org/XML/1998/namespace': 'xml' + } + _matchnamespaces = {} + +- can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo'] ++ can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo'] + can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'] + can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'] + html_types = ['text/html', 'application/xhtml+xml'] +@@ -385,8 +701,10 @@ + self.langstack = [] + self.baseuri = baseuri or '' + self.lang = baselang or None ++ self.svgOK = 0 ++ self.hasTitle = 0 + if baselang: +- self.feeddata['language'] = baselang ++ self.feeddata['language'] = baselang.replace('_','-') + + def unknown_starttag(self, tag, attrs): + if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs)) +@@ -397,6 +715,11 @@ + # track xml:base and xml:lang + attrsD = dict(attrs) + baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri ++ if not isinstance(baseuri, str): ++ try: ++ baseuri = str(baseuri, self.encoding) ++ except: ++ baseuri = str(baseuri, 'iso-8859-1') + self.baseuri = _urljoin(self.baseuri, baseuri) + lang = attrsD.get('xml:lang', attrsD.get('lang')) + if lang == '': +@@ -407,7 +730,7 @@ + lang = self.lang + if lang: + if tag in ('feed', 'rss', 'rdf:RDF'): +- self.feeddata['language'] = lang ++ self.feeddata['language'] = lang.replace('_','-') + self.lang = lang + self.basestack.append(self.baseuri) + self.langstack.append(lang) +@@ -420,23 +743,23 @@ + self.trackNamespace(None, uri) + + # track inline content +- if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): ++ if self.incontent and 'type' in self.contentparams and not self.contentparams.get('type', 'xml').endswith('xml'): ++ if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007 + # element declared itself as escaped markup, but it isn't really + self.contentparams['type'] = 'application/xhtml+xml' + if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml': +- # Note: probably shouldn't simply recreate localname here, but +- # our namespace handling isn't actually 100% correct in cases where +- # the feed redefines the default namespace (which is actually +- # the usual case for inline content, thanks Sam), so here we +- # cheat and just reconstruct the element based on localname +- # because that compensates for the bugs in our namespace handling. +- # This will horribly munge inline content with non-empty qnames, +- # but nobody actually does that, so I'm not fixing it. +- tag = tag.split(':')[-1] +- return self.handle_data('<%s%s>' % (tag, ''.join([' %s="%s"' % t for t in attrs])), escape=0) ++ if tag.find(':') != -1: ++ prefix, tag = tag.split(':', 1) ++ namespace = self.namespacesInUse.get(prefix, '') ++ if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML': ++ attrs.append(('xmlns',namespace)) ++ if tag=='svg' and namespace=='http://www.w3.org/2000/svg': ++ attrs.append(('xmlns',namespace)) ++ if tag == 'svg': self.svgOK += 1 ++ return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0) + + # match namespaces +- if tag.find(':') <> -1: ++ if tag.find(':') != -1: + prefix, suffix = tag.split(':', 1) + else: + prefix, suffix = '', tag +@@ -456,30 +779,41 @@ + method = getattr(self, methodname) + return method(attrsD) + except AttributeError: +- return self.push(prefix + suffix, 1) ++ # Since there's no handler or something has gone wrong we explicitly add the element and its attributes ++ unknown_tag = prefix + suffix ++ if len(attrsD) == 0: ++ # No attributes so merge it into the encosing dictionary ++ return self.push(unknown_tag, 1) ++ else: ++ # Has attributes so create it in its own dictionary ++ context = self._getContext() ++ context[unknown_tag] = attrsD + + def unknown_endtag(self, tag): + if _debug: sys.stderr.write('end %s\n' % tag) + # match namespaces +- if tag.find(':') <> -1: ++ if tag.find(':') != -1: + prefix, suffix = tag.split(':', 1) + else: + prefix, suffix = '', tag + prefix = self.namespacemap.get(prefix, prefix) + if prefix: + prefix = prefix + '_' ++ if suffix == 'svg' and self.svgOK: self.svgOK -= 1 + + # call special handler (if defined) or default handler + methodname = '_end_' + prefix + suffix + try: ++ if self.svgOK: raise AttributeError() + method = getattr(self, methodname) + method() + except AttributeError: + self.pop(prefix + suffix) + + # track inline content +- if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): ++ if self.incontent and 'type' in self.contentparams and not self.contentparams.get('type', 'xml').endswith('xml'): + # element declared itself as escaped markup, but it isn't really ++ if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007 + self.contentparams['type'] = 'application/xhtml+xml' + if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml': + tag = tag.split(':')[-1] +@@ -506,7 +840,7 @@ + c = int(ref[1:], 16) + else: + c = int(ref) +- text = unichr(c).encode('utf-8') ++ text = chr(c) + self.elementstack[-1][2].append(text) + + def handle_entityref(self, ref): +@@ -515,19 +849,14 @@ + if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref) + if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): + text = '&%s;' % ref ++ elif ref in self.entities.keys(): ++ text = self.entities[ref] ++ if text.startswith('&#') and text.endswith(';'): ++ return self.handle_entityref(text) + else: +- # entity resolution graciously donated by Aaron Swartz +- def name2cp(k): +- import htmlentitydefs +- if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3 +- return htmlentitydefs.name2codepoint[k] +- k = htmlentitydefs.entitydefs[k] +- if k.startswith('&#') and k.endswith(';'): +- return int(k[2:-1]) # not in latin-1 +- return ord(k) +- try: name2cp(ref) ++ try: name2codepoint[ref] + except KeyError: text = '&%s;' % ref +- else: text = unichr(name2cp(ref)).encode('utf-8') ++ else: text = chr(name2codepoint[ref]) + self.elementstack[-1][2].append(text) + + def handle_data(self, text, escape=1): +@@ -554,12 +883,19 @@ + if _debug: sys.stderr.write('entering parse_declaration\n') + if self.rawdata[i:i+9] == '<![CDATA[': + k = self.rawdata.find(']]>', i) +- if k == -1: k = len(self.rawdata) ++ if k == -1: ++ # CDATA block began but didn't finish ++ k = len(self.rawdata) ++ return k + self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0) + return k+3 + else: + k = self.rawdata.find('>', i) +- return k+1 ++ if k >= 0: ++ return k+1 ++ else: ++ # We have an incomplete CDATA block. ++ return k + + def mapContentType(self, contentType): + contentType = contentType.lower() +@@ -579,11 +915,11 @@ + self.version = 'rss10' + if loweruri == 'http://www.w3.org/2005/atom' and not self.version: + self.version = 'atom10' +- if loweruri.find('backend.userland.com/rss') <> -1: ++ if loweruri.find('backend.userland.com/rss') != -1: + # match any backend.userland.com namespace + uri = 'http://backend.userland.com/rss' + loweruri = uri +- if self._matchnamespaces.has_key(loweruri): ++ if loweruri in self._matchnamespaces: + self.namespacemap[prefix] = self._matchnamespaces[loweruri] + self.namespacesInUse[self._matchnamespaces[loweruri]] = uri + else: +@@ -595,6 +931,9 @@ + def decodeEntities(self, element, data): + return data + ++ def strattrs(self, attrs): ++ return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'"'})) for t in attrs]) ++ + def push(self, element, expectingText): + self.elementstack.append([element, expectingText, []]) + +@@ -603,6 +942,28 @@ + if self.elementstack[-1][0] != element: return + + element, expectingText, pieces = self.elementstack.pop() ++ ++ if self.version == 'atom10' and self.contentparams.get('type','text') == 'application/xhtml+xml': ++ # remove enclosing child element, but only if it is a <div> and ++ # only if all the remaining content is nested underneath it. ++ # This means that the divs would be retained in the following: ++ # <div>foo</div><div>bar</div> ++ while pieces and len(pieces)>1 and not pieces[-1].strip(): ++ del pieces[-1] ++ while pieces and len(pieces)>1 and not pieces[0].strip(): ++ del pieces[0] ++ if pieces and (pieces[0] == '<div>' or pieces[0].startswith('<div ')) and pieces[-1]=='</div>': ++ depth = 0 ++ for piece in pieces[:-1]: ++ if piece.startswith('</'): ++ depth -= 1 ++ if depth == 0: break ++ elif piece.startswith('<') and not piece.endswith('/>'): ++ depth += 1 ++ else: ++ pieces = pieces[1:-1] ++ ++ pieces = [s if isinstance(s, str) else s.encode(self.encoding) for s in pieces] + output = ''.join(pieces) + if stripWhitespace: + output = output.strip() +@@ -611,7 +972,7 @@ + # decode base64 content + if base64 and self.contentparams.get('base64', 0): + try: +- output = base64.decodestring(output) ++ output = base64.decodebytes(output.encode(self.encoding)).decode(self.encoding) + except binascii.Error: + pass + except binascii.Incomplete: +@@ -625,6 +986,9 @@ + if not self.contentparams.get('base64', 0): + output = self.decodeEntities(element, output) + ++ if self.lookslikehtml(output): ++ self.contentparams['type']='text/html' ++ + # remove temporary cruft from contentparams + try: + del self.contentparams['mode'] +@@ -635,25 +999,57 @@ + except KeyError: + pass + ++ is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types + # resolve relative URIs within embedded markup +- if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types: ++ if is_htmlish and RESOLVE_RELATIVE_URIS: + if element in self.can_contain_relative_uris: +- output = _resolveRelativeURIs(output, self.baseuri, self.encoding) ++ output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html')) ++ ++ # parse microformats ++ # (must do this before sanitizing because some microformats ++ # rely on elements that we sanitize) ++ if is_htmlish and element in ['content', 'description', 'summary']: ++ mfresults = _parseMicroformats(output, self.baseuri, self.encoding) ++ if mfresults: ++ for tag in mfresults.get('tags', []): ++ self._addTag(tag['term'], tag['scheme'], tag['label']) ++ for enclosure in mfresults.get('enclosures', []): ++ self._start_enclosure(enclosure) ++ for xfn in mfresults.get('xfn', []): ++ self._addXFN(xfn['relationships'], xfn['href'], xfn['name']) ++ vcard = mfresults.get('vcard') ++ if vcard: ++ self._getContext()['vcard'] = vcard + + # sanitize embedded markup +- if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types: ++ if is_htmlish and SANITIZE_HTML: + if element in self.can_contain_dangerous_markup: +- output = _sanitizeHTML(output, self.encoding) ++ output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html')) + +- if self.encoding and type(output) != type(u''): ++ if self.encoding and not isinstance(output, str): + try: +- output = unicode(output, self.encoding) ++ output = str(output, self.encoding) + except: + pass + ++ # address common error where people take data that is already ++ # utf-8, presume that it is iso-8859-1, and re-encode it. ++ if self.encoding=='utf-8' and isinstance(output, str): ++ try: ++ output = str(output.encode('iso-8859-1'), 'utf-8') ++ except: ++ pass ++ ++ # map win-1252 extensions to the proper code points ++ if isinstance(output, str): ++ output = ''.join([c in _cp1252.keys() and _cp1252[c] or c for c in output]) ++ + # categories/tags/keywords/whatever are handled in _end_category + if element == 'category': + return output ++ ++ if element == 'title' and self.hasTitle: ++ return output + + # store output in appropriate place(s) + if self.inentry and not self.insource: +@@ -674,7 +1070,7 @@ + contentparams = copy.deepcopy(self.contentparams) + contentparams['value'] = output + self.entries[-1][element + '_detail'] = contentparams +- elif (self.infeed or self.insourc |