summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJon Bergli Heier <snakebite@jvnv.net>2010-10-25 20:26:07 +0200
committerJon Bergli Heier <snakebite@jvnv.net>2010-10-25 20:26:07 +0200
commit6403bece675d170ace9a1715106f27d3bf8e8412 (patch)
treeeeba36e03aeea99a61fbde667620cfa1d1dfcf84
parent8be4ecef3d7f083ec34c2bb6344d9bafb06cc61c (diff)
Added python3-feedparser.
-rw-r--r--python3-feedparser/PKGBUILD28
-rw-r--r--python3-feedparser/python3.patch3359
2 files changed, 3387 insertions, 0 deletions
diff --git a/python3-feedparser/PKGBUILD b/python3-feedparser/PKGBUILD
new file mode 100644
index 0000000..9b82e2e
--- /dev/null
+++ b/python3-feedparser/PKGBUILD
@@ -0,0 +1,28 @@
+# Based on the offical PKGBUILD for python-feedparser.
+# Contributor: Jon Bergli Heier <snakebite@jvnv.net>
+
+pkgname=python3-feedparser
+pkgver=4.2pre315
+_pkgver=4.1
+pkgrel=1
+pkgdesc="Universal Feed Parser for Python 3"
+arch=('any')
+url="http://bitbucket.org/puzzlet/feedparser-py3/"
+license=('custom')
+depends=('python' 'libxml2' )
+source=(http://downloads.sourceforge.net/feedparser/feedparser-${_pkgver}.zip python3.patch)
+md5sums=('7ab1140c1e29d4cd52ab20fa7b1f8640'
+ '29fe3762cd3e2a97427001844ef2e772')
+
+build() {
+ cd ${srcdir}
+ patch -p0 -i python3.patch
+}
+
+package() {
+ cd ${srcdir}
+ python setup.py install --root=${pkgdir}
+ install -Dm644 LICENSE ${pkgdir}/usr/share/licenses/${pkgname}/license
+}
+
+# vim:set ts=2 sw=2 et:
diff --git a/python3-feedparser/python3.patch b/python3-feedparser/python3.patch
new file mode 100644
index 0000000..5af9afc
--- /dev/null
+++ b/python3-feedparser/python3.patch
@@ -0,0 +1,3359 @@
+--- feedparser.py 2010-10-25 20:07:40.000000000 +0200
++++ python3-feedparser.py 2010-10-25 20:07:02.000000000 +0200
+@@ -6,13 +6,12 @@
+ Visit http://feedparser.org/ for the latest version
+ Visit http://feedparser.org/docs/ for the latest documentation
+
+-Required: Python 2.1 or later
+-Recommended: Python 2.3 or later
+-Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
++Required: Python 3.0 or later
++Recommended: Python 3.1 or later
+ """
+
+-__version__ = "4.1"# + "$Revision: 1.92 $"[11:15] + "-cvs"
+-__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
++__version__ = "4.2-pre-" + "$Revision: 315 $"[11:14] + "-svn"
++__license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+@@ -39,7 +38,10 @@
+ "John Beimler <http://john.beimler.org/>",
+ "Fazal Majid <http://www.majid.info/mylos/weblog/>",
+ "Aaron Swartz <http://aaronsw.com/>",
+- "Kevin Marks <http://epeus.blogspot.com/>"]
++ "Kevin Marks <http://epeus.blogspot.com/>",
++ "Sam Ruby <http://intertwingly.net/>",
++ "Ade Oshineye <http://blog.oshineye.com/>",
++ "Puzzlet Chung <http://puzzlet.org/>"]
+ _debug = 0
+
+ # HTTP "User-Agent" header to send to servers when downloading feeds.
+@@ -65,12 +67,18 @@
+ # if TIDY_MARKUP = 1
+ PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
+
++# If you want feedparser to automatically resolve all relative URIs, set this
++# to 1.
++RESOLVE_RELATIVE_URIS = 1
++
++# If you want feedparser to automatically sanitize all potentially unsafe
++# HTML content, set this to 1.
++SANITIZE_HTML = 1
++
+ # ---------- required modules (should come with any Python distribution) ----------
+-import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
+-try:
+- from cStringIO import StringIO as _StringIO
+-except:
+- from StringIO import StringIO as _StringIO
++import html.parser, re, sys, copy, time, email, types, cgi, urllib, urllib.request, urllib.error, urllib.parse
++from io import StringIO as _StringIO
++from io import BytesIO
+
+ # ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
+
+@@ -95,28 +103,16 @@
+ _XML_AVAILABLE = 1
+ except:
+ _XML_AVAILABLE = 0
+- def _xmlescape(data):
++ def _xmlescape(data,entities={}):
+ data = data.replace('&', '&amp;')
+ data = data.replace('>', '&gt;')
+ data = data.replace('<', '&lt;')
++ for char, entity in entities:
++ data = data.replace(char, entity)
+ return data
+
+ # base64 support for Atom feeds that contain embedded binary data
+-try:
+- import base64, binascii
+-except:
+- base64 = binascii = None
+-
+-# cjkcodecs and iconv_codec provide support for more character encodings.
+-# Both are available from http://cjkpython.i18n.org/
+-try:
+- import cjkcodecs.aliases
+-except:
+- pass
+-try:
+- import iconv_codec
+-except:
+- pass
++import base64, binascii
+
+ # chardet library auto-detects character encodings
+ # Download from http://chardet.feedparser.org/
+@@ -128,6 +124,18 @@
+ except:
+ chardet = None
+
++from html.entities import name2codepoint, codepoint2name
++
++# BeautifulSoup parser used for parsing microformats from embedded HTML content
++# http://www.crummy.com/software/BeautifulSoup/
++# feedparser is tested with BeautifulSoup 3.0.x, but it might work with the
++# older 2.x series. If it doesn't, and you can figure out why, I'll accept a
++# patch and modify the compatibility statement accordingly.
++try:
++ import BeautifulSoup
++except:
++ BeautifulSoup = None
++
+ # ---------- don't touch these ----------
+ class ThingsNobodyCaresAboutButMe(Exception): pass
+ class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
+@@ -135,9 +143,288 @@
+ class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
+ class UndeclaredNamespace(Exception): pass
+
+-sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
+-sgmllib.special = re.compile('<!')
+-sgmllib.charref = re.compile('&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]')
++incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
++ '<([a-zA-Z][^<>]*|'
++ '/([a-zA-Z][^<>]*)?|'
++ '![^<>]*)?')
++
++entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
++charref = re.compile('&#(\d+|x[0-9a-fA-F]+);')
++
++starttagopen = re.compile('<[>a-zA-Z]')
++shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
++shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
++tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
++attrfind = re.compile(
++ r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
++ r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
++
++class EndBracketMatch:
++ endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''')
++ def search(self,string,index=0):
++ self.match = self.endbracket.match(string,index)
++ if self.match: return self
++ def start(self,n):
++ return self.match.end(n)
++endbracket = EndBracketMatch()
++
++class SGMLParser(html.parser.HTMLParser):
++ # Definition of entities -- derived classes may override
++ entity_or_charref = re.compile('&(?:'
++ '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
++ ')(;?)')
++
++ def __init__(self, verbose=0):
++ """Initialize and reset this instance."""
++ html.parser.HTMLParser.__init__(self)
++ self.verbose = verbose
++ self.reset()
++
++ def reset(self):
++ """Reset this instance. Loses all unprocessed data."""
++ self.__starttag_text = None
++ self.stack = []
++ self.nomoretags = 0
++ html.parser.HTMLParser.reset(self)
++
++ # Internal -- handle data as far as reasonable. May leave state
++ # and data to be processed by a subsequent call. If 'end' is
++ # true, force handling all data as if followed by EOF marker.
++ def goahead(self, end):
++ rawdata = self.rawdata
++ i = 0
++ n = len(rawdata)
++ while i < n:
++ if self.nomoretags:
++ self.handle_data(rawdata[i:n])
++ i = n
++ break
++ match = self.interesting.search(rawdata, i) # < or &
++ if match:
++ j = match.start()
++ else:
++ j = n
++ if i < j: self.handle_data(rawdata[i:j])
++ i = self.updatepos(i, j)
++ if i == n: break
++ startswith = rawdata.startswith
++ if startswith('<', i):
++ if starttagopen.match(rawdata, i): # < + letter
++ k = self.parse_starttag(i)
++ elif startswith("</", i):
++ k = self.parse_endtag(i)
++ elif startswith("<!--", i):
++ k = self.parse_comment(i)
++ elif startswith("<?", i):
++ k = self.parse_pi(i)
++ elif startswith("<!", i):
++ k = self.parse_declaration(i)
++ elif (i + 1) < n:
++ self.handle_data("<")
++ k = i + 1
++ else:
++ break
++ if k < 0:
++ if end:
++ pass #self.error("EOF in middle of construct")
++ break
++ i = self.updatepos(i, k)
++ continue
++ elif startswith('&', i):
++ match = charref.match(rawdata, i)
++ if match:
++ name = match.group(1)
++ self.handle_charref(name)
++ k = match.end()
++ if not startswith(';', k-1):
++ k = k - 1
++ i = self.updatepos(i, k)
++ continue
++ match = entityref.match(rawdata, i)
++ if match:
++ name = match.group(1)
++ self.handle_entityref(name)
++ k = match.end()
++ if not startswith(';', k-1):
++ k = k - 1
++ i = self.updatepos(i, k)
++ continue
++ else:
++ self.error('neither < nor & ??')
++ # We get here only if incomplete matches but
++ # nothing else
++ match = incomplete.match(rawdata, i)
++ if not match:
++ self.handle_data(rawdata[i])
++ i = i+1
++ continue
++ j = match.end(0)
++ if j == n:
++ break # Really incomplete
++ self.handle_data(rawdata[i:j])
++ i = j
++ # end while
++ if end and i < n:
++ self.handle_data(rawdata[i:n])
++ i = self.updatepos(i, n)
++ self.rawdata = rawdata[i:]
++ # XXX if end: check for empty stack
++
++ # Internal -- handle starttag, return length or -1 if not terminated
++ def parse_starttag(self, i):
++ self.__starttag_text = None
++ start_pos = i
++ rawdata = self.rawdata
++ if shorttagopen.match(rawdata, i):
++ # SGML shorthand: <tag/data/ == <tag>data</tag>
++ # XXX Can data contain &... (entity or char refs)?
++ # XXX Can data contain < or > (tag characters)?
++ # XXX Can there be whitespace before the first /?
++ match = shorttag.match(rawdata, i)
++ if not match:
++ return -1
++ tag, data = match.group(1, 2)
++ self.__starttag_text = '<%s/' % tag
++ tag = tag.lower()
++ k = match.end(0)
++ self.finish_shorttag(tag, data)
++ self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
++ return k
++ # XXX The following should skip matching quotes (' or ")
++ # As a shortcut way to exit, this isn't so bad, but shouldn't
++ # be used to locate the actual end of the start tag since the
++ # < or > characters may be embedded in an attribute value.
++ match = endbracket.search(rawdata, i+1)
++ if not match:
++ return -1
++ j = match.start(0)
++ # Now parse the data between i+1 and j into a tag and attrs
++ attrs = []
++ if rawdata[i:i+2] == '<>':
++ # SGML shorthand: <> == <last open tag seen>
++ k = j
++ tag = self.lasttag
++ else:
++ match = tagfind.match(rawdata, i+1)
++ if not match:
++ self.error('unexpected call to parse_starttag')
++ k = match.end(0)
++ tag = rawdata[i+1:k].lower()
++ self.lasttag = tag
++ while k < j:
++ match = attrfind.match(rawdata, k)
++ if not match:
++ break
++ attrname, rest, attrvalue = match.group(1, 2, 3)
++ if not rest:
++ attrvalue = attrname
++ elif attrvalue[:1] == "'" == attrvalue[-1:] or \
++ attrvalue[:1] == '"' == attrvalue[-1:]:
++ attrvalue = attrvalue[1:-1]
++ attrvalue = self.entity_or_charref.sub(self._convert_ref, attrvalue)
++ attrs.append((attrname.lower(), attrvalue))
++ k = match.end(0)
++ if rawdata[j] == '>':
++ j = j+1
++ self.__starttag_text = rawdata[start_pos:j]
++ self.finish_starttag(tag, attrs)
++ return j
++
++ # Internal -- convert entity or character reference
++ def _convert_ref(self, match):
++ if match.group(2):
++ return self.convert_charref(match.group(2)) or \
++ '&#%s%s' % match.groups()[1:]
++ elif match.group(3):
++ return self.convert_entityref(match.group(1)) or \
++ '&%s;' % match.group(1)
++ else:
++ return '&%s' % match.group(1)
++
++ # Internal -- parse endtag
++ def parse_endtag(self, i):
++ rawdata = self.rawdata
++ match = endbracket.search(rawdata, i+1)
++ if not match:
++ return -1
++ j = match.start(0)
++ tag = rawdata[i+2:j].strip().lower()
++ if rawdata[j] == '>':
++ j = j+1
++ self.finish_endtag(tag)
++ return j
++
++ # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
++ def finish_shorttag(self, tag, data):
++ self.finish_starttag(tag, [])
++ self.handle_data(data)
++ self.finish_endtag(tag)
++
++ # Internal -- finish processing of start tag
++ # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
++ def finish_starttag(self, tag, attrs):
++ method = getattr(self, 'start_' + tag, None)
++ if method:
++ self.stack.append(tag)
++ method(tag, attrs)
++ return 1
++ method = getattr(self, 'do_' + tag, None)
++ if method:
++ method(tag, attrs)
++ return 0
++ self.unknown_starttag(tag, attrs)
++ return -1
++
++ # Internal -- finish processing of end tag
++ def finish_endtag(self, tag):
++ if not tag:
++ found = len(self.stack) - 1
++ if found < 0:
++ self.unknown_endtag(tag)
++ return
++ else:
++ if tag not in self.stack:
++ if getattr(self, 'end_' + tag, None):
++ self.report_unbalanced(tag)
++ else:
++ self.unknown_endtag(tag)
++ return
++ found = len(self.stack)
++ for i in range(found):
++ if self.stack[i] == tag: found = i
++ while len(self.stack) > found:
++ tag = self.stack[-1]
++ method = getattr(self, 'end_' + tag, self.unknown_endtag)
++ method(tag)
++ del self.stack[-1]
++
++ # Example -- report an unbalanced </...> tag.
++ def report_unbalanced(self, tag):
++ if self.verbose:
++ print('*** Unbalanced </' + tag + '>')
++ print('*** Stack:', self.stack)
++
++ def convert_charref(self, name):
++ """Convert character reference, may be overridden."""
++ try:
++ n = int(name)
++ except ValueError:
++ return
++ if not 0 <= n <= 127:
++ return
++ return chr(n)
++
++ # Definition of entities -- derived classes may override
++ entitydefs = \
++ {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
++
++ def convert_entityref(self, name):
++ """Convert entity references.
++
++ As an alternative to overriding this method; one can tailor the
++ results by setting up the self.entitydefs mapping appropriately.
++ """
++ return self.entitydefs.get(name, None)
+
+ SUPPORTED_VERSIONS = {'': 'unknown',
+ 'rss090': 'RSS 0.90',
+@@ -158,16 +445,7 @@
+ 'hotrss': 'Hot RSS'
+ }
+
+-try:
+- UserDict = dict
+-except NameError:
+- # Python 2.1 does not have dict
+- from UserDict import UserDict
+- def dict(aList):
+- rc = {}
+- for k, v in aList:
+- rc[k] = v
+- return rc
++UserDict = dict
+
+ class FeedParserDict(UserDict):
+ keymap = {'channel': 'feed',
+@@ -188,14 +466,21 @@
+ def __getitem__(self, key):
+ if key == 'category':
+ return UserDict.__getitem__(self, 'tags')[0]['term']
++ if key == 'enclosures':
++ norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel'])
++ return [norel(link) for link in UserDict.__getitem__(self, 'links') if link['rel']=='enclosure']
++ if key == 'license':
++ for link in UserDict.__getitem__(self, 'links'):
++ if link['rel']=='license' and 'href' in link:
++ return link['href']
+ if key == 'categories':
+ return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')]
+ realkey = self.keymap.get(key, key)
+- if type(realkey) == types.ListType:
++ if isinstance(realkey, list):
+ for k in realkey:
+- if UserDict.has_key(self, k):
++ if super().__contains__(k):
+ return UserDict.__getitem__(self, k)
+- if UserDict.has_key(self, key):
++ if super().__contains__(key):
+ return UserDict.__getitem__(self, key)
+ return UserDict.__getitem__(self, realkey)
+
+@@ -203,24 +488,24 @@
+ for k in self.keymap.keys():
+ if key == k:
+ key = self.keymap[k]
+- if type(key) == types.ListType:
++ if isinstance(key, list):
+ key = key[0]
+ return UserDict.__setitem__(self, key, value)
+
+ def get(self, key, default=None):
+- if self.has_key(key):
++ if key in self:
+ return self[key]
+ else:
+ return default
+
+ def setdefault(self, key, value):
+- if not self.has_key(key):
++ if key not in self:
+ self[key] = value
+ return self[key]
+
+- def has_key(self, key):
++ def __contains__(self, key):
+ try:
+- return hasattr(self, key) or UserDict.has_key(self, key)
++ return hasattr(self, key) or UserDict.__contains__(self, key)
+ except AttributeError:
+ return False
+
+@@ -233,7 +518,7 @@
+ assert not key.startswith('_')
+ return self.__getitem__(key)
+ except:
+- raise AttributeError, "object has no attribute '%s'" % key
++ raise AttributeError("object has no attribute '%s'" % key)
+
+ def __setattr__(self, key, value):
+ if key.startswith('_') or key == 'data':
+@@ -241,9 +526,6 @@
+ else:
+ return self.__setitem__(key, value)
+
+- def __contains__(self, key):
+- return self.has_key(key)
+-
+ def zopeCompatibilityHack():
+ global FeedParserDict
+ del FeedParserDict
+@@ -275,15 +557,46 @@
+ 92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
+ 48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
+ )
+- import string
+- _ebcdic_to_ascii_map = string.maketrans( \
+- ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
++ _ebcdic_to_ascii_map = bytes.maketrans(bytes(range(256)), bytes(emap))
+ return s.translate(_ebcdic_to_ascii_map)
++
++_cp1252 = {
++ chr(128): chr(8364), # euro sign
++ chr(130): chr(8218), # single low-9 quotation mark
++ chr(131): chr( 402), # latin small letter f with hook
++ chr(132): chr(8222), # double low-9 quotation mark
++ chr(133): chr(8230), # horizontal ellipsis
++ chr(134): chr(8224), # dagger
++ chr(135): chr(8225), # double dagger
++ chr(136): chr( 710), # modifier letter circumflex accent
++ chr(137): chr(8240), # per mille sign
++ chr(138): chr( 352), # latin capital letter s with caron
++ chr(139): chr(8249), # single left-pointing angle quotation mark
++ chr(140): chr( 338), # latin capital ligature oe
++ chr(142): chr( 381), # latin capital letter z with caron
++ chr(145): chr(8216), # left single quotation mark
++ chr(146): chr(8217), # right single quotation mark
++ chr(147): chr(8220), # left double quotation mark
++ chr(148): chr(8221), # right double quotation mark
++ chr(149): chr(8226), # bullet
++ chr(150): chr(8211), # en dash
++ chr(151): chr(8212), # em dash
++ chr(152): chr( 732), # small tilde
++ chr(153): chr(8482), # trade mark sign
++ chr(154): chr( 353), # latin small letter s with caron
++ chr(155): chr(8250), # single right-pointing angle quotation mark
++ chr(156): chr( 339), # latin small ligature oe
++ chr(158): chr( 382), # latin small letter z with caron
++ chr(159): chr( 376)} # latin capital letter y with diaeresis
+
+ _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
+ def _urljoin(base, uri):
+ uri = _urifixer.sub(r'\1\3', uri)
+- return urlparse.urljoin(base, uri)
++ try:
++ return urllib.parse.urljoin(base, uri)
++ except:
++ uri = urllib.parse.urlunparse([urllib.quote(part) for part in urllib.parse.urlparse(uri)])
++ return urllib.parse.urljoin(base, uri)
+
+ class _FeedParserMixin:
+ namespaces = {'': '',
+@@ -324,6 +637,8 @@
+ 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes',
+ 'http://purl.org/rss/1.0/modules/link/': 'l',
+ 'http://search.yahoo.com/mrss': 'media',
++ #Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace
++ 'http://search.yahoo.com/mrss/': 'media',
+ 'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
+ 'http://prismstandard.org/namespaces/1.2/basic/': 'prism',
+ 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf',
+@@ -337,6 +652,7 @@
+ 'http://hacks.benhammersley.com/rss/streaming/': 'str',
+ 'http://purl.org/rss/1.0/modules/subscription/': 'sub',
+ 'http://purl.org/rss/1.0/modules/syndication/': 'sy',
++ 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf',
+ 'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo',
+ 'http://purl.org/rss/1.0/modules/threading/': 'thr',
+ 'http://purl.org/rss/1.0/modules/textinput/': 'ti',
+@@ -344,12 +660,12 @@
+ 'http://wellformedweb.org/commentAPI/': 'wfw',
+ 'http://purl.org/rss/1.0/modules/wiki/': 'wiki',
+ 'http://www.w3.org/1999/xhtml': 'xhtml',
+- 'http://www.w3.org/XML/1998/namespace': 'xml',
+- 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf'
++ 'http://www.w3.org/1999/xlink': 'xlink',
++ 'http://www.w3.org/XML/1998/namespace': 'xml'
+ }
+ _matchnamespaces = {}
+
+- can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
++ can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo']
+ can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
+ can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
+ html_types = ['text/html', 'application/xhtml+xml']
+@@ -385,8 +701,10 @@
+ self.langstack = []
+ self.baseuri = baseuri or ''
+ self.lang = baselang or None
++ self.svgOK = 0
++ self.hasTitle = 0
+ if baselang:
+- self.feeddata['language'] = baselang
++ self.feeddata['language'] = baselang.replace('_','-')
+
+ def unknown_starttag(self, tag, attrs):
+ if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
+@@ -397,6 +715,11 @@
+ # track xml:base and xml:lang
+ attrsD = dict(attrs)
+ baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
++ if not isinstance(baseuri, str):
++ try:
++ baseuri = str(baseuri, self.encoding)
++ except:
++ baseuri = str(baseuri, 'iso-8859-1')
+ self.baseuri = _urljoin(self.baseuri, baseuri)
+ lang = attrsD.get('xml:lang', attrsD.get('lang'))
+ if lang == '':
+@@ -407,7 +730,7 @@
+ lang = self.lang
+ if lang:
+ if tag in ('feed', 'rss', 'rdf:RDF'):
+- self.feeddata['language'] = lang
++ self.feeddata['language'] = lang.replace('_','-')
+ self.lang = lang
+ self.basestack.append(self.baseuri)
+ self.langstack.append(lang)
+@@ -420,23 +743,23 @@
+ self.trackNamespace(None, uri)
+
+ # track inline content
+- if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
++ if self.incontent and 'type' in self.contentparams and not self.contentparams.get('type', 'xml').endswith('xml'):
++ if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007
+ # element declared itself as escaped markup, but it isn't really
+ self.contentparams['type'] = 'application/xhtml+xml'
+ if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
+- # Note: probably shouldn't simply recreate localname here, but
+- # our namespace handling isn't actually 100% correct in cases where
+- # the feed redefines the default namespace (which is actually
+- # the usual case for inline content, thanks Sam), so here we
+- # cheat and just reconstruct the element based on localname
+- # because that compensates for the bugs in our namespace handling.
+- # This will horribly munge inline content with non-empty qnames,
+- # but nobody actually does that, so I'm not fixing it.
+- tag = tag.split(':')[-1]
+- return self.handle_data('<%s%s>' % (tag, ''.join([' %s="%s"' % t for t in attrs])), escape=0)
++ if tag.find(':') != -1:
++ prefix, tag = tag.split(':', 1)
++ namespace = self.namespacesInUse.get(prefix, '')
++ if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
++ attrs.append(('xmlns',namespace))
++ if tag=='svg' and namespace=='http://www.w3.org/2000/svg':
++ attrs.append(('xmlns',namespace))
++ if tag == 'svg': self.svgOK += 1
++ return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0)
+
+ # match namespaces
+- if tag.find(':') <> -1:
++ if tag.find(':') != -1:
+ prefix, suffix = tag.split(':', 1)
+ else:
+ prefix, suffix = '', tag
+@@ -456,30 +779,41 @@
+ method = getattr(self, methodname)
+ return method(attrsD)
+ except AttributeError:
+- return self.push(prefix + suffix, 1)
++ # Since there's no handler or something has gone wrong we explicitly add the element and its attributes
++ unknown_tag = prefix + suffix
++ if len(attrsD) == 0:
++ # No attributes so merge it into the encosing dictionary
++ return self.push(unknown_tag, 1)
++ else:
++ # Has attributes so create it in its own dictionary
++ context = self._getContext()
++ context[unknown_tag] = attrsD
+
+ def unknown_endtag(self, tag):
+ if _debug: sys.stderr.write('end %s\n' % tag)
+ # match namespaces
+- if tag.find(':') <> -1:
++ if tag.find(':') != -1:
+ prefix, suffix = tag.split(':', 1)
+ else:
+ prefix, suffix = '', tag
+ prefix = self.namespacemap.get(prefix, prefix)
+ if prefix:
+ prefix = prefix + '_'
++ if suffix == 'svg' and self.svgOK: self.svgOK -= 1
+
+ # call special handler (if defined) or default handler
+ methodname = '_end_' + prefix + suffix
+ try:
++ if self.svgOK: raise AttributeError()
+ method = getattr(self, methodname)
+ method()
+ except AttributeError:
+ self.pop(prefix + suffix)
+
+ # track inline content
+- if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
++ if self.incontent and 'type' in self.contentparams and not self.contentparams.get('type', 'xml').endswith('xml'):
+ # element declared itself as escaped markup, but it isn't really
++ if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007
+ self.contentparams['type'] = 'application/xhtml+xml'
+ if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
+ tag = tag.split(':')[-1]
+@@ -506,7 +840,7 @@
+ c = int(ref[1:], 16)
+ else:
+ c = int(ref)
+- text = unichr(c).encode('utf-8')
++ text = chr(c)
+ self.elementstack[-1][2].append(text)
+
+ def handle_entityref(self, ref):
+@@ -515,19 +849,14 @@
+ if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref)
+ if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
+ text = '&%s;' % ref
++ elif ref in self.entities.keys():
++ text = self.entities[ref]
++ if text.startswith('&#') and text.endswith(';'):
++ return self.handle_entityref(text)
+ else:
+- # entity resolution graciously donated by Aaron Swartz
+- def name2cp(k):
+- import htmlentitydefs
+- if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3
+- return htmlentitydefs.name2codepoint[k]
+- k = htmlentitydefs.entitydefs[k]
+- if k.startswith('&#') and k.endswith(';'):
+- return int(k[2:-1]) # not in latin-1
+- return ord(k)
+- try: name2cp(ref)
++ try: name2codepoint[ref]
+ except KeyError: text = '&%s;' % ref
+- else: text = unichr(name2cp(ref)).encode('utf-8')
++ else: text = chr(name2codepoint[ref])
+ self.elementstack[-1][2].append(text)
+
+ def handle_data(self, text, escape=1):
+@@ -554,12 +883,19 @@
+ if _debug: sys.stderr.write('entering parse_declaration\n')
+ if self.rawdata[i:i+9] == '<![CDATA[':
+ k = self.rawdata.find(']]>', i)
+- if k == -1: k = len(self.rawdata)
++ if k == -1:
++ # CDATA block began but didn't finish
++ k = len(self.rawdata)
++ return k
+ self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
+ return k+3
+ else:
+ k = self.rawdata.find('>', i)
+- return k+1
++ if k >= 0:
++ return k+1
++ else:
++ # We have an incomplete CDATA block.
++ return k
+
+ def mapContentType(self, contentType):
+ contentType = contentType.lower()
+@@ -579,11 +915,11 @@
+ self.version = 'rss10'
+ if loweruri == 'http://www.w3.org/2005/atom' and not self.version:
+ self.version = 'atom10'
+- if loweruri.find('backend.userland.com/rss') <> -1:
++ if loweruri.find('backend.userland.com/rss') != -1:
+ # match any backend.userland.com namespace
+ uri = 'http://backend.userland.com/rss'
+ loweruri = uri
+- if self._matchnamespaces.has_key(loweruri):
++ if loweruri in self._matchnamespaces:
+ self.namespacemap[prefix] = self._matchnamespaces[loweruri]
+ self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
+ else:
+@@ -595,6 +931,9 @@
+ def decodeEntities(self, element, data):
+ return data
+
++ def strattrs(self, attrs):
++ return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'&quot;'})) for t in attrs])
++
+ def push(self, element, expectingText):
+ self.elementstack.append([element, expectingText, []])
+
+@@ -603,6 +942,28 @@
+ if self.elementstack[-1][0] != element: return
+
+ element, expectingText, pieces = self.elementstack.pop()
++
++ if self.version == 'atom10' and self.contentparams.get('type','text') == 'application/xhtml+xml':
++ # remove enclosing child element, but only if it is a <div> and
++ # only if all the remaining content is nested underneath it.
++ # This means that the divs would be retained in the following:
++ # <div>foo</div><div>bar</div>
++ while pieces and len(pieces)>1 and not pieces[-1].strip():
++ del pieces[-1]
++ while pieces and len(pieces)>1 and not pieces[0].strip():
++ del pieces[0]
++ if pieces and (pieces[0] == '<div>' or pieces[0].startswith('<div ')) and pieces[-1]=='</div>':
++ depth = 0
++ for piece in pieces[:-1]:
++ if piece.startswith('</'):
++ depth -= 1
++ if depth == 0: break
++ elif piece.startswith('<') and not piece.endswith('/>'):
++ depth += 1
++ else:
++ pieces = pieces[1:-1]
++
++ pieces = [s if isinstance(s, str) else s.encode(self.encoding) for s in pieces]
+ output = ''.join(pieces)
+ if stripWhitespace:
+ output = output.strip()
+@@ -611,7 +972,7 @@
+ # decode base64 content
+ if base64 and self.contentparams.get('base64', 0):
+ try:
+- output = base64.decodestring(output)
++ output = base64.decodebytes(output.encode(self.encoding)).decode(self.encoding)
+ except binascii.Error:
+ pass
+ except binascii.Incomplete:
+@@ -625,6 +986,9 @@
+ if not self.contentparams.get('base64', 0):
+ output = self.decodeEntities(element, output)
+
++ if self.lookslikehtml(output):
++ self.contentparams['type']='text/html'
++
+ # remove temporary cruft from contentparams
+ try:
+ del self.contentparams['mode']
+@@ -635,25 +999,57 @@
+ except KeyError:
+ pass
+
++ is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types
+ # resolve relative URIs within embedded markup
+- if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
++ if is_htmlish and RESOLVE_RELATIVE_URIS:
+ if element in self.can_contain_relative_uris:
+- output = _resolveRelativeURIs(output, self.baseuri, self.encoding)
++ output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))
++
++ # parse microformats
++ # (must do this before sanitizing because some microformats
++ # rely on elements that we sanitize)
++ if is_htmlish and element in ['content', 'description', 'summary']:
++ mfresults = _parseMicroformats(output, self.baseuri, self.encoding)
++ if mfresults:
++ for tag in mfresults.get('tags', []):
++ self._addTag(tag['term'], tag['scheme'], tag['label'])
++ for enclosure in mfresults.get('enclosures', []):
++ self._start_enclosure(enclosure)
++ for xfn in mfresults.get('xfn', []):
++ self._addXFN(xfn['relationships'], xfn['href'], xfn['name'])
++ vcard = mfresults.get('vcard')
++ if vcard:
++ self._getContext()['vcard'] = vcard
+
+ # sanitize embedded markup
+- if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
++ if is_htmlish and SANITIZE_HTML:
+ if element in self.can_contain_dangerous_markup:
+- output = _sanitizeHTML(output, self.encoding)
++ output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))
+
+- if self.encoding and type(output) != type(u''):
++ if self.encoding and not isinstance(output, str):
+ try:
+- output = unicode(output, self.encoding)
++ output = str(output, self.encoding)
+ except:
+ pass
+
++ # address common error where people take data that is already
++ # utf-8, presume that it is iso-8859-1, and re-encode it.
++ if self.encoding=='utf-8' and isinstance(output, str):
++ try:
++ output = str(output.encode('iso-8859-1'), 'utf-8')
++ except:
++ pass
++
++ # map win-1252 extensions to the proper code points
++ if isinstance(output, str):
++ output = ''.join([c in _cp1252.keys() and _cp1252[c] or c for c in output])
++
+ # categories/tags/keywords/whatever are handled in _end_category
+ if element == 'category':
+ return output
++