Added python3-feedparser.

author: Jon Bergli Heier <snakebite@jvnv.net> 2010-10-25 20:26:07 +0200
committer: Jon Bergli Heier <snakebite@jvnv.net> 2010-10-25 20:26:07 +0200
commit: 6403bece675d170ace9a1715106f27d3bf8e8412 (patch)
tree: eeba36e03aeea99a61fbde667620cfa1d1dfcf84
parent: 8be4ecef3d7f083ec34c2bb6344d9bafb06cc61c (diff)
2 files changed, 3387 insertions, 0 deletions
diff --git a/python3-feedparser/PKGBUILD b/python3-feedparser/PKGBUILD
new file mode 100644
index 0000000..9b82e2e
--- /dev/null
+++ b/python3-feedparser/PKGBUILD
@@ -0,0 +1,28 @@
+# Based on the offical PKGBUILD for python-feedparser.
+# Contributor: Jon Bergli Heier <snakebite@jvnv.net>
+
+pkgname=python3-feedparser
+pkgver=4.2pre315
+_pkgver=4.1
+pkgrel=1
+pkgdesc="Universal Feed Parser for Python 3"
+arch=('any')
+url="http://bitbucket.org/puzzlet/feedparser-py3/"
+license=('custom')
+depends=('python' 'libxml2' )
+source=(http://downloads.sourceforge.net/feedparser/feedparser-${_pkgver}.zip python3.patch)
+md5sums=('7ab1140c1e29d4cd52ab20fa7b1f8640'
+         '29fe3762cd3e2a97427001844ef2e772')
+
+build() {
+  cd ${srcdir}
+  patch -p0 -i python3.patch
+}
+
+package() {
+  cd ${srcdir}
+  python setup.py install --root=${pkgdir}
+  install -Dm644 LICENSE ${pkgdir}/usr/share/licenses/${pkgname}/license
+}
+
+# vim:set ts=2 sw=2 et:
diff --git a/python3-feedparser/python3.patch b/python3-feedparser/python3.patch
new file mode 100644
index 0000000..5af9afc
--- /dev/null
+++ b/python3-feedparser/python3.patch
@@ -0,0 +1,3359 @@
+--- feedparser.py	2010-10-25 20:07:40.000000000 +0200
++++ python3-feedparser.py	2010-10-25 20:07:02.000000000 +0200
+@@ -6,13 +6,12 @@
+ Visit http://feedparser.org/ for the latest version
+ Visit http://feedparser.org/docs/ for the latest documentation
+ 
+-Required: Python 2.1 or later
+-Recommended: Python 2.3 or later
+-Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
++Required: Python 3.0 or later
++Recommended: Python 3.1 or later
+ """
+ 
+-__version__ = "4.1"# + "$Revision: 1.92 $"[11:15] + "-cvs"
+-__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
++__version__ = "4.2-pre-" + "$Revision: 315 $"[11:14] + "-svn"
++__license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved.
+ 
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+@@ -39,7 +38,10 @@
+                     "John Beimler <http://john.beimler.org/>",
+                     "Fazal Majid <http://www.majid.info/mylos/weblog/>",
+                     "Aaron Swartz <http://aaronsw.com/>",
+-                    "Kevin Marks <http://epeus.blogspot.com/>"]
++                    "Kevin Marks <http://epeus.blogspot.com/>",
++                    "Sam Ruby <http://intertwingly.net/>",
++                    "Ade Oshineye <http://blog.oshineye.com/>",
++                    "Puzzlet Chung <http://puzzlet.org/>"]
+ _debug = 0
+ 
+ # HTTP "User-Agent" header to send to servers when downloading feeds.
+@@ -65,12 +67,18 @@
+ # if TIDY_MARKUP = 1
+ PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
+ 
++# If you want feedparser to automatically resolve all relative URIs, set this
++# to 1.
++RESOLVE_RELATIVE_URIS = 1
++
++# If you want feedparser to automatically sanitize all potentially unsafe
++# HTML content, set this to 1.
++SANITIZE_HTML = 1
++
+ # ---------- required modules (should come with any Python distribution) ----------
+-import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
+-try:
+-    from cStringIO import StringIO as _StringIO
+-except:
+-    from StringIO import StringIO as _StringIO
++import html.parser, re, sys, copy, time, email, types, cgi, urllib, urllib.request, urllib.error, urllib.parse
++from io import StringIO as _StringIO
++from io import BytesIO
+ 
+ # ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
+ 
+@@ -95,28 +103,16 @@
+     _XML_AVAILABLE = 1
+ except:
+     _XML_AVAILABLE = 0
+-    def _xmlescape(data):
++    def _xmlescape(data,entities={}):
+         data = data.replace('&', '&amp;')
+         data = data.replace('>', '&gt;')
+         data = data.replace('<', '&lt;')
++        for char, entity in entities:
++            data = data.replace(char, entity)
+         return data
+ 
+ # base64 support for Atom feeds that contain embedded binary data
+-try:
+-    import base64, binascii
+-except:
+-    base64 = binascii = None
+-
+-# cjkcodecs and iconv_codec provide support for more character encodings.
+-# Both are available from http://cjkpython.i18n.org/
+-try:
+-    import cjkcodecs.aliases
+-except:
+-    pass
+-try:
+-    import iconv_codec
+-except:
+-    pass
++import base64, binascii
+ 
+ # chardet library auto-detects character encodings
+ # Download from http://chardet.feedparser.org/
+@@ -128,6 +124,18 @@
+ except:
+     chardet = None
+ 
++from html.entities import name2codepoint, codepoint2name
++
++# BeautifulSoup parser used for parsing microformats from embedded HTML content
++# http://www.crummy.com/software/BeautifulSoup/
++# feedparser is tested with BeautifulSoup 3.0.x, but it might work with the
++# older 2.x series.  If it doesn't, and you can figure out why, I'll accept a
++# patch and modify the compatibility statement accordingly.
++try:
++    import BeautifulSoup
++except:
++    BeautifulSoup = None
++
+ # ---------- don't touch these ----------
+ class ThingsNobodyCaresAboutButMe(Exception): pass
+ class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
+@@ -135,9 +143,288 @@
+ class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
+ class UndeclaredNamespace(Exception): pass
+ 
+-sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
+-sgmllib.special = re.compile('<!')
+-sgmllib.charref = re.compile('&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]')
++incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
++                           '<([a-zA-Z][^<>]*|'
++                              '/([a-zA-Z][^<>]*)?|'
++                              '![^<>]*)?')
++
++entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
++charref = re.compile('&#(\d+|x[0-9a-fA-F]+);')
++
++starttagopen = re.compile('<[>a-zA-Z]')
++shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
++shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
++tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
++attrfind = re.compile(
++    r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
++    r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
++
++class EndBracketMatch:
++    endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''')
++    def search(self,string,index=0):
++        self.match = self.endbracket.match(string,index)
++        if self.match: return self
++    def start(self,n):
++        return self.match.end(n)
++endbracket = EndBracketMatch()
++
++class SGMLParser(html.parser.HTMLParser):
++    # Definition of entities -- derived classes may override
++    entity_or_charref = re.compile('&(?:'
++      '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
++      ')(;?)')
++
++    def __init__(self, verbose=0):
++        """Initialize and reset this instance."""
++        html.parser.HTMLParser.__init__(self)
++        self.verbose = verbose
++        self.reset()
++
++    def reset(self):
++        """Reset this instance. Loses all unprocessed data."""
++        self.__starttag_text = None
++        self.stack = []
++        self.nomoretags = 0
++        html.parser.HTMLParser.reset(self)
++
++    # Internal -- handle data as far as reasonable.  May leave state
++    # and data to be processed by a subsequent call.  If 'end' is
++    # true, force handling all data as if followed by EOF marker.
++    def goahead(self, end):
++        rawdata = self.rawdata
++        i = 0
++        n = len(rawdata)
++        while i < n:
++            if self.nomoretags:
++                self.handle_data(rawdata[i:n])
++                i = n
++                break
++            match = self.interesting.search(rawdata, i) # < or &
++            if match:
++                j = match.start()
++            else:
++                j = n
++            if i < j: self.handle_data(rawdata[i:j])
++            i = self.updatepos(i, j)
++            if i == n: break
++            startswith = rawdata.startswith
++            if startswith('<', i):
++                if starttagopen.match(rawdata, i): # < + letter
++                    k = self.parse_starttag(i)
++                elif startswith("</", i):
++                    k = self.parse_endtag(i)
++                elif startswith("<!--", i):
++                    k = self.parse_comment(i)
++                elif startswith("<?", i):
++                    k = self.parse_pi(i)
++                elif startswith("<!", i):
++                    k = self.parse_declaration(i)
++                elif (i + 1) < n:
++                    self.handle_data("<")
++                    k = i + 1
++                else:
++                    break
++                if k < 0:
++                    if end:
++                        pass #self.error("EOF in middle of construct")
++                    break
++                i = self.updatepos(i, k)
++                continue
++            elif startswith('&', i):
++                match = charref.match(rawdata, i)
++                if match:
++                    name = match.group(1)
++                    self.handle_charref(name)
++                    k = match.end()
++                    if not startswith(';', k-1):
++                        k = k - 1
++                    i = self.updatepos(i, k)
++                    continue
++                match = entityref.match(rawdata, i)
++                if match:
++                    name = match.group(1)
++                    self.handle_entityref(name)
++                    k = match.end()
++                    if not startswith(';', k-1):
++                        k = k - 1
++                    i = self.updatepos(i, k)
++                    continue
++            else:
++                self.error('neither < nor & ??')
++            # We get here only if incomplete matches but
++            # nothing else
++            match = incomplete.match(rawdata, i)
++            if not match:
++                self.handle_data(rawdata[i])
++                i = i+1
++                continue
++            j = match.end(0)
++            if j == n:
++                break # Really incomplete
++            self.handle_data(rawdata[i:j])
++            i = j
++        # end while
++        if end and i < n:
++            self.handle_data(rawdata[i:n])
++            i = self.updatepos(i, n)
++        self.rawdata = rawdata[i:]
++        # XXX if end: check for empty stack
++
++    # Internal -- handle starttag, return length or -1 if not terminated
++    def parse_starttag(self, i):
++        self.__starttag_text = None
++        start_pos = i
++        rawdata = self.rawdata
++        if shorttagopen.match(rawdata, i):
++            # SGML shorthand: <tag/data/ == <tag>data</tag>
++            # XXX Can data contain &... (entity or char refs)?
++            # XXX Can data contain < or > (tag characters)?
++            # XXX Can there be whitespace before the first /?
++            match = shorttag.match(rawdata, i)
++            if not match:
++                return -1
++            tag, data = match.group(1, 2)
++            self.__starttag_text = '<%s/' % tag
++            tag = tag.lower()
++            k = match.end(0)
++            self.finish_shorttag(tag, data)
++            self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
++            return k
++        # XXX The following should skip matching quotes (' or ")
++        # As a shortcut way to exit, this isn't so bad, but shouldn't
++        # be used to locate the actual end of the start tag since the
++        # < or > characters may be embedded in an attribute value.
++        match = endbracket.search(rawdata, i+1)
++        if not match:
++            return -1
++        j = match.start(0)
++        # Now parse the data between i+1 and j into a tag and attrs
++        attrs = []
++        if rawdata[i:i+2] == '<>':
++            # SGML shorthand: <> == <last open tag seen>
++            k = j
++            tag = self.lasttag
++        else:
++            match = tagfind.match(rawdata, i+1)
++            if not match:
++                self.error('unexpected call to parse_starttag')
++            k = match.end(0)
++            tag = rawdata[i+1:k].lower()
++            self.lasttag = tag
++        while k < j:
++            match = attrfind.match(rawdata, k)
++            if not match:
++                break
++            attrname, rest, attrvalue = match.group(1, 2, 3)
++            if not rest:
++                attrvalue = attrname
++            elif attrvalue[:1] == "'" == attrvalue[-1:] or \
++                 attrvalue[:1] == '"' == attrvalue[-1:]:
++                attrvalue = attrvalue[1:-1]
++                attrvalue = self.entity_or_charref.sub(self._convert_ref, attrvalue)
++            attrs.append((attrname.lower(), attrvalue))
++            k = match.end(0)
++        if rawdata[j] == '>':
++            j = j+1
++        self.__starttag_text = rawdata[start_pos:j]
++        self.finish_starttag(tag, attrs)
++        return j
++
++    # Internal -- convert entity or character reference
++    def _convert_ref(self, match):
++        if match.group(2):
++            return self.convert_charref(match.group(2)) or \
++                '&#%s%s' % match.groups()[1:]
++        elif match.group(3):
++            return self.convert_entityref(match.group(1)) or \
++                '&%s;' % match.group(1)
++        else:
++            return '&%s' % match.group(1)
++
++    # Internal -- parse endtag
++    def parse_endtag(self, i):
++        rawdata = self.rawdata
++        match = endbracket.search(rawdata, i+1)
++        if not match:
++            return -1
++        j = match.start(0)
++        tag = rawdata[i+2:j].strip().lower()
++        if rawdata[j] == '>':
++            j = j+1
++        self.finish_endtag(tag)
++        return j
++
++    # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
++    def finish_shorttag(self, tag, data):
++        self.finish_starttag(tag, [])
++        self.handle_data(data)
++        self.finish_endtag(tag)
++
++    # Internal -- finish processing of start tag
++    # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
++    def finish_starttag(self, tag, attrs):
++        method = getattr(self, 'start_' + tag, None)
++        if method:
++            self.stack.append(tag)
++            method(tag, attrs)
++            return 1
++        method = getattr(self, 'do_' + tag, None)
++        if method:
++            method(tag, attrs)
++            return 0
++        self.unknown_starttag(tag, attrs)
++        return -1
++
++    # Internal -- finish processing of end tag
++    def finish_endtag(self, tag):
++        if not tag:
++            found = len(self.stack) - 1
++            if found < 0:
++                self.unknown_endtag(tag)
++                return
++        else:
++            if tag not in self.stack:
++                if getattr(self, 'end_' + tag, None):
++                    self.report_unbalanced(tag)
++                else:
++                    self.unknown_endtag(tag)
++                return
++            found = len(self.stack)
++            for i in range(found):
++                if self.stack[i] == tag: found = i
++        while len(self.stack) > found:
++            tag = self.stack[-1]
++            method = getattr(self, 'end_' + tag, self.unknown_endtag)
++            method(tag)
++            del self.stack[-1]
++
++    # Example -- report an unbalanced </...> tag.
++    def report_unbalanced(self, tag):
++        if self.verbose:
++            print('*** Unbalanced </' + tag + '>')
++            print('*** Stack:', self.stack)
++
++    def convert_charref(self, name):
++        """Convert character reference, may be overridden."""
++        try:
++            n = int(name)
++        except ValueError:
++            return
++        if not 0 <= n <= 127:
++            return
++        return chr(n)
++
++    # Definition of entities -- derived classes may override
++    entitydefs = \
++            {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
++
++    def convert_entityref(self, name):
++        """Convert entity references.
++
++        As an alternative to overriding this method; one can tailor the
++        results by setting up the self.entitydefs mapping appropriately.
++        """
++        return self.entitydefs.get(name, None)
+ 
+ SUPPORTED_VERSIONS = {'': 'unknown',
+                       'rss090': 'RSS 0.90',
+@@ -158,16 +445,7 @@
+                       'hotrss': 'Hot RSS'
+                       }
+ 
+-try:
+-    UserDict = dict
+-except NameError:
+-    # Python 2.1 does not have dict
+-    from UserDict import UserDict
+-    def dict(aList):
+-        rc = {}
+-        for k, v in aList:
+-            rc[k] = v
+-        return rc
++UserDict = dict
+ 
+ class FeedParserDict(UserDict):
+     keymap = {'channel': 'feed',
+@@ -188,14 +466,21 @@
+     def __getitem__(self, key):
+         if key == 'category':
+             return UserDict.__getitem__(self, 'tags')[0]['term']
++        if key == 'enclosures':
++            norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel'])
++            return [norel(link) for link in UserDict.__getitem__(self, 'links') if link['rel']=='enclosure']
++        if key == 'license':
++            for link in UserDict.__getitem__(self, 'links'):
++                if link['rel']=='license' and 'href' in link:
++                    return link['href']
+         if key == 'categories':
+             return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')]
+         realkey = self.keymap.get(key, key)
+-        if type(realkey) == types.ListType:
++        if isinstance(realkey, list):
+             for k in realkey:
+-                if UserDict.has_key(self, k):
++                if super().__contains__(k):
+                     return UserDict.__getitem__(self, k)
+-        if UserDict.has_key(self, key):
++        if super().__contains__(key):
+             return UserDict.__getitem__(self, key)
+         return UserDict.__getitem__(self, realkey)
+ 
+@@ -203,24 +488,24 @@
+         for k in self.keymap.keys():
+             if key == k:
+                 key = self.keymap[k]
+-                if type(key) == types.ListType:
++                if isinstance(key, list):
+                     key = key[0]
+         return UserDict.__setitem__(self, key, value)
+ 
+     def get(self, key, default=None):
+-        if self.has_key(key):
++        if key in self:
+             return self[key]
+         else:
+             return default
+ 
+     def setdefault(self, key, value):
+-        if not self.has_key(key):
++        if key not in self:
+             self[key] = value
+         return self[key]
+         
+-    def has_key(self, key):
++    def __contains__(self, key):
+         try:
+-            return hasattr(self, key) or UserDict.has_key(self, key)
++            return hasattr(self, key) or UserDict.__contains__(self, key)
+         except AttributeError:
+             return False
+         
+@@ -233,7 +518,7 @@
+             assert not key.startswith('_')
+             return self.__getitem__(key)
+         except:
+-            raise AttributeError, "object has no attribute '%s'" % key
++            raise AttributeError("object has no attribute '%s'" % key)
+ 
+     def __setattr__(self, key, value):
+         if key.startswith('_') or key == 'data':
+@@ -241,9 +526,6 @@
+         else:
+             return self.__setitem__(key, value)
+ 
+-    def __contains__(self, key):
+-        return self.has_key(key)
+-
+ def zopeCompatibilityHack():
+     global FeedParserDict
+     del FeedParserDict
+@@ -275,15 +557,46 @@
+             92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
+             48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
+             )
+-        import string
+-        _ebcdic_to_ascii_map = string.maketrans( \
+-            ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
++        _ebcdic_to_ascii_map = bytes.maketrans(bytes(range(256)), bytes(emap))
+     return s.translate(_ebcdic_to_ascii_map)
++ 
++_cp1252 = {
++  chr(128): chr(8364), # euro sign
++  chr(130): chr(8218), # single low-9 quotation mark
++  chr(131): chr( 402), # latin small letter f with hook
++  chr(132): chr(8222), # double low-9 quotation mark
++  chr(133): chr(8230), # horizontal ellipsis
++  chr(134): chr(8224), # dagger
++  chr(135): chr(8225), # double dagger
++  chr(136): chr( 710), # modifier letter circumflex accent
++  chr(137): chr(8240), # per mille sign
++  chr(138): chr( 352), # latin capital letter s with caron
++  chr(139): chr(8249), # single left-pointing angle quotation mark
++  chr(140): chr( 338), # latin capital ligature oe
++  chr(142): chr( 381), # latin capital letter z with caron
++  chr(145): chr(8216), # left single quotation mark
++  chr(146): chr(8217), # right single quotation mark
++  chr(147): chr(8220), # left double quotation mark
++  chr(148): chr(8221), # right double quotation mark
++  chr(149): chr(8226), # bullet
++  chr(150): chr(8211), # en dash
++  chr(151): chr(8212), # em dash
++  chr(152): chr( 732), # small tilde
++  chr(153): chr(8482), # trade mark sign
++  chr(154): chr( 353), # latin small letter s with caron
++  chr(155): chr(8250), # single right-pointing angle quotation mark
++  chr(156): chr( 339), # latin small ligature oe
++  chr(158): chr( 382), # latin small letter z with caron
++  chr(159): chr( 376)} # latin capital letter y with diaeresis
+ 
+ _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
+ def _urljoin(base, uri):
+     uri = _urifixer.sub(r'\1\3', uri)
+-    return urlparse.urljoin(base, uri)
++    try:
++        return urllib.parse.urljoin(base, uri)
++    except:
++        uri = urllib.parse.urlunparse([urllib.quote(part) for part in urllib.parse.urlparse(uri)])
++        return urllib.parse.urljoin(base, uri)
+ 
+ class _FeedParserMixin:
+     namespaces = {'': '',
+@@ -324,6 +637,8 @@
+                   'http://example.com/DTDs/PodCast-1.0.dtd':              'itunes',
+                   'http://purl.org/rss/1.0/modules/link/':                'l',
+                   'http://search.yahoo.com/mrss':                         'media',
++                  #Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace
++                  'http://search.yahoo.com/mrss/':                         'media',
+                   'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
+                   'http://prismstandard.org/namespaces/1.2/basic/':       'prism',
+                   'http://www.w3.org/1999/02/22-rdf-syntax-ns#':          'rdf',
+@@ -337,6 +652,7 @@
+                   'http://hacks.benhammersley.com/rss/streaming/':        'str',
+                   'http://purl.org/rss/1.0/modules/subscription/':        'sub',
+                   'http://purl.org/rss/1.0/modules/syndication/':         'sy',
++                  'http://schemas.pocketsoap.com/rss/myDescModule/':      'szf',
+                   'http://purl.org/rss/1.0/modules/taxonomy/':            'taxo',
+                   'http://purl.org/rss/1.0/modules/threading/':           'thr',
+                   'http://purl.org/rss/1.0/modules/textinput/':           'ti',
+@@ -344,12 +660,12 @@
+                   'http://wellformedweb.org/commentAPI/':                 'wfw',
+                   'http://purl.org/rss/1.0/modules/wiki/':                'wiki',
+                   'http://www.w3.org/1999/xhtml':                         'xhtml',
+-                  'http://www.w3.org/XML/1998/namespace':                 'xml',
+-                  'http://schemas.pocketsoap.com/rss/myDescModule/':      'szf'
++                  'http://www.w3.org/1999/xlink':                         'xlink',
++                  'http://www.w3.org/XML/1998/namespace':                 'xml'
+ }
+     _matchnamespaces = {}
+ 
+-    can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
++    can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo']
+     can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
+     can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
+     html_types = ['text/html', 'application/xhtml+xml']
+@@ -385,8 +701,10 @@
+         self.langstack = []
+         self.baseuri = baseuri or ''
+         self.lang = baselang or None
++        self.svgOK = 0
++        self.hasTitle = 0
+         if baselang:
+-            self.feeddata['language'] = baselang
++            self.feeddata['language'] = baselang.replace('_','-')
+ 
+     def unknown_starttag(self, tag, attrs):
+         if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
+@@ -397,6 +715,11 @@
+         # track xml:base and xml:lang
+         attrsD = dict(attrs)
+         baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
++        if not isinstance(baseuri, str):
++            try:
++                baseuri = str(baseuri, self.encoding)
++            except:
++                baseuri = str(baseuri, 'iso-8859-1')
+         self.baseuri = _urljoin(self.baseuri, baseuri)
+         lang = attrsD.get('xml:lang', attrsD.get('lang'))
+         if lang == '':
+@@ -407,7 +730,7 @@
+             lang = self.lang
+         if lang:
+             if tag in ('feed', 'rss', 'rdf:RDF'):
+-                self.feeddata['language'] = lang
++                self.feeddata['language'] = lang.replace('_','-')
+         self.lang = lang
+         self.basestack.append(self.baseuri)
+         self.langstack.append(lang)
+@@ -420,23 +743,23 @@
+                 self.trackNamespace(None, uri)
+ 
+         # track inline content
+-        if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
++        if self.incontent and 'type' in self.contentparams and not self.contentparams.get('type', 'xml').endswith('xml'):
++            if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007
+             # element declared itself as escaped markup, but it isn't really
+             self.contentparams['type'] = 'application/xhtml+xml'
+         if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
+-            # Note: probably shouldn't simply recreate localname here, but
+-            # our namespace handling isn't actually 100% correct in cases where
+-            # the feed redefines the default namespace (which is actually
+-            # the usual case for inline content, thanks Sam), so here we
+-            # cheat and just reconstruct the element based on localname
+-            # because that compensates for the bugs in our namespace handling.
+-            # This will horribly munge inline content with non-empty qnames,
+-            # but nobody actually does that, so I'm not fixing it.
+-            tag = tag.split(':')[-1]
+-            return self.handle_data('<%s%s>' % (tag, ''.join([' %s="%s"' % t for t in attrs])), escape=0)
++            if tag.find(':') != -1:
++                prefix, tag = tag.split(':', 1)
++                namespace = self.namespacesInUse.get(prefix, '')
++                if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
++                    attrs.append(('xmlns',namespace))
++                if tag=='svg' and namespace=='http://www.w3.org/2000/svg':
++                    attrs.append(('xmlns',namespace))
++            if tag == 'svg': self.svgOK += 1
++            return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0)
+ 
+         # match namespaces
+-        if tag.find(':') <> -1:
++        if tag.find(':') != -1:
+             prefix, suffix = tag.split(':', 1)
+         else:
+             prefix, suffix = '', tag
+@@ -456,30 +779,41 @@
+             method = getattr(self, methodname)
+             return method(attrsD)
+         except AttributeError:
+-            return self.push(prefix + suffix, 1)
++            # Since there's no handler or something has gone wrong we explicitly add the element and its attributes
++            unknown_tag = prefix + suffix
++            if len(attrsD) == 0:
++                # No attributes so merge it into the encosing dictionary
++                return self.push(unknown_tag, 1)
++            else:
++                # Has attributes so create it in its own dictionary
++                context = self._getContext()
++                context[unknown_tag] = attrsD
+ 
+     def unknown_endtag(self, tag):
+         if _debug: sys.stderr.write('end %s\n' % tag)
+         # match namespaces
+-        if tag.find(':') <> -1:
++        if tag.find(':') != -1:
+             prefix, suffix = tag.split(':', 1)
+         else:
+             prefix, suffix = '', tag
+         prefix = self.namespacemap.get(prefix, prefix)
+         if prefix:
+             prefix = prefix + '_'
++        if suffix == 'svg' and self.svgOK: self.svgOK -= 1
+ 
+         # call special handler (if defined) or default handler
+         methodname = '_end_' + prefix + suffix
+         try:
++            if self.svgOK: raise AttributeError()
+             method = getattr(self, methodname)
+             method()
+         except AttributeError:
+             self.pop(prefix + suffix)
+ 
+         # track inline content
+-        if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
++        if self.incontent and 'type' in self.contentparams and not self.contentparams.get('type', 'xml').endswith('xml'):
+             # element declared itself as escaped markup, but it isn't really
++            if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007
+             self.contentparams['type'] = 'application/xhtml+xml'
+         if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
+             tag = tag.split(':')[-1]
+@@ -506,7 +840,7 @@
+                 c = int(ref[1:], 16)
+             else:
+                 c = int(ref)
+-            text = unichr(c).encode('utf-8')
++            text = chr(c)
+         self.elementstack[-1][2].append(text)
+ 
+     def handle_entityref(self, ref):
+@@ -515,19 +849,14 @@
+         if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref)
+         if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
+             text = '&%s;' % ref
++        elif ref in self.entities.keys():
++            text = self.entities[ref]
++            if text.startswith('&#') and text.endswith(';'):
++                return self.handle_entityref(text)
+         else:
+-            # entity resolution graciously donated by Aaron Swartz
+-            def name2cp(k):
+-                import htmlentitydefs
+-                if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3
+-                    return htmlentitydefs.name2codepoint[k]
+-                k = htmlentitydefs.entitydefs[k]
+-                if k.startswith('&#') and k.endswith(';'):
+-                    return int(k[2:-1]) # not in latin-1
+-                return ord(k)
+-            try: name2cp(ref)
++            try: name2codepoint[ref]
+             except KeyError: text = '&%s;' % ref
+-            else: text = unichr(name2cp(ref)).encode('utf-8')
++            else: text = chr(name2codepoint[ref])
+         self.elementstack[-1][2].append(text)
+ 
+     def handle_data(self, text, escape=1):
+@@ -554,12 +883,19 @@
+         if _debug: sys.stderr.write('entering parse_declaration\n')
+         if self.rawdata[i:i+9] == '<![CDATA[':
+             k = self.rawdata.find(']]>', i)
+-            if k == -1: k = len(self.rawdata)
++            if k == -1:
++                # CDATA block began but didn't finish
++                k = len(self.rawdata)
++                return k
+             self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
+             return k+3
+         else:
+             k = self.rawdata.find('>', i)
+-            return k+1
++            if k >= 0:
++                return k+1
++            else:
++                # We have an incomplete CDATA block.
++                return k
+ 
+     def mapContentType(self, contentType):
+         contentType = contentType.lower()
+@@ -579,11 +915,11 @@
+             self.version = 'rss10'
+         if loweruri == 'http://www.w3.org/2005/atom' and not self.version:
+             self.version = 'atom10'
+-        if loweruri.find('backend.userland.com/rss') <> -1:
++        if loweruri.find('backend.userland.com/rss') != -1:
+             # match any backend.userland.com namespace
+             uri = 'http://backend.userland.com/rss'
+             loweruri = uri
+-        if self._matchnamespaces.has_key(loweruri):
++        if loweruri in self._matchnamespaces:
+             self.namespacemap[prefix] = self._matchnamespaces[loweruri]
+             self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
+         else:
+@@ -595,6 +931,9 @@
+     def decodeEntities(self, element, data):
+         return data
+ 
++    def strattrs(self, attrs):
++        return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'&quot;'})) for t in attrs])
++
+     def push(self, element, expectingText):
+         self.elementstack.append([element, expectingText, []])
+ 
+@@ -603,6 +942,28 @@
+         if self.elementstack[-1][0] != element: return
+         
+         element, expectingText, pieces = self.elementstack.pop()
++
++        if self.version == 'atom10' and self.contentparams.get('type','text') == 'application/xhtml+xml':
++            # remove enclosing child element, but only if it is a <div> and
++            # only if all the remaining content is nested underneath it.
++            # This means that the divs would be retained in the following:
++            #    <div>foo</div><div>bar</div>
++            while pieces and len(pieces)>1 and not pieces[-1].strip():
++                del pieces[-1]
++            while pieces and len(pieces)>1 and not pieces[0].strip():
++                del pieces[0]
++            if pieces and (pieces[0] == '<div>' or pieces[0].startswith('<div ')) and pieces[-1]=='</div>':
++                depth = 0
++                for piece in pieces[:-1]:
++                    if piece.startswith('</'):
++                        depth -= 1
++                        if depth == 0: break
++                    elif piece.startswith('<') and not piece.endswith('/>'):
++                        depth += 1
++                else:
++                    pieces = pieces[1:-1]
++
++        pieces = [s if isinstance(s, str) else s.encode(self.encoding) for s in pieces]
+         output = ''.join(pieces)
+         if stripWhitespace:
+             output = output.strip()
+@@ -611,7 +972,7 @@
+         # decode base64 content
+         if base64 and self.contentparams.get('base64', 0):
+             try:
+-                output = base64.decodestring(output)
++                output = base64.decodebytes(output.encode(self.encoding)).decode(self.encoding)
+             except binascii.Error:
+                 pass
+             except binascii.Incomplete:
+@@ -625,6 +986,9 @@
+         if not self.contentparams.get('base64', 0):
+             output = self.decodeEntities(element, output)
+ 
++        if self.lookslikehtml(output):
++            self.contentparams['type']='text/html'
++
+         # remove temporary cruft from contentparams
+         try:
+             del self.contentparams['mode']
+@@ -635,25 +999,57 @@
+         except KeyError:
+             pass
+ 
++        is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types
+         # resolve relative URIs within embedded markup
+-        if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
++        if is_htmlish and RESOLVE_RELATIVE_URIS:
+             if element in self.can_contain_relative_uris:
+-                output = _resolveRelativeURIs(output, self.baseuri, self.encoding)
++                output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html'))
++                
++        # parse microformats
++        # (must do this before sanitizing because some microformats
++        # rely on elements that we sanitize)
++        if is_htmlish and element in ['content', 'description', 'summary']:
++            mfresults = _parseMicroformats(output, self.baseuri, self.encoding)
++            if mfresults:
++                for tag in mfresults.get('tags', []):
++                    self._addTag(tag['term'], tag['scheme'], tag['label'])
++                for enclosure in mfresults.get('enclosures', []):
++                    self._start_enclosure(enclosure)
++                for xfn in mfresults.get('xfn', []):
++                    self._addXFN(xfn['relationships'], xfn['href'], xfn['name'])
++                vcard = mfresults.get('vcard')
++                if vcard:
++                    self._getContext()['vcard'] = vcard
+         
+         # sanitize embedded markup
+-        if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
++        if is_htmlish and SANITIZE_HTML:
+             if element in self.can_contain_dangerous_markup:
+-                output = _sanitizeHTML(output, self.encoding)
++                output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html'))
+ 
+-        if self.encoding and type(output) != type(u''):
++        if self.encoding and not isinstance(output, str):
+             try:
+-                output = unicode(output, self.encoding)
++                output = str(output, self.encoding)
+             except:
+                 pass
+ 
++        # address common error where people take data that is already
++        # utf-8, presume that it is iso-8859-1, and re-encode it.
++        if self.encoding=='utf-8' and isinstance(output, str):
++            try:
++                output = str(output.encode('iso-8859-1'), 'utf-8')
++            except:
++                pass
++
++        # map win-1252 extensions to the proper code points
++        if isinstance(output, str):
++            output = ''.join([c in _cp1252.keys() and _cp1252[c] or c for c in output])
++
+         # categories/tags/keywords/whatever are handled in _end_category
+         if element == 'category':
+             return output
++
author	Jon Bergli Heier <snakebite@jvnv.net>	2010-10-25 20:26:07 +0200
committer	Jon Bergli Heier <snakebite@jvnv.net>	2010-10-25 20:26:07 +0200
commit	6403bece675d170ace9a1715106f27d3bf8e8412 (patch)
tree	eeba36e03aeea99a61fbde667620cfa1d1dfcf84
parent	8be4ecef3d7f083ec34c2bb6344d9bafb06cc61c (diff)