diff options
| author | Jon Bergli Heier <snakebite@jvnv.net> | 2010-10-25 20:26:07 +0200 | 
|---|---|---|
| committer | Jon Bergli Heier <snakebite@jvnv.net> | 2010-10-25 20:26:07 +0200 | 
| commit | 6403bece675d170ace9a1715106f27d3bf8e8412 (patch) | |
| tree | eeba36e03aeea99a61fbde667620cfa1d1dfcf84 | |
| parent | 8be4ecef3d7f083ec34c2bb6344d9bafb06cc61c (diff) | |
Added python3-feedparser.
| -rw-r--r-- | python3-feedparser/PKGBUILD | 28 | ||||
| -rw-r--r-- | python3-feedparser/python3.patch | 3359 | 
2 files changed, 3387 insertions, 0 deletions
| diff --git a/python3-feedparser/PKGBUILD b/python3-feedparser/PKGBUILD new file mode 100644 index 0000000..9b82e2e --- /dev/null +++ b/python3-feedparser/PKGBUILD @@ -0,0 +1,28 @@ +# Based on the offical PKGBUILD for python-feedparser. +# Contributor: Jon Bergli Heier <snakebite@jvnv.net> + +pkgname=python3-feedparser +pkgver=4.2pre315 +_pkgver=4.1 +pkgrel=1 +pkgdesc="Universal Feed Parser for Python 3" +arch=('any') +url="http://bitbucket.org/puzzlet/feedparser-py3/" +license=('custom') +depends=('python' 'libxml2' ) +source=(http://downloads.sourceforge.net/feedparser/feedparser-${_pkgver}.zip python3.patch) +md5sums=('7ab1140c1e29d4cd52ab20fa7b1f8640' +         '29fe3762cd3e2a97427001844ef2e772') + +build() { +  cd ${srcdir} +  patch -p0 -i python3.patch +} + +package() { +  cd ${srcdir} +  python setup.py install --root=${pkgdir} +  install -Dm644 LICENSE ${pkgdir}/usr/share/licenses/${pkgname}/license +} + +# vim:set ts=2 sw=2 et: diff --git a/python3-feedparser/python3.patch b/python3-feedparser/python3.patch new file mode 100644 index 0000000..5af9afc --- /dev/null +++ b/python3-feedparser/python3.patch @@ -0,0 +1,3359 @@ +--- feedparser.py	2010-10-25 20:07:40.000000000 +0200 ++++ python3-feedparser.py	2010-10-25 20:07:02.000000000 +0200 +@@ -6,13 +6,12 @@ + Visit http://feedparser.org/ for the latest version + Visit http://feedparser.org/docs/ for the latest documentation +  +-Required: Python 2.1 or later +-Recommended: Python 2.3 or later +-Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/> ++Required: Python 3.0 or later ++Recommended: Python 3.1 or later + """ +  +-__version__ = "4.1"# + "$Revision: 1.92 $"[11:15] + "-cvs" +-__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved. ++__version__ = "4.2-pre-" + "$Revision: 315 $"[11:14] + "-svn" ++__license__ = """Copyright (c) 2002-2008, Mark Pilgrim, All rights reserved. +  + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: +@@ -39,7 +38,10 @@ +                     "John Beimler <http://john.beimler.org/>", +                     "Fazal Majid <http://www.majid.info/mylos/weblog/>", +                     "Aaron Swartz <http://aaronsw.com/>", +-                    "Kevin Marks <http://epeus.blogspot.com/>"] ++                    "Kevin Marks <http://epeus.blogspot.com/>", ++                    "Sam Ruby <http://intertwingly.net/>", ++                    "Ade Oshineye <http://blog.oshineye.com/>", ++                    "Puzzlet Chung <http://puzzlet.org/>"] + _debug = 0 +  + # HTTP "User-Agent" header to send to servers when downloading feeds. +@@ -65,12 +67,18 @@ + # if TIDY_MARKUP = 1 + PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] +  ++# If you want feedparser to automatically resolve all relative URIs, set this ++# to 1. ++RESOLVE_RELATIVE_URIS = 1 ++ ++# If you want feedparser to automatically sanitize all potentially unsafe ++# HTML content, set this to 1. ++SANITIZE_HTML = 1 ++ + # ---------- required modules (should come with any Python distribution) ---------- +-import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2 +-try: +-    from cStringIO import StringIO as _StringIO +-except: +-    from StringIO import StringIO as _StringIO ++import html.parser, re, sys, copy, time, email, types, cgi, urllib, urllib.request, urllib.error, urllib.parse ++from io import StringIO as _StringIO ++from io import BytesIO +  + # ---------- optional modules (feedparser will work without these, but with reduced functionality) ---------- +  +@@ -95,28 +103,16 @@ +     _XML_AVAILABLE = 1 + except: +     _XML_AVAILABLE = 0 +-    def _xmlescape(data): ++    def _xmlescape(data,entities={}): +         data = data.replace('&', '&') +         data = data.replace('>', '>') +         data = data.replace('<', '<') ++        for char, entity in entities: ++            data = data.replace(char, entity) +         return data +  + # base64 support for Atom feeds that contain embedded binary data +-try: +-    import base64, binascii +-except: +-    base64 = binascii = None +- +-# cjkcodecs and iconv_codec provide support for more character encodings. +-# Both are available from http://cjkpython.i18n.org/ +-try: +-    import cjkcodecs.aliases +-except: +-    pass +-try: +-    import iconv_codec +-except: +-    pass ++import base64, binascii +  + # chardet library auto-detects character encodings + # Download from http://chardet.feedparser.org/ +@@ -128,6 +124,18 @@ + except: +     chardet = None +  ++from html.entities import name2codepoint, codepoint2name ++ ++# BeautifulSoup parser used for parsing microformats from embedded HTML content ++# http://www.crummy.com/software/BeautifulSoup/ ++# feedparser is tested with BeautifulSoup 3.0.x, but it might work with the ++# older 2.x series.  If it doesn't, and you can figure out why, I'll accept a ++# patch and modify the compatibility statement accordingly. ++try: ++    import BeautifulSoup ++except: ++    BeautifulSoup = None ++ + # ---------- don't touch these ---------- + class ThingsNobodyCaresAboutButMe(Exception): pass + class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass +@@ -135,9 +143,288 @@ + class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass + class UndeclaredNamespace(Exception): pass +  +-sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') +-sgmllib.special = re.compile('<!') +-sgmllib.charref = re.compile('&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]') ++incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' ++                           '<([a-zA-Z][^<>]*|' ++                              '/([a-zA-Z][^<>]*)?|' ++                              '![^<>]*)?') ++ ++entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') ++charref = re.compile('&#(\d+|x[0-9a-fA-F]+);') ++ ++starttagopen = re.compile('<[>a-zA-Z]') ++shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/') ++shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/') ++tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') ++attrfind = re.compile( ++    r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' ++    r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?') ++ ++class EndBracketMatch: ++    endbracket = re.compile('''([^'"<>]|"[^"]*"(?=>|/|\s|\w+=)|'[^']*'(?=>|/|\s|\w+=))*(?=[<>])|.*?(?=[<>])''') ++    def search(self,string,index=0): ++        self.match = self.endbracket.match(string,index) ++        if self.match: return self ++    def start(self,n): ++        return self.match.end(n) ++endbracket = EndBracketMatch() ++ ++class SGMLParser(html.parser.HTMLParser): ++    # Definition of entities -- derived classes may override ++    entity_or_charref = re.compile('&(?:' ++      '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)' ++      ')(;?)') ++ ++    def __init__(self, verbose=0): ++        """Initialize and reset this instance.""" ++        html.parser.HTMLParser.__init__(self) ++        self.verbose = verbose ++        self.reset() ++ ++    def reset(self): ++        """Reset this instance. Loses all unprocessed data.""" ++        self.__starttag_text = None ++        self.stack = [] ++        self.nomoretags = 0 ++        html.parser.HTMLParser.reset(self) ++ ++    # Internal -- handle data as far as reasonable.  May leave state ++    # and data to be processed by a subsequent call.  If 'end' is ++    # true, force handling all data as if followed by EOF marker. ++    def goahead(self, end): ++        rawdata = self.rawdata ++        i = 0 ++        n = len(rawdata) ++        while i < n: ++            if self.nomoretags: ++                self.handle_data(rawdata[i:n]) ++                i = n ++                break ++            match = self.interesting.search(rawdata, i) # < or & ++            if match: ++                j = match.start() ++            else: ++                j = n ++            if i < j: self.handle_data(rawdata[i:j]) ++            i = self.updatepos(i, j) ++            if i == n: break ++            startswith = rawdata.startswith ++            if startswith('<', i): ++                if starttagopen.match(rawdata, i): # < + letter ++                    k = self.parse_starttag(i) ++                elif startswith("</", i): ++                    k = self.parse_endtag(i) ++                elif startswith("<!--", i): ++                    k = self.parse_comment(i) ++                elif startswith("<?", i): ++                    k = self.parse_pi(i) ++                elif startswith("<!", i): ++                    k = self.parse_declaration(i) ++                elif (i + 1) < n: ++                    self.handle_data("<") ++                    k = i + 1 ++                else: ++                    break ++                if k < 0: ++                    if end: ++                        pass #self.error("EOF in middle of construct") ++                    break ++                i = self.updatepos(i, k) ++                continue ++            elif startswith('&', i): ++                match = charref.match(rawdata, i) ++                if match: ++                    name = match.group(1) ++                    self.handle_charref(name) ++                    k = match.end() ++                    if not startswith(';', k-1): ++                        k = k - 1 ++                    i = self.updatepos(i, k) ++                    continue ++                match = entityref.match(rawdata, i) ++                if match: ++                    name = match.group(1) ++                    self.handle_entityref(name) ++                    k = match.end() ++                    if not startswith(';', k-1): ++                        k = k - 1 ++                    i = self.updatepos(i, k) ++                    continue ++            else: ++                self.error('neither < nor & ??') ++            # We get here only if incomplete matches but ++            # nothing else ++            match = incomplete.match(rawdata, i) ++            if not match: ++                self.handle_data(rawdata[i]) ++                i = i+1 ++                continue ++            j = match.end(0) ++            if j == n: ++                break # Really incomplete ++            self.handle_data(rawdata[i:j]) ++            i = j ++        # end while ++        if end and i < n: ++            self.handle_data(rawdata[i:n]) ++            i = self.updatepos(i, n) ++        self.rawdata = rawdata[i:] ++        # XXX if end: check for empty stack ++ ++    # Internal -- handle starttag, return length or -1 if not terminated ++    def parse_starttag(self, i): ++        self.__starttag_text = None ++        start_pos = i ++        rawdata = self.rawdata ++        if shorttagopen.match(rawdata, i): ++            # SGML shorthand: <tag/data/ == <tag>data</tag> ++            # XXX Can data contain &... (entity or char refs)? ++            # XXX Can data contain < or > (tag characters)? ++            # XXX Can there be whitespace before the first /? ++            match = shorttag.match(rawdata, i) ++            if not match: ++                return -1 ++            tag, data = match.group(1, 2) ++            self.__starttag_text = '<%s/' % tag ++            tag = tag.lower() ++            k = match.end(0) ++            self.finish_shorttag(tag, data) ++            self.__starttag_text = rawdata[start_pos:match.end(1) + 1] ++            return k ++        # XXX The following should skip matching quotes (' or ") ++        # As a shortcut way to exit, this isn't so bad, but shouldn't ++        # be used to locate the actual end of the start tag since the ++        # < or > characters may be embedded in an attribute value. ++        match = endbracket.search(rawdata, i+1) ++        if not match: ++            return -1 ++        j = match.start(0) ++        # Now parse the data between i+1 and j into a tag and attrs ++        attrs = [] ++        if rawdata[i:i+2] == '<>': ++            # SGML shorthand: <> == <last open tag seen> ++            k = j ++            tag = self.lasttag ++        else: ++            match = tagfind.match(rawdata, i+1) ++            if not match: ++                self.error('unexpected call to parse_starttag') ++            k = match.end(0) ++            tag = rawdata[i+1:k].lower() ++            self.lasttag = tag ++        while k < j: ++            match = attrfind.match(rawdata, k) ++            if not match: ++                break ++            attrname, rest, attrvalue = match.group(1, 2, 3) ++            if not rest: ++                attrvalue = attrname ++            elif attrvalue[:1] == "'" == attrvalue[-1:] or \ ++                 attrvalue[:1] == '"' == attrvalue[-1:]: ++                attrvalue = attrvalue[1:-1] ++                attrvalue = self.entity_or_charref.sub(self._convert_ref, attrvalue) ++            attrs.append((attrname.lower(), attrvalue)) ++            k = match.end(0) ++        if rawdata[j] == '>': ++            j = j+1 ++        self.__starttag_text = rawdata[start_pos:j] ++        self.finish_starttag(tag, attrs) ++        return j ++ ++    # Internal -- convert entity or character reference ++    def _convert_ref(self, match): ++        if match.group(2): ++            return self.convert_charref(match.group(2)) or \ ++                '&#%s%s' % match.groups()[1:] ++        elif match.group(3): ++            return self.convert_entityref(match.group(1)) or \ ++                '&%s;' % match.group(1) ++        else: ++            return '&%s' % match.group(1) ++ ++    # Internal -- parse endtag ++    def parse_endtag(self, i): ++        rawdata = self.rawdata ++        match = endbracket.search(rawdata, i+1) ++        if not match: ++            return -1 ++        j = match.start(0) ++        tag = rawdata[i+2:j].strip().lower() ++        if rawdata[j] == '>': ++            j = j+1 ++        self.finish_endtag(tag) ++        return j ++ ++    # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>) ++    def finish_shorttag(self, tag, data): ++        self.finish_starttag(tag, []) ++        self.handle_data(data) ++        self.finish_endtag(tag) ++ ++    # Internal -- finish processing of start tag ++    # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag ++    def finish_starttag(self, tag, attrs): ++        method = getattr(self, 'start_' + tag, None) ++        if method: ++            self.stack.append(tag) ++            method(tag, attrs) ++            return 1 ++        method = getattr(self, 'do_' + tag, None) ++        if method: ++            method(tag, attrs) ++            return 0 ++        self.unknown_starttag(tag, attrs) ++        return -1 ++ ++    # Internal -- finish processing of end tag ++    def finish_endtag(self, tag): ++        if not tag: ++            found = len(self.stack) - 1 ++            if found < 0: ++                self.unknown_endtag(tag) ++                return ++        else: ++            if tag not in self.stack: ++                if getattr(self, 'end_' + tag, None): ++                    self.report_unbalanced(tag) ++                else: ++                    self.unknown_endtag(tag) ++                return ++            found = len(self.stack) ++            for i in range(found): ++                if self.stack[i] == tag: found = i ++        while len(self.stack) > found: ++            tag = self.stack[-1] ++            method = getattr(self, 'end_' + tag, self.unknown_endtag) ++            method(tag) ++            del self.stack[-1] ++ ++    # Example -- report an unbalanced </...> tag. ++    def report_unbalanced(self, tag): ++        if self.verbose: ++            print('*** Unbalanced </' + tag + '>') ++            print('*** Stack:', self.stack) ++ ++    def convert_charref(self, name): ++        """Convert character reference, may be overridden.""" ++        try: ++            n = int(name) ++        except ValueError: ++            return ++        if not 0 <= n <= 127: ++            return ++        return chr(n) ++ ++    # Definition of entities -- derived classes may override ++    entitydefs = \ ++            {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''} ++ ++    def convert_entityref(self, name): ++        """Convert entity references. ++ ++        As an alternative to overriding this method; one can tailor the ++        results by setting up the self.entitydefs mapping appropriately. ++        """ ++        return self.entitydefs.get(name, None) +  + SUPPORTED_VERSIONS = {'': 'unknown', +                       'rss090': 'RSS 0.90', +@@ -158,16 +445,7 @@ +                       'hotrss': 'Hot RSS' +                       } +  +-try: +-    UserDict = dict +-except NameError: +-    # Python 2.1 does not have dict +-    from UserDict import UserDict +-    def dict(aList): +-        rc = {} +-        for k, v in aList: +-            rc[k] = v +-        return rc ++UserDict = dict +  + class FeedParserDict(UserDict): +     keymap = {'channel': 'feed', +@@ -188,14 +466,21 @@ +     def __getitem__(self, key): +         if key == 'category': +             return UserDict.__getitem__(self, 'tags')[0]['term'] ++        if key == 'enclosures': ++            norel = lambda link: FeedParserDict([(name,value) for (name,value) in link.items() if name!='rel']) ++            return [norel(link) for link in UserDict.__getitem__(self, 'links') if link['rel']=='enclosure'] ++        if key == 'license': ++            for link in UserDict.__getitem__(self, 'links'): ++                if link['rel']=='license' and 'href' in link: ++                    return link['href'] +         if key == 'categories': +             return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')] +         realkey = self.keymap.get(key, key) +-        if type(realkey) == types.ListType: ++        if isinstance(realkey, list): +             for k in realkey: +-                if UserDict.has_key(self, k): ++                if super().__contains__(k): +                     return UserDict.__getitem__(self, k) +-        if UserDict.has_key(self, key): ++        if super().__contains__(key): +             return UserDict.__getitem__(self, key) +         return UserDict.__getitem__(self, realkey) +  +@@ -203,24 +488,24 @@ +         for k in self.keymap.keys(): +             if key == k: +                 key = self.keymap[k] +-                if type(key) == types.ListType: ++                if isinstance(key, list): +                     key = key[0] +         return UserDict.__setitem__(self, key, value) +  +     def get(self, key, default=None): +-        if self.has_key(key): ++        if key in self: +             return self[key] +         else: +             return default +  +     def setdefault(self, key, value): +-        if not self.has_key(key): ++        if key not in self: +             self[key] = value +         return self[key] +          +-    def has_key(self, key): ++    def __contains__(self, key): +         try: +-            return hasattr(self, key) or UserDict.has_key(self, key) ++            return hasattr(self, key) or UserDict.__contains__(self, key) +         except AttributeError: +             return False +          +@@ -233,7 +518,7 @@ +             assert not key.startswith('_') +             return self.__getitem__(key) +         except: +-            raise AttributeError, "object has no attribute '%s'" % key ++            raise AttributeError("object has no attribute '%s'" % key) +  +     def __setattr__(self, key, value): +         if key.startswith('_') or key == 'data': +@@ -241,9 +526,6 @@ +         else: +             return self.__setitem__(key, value) +  +-    def __contains__(self, key): +-        return self.has_key(key) +- + def zopeCompatibilityHack(): +     global FeedParserDict +     del FeedParserDict +@@ -275,15 +557,46 @@ +             92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249, +             48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255 +             ) +-        import string +-        _ebcdic_to_ascii_map = string.maketrans( \ +-            ''.join(map(chr, range(256))), ''.join(map(chr, emap))) ++        _ebcdic_to_ascii_map = bytes.maketrans(bytes(range(256)), bytes(emap)) +     return s.translate(_ebcdic_to_ascii_map) ++  ++_cp1252 = { ++  chr(128): chr(8364), # euro sign ++  chr(130): chr(8218), # single low-9 quotation mark ++  chr(131): chr( 402), # latin small letter f with hook ++  chr(132): chr(8222), # double low-9 quotation mark ++  chr(133): chr(8230), # horizontal ellipsis ++  chr(134): chr(8224), # dagger ++  chr(135): chr(8225), # double dagger ++  chr(136): chr( 710), # modifier letter circumflex accent ++  chr(137): chr(8240), # per mille sign ++  chr(138): chr( 352), # latin capital letter s with caron ++  chr(139): chr(8249), # single left-pointing angle quotation mark ++  chr(140): chr( 338), # latin capital ligature oe ++  chr(142): chr( 381), # latin capital letter z with caron ++  chr(145): chr(8216), # left single quotation mark ++  chr(146): chr(8217), # right single quotation mark ++  chr(147): chr(8220), # left double quotation mark ++  chr(148): chr(8221), # right double quotation mark ++  chr(149): chr(8226), # bullet ++  chr(150): chr(8211), # en dash ++  chr(151): chr(8212), # em dash ++  chr(152): chr( 732), # small tilde ++  chr(153): chr(8482), # trade mark sign ++  chr(154): chr( 353), # latin small letter s with caron ++  chr(155): chr(8250), # single right-pointing angle quotation mark ++  chr(156): chr( 339), # latin small ligature oe ++  chr(158): chr( 382), # latin small letter z with caron ++  chr(159): chr( 376)} # latin capital letter y with diaeresis +  + _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)') + def _urljoin(base, uri): +     uri = _urifixer.sub(r'\1\3', uri) +-    return urlparse.urljoin(base, uri) ++    try: ++        return urllib.parse.urljoin(base, uri) ++    except: ++        uri = urllib.parse.urlunparse([urllib.quote(part) for part in urllib.parse.urlparse(uri)]) ++        return urllib.parse.urljoin(base, uri) +  + class _FeedParserMixin: +     namespaces = {'': '', +@@ -324,6 +637,8 @@ +                   'http://example.com/DTDs/PodCast-1.0.dtd':              'itunes', +                   'http://purl.org/rss/1.0/modules/link/':                'l', +                   'http://search.yahoo.com/mrss':                         'media', ++                  #Version 1.1.2 of the Media RSS spec added the trailing slash on the namespace ++                  'http://search.yahoo.com/mrss/':                         'media', +                   'http://madskills.com/public/xml/rss/module/pingback/': 'pingback', +                   'http://prismstandard.org/namespaces/1.2/basic/':       'prism', +                   'http://www.w3.org/1999/02/22-rdf-syntax-ns#':          'rdf', +@@ -337,6 +652,7 @@ +                   'http://hacks.benhammersley.com/rss/streaming/':        'str', +                   'http://purl.org/rss/1.0/modules/subscription/':        'sub', +                   'http://purl.org/rss/1.0/modules/syndication/':         'sy', ++                  'http://schemas.pocketsoap.com/rss/myDescModule/':      'szf', +                   'http://purl.org/rss/1.0/modules/taxonomy/':            'taxo', +                   'http://purl.org/rss/1.0/modules/threading/':           'thr', +                   'http://purl.org/rss/1.0/modules/textinput/':           'ti', +@@ -344,12 +660,12 @@ +                   'http://wellformedweb.org/commentAPI/':                 'wfw', +                   'http://purl.org/rss/1.0/modules/wiki/':                'wiki', +                   'http://www.w3.org/1999/xhtml':                         'xhtml', +-                  'http://www.w3.org/XML/1998/namespace':                 'xml', +-                  'http://schemas.pocketsoap.com/rss/myDescModule/':      'szf' ++                  'http://www.w3.org/1999/xlink':                         'xlink', ++                  'http://www.w3.org/XML/1998/namespace':                 'xml' + } +     _matchnamespaces = {} +  +-    can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo'] ++    can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'icon', 'logo'] +     can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'] +     can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'] +     html_types = ['text/html', 'application/xhtml+xml'] +@@ -385,8 +701,10 @@ +         self.langstack = [] +         self.baseuri = baseuri or '' +         self.lang = baselang or None ++        self.svgOK = 0 ++        self.hasTitle = 0 +         if baselang: +-            self.feeddata['language'] = baselang ++            self.feeddata['language'] = baselang.replace('_','-') +  +     def unknown_starttag(self, tag, attrs): +         if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs)) +@@ -397,6 +715,11 @@ +         # track xml:base and xml:lang +         attrsD = dict(attrs) +         baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri ++        if not isinstance(baseuri, str): ++            try: ++                baseuri = str(baseuri, self.encoding) ++            except: ++                baseuri = str(baseuri, 'iso-8859-1') +         self.baseuri = _urljoin(self.baseuri, baseuri) +         lang = attrsD.get('xml:lang', attrsD.get('lang')) +         if lang == '': +@@ -407,7 +730,7 @@ +             lang = self.lang +         if lang: +             if tag in ('feed', 'rss', 'rdf:RDF'): +-                self.feeddata['language'] = lang ++                self.feeddata['language'] = lang.replace('_','-') +         self.lang = lang +         self.basestack.append(self.baseuri) +         self.langstack.append(lang) +@@ -420,23 +743,23 @@ +                 self.trackNamespace(None, uri) +  +         # track inline content +-        if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): ++        if self.incontent and 'type' in self.contentparams and not self.contentparams.get('type', 'xml').endswith('xml'): ++            if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007 +             # element declared itself as escaped markup, but it isn't really +             self.contentparams['type'] = 'application/xhtml+xml' +         if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml': +-            # Note: probably shouldn't simply recreate localname here, but +-            # our namespace handling isn't actually 100% correct in cases where +-            # the feed redefines the default namespace (which is actually +-            # the usual case for inline content, thanks Sam), so here we +-            # cheat and just reconstruct the element based on localname +-            # because that compensates for the bugs in our namespace handling. +-            # This will horribly munge inline content with non-empty qnames, +-            # but nobody actually does that, so I'm not fixing it. +-            tag = tag.split(':')[-1] +-            return self.handle_data('<%s%s>' % (tag, ''.join([' %s="%s"' % t for t in attrs])), escape=0) ++            if tag.find(':') != -1: ++                prefix, tag = tag.split(':', 1) ++                namespace = self.namespacesInUse.get(prefix, '') ++                if tag=='math' and namespace=='http://www.w3.org/1998/Math/MathML': ++                    attrs.append(('xmlns',namespace)) ++                if tag=='svg' and namespace=='http://www.w3.org/2000/svg': ++                    attrs.append(('xmlns',namespace)) ++            if tag == 'svg': self.svgOK += 1 ++            return self.handle_data('<%s%s>' % (tag, self.strattrs(attrs)), escape=0) +  +         # match namespaces +-        if tag.find(':') <> -1: ++        if tag.find(':') != -1: +             prefix, suffix = tag.split(':', 1) +         else: +             prefix, suffix = '', tag +@@ -456,30 +779,41 @@ +             method = getattr(self, methodname) +             return method(attrsD) +         except AttributeError: +-            return self.push(prefix + suffix, 1) ++            # Since there's no handler or something has gone wrong we explicitly add the element and its attributes ++            unknown_tag = prefix + suffix ++            if len(attrsD) == 0: ++                # No attributes so merge it into the encosing dictionary ++                return self.push(unknown_tag, 1) ++            else: ++                # Has attributes so create it in its own dictionary ++                context = self._getContext() ++                context[unknown_tag] = attrsD +  +     def unknown_endtag(self, tag): +         if _debug: sys.stderr.write('end %s\n' % tag) +         # match namespaces +-        if tag.find(':') <> -1: ++        if tag.find(':') != -1: +             prefix, suffix = tag.split(':', 1) +         else: +             prefix, suffix = '', tag +         prefix = self.namespacemap.get(prefix, prefix) +         if prefix: +             prefix = prefix + '_' ++        if suffix == 'svg' and self.svgOK: self.svgOK -= 1 +  +         # call special handler (if defined) or default handler +         methodname = '_end_' + prefix + suffix +         try: ++            if self.svgOK: raise AttributeError() +             method = getattr(self, methodname) +             method() +         except AttributeError: +             self.pop(prefix + suffix) +  +         # track inline content +-        if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): ++        if self.incontent and 'type' in self.contentparams and not self.contentparams.get('type', 'xml').endswith('xml'): +             # element declared itself as escaped markup, but it isn't really ++            if tag in ['xhtml:div', 'div']: return # typepad does this 10/2007 +             self.contentparams['type'] = 'application/xhtml+xml' +         if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml': +             tag = tag.split(':')[-1] +@@ -506,7 +840,7 @@ +                 c = int(ref[1:], 16) +             else: +                 c = int(ref) +-            text = unichr(c).encode('utf-8') ++            text = chr(c) +         self.elementstack[-1][2].append(text) +  +     def handle_entityref(self, ref): +@@ -515,19 +849,14 @@ +         if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref) +         if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): +             text = '&%s;' % ref ++        elif ref in self.entities.keys(): ++            text = self.entities[ref] ++            if text.startswith('&#') and text.endswith(';'): ++                return self.handle_entityref(text) +         else: +-            # entity resolution graciously donated by Aaron Swartz +-            def name2cp(k): +-                import htmlentitydefs +-                if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3 +-                    return htmlentitydefs.name2codepoint[k] +-                k = htmlentitydefs.entitydefs[k] +-                if k.startswith('&#') and k.endswith(';'): +-                    return int(k[2:-1]) # not in latin-1 +-                return ord(k) +-            try: name2cp(ref) ++            try: name2codepoint[ref] +             except KeyError: text = '&%s;' % ref +-            else: text = unichr(name2cp(ref)).encode('utf-8') ++            else: text = chr(name2codepoint[ref]) +         self.elementstack[-1][2].append(text) +  +     def handle_data(self, text, escape=1): +@@ -554,12 +883,19 @@ +         if _debug: sys.stderr.write('entering parse_declaration\n') +         if self.rawdata[i:i+9] == '<![CDATA[': +             k = self.rawdata.find(']]>', i) +-            if k == -1: k = len(self.rawdata) ++            if k == -1: ++                # CDATA block began but didn't finish ++                k = len(self.rawdata) ++                return k +             self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0) +             return k+3 +         else: +             k = self.rawdata.find('>', i) +-            return k+1 ++            if k >= 0: ++                return k+1 ++            else: ++                # We have an incomplete CDATA block. ++                return k +  +     def mapContentType(self, contentType): +         contentType = contentType.lower() +@@ -579,11 +915,11 @@ +             self.version = 'rss10' +         if loweruri == 'http://www.w3.org/2005/atom' and not self.version: +             self.version = 'atom10' +-        if loweruri.find('backend.userland.com/rss') <> -1: ++        if loweruri.find('backend.userland.com/rss') != -1: +             # match any backend.userland.com namespace +             uri = 'http://backend.userland.com/rss' +             loweruri = uri +-        if self._matchnamespaces.has_key(loweruri): ++        if loweruri in self._matchnamespaces: +             self.namespacemap[prefix] = self._matchnamespaces[loweruri] +             self.namespacesInUse[self._matchnamespaces[loweruri]] = uri +         else: +@@ -595,6 +931,9 @@ +     def decodeEntities(self, element, data): +         return data +  ++    def strattrs(self, attrs): ++        return ''.join([' %s="%s"' % (t[0],_xmlescape(t[1],{'"':'"'})) for t in attrs]) ++ +     def push(self, element, expectingText): +         self.elementstack.append([element, expectingText, []]) +  +@@ -603,6 +942,28 @@ +         if self.elementstack[-1][0] != element: return +          +         element, expectingText, pieces = self.elementstack.pop() ++ ++        if self.version == 'atom10' and self.contentparams.get('type','text') == 'application/xhtml+xml': ++            # remove enclosing child element, but only if it is a <div> and ++            # only if all the remaining content is nested underneath it. ++            # This means that the divs would be retained in the following: ++            #    <div>foo</div><div>bar</div> ++            while pieces and len(pieces)>1 and not pieces[-1].strip(): ++                del pieces[-1] ++            while pieces and len(pieces)>1 and not pieces[0].strip(): ++                del pieces[0] ++            if pieces and (pieces[0] == '<div>' or pieces[0].startswith('<div ')) and pieces[-1]=='</div>': ++                depth = 0 ++                for piece in pieces[:-1]: ++                    if piece.startswith('</'): ++                        depth -= 1 ++                        if depth == 0: break ++                    elif piece.startswith('<') and not piece.endswith('/>'): ++                        depth += 1 ++                else: ++                    pieces = pieces[1:-1] ++ ++        pieces = [s if isinstance(s, str) else s.encode(self.encoding) for s in pieces] +         output = ''.join(pieces) +         if stripWhitespace: +             output = output.strip() +@@ -611,7 +972,7 @@ +         # decode base64 content +         if base64 and self.contentparams.get('base64', 0): +             try: +-                output = base64.decodestring(output) ++                output = base64.decodebytes(output.encode(self.encoding)).decode(self.encoding) +             except binascii.Error: +                 pass +             except binascii.Incomplete: +@@ -625,6 +986,9 @@ +         if not self.contentparams.get('base64', 0): +             output = self.decodeEntities(element, output) +  ++        if self.lookslikehtml(output): ++            self.contentparams['type']='text/html' ++ +         # remove temporary cruft from contentparams +         try: +             del self.contentparams['mode'] +@@ -635,25 +999,57 @@ +         except KeyError: +             pass +  ++        is_htmlish = self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types +         # resolve relative URIs within embedded markup +-        if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types: ++        if is_htmlish and RESOLVE_RELATIVE_URIS: +             if element in self.can_contain_relative_uris: +-                output = _resolveRelativeURIs(output, self.baseuri, self.encoding) ++                output = _resolveRelativeURIs(output, self.baseuri, self.encoding, self.contentparams.get('type', 'text/html')) ++                 ++        # parse microformats ++        # (must do this before sanitizing because some microformats ++        # rely on elements that we sanitize) ++        if is_htmlish and element in ['content', 'description', 'summary']: ++            mfresults = _parseMicroformats(output, self.baseuri, self.encoding) ++            if mfresults: ++                for tag in mfresults.get('tags', []): ++                    self._addTag(tag['term'], tag['scheme'], tag['label']) ++                for enclosure in mfresults.get('enclosures', []): ++                    self._start_enclosure(enclosure) ++                for xfn in mfresults.get('xfn', []): ++                    self._addXFN(xfn['relationships'], xfn['href'], xfn['name']) ++                vcard = mfresults.get('vcard') ++                if vcard: ++                    self._getContext()['vcard'] = vcard +          +         # sanitize embedded markup +-        if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types: ++        if is_htmlish and SANITIZE_HTML: +             if element in self.can_contain_dangerous_markup: +-                output = _sanitizeHTML(output, self.encoding) ++                output = _sanitizeHTML(output, self.encoding, self.contentparams.get('type', 'text/html')) +  +-        if self.encoding and type(output) != type(u''): ++        if self.encoding and not isinstance(output, str): +             try: +-                output = unicode(output, self.encoding) ++                output = str(output, self.encoding) +             except: +                 pass +  ++        # address common error where people take data that is already ++        # utf-8, presume that it is iso-8859-1, and re-encode it. ++        if self.encoding=='utf-8' and isinstance(output, str): ++            try: ++                output = str(output.encode('iso-8859-1'), 'utf-8') ++            except: ++                pass ++ ++        # map win-1252 extensions to the proper code points ++        if isinstance(output, str): ++            output = ''.join([c in _cp1252.keys() and _cp1252[c] or c for c in output]) ++ +         # categories/tags/keywords/whatever are handled in _end_category +         if element == 'category': +             return output ++ | 
