From e957f116c862bf910e44d7f669d8b9d631860880 Mon Sep 17 00:00:00 2001 From: Jon Bergli Heier Date: Tue, 9 Nov 2010 21:15:41 +0100 Subject: url_titles: Find encoding (charset) from HTTP headers or meta-tag and recode to UTF-8 if necessary. --- modules/url_titles.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/modules/url_titles.py b/modules/url_titles.py index fe6282b..b7d5e25 100644 --- a/modules/url_titles.py +++ b/modules/url_titles.py @@ -10,6 +10,7 @@ from PIL import ImageFile class Module: re_http = re.compile(r'(https?://[^\ #]+)') re_title = re.compile(r']*?>(.*?)', re.S | re.I) + re_meta = re.compile(r'' in s): break buf = u.read(1024) + if enc: + enc = enc.groups()[0] + else: + ct = u.headers['content-type'] + enc = ct.lower().split('charset=') + if len(enc) == 2: + enc = enc[1] + else: + enc = None + if m: - titles.append(m.groups()[0]) + s = m.groups()[0] + if enc: + s = s.decode(enc).encode('utf8') + titles.append(s) elif u.headers['content-type'] in ('image/gif', 'image/png', 'image/jpeg'): def pretty_size(size): suffixes = (('B', 2**10), ('KiB', 2**20), ('MiB', 2**30), ('GiB', 2**40), ('TiB', 2**50)) -- cgit v1.2.3