diff options
author | Jon Bergli Heier <snakebite@jvnv.net> | 2010-11-09 21:15:41 +0100 |
---|---|---|
committer | Jon Bergli Heier <snakebite@jvnv.net> | 2010-11-09 21:15:41 +0100 |
commit | e957f116c862bf910e44d7f669d8b9d631860880 (patch) | |
tree | c3dd8d3331bed2826b85985367045d4ce20f6065 /modules | |
parent | 00ae549266241289e730586e7d0266bd6fc63493 (diff) |
url_titles: Find encoding (charset) from HTTP headers or meta-tag and recode to UTF-8 if necessary.
Diffstat (limited to 'modules')
-rw-r--r-- | modules/url_titles.py | 25 |
1 files changed, 18 insertions, 7 deletions
diff --git a/modules/url_titles.py b/modules/url_titles.py index fe6282b..b7d5e25 100644 --- a/modules/url_titles.py +++ b/modules/url_titles.py @@ -10,6 +10,7 @@ from PIL import ImageFile class Module: re_http = re.compile(r'(https?://[^\ #]+)') re_title = re.compile(r'<title[^>]*?>(.*?)</title>', re.S | re.I) + re_meta = re.compile(r'<meta\s+http-equiv="content-type"\s+content="[^"]+charset=([^"]+)"', re.I) def __init__(self, bot): self.irc = bot @@ -62,11 +63,6 @@ class Module: u = urllib2.urlopen(url) except: return - #enc = ct.split('encoding=') - #if len(enc) == 2: - #enc = enc[1] - #else: - #enc = None if u.headers['content-type'].startswith('text/html'): #s = u.read() if 'content-encoding' in u.headers and u.headers['content-encoding'] == 'gzip': @@ -74,6 +70,7 @@ class Module: s.seek(0) s = gzip.GzipFile(fileobj = s).read() m = self.re_title.search(s) + enc = self.re_meta.search(s) else: s = '' m = None @@ -81,11 +78,25 @@ class Module: while buf and time.time() - t < 5.0: s += buf m = self.re_title.search(s) - if m: + enc = self.re_meta.search(s) + if m and (enc or '</head>' in s): break buf = u.read(1024) + if enc: + enc = enc.groups()[0] + else: + ct = u.headers['content-type'] + enc = ct.lower().split('charset=') + if len(enc) == 2: + enc = enc[1] + else: + enc = None + if m: - titles.append(m.groups()[0]) + s = m.groups()[0] + if enc: + s = s.decode(enc).encode('utf8') + titles.append(s) elif u.headers['content-type'] in ('image/gif', 'image/png', 'image/jpeg'): def pretty_size(size): suffixes = (('B', 2**10), ('KiB', 2**20), ('MiB', 2**30), ('GiB', 2**40), ('TiB', 2**50)) |