summaryrefslogtreecommitdiff
path: root/modules
diff options
context:
space:
mode:
authorJon Bergli Heier <snakebite@jvnv.net>2010-11-09 21:15:41 +0100
committerJon Bergli Heier <snakebite@jvnv.net>2010-11-09 21:15:41 +0100
commite957f116c862bf910e44d7f669d8b9d631860880 (patch)
treec3dd8d3331bed2826b85985367045d4ce20f6065 /modules
parent00ae549266241289e730586e7d0266bd6fc63493 (diff)
url_titles: Find encoding (charset) from HTTP headers or meta-tag and recode to UTF-8 if necessary.
Diffstat (limited to 'modules')
-rw-r--r--modules/url_titles.py25
1 files changed, 18 insertions, 7 deletions
diff --git a/modules/url_titles.py b/modules/url_titles.py
index fe6282b..b7d5e25 100644
--- a/modules/url_titles.py
+++ b/modules/url_titles.py
@@ -10,6 +10,7 @@ from PIL import ImageFile
class Module:
re_http = re.compile(r'(https?://[^\ #]+)')
re_title = re.compile(r'<title[^>]*?>(.*?)</title>', re.S | re.I)
+ re_meta = re.compile(r'<meta\s+http-equiv="content-type"\s+content="[^"]+charset=([^"]+)"', re.I)
def __init__(self, bot):
self.irc = bot
@@ -62,11 +63,6 @@ class Module:
u = urllib2.urlopen(url)
except:
return
- #enc = ct.split('encoding=')
- #if len(enc) == 2:
- #enc = enc[1]
- #else:
- #enc = None
if u.headers['content-type'].startswith('text/html'):
#s = u.read()
if 'content-encoding' in u.headers and u.headers['content-encoding'] == 'gzip':
@@ -74,6 +70,7 @@ class Module:
s.seek(0)
s = gzip.GzipFile(fileobj = s).read()
m = self.re_title.search(s)
+ enc = self.re_meta.search(s)
else:
s = ''
m = None
@@ -81,11 +78,25 @@ class Module:
while buf and time.time() - t < 5.0:
s += buf
m = self.re_title.search(s)
- if m:
+ enc = self.re_meta.search(s)
+ if m and (enc or '</head>' in s):
break
buf = u.read(1024)
+ if enc:
+ enc = enc.groups()[0]
+ else:
+ ct = u.headers['content-type']
+ enc = ct.lower().split('charset=')
+ if len(enc) == 2:
+ enc = enc[1]
+ else:
+ enc = None
+
if m:
- titles.append(m.groups()[0])
+ s = m.groups()[0]
+ if enc:
+ s = s.decode(enc).encode('utf8')
+ titles.append(s)
elif u.headers['content-type'] in ('image/gif', 'image/png', 'image/jpeg'):
def pretty_size(size):
suffixes = (('B', 2**10), ('KiB', 2**20), ('MiB', 2**30), ('GiB', 2**40), ('TiB', 2**50))