From f30f76acd13b45143adde1a78f185f7d81058628 Mon Sep 17 00:00:00 2001 From: Jon Bergli Heier Date: Sun, 27 Sep 2015 11:04:27 +0200 Subject: url_titles: Prefer Content-Type from HTTP headers. --- modules/url_titles.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'modules') diff --git a/modules/url_titles.py b/modules/url_titles.py index e4bbae6..fc750f8 100644 --- a/modules/url_titles.py +++ b/modules/url_titles.py @@ -83,28 +83,27 @@ class Module: s.seek(0) s = gzip.GzipFile(fileobj = s).read() m = self.re_title.search(s) - enc = self.re_meta.search(s) + meta_enc = self.re_meta.search(s) else: s = '' m = None - enc = None + meta_enc = None buf = u.read(1024) while buf and time.time() - t < 5.0: s += buf m = self.re_title.search(s) - enc = self.re_meta.search(s) - if m and (enc or '' in s): + meta_enc = self.re_meta.search(s) + if m and (meta_enc or '' in s): break buf = u.read(1024) - if enc: - enc = enc.groups()[0] + ct = u.headers['content-type'] + enc = ct.lower().split('charset=') + if len(enc) == 2: + enc = enc[1] + elif meta_enc: + enc = meta_enc.groups()[0] else: - ct = u.headers['content-type'] - enc = ct.lower().split('charset=') - if len(enc) == 2: - enc = enc[1] - else: - enc = None + enc = None if m: s = m.groups()[0] -- cgit v1.2.3