summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJon Bergli Heier <snakebite@jvnv.net>2015-09-27 11:04:27 +0200
committerJon Bergli Heier <snakebite@jvnv.net>2015-09-27 11:04:27 +0200
commitf30f76acd13b45143adde1a78f185f7d81058628 (patch)
treed3713f1b3cfb9e0eac4cd18f40c49b1d400a7d91
parent09e5bd6053df3c848b1ea853d23aebb145398dac (diff)
url_titles: Prefer Content-Type from HTTP headers.
-rw-r--r--modules/url_titles.py23
1 files changed, 11 insertions, 12 deletions
diff --git a/modules/url_titles.py b/modules/url_titles.py
index e4bbae6..fc750f8 100644
--- a/modules/url_titles.py
+++ b/modules/url_titles.py
@@ -83,28 +83,27 @@ class Module:
s.seek(0)
s = gzip.GzipFile(fileobj = s).read()
m = self.re_title.search(s)
- enc = self.re_meta.search(s)
+ meta_enc = self.re_meta.search(s)
else:
s = ''
m = None
- enc = None
+ meta_enc = None
buf = u.read(1024)
while buf and time.time() - t < 5.0:
s += buf
m = self.re_title.search(s)
- enc = self.re_meta.search(s)
- if m and (enc or '</head>' in s):
+ meta_enc = self.re_meta.search(s)
+ if m and (meta_enc or '</head>' in s):
break
buf = u.read(1024)
- if enc:
- enc = enc.groups()[0]
+ ct = u.headers['content-type']
+ enc = ct.lower().split('charset=')
+ if len(enc) == 2:
+ enc = enc[1]
+ elif meta_enc:
+ enc = meta_enc.groups()[0]
else:
- ct = u.headers['content-type']
- enc = ct.lower().split('charset=')
- if len(enc) == 2:
- enc = enc[1]
- else:
- enc = None
+ enc = None
if m:
s = m.groups()[0]