diff options
author | Jon Bergli Heier <snakebite@jvnv.net> | 2009-11-14 01:28:23 +0100 |
---|---|---|
committer | Jon Bergli Heier <snakebite@jvnv.net> | 2009-11-14 01:28:23 +0100 |
commit | dcebcafcc52ae847077890b551b8319d80d36d91 (patch) | |
tree | 1496e0436e56a8076891ed6be551004c59d32c13 /modules/url_titles.py |
A much needed inital import.
Diffstat (limited to 'modules/url_titles.py')
-rw-r--r-- | modules/url_titles.py | 115 |
1 files changed, 115 insertions, 0 deletions
diff --git a/modules/url_titles.py b/modules/url_titles.py new file mode 100644 index 0000000..fe5181c --- /dev/null +++ b/modules/url_titles.py @@ -0,0 +1,115 @@ +info = { + 'author': 'Jon Bergli Heier', + 'title': 'URL Titles', + 'description': 'Fetches the title tags off of URLs.', +} + +import re, urllib2, htmlentitydefs, gzip, cStringIO, spotimeta + +class Module: + re_http = re.compile(r'(http://[^\ ]+)') + re_title = re.compile(r'<title>(.*?)</title>', re.S | re.I) + metadata = spotimeta.Metadata(cache = {}) + + def __init__(self, bot): + self.irc = bot + + def spotify(self, s): + try: + data = self.metadata.lookup(s) + except: + return 'Failed to fetch metadata from spotify.' + if data['type'] == 'artist': + return 'Spotify: %s' % data['result']['name'] + else: + return 'Spotify: %s - %s' % (data['result']['artist']['name'], data['result']['name']) + + def get_titles(self, s): + def parse_url(url): + s = url[7:].split('/', 1)[0:] + host = s[0] + path = '/' if len(s) == 1 else '/' + s[1] + return host, path + + def unescape(text): + def fixup(m): + text = m.group(0) + if text[:2] == "&#": + # character reference + try: + if text[:3] == "&#x": + return unichr(int(text[3:-1], 16)).encode('utf-8') + else: + return unichr(int(text[2:-1])).encode('utf-8') + except ValueError: + pass + else: + # named entity + try: + text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]).encode('utf-8') + except KeyError: + pass + return text # leave as is + return re.sub("&#?\w+;", fixup, text) + + def format_text(s): + s = s.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ') + while ' ' in s: + s = s.replace(' ', ' ') + s = unescape(s) + return s + + m = self.re_http.findall(s) + if not m: + return + titles = [] + for url in m: + if any([x in url for x in ('open.spotify.com', 'spotify:track:', 'spotify:artist:', 'spotify:album:')]): + titles.append(self.spotify(url).encode('utf8')) + continue + try: + u = urllib2.urlopen(url) + except: + return + #enc = ct.split('encoding=') + #if len(enc) == 2: + #enc = enc[1] + #else: + #enc = None + if u.headers['content-type'].startswith('text/html'): + #s = u.read() + if 'content-encoding' in u.headers and u.headers['content-encoding'] == 'gzip': + s = cStringIO.StringIO(u.read()) + s.seek(0) + s = gzip.GzipFile(fileobj = s).read() + m = self.re_title.search(s) + else: + s = '' + m = None + buf = u.read(1024) + while buf: + s += buf + m = self.re_title.search(s) + if m: + break + buf = u.read(1024) + if m: + titles.append(m.groups()[0]) + u.close() + if len(titles) == 1: + s = format_text(titles[0]) + else: + s = '' + for i in range(len(titles)): + s += '\002[%d]\002 %s ' % (i+1, format_text(titles[i])) + return s.strip() + + def __call__(self, nick, channel, msg): + titles = self.get_titles(msg) + if titles: + self.irc.msg(channel if not channel == self.irc.nickname else nick.split('!')[0], titles) + +if __name__ == '__main__': + import sys + m = Module(None) + print m.get_titles(' '.join(sys.argv[1:])) |