info = { 'author': 'Jon Bergli Heier', 'title': 'URL Titles', 'description': 'Fetches the title tags off of URLs.', } import re, urllib2, htmlentitydefs, gzip, cStringIO, spotimeta, time from PIL import ImageFile class Module: re_http = re.compile(r'(http://[^\ ]+)') re_title = re.compile(r']*?>(.*?)', re.S | re.I) metadata = spotimeta.Metadata(cache = {}) def __init__(self, bot): self.irc = bot def spotify(self, s): try: data = self.metadata.lookup(s) except: return 'Failed to fetch metadata from spotify.' if data['type'] == 'artist': return 'Spotify: %s' % data['result']['name'] else: return 'Spotify: %s - %s' % (data['result']['artist']['name'], data['result']['name']) def get_titles(self, s): def parse_url(url): s = url[7:].split('/', 1)[0:] host = s[0] path = '/' if len(s) == 1 else '/' + s[1] return host, path def unescape(text): def fixup(m): text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return unichr(int(text[3:-1], 16)).encode('utf-8') else: return unichr(int(text[2:-1])).encode('utf-8') except ValueError: pass else: # named entity try: text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]).encode('utf-8') except KeyError: pass return text # leave as is return re.sub("&#?\w+;", fixup, text) def format_text(s): s = s.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ') while ' ' in s: s = s.replace(' ', ' ') s = unescape(s) return s m = self.re_http.findall(s) if not m: return titles = [] for url in m: if any([x in url for x in ('open.spotify.com', 'spotify:track:', 'spotify:artist:', 'spotify:album:')]): titles.append(self.spotify(url).encode('utf8')) continue t = time.time() try: u = urllib2.urlopen(url) except: return #enc = ct.split('encoding=') #if len(enc) == 2: #enc = enc[1] #else: #enc = None if u.headers['content-type'].startswith('text/html'): #s = u.read() if 'content-encoding' in u.headers and u.headers['content-encoding'] == 'gzip': s = cStringIO.StringIO(u.read()) s.seek(0) s = gzip.GzipFile(fileobj = s).read() m = self.re_title.search(s) else: s = '' m = None buf = u.read(1024) while buf and time.time() - t < 5.0: s += buf m = self.re_title.search(s) if m: break buf = u.read(1024) if m: titles.append(m.groups()[0]) elif u.headers['content-type'] in ('image/gif', 'image/png', 'image/jpeg'): def pretty_size(size): suffixes = (('B', 2**10), ('KiB', 2**20), ('MiB', 2**30), ('GiB', 2**40), ('TiB', 2**50)) for suf, lim in suffixes: if size > lim: continue else: return '%s %s' % (str(round(size/float(lim/2**10), 2)), suf) p = ImageFile.Parser() size = 0 while time.time() - t < 5.0: s = u.read(1024) size += len(s) if not s: break p.feed(s) try: im = None im = p.close() titles.append('%s image: %dx%d (%s)' % ((im.format,) + tuple(im.size) + (pretty_size(size),))) except: pass finally: del im u.close() if len(titles) == 1: s = format_text(titles[0]) else: s = '' for i in range(len(titles)): s += '\002[%d]\002 %s ' % (i+1, format_text(titles[i])) return s.strip() def __call__(self, nick, channel, msg): titles = self.get_titles(msg) if titles: self.irc.msg(channel if not channel == self.irc.nickname else nick.split('!')[0], titles) if __name__ == '__main__': import sys m = Module(None) print m.get_titles(' '.join(sys.argv[1:]))