info = {
'author': 'Jon Bergli Heier',
'title': 'URL Titles',
'description': 'Fetches the title tags off of URLs.',
}
import re, urllib2, htmlentitydefs, gzip, cStringIO, spotimeta, time
from PIL import ImageFile
class Module:
re_http = re.compile(r'(http://[^\ ]+)')
re_title = re.compile(r'
]*?>(.*?)', re.S | re.I)
metadata = spotimeta.Metadata(cache = {})
def __init__(self, bot):
self.irc = bot
def spotify(self, s):
try:
data = self.metadata.lookup(s)
except:
return 'Failed to fetch metadata from spotify.'
if data['type'] == 'artist':
return 'Spotify: %s' % data['result']['name']
else:
return 'Spotify: %s - %s' % (data['result']['artist']['name'], data['result']['name'])
def get_titles(self, s):
def parse_url(url):
s = url[7:].split('/', 1)[0:]
host = s[0]
path = '/' if len(s) == 1 else '/' + s[1]
return host, path
def unescape(text):
def fixup(m):
text = m.group(0)
if text[:2] == "":
# character reference
try:
if text[:3] == "":
return unichr(int(text[3:-1], 16)).encode('utf-8')
else:
return unichr(int(text[2:-1])).encode('utf-8')
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]).encode('utf-8')
except KeyError:
pass
return text # leave as is
return re.sub("?\w+;", fixup, text)
def format_text(s):
s = s.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
while ' ' in s:
s = s.replace(' ', ' ')
s = unescape(s)
return s
m = self.re_http.findall(s)
if not m:
return
titles = []
for url in m:
if any([x in url for x in ('open.spotify.com', 'spotify:track:', 'spotify:artist:', 'spotify:album:')]):
titles.append(self.spotify(url).encode('utf8'))
continue
t = time.time()
try:
u = urllib2.urlopen(url)
except:
return
#enc = ct.split('encoding=')
#if len(enc) == 2:
#enc = enc[1]
#else:
#enc = None
if u.headers['content-type'].startswith('text/html'):
#s = u.read()
if 'content-encoding' in u.headers and u.headers['content-encoding'] == 'gzip':
s = cStringIO.StringIO(u.read())
s.seek(0)
s = gzip.GzipFile(fileobj = s).read()
m = self.re_title.search(s)
else:
s = ''
m = None
buf = u.read(1024)
while buf and time.time() - t < 5.0:
s += buf
m = self.re_title.search(s)
if m:
break
buf = u.read(1024)
if m:
titles.append(m.groups()[0])
elif u.headers['content-type'] in ('image/gif', 'image/png', 'image/jpeg'):
def pretty_size(size):
suffixes = (('B', 2**10), ('KiB', 2**20), ('MiB', 2**30), ('GiB', 2**40), ('TiB', 2**50))
for suf, lim in suffixes:
if size > lim:
continue
else:
return '%s %s' % (str(round(size/float(lim/2**10), 2)), suf)
p = ImageFile.Parser()
size = 0
while time.time() - t < 5.0:
s = u.read(1024)
size += len(s)
if not s:
break
p.feed(s)
try:
im = None
im = p.close()
titles.append('%s image: %dx%d (%s)' % ((im.format,) + tuple(im.size) + (pretty_size(size),)))
except:
pass
finally:
del im
u.close()
if len(titles) == 1:
s = format_text(titles[0])
else:
s = ''
for i in range(len(titles)):
s += '\002[%d]\002 %s ' % (i+1, format_text(titles[i]))
return s.strip()
def __call__(self, nick, channel, msg):
titles = self.get_titles(msg)
if titles:
self.irc.msg(channel if not channel == self.irc.nickname else nick.split('!')[0], titles)
if __name__ == '__main__':
import sys
m = Module(None)
print m.get_titles(' '.join(sys.argv[1:]))