summaryrefslogtreecommitdiff
path: root/modules/url_titles.py
diff options
context:
space:
mode:
authorJon Bergli Heier <snakebite@jvnv.net>2009-11-14 01:28:23 +0100
committerJon Bergli Heier <snakebite@jvnv.net>2009-11-14 01:28:23 +0100
commitdcebcafcc52ae847077890b551b8319d80d36d91 (patch)
tree1496e0436e56a8076891ed6be551004c59d32c13 /modules/url_titles.py
A much needed inital import.
Diffstat (limited to 'modules/url_titles.py')
-rw-r--r--modules/url_titles.py115
1 files changed, 115 insertions, 0 deletions
diff --git a/modules/url_titles.py b/modules/url_titles.py
new file mode 100644
index 0000000..fe5181c
--- /dev/null
+++ b/modules/url_titles.py
@@ -0,0 +1,115 @@
+info = {
+ 'author': 'Jon Bergli Heier',
+ 'title': 'URL Titles',
+ 'description': 'Fetches the title tags off of URLs.',
+}
+
+import re, urllib2, htmlentitydefs, gzip, cStringIO, spotimeta
+
+class Module:
+ re_http = re.compile(r'(http://[^\ ]+)')
+ re_title = re.compile(r'<title>(.*?)</title>', re.S | re.I)
+ metadata = spotimeta.Metadata(cache = {})
+
+ def __init__(self, bot):
+ self.irc = bot
+
+ def spotify(self, s):
+ try:
+ data = self.metadata.lookup(s)
+ except:
+ return 'Failed to fetch metadata from spotify.'
+ if data['type'] == 'artist':
+ return 'Spotify: %s' % data['result']['name']
+ else:
+ return 'Spotify: %s - %s' % (data['result']['artist']['name'], data['result']['name'])
+
+ def get_titles(self, s):
+ def parse_url(url):
+ s = url[7:].split('/', 1)[0:]
+ host = s[0]
+ path = '/' if len(s) == 1 else '/' + s[1]
+ return host, path
+
+ def unescape(text):
+ def fixup(m):
+ text = m.group(0)
+ if text[:2] == "&#":
+ # character reference
+ try:
+ if text[:3] == "&#x":
+ return unichr(int(text[3:-1], 16)).encode('utf-8')
+ else:
+ return unichr(int(text[2:-1])).encode('utf-8')
+ except ValueError:
+ pass
+ else:
+ # named entity
+ try:
+ text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]).encode('utf-8')
+ except KeyError:
+ pass
+ return text # leave as is
+ return re.sub("&#?\w+;", fixup, text)
+
+ def format_text(s):
+ s = s.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
+ while ' ' in s:
+ s = s.replace(' ', ' ')
+ s = unescape(s)
+ return s
+
+ m = self.re_http.findall(s)
+ if not m:
+ return
+ titles = []
+ for url in m:
+ if any([x in url for x in ('open.spotify.com', 'spotify:track:', 'spotify:artist:', 'spotify:album:')]):
+ titles.append(self.spotify(url).encode('utf8'))
+ continue
+ try:
+ u = urllib2.urlopen(url)
+ except:
+ return
+ #enc = ct.split('encoding=')
+ #if len(enc) == 2:
+ #enc = enc[1]
+ #else:
+ #enc = None
+ if u.headers['content-type'].startswith('text/html'):
+ #s = u.read()
+ if 'content-encoding' in u.headers and u.headers['content-encoding'] == 'gzip':
+ s = cStringIO.StringIO(u.read())
+ s.seek(0)
+ s = gzip.GzipFile(fileobj = s).read()
+ m = self.re_title.search(s)
+ else:
+ s = ''
+ m = None
+ buf = u.read(1024)
+ while buf:
+ s += buf
+ m = self.re_title.search(s)
+ if m:
+ break
+ buf = u.read(1024)
+ if m:
+ titles.append(m.groups()[0])
+ u.close()
+ if len(titles) == 1:
+ s = format_text(titles[0])
+ else:
+ s = ''
+ for i in range(len(titles)):
+ s += '\002[%d]\002 %s ' % (i+1, format_text(titles[i]))
+ return s.strip()
+
+ def __call__(self, nick, channel, msg):
+ titles = self.get_titles(msg)
+ if titles:
+ self.irc.msg(channel if not channel == self.irc.nickname else nick.split('!')[0], titles)
+
+if __name__ == '__main__':
+ import sys
+ m = Module(None)
+ print m.get_titles(' '.join(sys.argv[1:]))