pyanidb/hash.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131

import threading, time, os, hashlib, binascii
try:
	import xattr
except ImportError:
	xattr = None

class Ed2k:
	def __init__(self):
		self.md4_partial = hashlib.new('md4')
		self.md4_final = hashlib.new('md4')
		self.size_total = 0
	
	def update(self, data):
		pos = 0
		while pos < len(data):
			if (not (self.size_total % 9728000)) and self.size_total:
				self.md4_final.update(self.md4_partial.digest())
				self.md4_partial = hashlib.new('md4')
			size = min(len(data) - pos, 9728000 - (self.size_total % 9728000))
			self.md4_partial.update(data[pos:pos + size])
			pos += size
			self.size_total += size
	
	def hexdigest(self):
		if self.size_total > 9728000:
			self.md4_final.update(self.md4_partial.digest())
			return self.md4_final.hexdigest()
		return self.md4_partial.hexdigest()

class Crc32:
	def __init__(self):
		self.s = 0
	
	def update(self, data):
		self.s = binascii.crc32(data, self.s)
	
	def hexdigest(self):
		return '%08x' % (self.s & 0xffffffff)

hasher_obj = {
	'ed2k': Ed2k,
	'md5': lambda: hashlib.new('md5'),
	'sha1': lambda: hashlib.new('sha1'),
	'crc32': Crc32,
}

class Hash:
	def __init__(self, filename, algorithms):
		update_list = []
		for a in algorithms:
			h = hasher_obj[a]()
			update_list.append(h.update)
			setattr(self, a, h.hexdigest)
		
		f = open(filename)
		data = f.read(131072)
		while data:
			for u in update_list:
				u(data)
			data = f.read(131072)

class File:
	def __init__(self, name, algorithms, cache):
		self.name = name
		self.size = os.path.getsize(name)
		self.mtime = os.path.getmtime(name)
		self.cached = False
		if cache:
			self.read_cache()
		if False in [hasattr(self, a) for a in algorithms]:
			self.cached = False
			h = Hash(name, algorithms)
			for a in algorithms:
				setattr(self, a, getattr(h, a)())
			self.write_cache()
	
	def read_cache(self):
		if not xattr:
			return
		cache = dict([(n[13:], xattr.getxattr(self.name, n)) for n in xattr.listxattr(self.name) if n.startswith('user.pyanidb.')])
		if 'mtime' not in cache or str(int(self.mtime)) != cache.pop('mtime'):
			return
		for n, v in cache.iteritems():
			setattr(self, n, v)
		self.cached = True
	
	def write_cache(self):
		if not xattr:
			return
		try:
			self.clear_cache()
			xattr.setxattr(self.name, 'user.pyanidb.mtime', str(int(self.mtime)))
			for n in ('ed2k', 'md5', 'sha1', 'crc32'):
				if hasattr(self, n):
					xattr.setxattr(self.name, 'user.pyanidb.' + n, getattr(self, n))
		except IOError:
			pass
	
	def clear_cache(self):
		for name in xattr.listxattr(self.name):
			if name.startswith('user.pyanidb.'):
				xattr.removexattr(self.name, name)

class Hashthread(threading.Thread):
	def __init__(self, filelist, hashlist, algorithms, cache, *args, **kwargs):
		self.filelist = filelist
		self.hashlist = hashlist
		self.algorithms = algorithms
		self.cache = cache
		threading.Thread.__init__(self, *args, **kwargs)
	def run(self):
		try:
			while 1:
				f = self.filelist.pop(0)
				self.hashlist.append(File(f, self.algorithms, self.cache))
		except IndexError:
			return

def hash_files(files, cache = False, algorithms = ('ed2k',), num_threads = 1):
	hashlist = []
	threads = []
	for x in xrange(num_threads):
		thread = Hashthread(files, hashlist, algorithms, cache)
		thread.start()
		threads.append(thread)
	while hashlist or sum([thread.isAlive() for thread in threads]):
		try:
			yield hashlist.pop(0)
		except IndexError:
			time.sleep(0.1)
	raise StopIteration