Context Navigation

source: trunk/lib/bletchley/blobtools.py @ 42

Last change on this file since 42 was 42, checked in by tmorgan, 11 years ago
fixed bug in 2->3 transition
File size: 15.3 KB

Line
1	'''
2	A collection of tools to assist in analyzing encrypted blobs of data
3
4	Copyright (C) 2011-2012 Virtual Security Research, LLC
5	Author: Timothy D. Morgan, Jason A. Donenfeld
6
7	This program is free software: you can redistribute it and/or modify
8	it under the terms of the GNU Lesser General Public License, version 3,
9	as published by the Free Software Foundation.
10
11	This program is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	GNU General Public License for more details.
15
16	You should have received a copy of the GNU General Public License
17	along with this program. If not, see <http://www.gnu.org/licenses/>.
18	'''
19
20	import sys
21	import base64
22	import binascii
23	import fractions
24	import operator
25	import functools
26	import itertools
27	from . import buffertools
28
29
30	# urllib.parse's functions are not well suited for encoding/decoding
31	# bytes or managing encoded case
32	def _percentEncode(binary, plus=False, upper=True):
33	fmt = "%%%.2X"
34	if upper:
35	fmt = "%%%.2x"
36
37	ret_val = b''
38	for c in binary:
39	if c not in b'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789':
40	ret_val += (fmt % c).encode('ascii')
41	elif plus and (c == 20):
42	ret_val += b'+'
43	else:
44	ret_val += c
45
46	return ret_val
47
48
49	def _percentDecode(binary, plus=False):
50	ret_val = b''
51	if plus:
52	binary = binary.replace(b'+', b' ')
53	if binary == b'':
54	return b''
55	chunks = binary.split(b'%')
56	if binary[0] == 0x25:
57	chunks = chunks[1:]
58
59	for chunk in chunks:
60	if len(chunk) < 2:
61	return None
62	try:
63	ret_val += bytes([int(chunk[0:2], 16)]) + chunk[2:]
64	except:
65	print(repr(chunk))
66	return None
67
68	return ret_val
69
70
71	# abstract class
72	class DataEncoding(object):
73	charset = frozenset(b'')
74	extraneous_chars = b''
75	dialect = None
76	name = None
77	priority = None
78
79	def __init__(self, dialect=''):
80	self.dialect = dialect
81
82	def isExample(self, blob):
83	sblob = frozenset(blob)
84	if self.charset != None and not sblob.issubset(self.charset):
85	return False
86	return self.extraTests(blob)
87
88	def extraTests(self, blob):
89	"""May return True, False, or None, for is an example, isn't an
90	example, or unknown, respectively.
91
92	"""
93	return True
94
95	def decode(self, blob):
96	return None
97
98	def encode(self, blob):
99	return None
100
101
102	class base64Encoding(DataEncoding):
103	name = 'base64'
104	def __init__(self, dialect='rfc3548'):
105	super(base64Encoding, self).__init__(dialect)
106	if dialect.startswith('rfc3548'):
107	self.c62 = b'+'
108	self.c63 = b'/'
109	self.pad = b'='
110	elif dialect.startswith('filename'):
111	self.c62 = b'+'
112	self.c63 = b'-'
113	self.pad = b'='
114	elif dialect.startswith('url1'):
115	self.c62 = b'-'
116	self.c63 = b'_'
117	self.pad = b'='
118	elif dialect.startswith('url2'):
119	self.c62 = b'-'
120	self.c63 = b'_'
121	self.pad = b'.'
122	elif dialect.startswith('url3'):
123	self.c62 = b'_'
124	self.c63 = b'-'
125	self.pad = b'.'
126	elif dialect.startswith('url4'):
127	self.c62 = b'-'
128	self.c63 = b'_'
129	self.pad = b'!'
130	elif dialect.startswith('url5'):
131	self.c62 = b'+'
132	self.c63 = b'/'
133	self.pad = b'$'
134	elif dialect.startswith('otkurl'):
135	self.c62 = b'-'
136	self.c63 = b'_'
137	self.pad = b'*'
138	elif dialect.startswith('xmlnmtoken'):
139	self.c62 = b'.'
140	self.c63 = b'-'
141	self.pad = b'='
142	elif dialect.startswith('xmlname'):
143	self.c62 = b'_'
144	self.c63 = b':'
145	self.pad = b'='
146
147	if 'newline' in dialect:
148	self.extraneous_chars = b'\r\n'
149
150	self.charset = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
151	+b'abcdefghijklmnopqrstuvwxyz0123456789'
152	+self.c62+self.c63+self.pad+self.extraneous_chars)
153
154	def _guessPadLength(self, nopad_len):
155	length = ((4 - nopad_len % 4) % 4)
156	if length != 3:
157	return length
158	return None
159
160	def extraTests(self, blob):
161	for c in self.extraneous_chars:
162	blob = blob.replace(bytes([c]), b'')
163
164	nopad = blob.rstrip(self.pad)
165	padlen_guess = self._guessPadLength(len(nopad))
166	if padlen_guess == None:
167	return False
168
169	# we don't accept bad pads, only missing pads
170	if self.dialect.endswith('nopad'):
171	return self.pad not in blob
172
173	# pad must not appear in the middle of the
174	# string and must be the correct length at the end
175	return (self.pad not in nopad) and (len(blob) == len(nopad)+padlen_guess)
176
177	def decode(self, blob):
178	for c in self.extraneous_chars:
179	blob = blob.replace(bytes(c), b'')
180
181	if self.dialect.endswith('nopad'):
182	if self.pad in blob:
183	raise Exception("Unpadded base64 string contains pad character")
184
185	padlen = self._guessPadLength(len(blob))
186	if padlen == None:
187	raise Exception("Invalid length for unpadded base64 string.")
188
189	blob = blob+(self.pad*padlen)
190
191	if not self.dialect.startswith('rfc3548'):
192	table = bytes.maketrans(self.c62+self.c63+self.pad, b'+/=')
193	blob = blob.translate(table)
194
195	return base64.standard_b64decode(blob)
196
197
198	def encode(self, blob):
199	ret_val = base64.standard_b64encode(blob)
200
201	if not self.dialect.startswith('rfc3548'):
202	table = bytes.maketrans(b'+/=', self.c62+self.c63+self.pad)
203	ret_val = ret_val.translate(table)
204
205	if ret_val != None and self.dialect.endswith('nopad'):
206	ret_val = ret_val.rstrip(self.pad)
207
208	return ret_val
209
210
211	class base32Encoding(DataEncoding):
212	name = 'base32'
213	def __init__(self, dialect='rfc3548upper'):
214	super(base32Encoding, self).__init__(dialect)
215	if dialect.startswith('rfc3548upper'):
216	self.pad = b'='
217	self.charset = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ234567'+self.pad)
218
219	elif dialect.startswith('rfc3548lower'):
220	self.pad = b'='
221	self.charset = frozenset(b'abcdefghijklmnopqrstuvwxyz234567'+self.pad)
222
223	def _guessPadLength(self, nopad_len):
224	pad_lengths = {0:0, 7:1, 5:3, 4:4, 2:6}
225	return pad_lengths.get(nopad_len%8, None)
226
227	def extraTests(self, blob):
228	nopad = blob.rstrip(self.pad)
229	padlen_guess = self._guessPadLength(len(nopad))
230	if padlen_guess == None:
231	return False
232
233	# we don't accept bad pads, only missing pads
234	if self.dialect.endswith('nopad'):
235	return self.pad not in blob
236
237	# pad must not appear in the middle of the
238	# string and must be the correct length at the end
239	return (self.pad not in nopad) and (len(blob) == len(nopad)+padlen_guess)
240
241
242	def decode(self, blob):
243	if self.dialect.endswith('nopad'):
244	if self.pad in blob:
245	raise Exception("Unpadded base32 string contains pad character")
246
247	padlen = self._guessPadLength(len(blob))
248	if padlen == None:
249	raise Exception("Invalid length for unpadded base64 string.")
250
251	blob = blob+(self.pad*padlen)
252
253	return base64.b32decode(blob.upper())
254
255
256	def encode(self, blob):
257	ret_val = base64.b32encode(blob)
258
259	if ret_val != None and self.dialect.endswith('nopad'):
260	ret_val = ret_val.rstrip(self.pad)
261
262	if 'lower' in self.dialect:
263	ret_val = ret_val.lower()
264	else:
265	ret_val = ret_val.upper()
266
267	return ret_val
268
269
270	class hexEncoding(DataEncoding):
271	name = 'hex'
272	def __init__(self, dialect='mixed'):
273	super(hexEncoding, self).__init__(dialect)
274	if 'mixed' in dialect:
275	self.charset = frozenset(b'ABCDEFabcdef0123456789')
276	elif 'upper' in dialect:
277	self.charset = frozenset(b'ABCDEF0123456789')
278	elif 'lower' in dialect:
279	self.charset = frozenset(b'abcdef0123456789')
280
281
282	def extraTests(self, blob):
283	return (len(blob) % 2 == 0)
284
285	def decode(self, blob):
286	return binascii.a2b_hex(blob)
287
288	def encode(self, blob):
289	if 'upper' in self.dialect:
290	return binascii.b2a_hex(blob).upper()
291	if 'lower' in self.dialect:
292	return binascii.b2a_hex(blob).lower()
293	else:
294	return binascii.b2a_hex(blob)
295
296
297	class percentEncoding(DataEncoding):
298	name = 'percent'
299	def __init__(self, dialect='mixed'):
300	super(percentEncoding, self).__init__(dialect)
301	self.charset = None
302	if 'mixed' in dialect:
303	self.hexchars = frozenset(b'ABCDEFabcdef0123456789')
304	elif 'upper' in dialect:
305	self.hexchars = frozenset(b'ABCDEF0123456789')
306	elif 'lower' in dialect:
307	self.hexchars = frozenset(b'abcdef0123456789')
308
309	def extraTests(self, blob):
310	chunks = blob.split(b'%')
311	if len(chunks) < 2:
312	return None
313	for c in chunks[1:]:
314	if len(c) < 2:
315	return False
316	if (c[0] not in self.hexchars) or (c[1] not in self.hexchars):
317	return False
318	return True
319
320	def decode(self, blob):
321	plus = False
322	if 'plus' in self.dialect:
323	plus = True
324	return _percentDecode(blob, plus=plus)
325
326	def encode(self, blob):
327	upper = True
328	plus = False
329	if 'plus' in self.dialect:
330	plus = True
331	if 'lower' in self.dialect:
332	upper = False
333
334	return _percentEncode(blob, plus=plus, upper=upper)
335
336
337	priorities = [
338	(hexEncoding, 'upper', 100),
339	(hexEncoding, 'lower', 101),
340	(hexEncoding, 'mixed', 102),
341	(base32Encoding, 'rfc3548upper', 150),
342	(base32Encoding, 'rfc3548lower', 151),
343	(base32Encoding, 'rfc3548upper-nopad', 160),
344	(base32Encoding, 'rfc3548lower-nopad', 161),
345	(base64Encoding, 'rfc3548', 200),
346	(base64Encoding, 'rfc3548-nopad', 201),
347	(base64Encoding, 'rfc3548-newline', 202),
348	(base64Encoding, 'filename', 210),
349	(base64Encoding, 'filename-nopad', 211),
350	(base64Encoding, 'url1', 230),
351	(base64Encoding, 'url1-nopad', 231),
352	(base64Encoding, 'otkurl', 235),
353	(base64Encoding, 'otkurl-nopad', 236),
354	(base64Encoding, 'url2', 240),
355	(base64Encoding, 'url2-nopad', 241),
356	(base64Encoding, 'url3', 250),
357	(base64Encoding, 'url3-nopad', 251),
358	(base64Encoding, 'url4', 260),
359	(base64Encoding, 'url4-nopad', 261),
360	(base64Encoding, 'url5', 265),
361	(base64Encoding, 'url5-nopad', 266),
362	(base64Encoding, 'xmlnmtoken', 270),
363	(base64Encoding, 'xmlnmtoken-nopad', 271),
364	(base64Encoding, 'xmlname', 280),
365	(base64Encoding, 'xmlname-nopad', 281),
366	(percentEncoding, 'upper-plus', 400),
367	(percentEncoding, 'upper', 401),
368	(percentEncoding, 'lower-plus', 410),
369	(percentEncoding, 'lower', 411),
370	(percentEncoding, 'mixed-plus', 420),
371	(percentEncoding, 'mixed', 421),
372	]
373
374	encodings = {}
375	for enc,d,p in priorities:
376	e = enc(d)
377	e.priority = p
378	encodings["%s/%s" % (enc.name, d)] = e
379
380	def supportedEncodings():
381	e = list(encodings.keys())
382	e.sort()
383	return e
384
385
386	def possibleEncodings(blob):
387	likely = set()
388	possible = set()
389	for name,encoding in encodings.items():
390	result = encoding.isExample(blob)
391	if result == True:
392	likely.add(name)
393	elif result == None:
394	possible.add(name)
395	return likely,possible
396
397
398	def encodingIntersection(blobs):
399	ret_val = set(encodings.keys())
400	p = set(encodings.keys())
401	for b in blobs:
402	likely,possible = possibleEncodings(b)
403	ret_val &= likely \| possible
404	p &= possible
405	return ret_val - p
406
407
408	def bestEncoding(encs):
409	priority = 999999999
410	best = None
411	for e in encs:
412	if encodings[e].priority < priority:
413	best = e
414	priority = encodings[e].priority
415	return best
416
417
418	def decode(encoding, blob):
419	return encodings[encoding].decode(blob)
420
421	def encode(encoding, blob):
422	return encodings[encoding].encode(blob)
423
424	def decodeAll(encoding, blobs):
425	return [encodings[encoding].decode(b) for b in blobs]
426
427	def encodeAll(encoding, blobs):
428	return [encodings[encoding].encode(b) for b in blobs]
429
430	def decodeChain(decoding_chain, blob):
431	for decoding in decoding_chain:
432	blob = decode(decoding, blob)
433	return blob
434
435	def encodeChain(encoding_chain, blob):
436	for encoding in encoding_chain:
437	blob = encode(encoding, blob)
438	return blob
439
440	def getLengths(s):
441	lengths = set()
442	for bin in s:
443	lengths.add(len(bin))
444	lengths = list(lengths)
445	lengths.sort()
446	return lengths
447
448
449	def maxBlockSize(blob_lengths):
450	divisor = 0
451	for bl in blob_lengths:
452	divisor = fractions.gcd(divisor, bl)
453
454	return divisor
455
456
457	allTrue = functools.partial(functools.reduce, (lambda x,y: x and y))
458
459	def checkCommonBlocksizes(lengths):
460	common_block_sizes = (8,16,20)
461	ret_val = []
462	for cbs in common_block_sizes:
463	gcdIsCBS = (lambda x: fractions.gcd(x,cbs)==cbs)
464	if allTrue(map(gcdIsCBS, lengths)):
465	ret_val.append(cbs)
466	return ret_val
467
468
469	def int2binary(x, bits=8):
470	"""
471	Integer to binary
472	Count is number of bits
473	"""
474	return "".join(map(lambda y:str((x>>y)&1), range(bits-1, -1, -1)))
475
476
477	#XXX: move this to buffertools
478	def smartPermutateBlobs(blobs, block_size=8):
479	"""
480	Intelligently permutates through blocks in blobs.
481	If the same blob shows up in the same place for
482	every blob, the resultant permutations will have
483	this property as well.
484	blobs should be an array containing blobs
485	block_size should be an integer block_size or an
486	array of block sizes.
487	"""
488
489	if len(blobs) == 0:
490	return
491
492	if not isinstance(block_size, (int, long)):
493	for size in block_size:
494	for blob in smartPermutateBlobs(blobs, size):
495	yield blob
496	return
497
498	# First we find the indexes of the chunks that are different
499	different = set()
500	for combo in itertools.combinations(blobs, 2):
501	different \|= set(buffertools.blockWiseDiff(block_size, combo[0], combo[1]))
502
503	# Next we form a set containing the chunks that are different
504	different_chunks = []
505	for blob in blobs:
506	different_chunks.extend([blob[i * block_size:(i + 1) * block_size] for i in different])
507	# Remove duplicates
508	different_chunks = set(different_chunks)
509
510	# We want to know which chunks are the same, too
511	chunk_len = len(blobs[0]) / block_size
512	same = set(range(0, chunk_len)) - different
513
514	# Now let's mix and match the differnet blocks, for all possible lengths
515	for i in range(1, chunk_len + 1):
516	for mix in itertools.permutations(different_chunks, i):
517	# We add back in the part that stays the same
518	for j in same:
519	mix.insert(j, blobs[0][j * block_size:(j + 1) * block_size])
520	mix = "".join(mix)
521	if mix in blobs:
522	continue
523	yield mix

Note: See TracBrowser for help on using the repository browser.

Download in other formats: