Context Navigation

source: trunk/lib/bletchley/blobtools.py @ 67

Last change on this file since 67 was 67, checked in by tmorgan, 11 years ago

added documentation to a few functions

allowing strings as argument to decode

File size: 18.0 KB

Line
1	'''
2	A collection of tools to assist in analyzing encrypted blobs of data
3
4	Copyright (C) 2011-2013 Virtual Security Research, LLC
5	Author: Timothy D. Morgan, Jason A. Donenfeld
6
7	This program is free software: you can redistribute it and/or modify
8	it under the terms of the GNU Lesser General Public License, version 3,
9	as published by the Free Software Foundation.
10
11	This program is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	GNU General Public License for more details.
15
16	You should have received a copy of the GNU General Public License
17	along with this program. If not, see <http://www.gnu.org/licenses/>.
18	'''
19
20	import sys
21	import base64
22	import binascii
23	import traceback
24	import fractions
25	import operator
26	import functools
27	from . import buffertools
28
29
30	# urllib.parse's functions are not well suited for encoding/decoding
31	# bytes or managing encoded case
32	def _percentEncode(binary, plus=False, upper=True):
33	fmt = "%%%.2X"
34	if upper:
35	fmt = "%%%.2x"
36
37	ret_val = bytearray(b'')
38	for c in binary:
39	if c not in b'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789':
40	ret_val.extend((fmt % c).encode('ascii'))
41	elif plus and (c == 20):
42	ret_val.extend(b'+')
43	else:
44	ret_val.append(c)
45
46	return ret_val
47
48
49	def _percentDecode(binary, plus=False):
50	if plus:
51	binary = binary.replace(b'+', b' ')
52	if binary == b'':
53	return b''
54	chunks = binary.split(b'%')
55
56	ret_val = chunks[0]
57	for chunk in chunks[1:]:
58	if len(chunk) < 2:
59	return None
60	try:
61	ret_val += bytes([int(chunk[0:2], 16)]) + chunk[2:]
62	except:
63	#traceback.print_exc()
64	#print(repr(chunk), repr(binary))
65	return None
66
67	return ret_val
68
69
70	# abstract class
71	class DataEncoding(object):
72	charset = frozenset(b'')
73	extraneous_chars = b''
74	dialect = None
75	name = None
76	priority = None
77
78	def __init__(self, dialect=''):
79	self.dialect = dialect
80
81	def isExample(self, blob):
82	sblob = frozenset(blob)
83	if self.charset != None and not sblob.issubset(self.charset):
84	return False
85	return self.extraTests(blob)
86
87	def extraTests(self, blob):
88	"""May return True, False, or None, for is an example, isn't an
89	example, or unknown, respectively.
90
91	"""
92	return True
93
94	def decode(self, blob):
95	return None
96
97	def encode(self, blob):
98	return None
99
100
101	class base64Encoding(DataEncoding):
102	name = 'base64'
103	def __init__(self, dialect='rfc3548'):
104	super(base64Encoding, self).__init__(dialect)
105	if dialect.startswith('rfc3548'):
106	self.c62 = b'+'
107	self.c63 = b'/'
108	self.pad = b'='
109	elif dialect.startswith('filename'):
110	self.c62 = b'+'
111	self.c63 = b'-'
112	self.pad = b'='
113	elif dialect.startswith('url1'):
114	self.c62 = b'-'
115	self.c63 = b'_'
116	self.pad = b'='
117	elif dialect.startswith('url2'):
118	self.c62 = b'-'
119	self.c63 = b'_'
120	self.pad = b'.'
121	elif dialect.startswith('url3'):
122	self.c62 = b'_'
123	self.c63 = b'-'
124	self.pad = b'.'
125	elif dialect.startswith('url4'):
126	self.c62 = b'-'
127	self.c63 = b'_'
128	self.pad = b'!'
129	elif dialect.startswith('url5'):
130	self.c62 = b'+'
131	self.c63 = b'/'
132	self.pad = b'$'
133	elif dialect.startswith('url6'):
134	self.c62 = b'*'
135	self.c63 = b'/'
136	self.pad = b'='
137	elif dialect.startswith('otkurl'):
138	self.c62 = b'-'
139	self.c63 = b'_'
140	self.pad = b'*'
141	elif dialect.startswith('xmlnmtoken'):
142	self.c62 = b'.'
143	self.c63 = b'-'
144	self.pad = b'='
145	elif dialect.startswith('xmlname'):
146	self.c62 = b'_'
147	self.c63 = b':'
148	self.pad = b'='
149
150	if 'newline' in dialect:
151	self.extraneous_chars = b'\r\n'
152
153	self.charset = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
154	+b'abcdefghijklmnopqrstuvwxyz0123456789'
155	+self.c62+self.c63+self.pad+self.extraneous_chars)
156
157	def _guessPadLength(self, nopad_len):
158	length = ((4 - nopad_len % 4) % 4)
159	if length != 3:
160	return length
161	return None
162
163	def extraTests(self, blob):
164	for c in self.extraneous_chars:
165	blob = blob.replace(bytes([c]), b'')
166
167	if self.dialect.endswith('intpad'):
168	if blob[-1] not in b'012':
169	return False
170	nopad = blob[:-1]
171	padlen = blob[-1] - 48 # see the ascii table
172	else:
173	nopad = blob.rstrip(self.pad)
174	padlen = len(blob) - len(nopad)
175
176	# what the pad length ought to be
177	padlen_guess = self._guessPadLength(len(nopad))
178	if padlen_guess == None:
179	return False
180
181	# we don't accept bad pads, only missing pads
182	if self.dialect.endswith('nopad'):
183	return self.pad not in blob
184
185	# pad must not appear in the middle of the
186	# string and must be the correct length at the end
187	return (self.pad not in nopad) and (padlen == padlen_guess)
188
189	def decode(self, blob):
190	for c in self.extraneous_chars:
191	blob = blob.replace(bytes(c), b'')
192
193	if self.dialect.endswith('intpad'):
194	padlen = blob[-1] - 48 # see the ascii table
195	padlen_guess = self._guessPadLength(len(blob[:-1]))
196	if padlen != padlen_guess:
197	raise Exception("Invalid length for int-padded base64 string. (%d != %d)"
198	% (padlen, padlen_guess))
199
200	blob = blob[:-1] + (self.pad*padlen)
201
202	if self.dialect.endswith('nopad'):
203	if self.pad in blob:
204	raise Exception("Unpadded base64 string contains pad character")
205
206	padlen = self._guessPadLength(len(blob))
207	if padlen == None:
208	raise Exception("Invalid length for unpadded base64 string.")
209
210	blob = blob+(self.pad*padlen)
211
212	if not self.dialect.startswith('rfc3548'):
213	table = bytes.maketrans(self.c62+self.c63+self.pad, b'+/=')
214	blob = blob.translate(table)
215
216	return base64.standard_b64decode(blob)
217
218
219	def encode(self, blob):
220	ret_val = base64.standard_b64encode(blob)
221
222	if not self.dialect.startswith('rfc3548'):
223	table = bytes.maketrans(b'+/=', self.c62+self.c63+self.pad)
224	ret_val = ret_val.translate(table)
225
226	if ret_val != None and self.dialect.endswith('nopad'):
227	ret_val = ret_val.rstrip(self.pad)
228
229	if ret_val != None and self.dialect.endswith('intpad'):
230	stripped = ret_val.rstrip(self.pad)
231	ret_val = stripped + ("%d" % (len(ret_val) - len(stripped))).encode('utf-8')
232
233	return ret_val
234
235
236	class base32Encoding(DataEncoding):
237	name = 'base32'
238	def __init__(self, dialect='rfc3548upper'):
239	super(base32Encoding, self).__init__(dialect)
240	if dialect.startswith('rfc3548upper'):
241	self.pad = b'='
242	self.charset = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ234567'+self.pad)
243
244	elif dialect.startswith('rfc3548lower'):
245	self.pad = b'='
246	self.charset = frozenset(b'abcdefghijklmnopqrstuvwxyz234567'+self.pad)
247
248	def _guessPadLength(self, nopad_len):
249	pad_lengths = {0:0, 7:1, 5:3, 4:4, 2:6}
250	return pad_lengths.get(nopad_len%8, None)
251
252	def extraTests(self, blob):
253	nopad = blob.rstrip(self.pad)
254	padlen_guess = self._guessPadLength(len(nopad))
255	if padlen_guess == None:
256	return False
257
258	# we don't accept bad pads, only missing pads
259	if self.dialect.endswith('nopad'):
260	return self.pad not in blob
261
262	# pad must not appear in the middle of the
263	# string and must be the correct length at the end
264	return (self.pad not in nopad) and (len(blob) == len(nopad)+padlen_guess)
265
266
267	def decode(self, blob):
268	if self.dialect.endswith('nopad'):
269	if self.pad in blob:
270	raise Exception("Unpadded base32 string contains pad character")
271
272	padlen = self._guessPadLength(len(blob))
273	if padlen == None:
274	raise Exception("Invalid length for unpadded base64 string.")
275
276	blob = blob+(self.pad*padlen)
277
278	return base64.b32decode(blob.upper())
279
280
281	def encode(self, blob):
282	ret_val = base64.b32encode(blob)
283
284	if ret_val != None and self.dialect.endswith('nopad'):
285	ret_val = ret_val.rstrip(self.pad)
286
287	if 'lower' in self.dialect:
288	ret_val = ret_val.lower()
289	else:
290	ret_val = ret_val.upper()
291
292	return ret_val
293
294
295	class hexEncoding(DataEncoding):
296	name = 'hex'
297	def __init__(self, dialect='mixed'):
298	super(hexEncoding, self).__init__(dialect)
299	if 'mixed' in dialect:
300	self.charset = frozenset(b'ABCDEFabcdef0123456789')
301	elif 'upper' in dialect:
302	self.charset = frozenset(b'ABCDEF0123456789')
303	elif 'lower' in dialect:
304	self.charset = frozenset(b'abcdef0123456789')
305
306
307	def extraTests(self, blob):
308	return (len(blob) % 2 == 0)
309
310	def decode(self, blob):
311	return binascii.a2b_hex(blob)
312
313	def encode(self, blob):
314	if 'upper' in self.dialect:
315	return binascii.b2a_hex(blob).upper()
316	if 'lower' in self.dialect:
317	return binascii.b2a_hex(blob).lower()
318	else:
319	return binascii.b2a_hex(blob)
320
321
322	class percentEncoding(DataEncoding):
323	name = 'percent'
324	def __init__(self, dialect='mixed'):
325	super(percentEncoding, self).__init__(dialect)
326	self.charset = None
327	if 'mixed' in dialect:
328	self.hexchars = frozenset(b'ABCDEFabcdef0123456789')
329	elif 'upper' in dialect:
330	self.hexchars = frozenset(b'ABCDEF0123456789')
331	elif 'lower' in dialect:
332	self.hexchars = frozenset(b'abcdef0123456789')
333
334	def extraTests(self, blob):
335	chunks = blob.split(b'%')
336	if len(chunks) < 2:
337	return None
338	for c in chunks[1:]:
339	if len(c) < 2:
340	return False
341	if (c[0] not in self.hexchars) or (c[1] not in self.hexchars):
342	return False
343	return True
344
345	def decode(self, blob):
346	plus = False
347	if 'plus' in self.dialect:
348	plus = True
349	return _percentDecode(blob, plus=plus)
350
351	def encode(self, blob):
352	upper = True
353	plus = False
354	if 'plus' in self.dialect:
355	plus = True
356	if 'lower' in self.dialect:
357	upper = False
358
359	return _percentEncode(blob, plus=plus, upper=upper)
360
361	# XXX: need a better way to organize these with the possible combinations of dialects, padding, etc
362	# for instance, can we have rfc3548-newline-nopad ?
363	priorities = [
364	(hexEncoding, 'upper', 100),
365	(hexEncoding, 'lower', 101),
366	(hexEncoding, 'mixed', 102),
367	(base32Encoding, 'rfc3548upper', 150),
368	(base32Encoding, 'rfc3548lower', 151),
369	(base32Encoding, 'rfc3548upper-nopad', 160),
370	(base32Encoding, 'rfc3548lower-nopad', 161),
371	(base64Encoding, 'rfc3548', 200),
372	(base64Encoding, 'rfc3548-nopad', 201),
373	(base64Encoding, 'rfc3548-newline', 202),
374	(base64Encoding, 'rfc3548-intpad', 203),
375	(base64Encoding, 'filename', 210),
376	(base64Encoding, 'filename-nopad', 211),
377	(base64Encoding, 'filename-intpad', 212),
378	(base64Encoding, 'url1', 230),
379	(base64Encoding, 'url1-nopad', 231),
380	(base64Encoding, 'url1-intpad', 232),
381	(base64Encoding, 'otkurl', 235),
382	(base64Encoding, 'otkurl-nopad', 236),
383	(base64Encoding, 'otkurl-intpad', 237),
384	(base64Encoding, 'url2', 240),
385	(base64Encoding, 'url2-nopad', 241),
386	(base64Encoding, 'url2-intpad', 242),
387	(base64Encoding, 'url3', 250),
388	(base64Encoding, 'url3-nopad', 251),
389	(base64Encoding, 'url3-intpad', 252),
390	(base64Encoding, 'url4', 260),
391	(base64Encoding, 'url4-nopad', 261),
392	(base64Encoding, 'url4-intpad', 262),
393	(base64Encoding, 'url5', 265),
394	(base64Encoding, 'url5-nopad', 266),
395	(base64Encoding, 'url5-intpad', 267),
396	(base64Encoding, 'url6', 267),
397	(base64Encoding, 'url6-nopad', 268),
398	(base64Encoding, 'url6-intpad', 269),
399	(base64Encoding, 'xmlnmtoken', 270),
400	(base64Encoding, 'xmlnmtoken-nopad', 271),
401	(base64Encoding, 'xmlnmtoken-intpad', 272),
402	(base64Encoding, 'xmlname', 280),
403	(base64Encoding, 'xmlname-nopad', 281),
404	(base64Encoding, 'xmlname-intpad', 282),
405	(percentEncoding, 'upper-plus', 400),
406	(percentEncoding, 'upper', 401),
407	(percentEncoding, 'lower-plus', 410),
408	(percentEncoding, 'lower', 411),
409	(percentEncoding, 'mixed-plus', 420),
410	(percentEncoding, 'mixed', 421),
411	]
412
413	encodings = {}
414	for enc,d,p in priorities:
415	e = enc(d)
416	e.priority = p
417	encodings["%s/%s" % (enc.name, d)] = e
418
419
420	def supportedEncodings():
421	e = list(encodings.keys())
422	e.sort()
423	return e
424
425
426	def possibleEncodings(blob):
427	likely = set()
428	possible = set()
429	for name,encoding in encodings.items():
430	result = encoding.isExample(blob)
431	if result == True:
432	likely.add(name)
433	elif result == None:
434	possible.add(name)
435	return likely,possible
436
437
438	def encodingIntersection(blobs):
439	ret_val = set(encodings.keys())
440	p = set(encodings.keys())
441	for b in blobs:
442	likely,possible = possibleEncodings(b)
443	ret_val &= likely \| possible
444	p &= possible
445	return ret_val - p
446
447
448	def bestEncoding(encs):
449	priority = 999999999
450	best = None
451	for e in encs:
452	if encodings[e].priority < priority:
453	best = e
454	priority = encodings[e].priority
455	return best
456
457
458	def decode(encoding, blob):
459	"""Given an encoding name and a blob, decodes the blob and returns it.
460
461	encoding -- A string representation of the encoding and dialect.
462	For a list of valid encoding names, run:
463	bletchley-analyze -e ?
464
465	blob -- A bytes or bytearray object to be decoded. If a string
466	is provided instead, it will be converted to a bytes
467	object using 'utf-8'.
468
469	Returns a bytes object containing the decoded representation of
470	blob. Will throw various types of exceptions if a problem is
471	encountered.
472	"""
473	if isinstance(blob, str):
474	blob = blob.encode('utf-8')
475	return encodings[encoding].decode(blob)
476
477	def encode(encoding, blob):
478	"""Given an encoding name and a blob, encodes the blob and returns it.
479
480	encoding -- A string representation of the encoding and dialect.
481	For a list of valid encoding names, run:
482	bletchley-analyze -e ?
483
484	blob -- A bytes or bytearray object to be encoded.
485
486	Returns a bytes object containing the encoded representation of
487	blob. Will throw various types of exceptions if a problem is
488	encountered."""
489	return encodings[encoding].encode(blob)
490
491
492	def decodeAll(encoding, blobs):
493	return [encodings[encoding].decode(b) for b in blobs]
494
495
496	def encodeAll(encoding, blobs):
497	return [encodings[encoding].encode(b) for b in blobs]
498
499
500	def decodeChain(decoding_chain, blob):
501	"""Given a sequence of encoding names (decoding_chain) and a blob,
502	decodes the blob once for each element of the decoding_chain. For
503	instance, if the decoding_chain were
504	['percent/lower', 'base64/rfc3548']
505	then blob would first be decoded as 'percent/lower', followed by
506	'base64/rfc3548'.
507
508	decoding_chain -- A sequence (list,tuple,...) of string
509	representations of the encoding and dialect. For a
510	list of valid encoding names, run:
511	bletchley-analyze -e ?
512
513	blob -- A bytes or bytearray object to be decoded. If a string
514	is provided instead, it will be converted to a bytes
515	object using 'utf-8'.
516
517	Returns a bytes object containing the decoded representation of
518	blob. Will throw various types of exceptions if a problem is
519	encountered.
520	"""
521	for decoding in decoding_chain:
522	blob = decode(decoding, blob)
523	return blob
524
525
526	def encodeChain(encoding_chain, blob):
527	"""Given a sequence of encoding names (encoding_chain) and a blob,
528	encodes the blob once for each element of the encoding_chain. For
529	instance, if the encoding_chain were
530	['base64/rfc3548', 'percent/lower',]
531	then blob would first be encoded as 'base64/rfc3548', followed by
532	'percent/lower'.
533
534	encoding_chain -- A sequence (list,tuple,...) of string
535	representations of the encoding and dialect. For a
536	list of valid encoding names, run:
537	bletchley-analyze -e ?
538
539	blob -- A bytes or bytearray object to be encoded.
540
541	Returns a bytes object containing the encoded representation of
542	blob. Will throw various types of exceptions if a problem is
543	encountered.
544	"""
545	for encoding in encoding_chain:
546	blob = encode(encoding, blob)
547	return blob
548
549
550	def getLengths(s):
551	lengths = set()
552	for bin in s:
553	lengths.add(len(bin))
554	lengths = list(lengths)
555	lengths.sort()
556	return lengths
557
558
559	def maxBlockSize(blob_lengths):
560	divisor = 0
561	for bl in blob_lengths:
562	divisor = fractions.gcd(divisor, bl)
563
564	return divisor
565
566
567	allTrue = functools.partial(functools.reduce, (lambda x,y: x and y))
568
569	def checkCommonBlocksizes(lengths):
570	common_block_sizes = (8,16,20)
571	ret_val = []
572	for cbs in common_block_sizes:
573	gcdIsCBS = (lambda x: fractions.gcd(x,cbs)==cbs)
574	if allTrue(map(gcdIsCBS, lengths)):
575	ret_val.append(cbs)
576	return ret_val
577
578
579	def int2binary(x, bits=8):
580	"""
581	Integer to binary
582	Count is number of bits
583	"""
584	return "".join(map(lambda y:str((x>>y)&1), range(bits-1, -1, -1)))

Note: See TracBrowser for help on using the repository browser.

Download in other formats: