source: trunk/lib/bletchley/blobtools.py @ 67

Last change on this file since 67 was 67, checked in by tmorgan, 11 years ago

added documentation to a few functions

allowing strings as argument to decode

File size: 18.0 KB
Line 
1'''
2A collection of tools to assist in analyzing encrypted blobs of data
3
4Copyright (C) 2011-2013 Virtual Security Research, LLC
5Author: Timothy D. Morgan, Jason A. Donenfeld
6
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU Lesser General Public License, version 3,
9 as published by the Free Software Foundation.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program.  If not, see <http://www.gnu.org/licenses/>.
18'''
19
20import sys
21import base64
22import binascii
23import traceback
24import fractions
25import operator
26import functools
27from . import buffertools
28
29
30# urllib.parse's functions are not well suited for encoding/decoding
31# bytes or managing encoded case
32def _percentEncode(binary, plus=False, upper=True):
33    fmt = "%%%.2X"
34    if upper:
35        fmt = "%%%.2x"
36
37    ret_val = bytearray(b'')
38    for c in binary:
39        if c not in b'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789':
40            ret_val.extend((fmt % c).encode('ascii'))
41        elif plus and (c == 20):
42            ret_val.extend(b'+')
43        else:
44            ret_val.append(c)
45   
46    return ret_val
47
48
49def _percentDecode(binary, plus=False):
50    if plus:
51        binary = binary.replace(b'+', b' ')
52    if binary == b'':
53        return b''
54    chunks = binary.split(b'%')
55
56    ret_val = chunks[0]
57    for chunk in chunks[1:]:
58        if len(chunk) < 2:
59            return None
60        try:
61            ret_val += bytes([int(chunk[0:2], 16)]) + chunk[2:]
62        except:
63            #traceback.print_exc()
64            #print(repr(chunk), repr(binary))
65            return None
66           
67    return ret_val
68
69
70# abstract class
71class DataEncoding(object):
72    charset = frozenset(b'')
73    extraneous_chars = b''
74    dialect = None
75    name = None
76    priority = None
77
78    def __init__(self, dialect=''):
79        self.dialect = dialect
80
81    def isExample(self, blob):
82        sblob = frozenset(blob)
83        if self.charset != None and not sblob.issubset(self.charset):
84            return False
85        return self.extraTests(blob)
86   
87    def extraTests(self, blob):
88        """May return True, False, or None, for is an example, isn't an
89        example, or unknown, respectively.
90
91        """
92        return True
93
94    def decode(self, blob):
95        return None
96
97    def encode(self, blob):
98        return None
99
100
101class base64Encoding(DataEncoding):
102    name = 'base64'
103    def __init__(self, dialect='rfc3548'):
104        super(base64Encoding, self).__init__(dialect)
105        if dialect.startswith('rfc3548'):
106            self.c62 = b'+'
107            self.c63 = b'/'
108            self.pad = b'='
109        elif dialect.startswith('filename'):
110            self.c62 = b'+'
111            self.c63 = b'-'
112            self.pad = b'='
113        elif dialect.startswith('url1'):
114            self.c62 = b'-'
115            self.c63 = b'_'
116            self.pad = b'='
117        elif dialect.startswith('url2'):
118            self.c62 = b'-'
119            self.c63 = b'_'
120            self.pad = b'.'
121        elif dialect.startswith('url3'):
122            self.c62 = b'_'
123            self.c63 = b'-'
124            self.pad = b'.'
125        elif dialect.startswith('url4'):
126            self.c62 = b'-'
127            self.c63 = b'_'
128            self.pad = b'!'
129        elif dialect.startswith('url5'):
130            self.c62 = b'+'
131            self.c63 = b'/'
132            self.pad = b'$'
133        elif dialect.startswith('url6'):
134            self.c62 = b'*'
135            self.c63 = b'/'
136            self.pad = b'='
137        elif dialect.startswith('otkurl'):
138            self.c62 = b'-'
139            self.c63 = b'_'
140            self.pad = b'*'
141        elif dialect.startswith('xmlnmtoken'):
142            self.c62 = b'.'
143            self.c63 = b'-'
144            self.pad = b'='
145        elif dialect.startswith('xmlname'):
146            self.c62 = b'_'
147            self.c63 = b':'
148            self.pad = b'='
149       
150        if 'newline' in dialect:
151            self.extraneous_chars = b'\r\n'
152
153        self.charset = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
154                                 +b'abcdefghijklmnopqrstuvwxyz0123456789'
155                                 +self.c62+self.c63+self.pad+self.extraneous_chars)
156
157    def _guessPadLength(self, nopad_len):
158        length = ((4 - nopad_len % 4) % 4)
159        if length != 3:
160            return length
161        return None
162
163    def extraTests(self, blob):
164        for c in self.extraneous_chars:
165            blob = blob.replace(bytes([c]), b'')
166
167        if self.dialect.endswith('intpad'):
168            if blob[-1] not in b'012':
169                return False
170            nopad = blob[:-1]
171            padlen = blob[-1] - 48 # see the ascii table
172        else:
173            nopad = blob.rstrip(self.pad)
174            padlen = len(blob) - len(nopad)
175
176        # what the pad length ought to be
177        padlen_guess = self._guessPadLength(len(nopad))
178        if padlen_guess == None:
179            return False
180
181        # we don't accept bad pads, only missing pads
182        if self.dialect.endswith('nopad'):
183            return self.pad not in blob
184
185        # pad must not appear in the middle of the
186        # string and must be the correct length at the end
187        return (self.pad not in nopad) and (padlen == padlen_guess)
188
189    def decode(self, blob):
190        for c in self.extraneous_chars:
191            blob = blob.replace(bytes(c), b'')
192
193        if self.dialect.endswith('intpad'):
194            padlen = blob[-1] - 48 # see the ascii table
195            padlen_guess = self._guessPadLength(len(blob[:-1]))
196            if padlen != padlen_guess:
197                raise Exception("Invalid length for int-padded base64 string. (%d != %d)" 
198                                % (padlen, padlen_guess))
199
200            blob = blob[:-1] + (self.pad*padlen)
201
202        if self.dialect.endswith('nopad'):
203            if self.pad in blob:
204                raise Exception("Unpadded base64 string contains pad character")
205
206            padlen = self._guessPadLength(len(blob))
207            if padlen == None:
208                raise Exception("Invalid length for unpadded base64 string.")
209
210            blob = blob+(self.pad*padlen)
211
212        if not self.dialect.startswith('rfc3548'):
213            table = bytes.maketrans(self.c62+self.c63+self.pad, b'+/=')
214            blob = blob.translate(table)
215
216        return base64.standard_b64decode(blob)
217
218
219    def encode(self, blob):
220        ret_val = base64.standard_b64encode(blob)
221
222        if not self.dialect.startswith('rfc3548'):
223            table = bytes.maketrans(b'+/=', self.c62+self.c63+self.pad)
224            ret_val = ret_val.translate(table)
225
226        if ret_val != None and self.dialect.endswith('nopad'):
227            ret_val = ret_val.rstrip(self.pad)
228
229        if ret_val != None and self.dialect.endswith('intpad'):
230            stripped = ret_val.rstrip(self.pad) 
231            ret_val = stripped + ("%d" % (len(ret_val) - len(stripped))).encode('utf-8')
232
233        return ret_val
234
235
236class base32Encoding(DataEncoding):
237    name = 'base32'
238    def __init__(self, dialect='rfc3548upper'):
239        super(base32Encoding, self).__init__(dialect)
240        if dialect.startswith('rfc3548upper'):
241            self.pad = b'='
242            self.charset = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ234567'+self.pad)
243
244        elif dialect.startswith('rfc3548lower'):
245            self.pad = b'='
246            self.charset = frozenset(b'abcdefghijklmnopqrstuvwxyz234567'+self.pad)
247
248    def _guessPadLength(self, nopad_len):
249        pad_lengths = {0:0, 7:1, 5:3, 4:4, 2:6}
250        return pad_lengths.get(nopad_len%8, None) 
251
252    def extraTests(self, blob):
253        nopad = blob.rstrip(self.pad)
254        padlen_guess = self._guessPadLength(len(nopad))
255        if padlen_guess == None:
256            return False
257
258        # we don't accept bad pads, only missing pads
259        if self.dialect.endswith('nopad'):
260            return self.pad not in blob
261
262        # pad must not appear in the middle of the
263        # string and must be the correct length at the end
264        return (self.pad not in nopad) and (len(blob) == len(nopad)+padlen_guess)
265
266
267    def decode(self, blob):
268        if self.dialect.endswith('nopad'):
269            if self.pad in blob:
270                raise Exception("Unpadded base32 string contains pad character")
271
272            padlen = self._guessPadLength(len(blob))
273            if padlen == None:
274                raise Exception("Invalid length for unpadded base64 string.")
275
276            blob = blob+(self.pad*padlen)
277
278        return base64.b32decode(blob.upper())
279
280
281    def encode(self, blob):
282        ret_val = base64.b32encode(blob)
283
284        if ret_val != None and self.dialect.endswith('nopad'):
285            ret_val = ret_val.rstrip(self.pad)
286
287        if 'lower' in self.dialect:
288            ret_val = ret_val.lower()
289        else:
290            ret_val = ret_val.upper()
291
292        return ret_val
293
294
295class hexEncoding(DataEncoding):
296    name = 'hex'
297    def __init__(self, dialect='mixed'):
298        super(hexEncoding, self).__init__(dialect)
299        if 'mixed' in dialect:
300            self.charset = frozenset(b'ABCDEFabcdef0123456789')
301        elif 'upper' in dialect:
302            self.charset = frozenset(b'ABCDEF0123456789')           
303        elif 'lower' in dialect:
304            self.charset = frozenset(b'abcdef0123456789')
305
306
307    def extraTests(self, blob):
308        return (len(blob) % 2 == 0)
309
310    def decode(self, blob):
311        return binascii.a2b_hex(blob)
312
313    def encode(self, blob):
314        if 'upper' in self.dialect:
315            return binascii.b2a_hex(blob).upper()
316        if 'lower' in self.dialect:
317            return binascii.b2a_hex(blob).lower()
318        else:
319            return binascii.b2a_hex(blob)
320
321
322class percentEncoding(DataEncoding):
323    name = 'percent'
324    def __init__(self, dialect='mixed'):
325        super(percentEncoding, self).__init__(dialect)
326        self.charset = None
327        if 'mixed' in dialect:
328            self.hexchars = frozenset(b'ABCDEFabcdef0123456789')
329        elif 'upper' in dialect:
330            self.hexchars = frozenset(b'ABCDEF0123456789')           
331        elif 'lower' in dialect:
332            self.hexchars = frozenset(b'abcdef0123456789')
333
334    def extraTests(self, blob):
335        chunks = blob.split(b'%')
336        if len(chunks) < 2:
337            return None
338        for c in chunks[1:]:
339            if len(c) < 2:
340                return False
341            if (c[0] not in self.hexchars) or (c[1] not in self.hexchars):
342                return False
343        return True
344
345    def decode(self, blob):
346        plus = False
347        if 'plus' in self.dialect:
348            plus = True
349        return _percentDecode(blob, plus=plus)
350
351    def encode(self, blob):
352        upper = True
353        plus = False
354        if 'plus' in self.dialect:
355            plus = True
356        if 'lower' in self.dialect:
357            upper = False
358
359        return _percentEncode(blob, plus=plus, upper=upper)
360
361# XXX: need a better way to organize these with the possible combinations of dialects, padding, etc
362#      for instance, can we have rfc3548-newline-nopad ?
363priorities = [
364    (hexEncoding, 'upper', 100),
365    (hexEncoding, 'lower', 101),
366    (hexEncoding, 'mixed', 102),
367    (base32Encoding, 'rfc3548upper', 150),
368    (base32Encoding, 'rfc3548lower', 151),
369    (base32Encoding, 'rfc3548upper-nopad', 160),
370    (base32Encoding, 'rfc3548lower-nopad', 161),
371    (base64Encoding, 'rfc3548', 200),
372    (base64Encoding, 'rfc3548-nopad', 201),
373    (base64Encoding, 'rfc3548-newline', 202),
374    (base64Encoding, 'rfc3548-intpad', 203),
375    (base64Encoding, 'filename', 210),
376    (base64Encoding, 'filename-nopad', 211),
377    (base64Encoding, 'filename-intpad', 212),
378    (base64Encoding, 'url1', 230),
379    (base64Encoding, 'url1-nopad', 231),
380    (base64Encoding, 'url1-intpad', 232),
381    (base64Encoding, 'otkurl', 235),
382    (base64Encoding, 'otkurl-nopad', 236),
383    (base64Encoding, 'otkurl-intpad', 237),
384    (base64Encoding, 'url2', 240),
385    (base64Encoding, 'url2-nopad', 241),
386    (base64Encoding, 'url2-intpad', 242),
387    (base64Encoding, 'url3', 250),
388    (base64Encoding, 'url3-nopad', 251),
389    (base64Encoding, 'url3-intpad', 252),
390    (base64Encoding, 'url4', 260),
391    (base64Encoding, 'url4-nopad', 261),
392    (base64Encoding, 'url4-intpad', 262),
393    (base64Encoding, 'url5', 265),
394    (base64Encoding, 'url5-nopad', 266),
395    (base64Encoding, 'url5-intpad', 267),
396    (base64Encoding, 'url6', 267),
397    (base64Encoding, 'url6-nopad', 268),
398    (base64Encoding, 'url6-intpad', 269),
399    (base64Encoding, 'xmlnmtoken', 270),
400    (base64Encoding, 'xmlnmtoken-nopad', 271),
401    (base64Encoding, 'xmlnmtoken-intpad', 272),
402    (base64Encoding, 'xmlname', 280),
403    (base64Encoding, 'xmlname-nopad', 281),
404    (base64Encoding, 'xmlname-intpad', 282),
405    (percentEncoding, 'upper-plus', 400),
406    (percentEncoding, 'upper', 401),
407    (percentEncoding, 'lower-plus', 410),
408    (percentEncoding, 'lower', 411),
409    (percentEncoding, 'mixed-plus', 420),
410    (percentEncoding, 'mixed', 421),
411    ]
412
413encodings = {}
414for enc,d,p in priorities:
415    e = enc(d)
416    e.priority = p
417    encodings["%s/%s" % (enc.name, d)] = e
418
419
420def supportedEncodings():
421    e = list(encodings.keys())
422    e.sort()
423    return e
424
425
426def possibleEncodings(blob):
427    likely = set()
428    possible = set()
429    for name,encoding in encodings.items():
430        result = encoding.isExample(blob)
431        if result == True:
432            likely.add(name)
433        elif result == None:
434            possible.add(name)
435    return likely,possible
436
437
438def encodingIntersection(blobs):
439    ret_val = set(encodings.keys())
440    p = set(encodings.keys())
441    for b in blobs:
442        likely,possible = possibleEncodings(b)
443        ret_val &= likely | possible
444        p &= possible
445    return ret_val - p
446
447
448def bestEncoding(encs):
449    priority = 999999999
450    best = None
451    for e in encs:
452        if encodings[e].priority < priority:
453            best = e
454            priority = encodings[e].priority
455    return best
456
457
458def decode(encoding, blob):
459    """Given an encoding name and a blob, decodes the blob and returns it.
460
461    encoding -- A string representation of the encoding and dialect.
462                For a list of valid encoding names, run:
463                  bletchley-analyze -e ?
464
465    blob     -- A bytes or bytearray object to be decoded.  If a string
466                is provided instead, it will be converted to a bytes
467                object using 'utf-8'.
468
469    Returns a bytes object containing the decoded representation of
470    blob.  Will throw various types of exceptions if a problem is
471    encountered.
472    """
473    if isinstance(blob, str):
474        blob = blob.encode('utf-8')
475    return encodings[encoding].decode(blob)
476
477def encode(encoding, blob):
478    """Given an encoding name and a blob, encodes the blob and returns it.
479
480    encoding -- A string representation of the encoding and dialect.
481                For a list of valid encoding names, run:
482                  bletchley-analyze -e ?
483
484    blob     -- A bytes or bytearray object to be encoded.
485
486    Returns a bytes object containing the encoded representation of
487    blob.  Will throw various types of exceptions if a problem is
488    encountered."""
489    return encodings[encoding].encode(blob)
490
491
492def decodeAll(encoding, blobs):
493    return [encodings[encoding].decode(b) for b in blobs]
494
495
496def encodeAll(encoding, blobs):
497    return [encodings[encoding].encode(b) for b in blobs]
498
499
500def decodeChain(decoding_chain, blob):
501    """Given a sequence of encoding names (decoding_chain) and a blob,
502    decodes the blob once for each element of the decoding_chain. For
503    instance, if the decoding_chain were
504      ['percent/lower', 'base64/rfc3548']
505    then blob would first be decoded as 'percent/lower', followed by
506    'base64/rfc3548'.
507
508    decoding_chain -- A sequence (list,tuple,...) of string
509                      representations of the encoding and dialect. For a
510                      list of valid encoding names, run: 
511                         bletchley-analyze -e ?
512
513    blob     -- A bytes or bytearray object to be decoded.  If a string
514                is provided instead, it will be converted to a bytes
515                object using 'utf-8'.
516
517    Returns a bytes object containing the decoded representation of
518    blob.  Will throw various types of exceptions if a problem is
519    encountered.
520    """
521    for decoding in decoding_chain:
522        blob = decode(decoding, blob)
523    return blob
524
525
526def encodeChain(encoding_chain, blob):
527    """Given a sequence of encoding names (encoding_chain) and a blob,
528    encodes the blob once for each element of the encoding_chain. For
529    instance, if the encoding_chain were
530      ['base64/rfc3548', 'percent/lower',]
531    then blob would first be encoded as 'base64/rfc3548', followed by
532    'percent/lower'.
533
534    encoding_chain -- A sequence (list,tuple,...) of string
535                      representations of the encoding and dialect. For a
536                      list of valid encoding names, run: 
537                         bletchley-analyze -e ?
538
539    blob     -- A bytes or bytearray object to be encoded.
540
541    Returns a bytes object containing the encoded representation of
542    blob.  Will throw various types of exceptions if a problem is
543    encountered.
544    """   
545    for encoding in encoding_chain:
546        blob = encode(encoding, blob)
547    return blob
548
549
550def getLengths(s):
551    lengths = set()
552    for bin in s:
553        lengths.add(len(bin))
554    lengths = list(lengths)
555    lengths.sort()
556    return lengths
557
558
559def maxBlockSize(blob_lengths):
560    divisor = 0
561    for bl in blob_lengths:
562        divisor = fractions.gcd(divisor, bl)
563
564    return divisor
565
566
567allTrue = functools.partial(functools.reduce, (lambda x,y: x and y))
568
569def checkCommonBlocksizes(lengths):
570    common_block_sizes = (8,16,20)
571    ret_val = []
572    for cbs in common_block_sizes:
573        gcdIsCBS = (lambda x: fractions.gcd(x,cbs)==cbs)
574        if allTrue(map(gcdIsCBS, lengths)):
575            ret_val.append(cbs)
576    return ret_val
577
578
579def int2binary(x, bits=8):
580        """
581        Integer to binary
582        Count is number of bits
583        """
584        return "".join(map(lambda y:str((x>>y)&1), range(bits-1, -1, -1)))
Note: See TracBrowser for help on using the repository browser.