diff options
| author | Cyrille Bagard <nocbos@gmail.com> | 2018-08-16 18:11:04 (GMT) | 
|---|---|---|
| committer | Cyrille Bagard <nocbos@gmail.com> | 2018-08-16 18:11:04 (GMT) | 
| commit | e0419341c4e0e1bd29ea5f6a48f1b9da6deb106c (patch) | |
| tree | 6b5d47d7cd16cc358c2e840b93f7d82826af5f45 | |
| parent | cb4bc9b61622803024345538fb55f6781ef872b9 (diff) | |
Used DeepLearning to find encrypted strings.
| -rw-r--r-- | python/alla_net.py | 367 | 
1 files changed, 367 insertions, 0 deletions
| diff --git a/python/alla_net.py b/python/alla_net.py new file mode 100644 index 0000000..0ff0da1 --- /dev/null +++ b/python/alla_net.py @@ -0,0 +1,367 @@ + +import numpy as np +import os.path +import pickle +import pychrysalide +from pychrysalide.analysis.contents import FileContent +from pychrysalide.analysis import LoadedBinary +from pychrysalide.arch import ArchInstruction +from pychrysalide.format import BinSymbol +from pychrysalide.format.dex import DexFormat +import random +import sys + + +class NeuralNetwork(): +    """Neural network.""" + + +    def __init__(self, inputs): +        """Initialize the neural network.""" + +        # Parameters +        input_size = len(inputs[0]) +        output_size = 1 + +        # Hidden layer +        # See https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw +        hidden_size = int((2 * input_size) / 3) + output_size + +        # (mxn) weight matrix from input to hidden layer +        self._w1 = np.random.randn(input_size, hidden_size)  + +        # (nx1) weight matrix from hidden to output layer +        self._w2 = np.random.randn(hidden_size, output_size) + + +    def forward(self, x): +        """Forward propagation through the network.""" + +        self._z = np.dot(x, self._w1) +        self._z2 = self.sigmoid(self._z) +        self._z3 = np.dot(self._z2, self._w2) + +        o = self.sigmoid(self._z3) + +        return o + + +    def sigmoid(self, s): +        """Activation function.""" + +        return 1 / (1 + np.exp(-s)) + + +    def sigmoidPrime(self, s): +        """Derivative of sigmoid.""" + +        return s * (1 - s) + + +    def backward(self, x, y, o): +        """Backward propagate through the network.""" + +        # Error in output +        self._o_error = y - o +        self._o_delta = self._o_error * self.sigmoidPrime(o) + +         # How much the hidden layer weights contributed to output error ? +        self._z2_error = self._o_delta.dot(self._w2.T) +        self._z2_delta = self._z2_error * self.sigmoidPrime(self._z2) + +        # Adjusting set weights + +        # (input --> hidden) +        self._w1 += x.T.dot(self._z2_delta) + +        # (hidden --> output) +        self._w2 += self._z2.T.dot(self._o_delta) + + +    def train(self, x, y): +        """Train the neural network with samples.""" + +        o = self.forward(x) + +        old_loss = np.mean(np.square(y - nn.forward(x))) + +        self.backward(x, y, o) + +        loss = np.mean(np.square(y - nn.forward(x))) + +        return old_loss == loss + + +    def predict(self, origin, x): +        """Guess a results with the trained neural network.""" + +        got = self.forward(x)[0] + +        print('Input:', origin) +        print('Output:', got) + +        return got > 0.5 + + +class DeepLearning(): +    """Deep learning.""" + + +    def __init__(self, keep): +        """Build a deep learning system.""" + +        self._keep = keep + + +    def _get_input_strings(self, filename): +        """Grab all plain and encrypted strings from a Dex file.""" + +        cnt = FileContent(filename) +        fmt = DexFormat(cnt) +        binary = LoadedBinary(fmt) + +        binary.analyze_and_wait() + +        encrypted = [] + +        strings = [] + +        for sym in binary.format.symbols: + +            if sym.target_type == BinSymbol.STP_DYN_STRING: + +                ins = binary.processor.find_instr_by_addr(sym.range.addr) +                assert(ins) + +                for slink, stype in ins.sources: + +                    if stype == ArchInstruction.ILT_REF: + +                        encrypted.append(slink.range.addr.phys) + +            elif sym.target_type == BinSymbol.STP_RO_STRING: + +                strings.append(sym) + +        final = [] + +        for s in strings: + +            if len(s.raw) < 5: +                continue + +            final.append( [ s.raw, s.range.addr in encrypted ] ) + +        return final + + +    def _normalize_string(self, raw): +        """Produce an input using a common input format.""" + +        non_printable = 0 +        punct = 0 +        digit = 0 +        upper = 0 +        lower = 0 + +        descriptor = 0 + +        for b in raw: + +            if b <= 0x20: +                non_printable += 1 + +            elif b >= 0x21 and b < 0x2f: +                punct += 1 + +            elif b >= 0x30 and b < 0x39: +                digit += 1 + +            elif b >= 0x3a and b < 0x40: +                punct += 1 + +            elif b >= 0x41 and b < 0x5a: +                upper += 1 + +            elif b >= 0x5b and b < 0x60: +                punct += 1 + +            elif b >= 0x61 and b < 0x7a: +                lower += 1 + +            elif b >= 0x7b and b < 0x7e: +                punct += 1 + +            else: +                non_printable += 1 + +            if b in b'$-_</[;': +                descriptor += 1 +                punct -= 1 + +        length = len(raw) + +        return [ non_printable / length, punct / length, digit / length, upper / length, lower / length, +                 descriptor / length] + + +    def _build_inputs_and_outputs(self, strings): +        """Produces inputs and outputs.""" + +        inputs = [] +        outputs = [] + +        for raw, encrypted in strings: + +            inputs.append( self._normalize_string(raw) ) + +            outputs.append( [ 1.0 if encrypted else 0.1 ] ) + +        return inputs, outputs + + +    def get_training_data(self, filename): +        """Provide some training data.""" + +        if filename: + +            if not(os.path.isfile('training.data')): + +                strings = self._get_input_strings(filename) + +                #strings = sorted(strings, key=lambda s: len(s[0]), reverse=True) +                random.shuffle(strings) + +                kept = [] + +                plain_count = 0 +                encrypted_count = 0 + +                for raw, encrypted in strings: + +                    if encrypted: +                        if encrypted_count < self._keep: +                            kept.append( [ raw, True ] ) +                            encrypted_count += 1 + +                    else: +                        if plain_count < self._keep: +                            kept.append( [ raw, False ] ) +                            plain_count += 1 + +                    if encrypted_count == self._keep and plain_count < self._keep: +                        break + +                fd = open('training.data', 'wb') +                pickle.dump(kept, fd) +                fd.close() + +            else: + +                fd = open('training.data', 'rb') +                kept = pickle.load(fd) +                fd.close() + +        else: + +            kept = [ +                [ b'versionNeededToExtract', False ], +                [ b'versionNumber', False ], +                [ b'vhstxos(gtxp~brt9Rrjrv\x7fiy\x7f9GTR^IYYTNVHPCHBR@VS[R', True ], +                [ b'vhstxos(~hccyr9gtr~iy(TJXURYD_DRRKHB^G[IPU', True ], +            ] + +        return self._build_inputs_and_outputs(kept) + + +    def get_predict_data(self, filename): +        """Provide some data to predict.""" + +        if filename: + +            if not(os.path.isfile('predict.data')): + +                strings = self._get_input_strings(filename) + +                fd = open('predict.data', 'wb') +                pickle.dump(strings, fd) +                fd.close() + +            else: + +                fd = open('predict.data', 'rb') +                strings = pickle.load(fd) +                fd.close() + +        else: + +            strings = [ +                [ b'vhstxos(gtxp~brt9Rrjrv\x7fiy\x7f9GTR^IYYTNVHPCHBR@VS[R', True ], +                [ b'versionNeededToExtract', False ], +                [ b'jLoPdKo\x0cbL\x7fGeV%ChVbMe\x0cHcGn', True ], +                [ b'isWriteComprSizeInZip64ExtraRecord', False ] +            ] + +        inputs, outputs = self._build_inputs_and_outputs(strings) + +        return strings, inputs, outputs + + +if __name__ == '__main__': +    """Entry point.""" + +    if len(sys.argv) != 3: +        print('Usage: %s < training dex or - > < predict dex or - >' % sys.argv[0]) +        sys.exit(1) + +    training_dex = sys.argv[1] if sys.argv[1] != '-' else None +    predict_dex = sys.argv[2] if sys.argv[2] != '-' else None + +    dl = DeepLearning(15) + +    print() +    print('--- Training ---') +    print() + +    if training_dex: +        print('Input file:', training_dex) + +    inputs, outputs = dl.get_training_data(training_dex) + +    nn = NeuralNetwork(inputs) + +    x = np.array(inputs, dtype=float) +    y = np.array(outputs, dtype=float) + +    for i in range(100000): + +        print('#', i + 1, 'Loss:',  np.mean(np.square(y - nn.forward(x))), end='\r') + +        if nn.train(x, y): +            break + +    print() + +    print() +    print('--- Predictions ---') +    print() + +    if predict_dex: +        print('Predict file:', predict_dex) + +    strings, inputs, outputs = dl.get_predict_data(predict_dex) + +    right = 0 + +    for i in range(len(inputs)): + +        x = np.array(inputs[i], dtype=float) + +        encrypted = nn.predict(strings[i][0], x) + +        if encrypted == strings[i][1]: +            right += 1 + +    print() +    print('Right guessed:', (right * 100 ) / len(strings)) +    print() | 
