Used DeepLearning to find encrypted strings.

author: Cyrille Bagard <nocbos@gmail.com> 2018-08-16 18:11:04 (GMT)
committer: Cyrille Bagard <nocbos@gmail.com> 2018-08-16 18:11:04 (GMT)
commit: e0419341c4e0e1bd29ea5f6a48f1b9da6deb106c (patch)
tree: 6b5d47d7cd16cc358c2e840b93f7d82826af5f45
parent: cb4bc9b61622803024345538fb55f6781ef872b9 (diff)
1 files changed, 367 insertions, 0 deletions
diff --git a/python/alla_net.py b/python/alla_net.py
new file mode 100644
index 0000000..0ff0da1
--- /dev/null
+++ b/python/alla_net.py
@@ -0,0 +1,367 @@
+
+import numpy as np
+import os.path
+import pickle
+import pychrysalide
+from pychrysalide.analysis.contents import FileContent
+from pychrysalide.analysis import LoadedBinary
+from pychrysalide.arch import ArchInstruction
+from pychrysalide.format import BinSymbol
+from pychrysalide.format.dex import DexFormat
+import random
+import sys
+
+
+class NeuralNetwork():
+    """Neural network."""
+
+
+    def __init__(self, inputs):
+        """Initialize the neural network."""
+
+        # Parameters
+        input_size = len(inputs[0])
+        output_size = 1
+
+        # Hidden layer
+        # See https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw
+        hidden_size = int((2 * input_size) / 3) + output_size
+
+        # (mxn) weight matrix from input to hidden layer
+        self._w1 = np.random.randn(input_size, hidden_size) 
+
+        # (nx1) weight matrix from hidden to output layer
+        self._w2 = np.random.randn(hidden_size, output_size)
+
+
+    def forward(self, x):
+        """Forward propagation through the network."""
+
+        self._z = np.dot(x, self._w1)
+        self._z2 = self.sigmoid(self._z)
+        self._z3 = np.dot(self._z2, self._w2)
+
+        o = self.sigmoid(self._z3)
+
+        return o
+
+
+    def sigmoid(self, s):
+        """Activation function."""
+
+        return 1 / (1 + np.exp(-s))
+
+
+    def sigmoidPrime(self, s):
+        """Derivative of sigmoid."""
+
+        return s * (1 - s)
+
+
+    def backward(self, x, y, o):
+        """Backward propagate through the network."""
+
+        # Error in output
+        self._o_error = y - o
+        self._o_delta = self._o_error * self.sigmoidPrime(o)
+
+         # How much the hidden layer weights contributed to output error ?
+        self._z2_error = self._o_delta.dot(self._w2.T)
+        self._z2_delta = self._z2_error * self.sigmoidPrime(self._z2)
+
+        # Adjusting set weights
+
+        # (input --> hidden)
+        self._w1 += x.T.dot(self._z2_delta)
+
+        # (hidden --> output)
+        self._w2 += self._z2.T.dot(self._o_delta)
+
+
+    def train(self, x, y):
+        """Train the neural network with samples."""
+
+        o = self.forward(x)
+
+        old_loss = np.mean(np.square(y - nn.forward(x)))
+
+        self.backward(x, y, o)
+
+        loss = np.mean(np.square(y - nn.forward(x)))
+
+        return old_loss == loss
+
+
+    def predict(self, origin, x):
+        """Guess a results with the trained neural network."""
+
+        got = self.forward(x)[0]
+
+        print('Input:', origin)
+        print('Output:', got)
+
+        return got > 0.5
+
+
+class DeepLearning():
+    """Deep learning."""
+
+
+    def __init__(self, keep):
+        """Build a deep learning system."""
+
+        self._keep = keep
+
+
+    def _get_input_strings(self, filename):
+        """Grab all plain and encrypted strings from a Dex file."""
+
+        cnt = FileContent(filename)
+        fmt = DexFormat(cnt)
+        binary = LoadedBinary(fmt)
+
+        binary.analyze_and_wait()
+
+        encrypted = []
+
+        strings = []
+
+        for sym in binary.format.symbols:
+
+            if sym.target_type == BinSymbol.STP_DYN_STRING:
+
+                ins = binary.processor.find_instr_by_addr(sym.range.addr)
+                assert(ins)
+
+                for slink, stype in ins.sources:
+
+                    if stype == ArchInstruction.ILT_REF:
+
+                        encrypted.append(slink.range.addr.phys)
+
+            elif sym.target_type == BinSymbol.STP_RO_STRING:
+
+                strings.append(sym)
+
+        final = []
+
+        for s in strings:
+
+            if len(s.raw) < 5:
+                continue
+
+            final.append( [ s.raw, s.range.addr in encrypted ] )
+
+        return final
+
+
+    def _normalize_string(self, raw):
+        """Produce an input using a common input format."""
+
+        non_printable = 0
+        punct = 0
+        digit = 0
+        upper = 0
+        lower = 0
+
+        descriptor = 0
+
+        for b in raw:
+
+            if b <= 0x20:
+                non_printable += 1
+
+            elif b >= 0x21 and b < 0x2f:
+                punct += 1
+
+            elif b >= 0x30 and b < 0x39:
+                digit += 1
+
+            elif b >= 0x3a and b < 0x40:
+                punct += 1
+
+            elif b >= 0x41 and b < 0x5a:
+                upper += 1
+
+            elif b >= 0x5b and b < 0x60:
+                punct += 1
+
+            elif b >= 0x61 and b < 0x7a:
+                lower += 1
+
+            elif b >= 0x7b and b < 0x7e:
+                punct += 1
+
+            else:
+                non_printable += 1
+
+            if b in b'$-_</[;':
+                descriptor += 1
+                punct -= 1
+
+        length = len(raw)
+
+        return [ non_printable / length, punct / length, digit / length, upper / length, lower / length,
+                 descriptor / length]
+
+
+    def _build_inputs_and_outputs(self, strings):
+        """Produces inputs and outputs."""
+
+        inputs = []
+        outputs = []
+
+        for raw, encrypted in strings:
+
+            inputs.append( self._normalize_string(raw) )
+
+            outputs.append( [ 1.0 if encrypted else 0.1 ] )
+
+        return inputs, outputs
+
+
+    def get_training_data(self, filename):
+        """Provide some training data."""
+
+        if filename:
+
+            if not(os.path.isfile('training.data')):
+
+                strings = self._get_input_strings(filename)
+
+                #strings = sorted(strings, key=lambda s: len(s[0]), reverse=True)
+                random.shuffle(strings)
+
+                kept = []
+
+                plain_count = 0
+                encrypted_count = 0
+
+                for raw, encrypted in strings:
+
+                    if encrypted:
+                        if encrypted_count < self._keep:
+                            kept.append( [ raw, True ] )
+                            encrypted_count += 1
+
+                    else:
+                        if plain_count < self._keep:
+                            kept.append( [ raw, False ] )
+                            plain_count += 1
+
+                    if encrypted_count == self._keep and plain_count < self._keep:
+                        break
+
+                fd = open('training.data', 'wb')
+                pickle.dump(kept, fd)
+                fd.close()
+
+            else:
+
+                fd = open('training.data', 'rb')
+                kept = pickle.load(fd)
+                fd.close()
+
+        else:
+
+            kept = [
+                [ b'versionNeededToExtract', False ],
+                [ b'versionNumber', False ],
+                [ b'vhstxos(gtxp~brt9Rrjrv\x7fiy\x7f9GTR^IYYTNVHPCHBR@VS[R', True ],
+                [ b'vhstxos(~hccyr9gtr~iy(TJXURYD_DRRKHB^G[IPU', True ],
+            ]
+
+        return self._build_inputs_and_outputs(kept)
+
+
+    def get_predict_data(self, filename):
+        """Provide some data to predict."""
+
+        if filename:
+
+            if not(os.path.isfile('predict.data')):
+
+                strings = self._get_input_strings(filename)
+
+                fd = open('predict.data', 'wb')
+                pickle.dump(strings, fd)
+                fd.close()
+
+            else:
+
+                fd = open('predict.data', 'rb')
+                strings = pickle.load(fd)
+                fd.close()
+
+        else:
+
+            strings = [
+                [ b'vhstxos(gtxp~brt9Rrjrv\x7fiy\x7f9GTR^IYYTNVHPCHBR@VS[R', True ],
+                [ b'versionNeededToExtract', False ],
+                [ b'jLoPdKo\x0cbL\x7fGeV%ChVbMe\x0cHcGn', True ],
+                [ b'isWriteComprSizeInZip64ExtraRecord', False ]
+            ]
+
+        inputs, outputs = self._build_inputs_and_outputs(strings)
+
+        return strings, inputs, outputs
+
+
+if __name__ == '__main__':
+    """Entry point."""
+
+    if len(sys.argv) != 3:
+        print('Usage: %s < training dex or - > < predict dex or - >' % sys.argv[0])
+        sys.exit(1)
+
+    training_dex = sys.argv[1] if sys.argv[1] != '-' else None
+    predict_dex = sys.argv[2] if sys.argv[2] != '-' else None
+
+    dl = DeepLearning(15)
+
+    print()
+    print('--- Training ---')
+    print()
+
+    if training_dex:
+        print('Input file:', training_dex)
+
+    inputs, outputs = dl.get_training_data(training_dex)
+
+    nn = NeuralNetwork(inputs)
+
+    x = np.array(inputs, dtype=float)
+    y = np.array(outputs, dtype=float)
+
+    for i in range(100000):
+
+        print('#', i + 1, 'Loss:',  np.mean(np.square(y - nn.forward(x))), end='\r')
+
+        if nn.train(x, y):
+            break
+
+    print()
+
+    print()
+    print('--- Predictions ---')
+    print()
+
+    if predict_dex:
+        print('Predict file:', predict_dex)
+
+    strings, inputs, outputs = dl.get_predict_data(predict_dex)
+
+    right = 0
+
+    for i in range(len(inputs)):
+
+        x = np.array(inputs[i], dtype=float)
+
+        encrypted = nn.predict(strings[i][0], x)
+
+        if encrypted == strings[i][1]:
+            right += 1
+
+    print()
+    print('Right guessed:', (right * 100 ) / len(strings))
+    print()
author	Cyrille Bagard <nocbos@gmail.com>	2018-08-16 18:11:04 (GMT)
committer	Cyrille Bagard <nocbos@gmail.com>	2018-08-16 18:11:04 (GMT)
commit	e0419341c4e0e1bd29ea5f6a48f1b9da6deb106c (patch)
tree	6b5d47d7cd16cc358c2e840b93f7d82826af5f45
parent	cb4bc9b61622803024345538fb55f6781ef872b9 (diff)