From e0419341c4e0e1bd29ea5f6a48f1b9da6deb106c Mon Sep 17 00:00:00 2001 From: Cyrille Bagard Date: Thu, 16 Aug 2018 20:11:04 +0200 Subject: Used DeepLearning to find encrypted strings. --- python/alla_net.py | 367 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 367 insertions(+) create mode 100644 python/alla_net.py diff --git a/python/alla_net.py b/python/alla_net.py new file mode 100644 index 0000000..0ff0da1 --- /dev/null +++ b/python/alla_net.py @@ -0,0 +1,367 @@ + +import numpy as np +import os.path +import pickle +import pychrysalide +from pychrysalide.analysis.contents import FileContent +from pychrysalide.analysis import LoadedBinary +from pychrysalide.arch import ArchInstruction +from pychrysalide.format import BinSymbol +from pychrysalide.format.dex import DexFormat +import random +import sys + + +class NeuralNetwork(): + """Neural network.""" + + + def __init__(self, inputs): + """Initialize the neural network.""" + + # Parameters + input_size = len(inputs[0]) + output_size = 1 + + # Hidden layer + # See https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw + hidden_size = int((2 * input_size) / 3) + output_size + + # (mxn) weight matrix from input to hidden layer + self._w1 = np.random.randn(input_size, hidden_size) + + # (nx1) weight matrix from hidden to output layer + self._w2 = np.random.randn(hidden_size, output_size) + + + def forward(self, x): + """Forward propagation through the network.""" + + self._z = np.dot(x, self._w1) + self._z2 = self.sigmoid(self._z) + self._z3 = np.dot(self._z2, self._w2) + + o = self.sigmoid(self._z3) + + return o + + + def sigmoid(self, s): + """Activation function.""" + + return 1 / (1 + np.exp(-s)) + + + def sigmoidPrime(self, s): + """Derivative of sigmoid.""" + + return s * (1 - s) + + + def backward(self, x, y, o): + """Backward propagate through the network.""" + + # Error in output + self._o_error = y - o + self._o_delta = self._o_error * self.sigmoidPrime(o) + + # How much the hidden layer weights contributed to output error ? + self._z2_error = self._o_delta.dot(self._w2.T) + self._z2_delta = self._z2_error * self.sigmoidPrime(self._z2) + + # Adjusting set weights + + # (input --> hidden) + self._w1 += x.T.dot(self._z2_delta) + + # (hidden --> output) + self._w2 += self._z2.T.dot(self._o_delta) + + + def train(self, x, y): + """Train the neural network with samples.""" + + o = self.forward(x) + + old_loss = np.mean(np.square(y - nn.forward(x))) + + self.backward(x, y, o) + + loss = np.mean(np.square(y - nn.forward(x))) + + return old_loss == loss + + + def predict(self, origin, x): + """Guess a results with the trained neural network.""" + + got = self.forward(x)[0] + + print('Input:', origin) + print('Output:', got) + + return got > 0.5 + + +class DeepLearning(): + """Deep learning.""" + + + def __init__(self, keep): + """Build a deep learning system.""" + + self._keep = keep + + + def _get_input_strings(self, filename): + """Grab all plain and encrypted strings from a Dex file.""" + + cnt = FileContent(filename) + fmt = DexFormat(cnt) + binary = LoadedBinary(fmt) + + binary.analyze_and_wait() + + encrypted = [] + + strings = [] + + for sym in binary.format.symbols: + + if sym.target_type == BinSymbol.STP_DYN_STRING: + + ins = binary.processor.find_instr_by_addr(sym.range.addr) + assert(ins) + + for slink, stype in ins.sources: + + if stype == ArchInstruction.ILT_REF: + + encrypted.append(slink.range.addr.phys) + + elif sym.target_type == BinSymbol.STP_RO_STRING: + + strings.append(sym) + + final = [] + + for s in strings: + + if len(s.raw) < 5: + continue + + final.append( [ s.raw, s.range.addr in encrypted ] ) + + return final + + + def _normalize_string(self, raw): + """Produce an input using a common input format.""" + + non_printable = 0 + punct = 0 + digit = 0 + upper = 0 + lower = 0 + + descriptor = 0 + + for b in raw: + + if b <= 0x20: + non_printable += 1 + + elif b >= 0x21 and b < 0x2f: + punct += 1 + + elif b >= 0x30 and b < 0x39: + digit += 1 + + elif b >= 0x3a and b < 0x40: + punct += 1 + + elif b >= 0x41 and b < 0x5a: + upper += 1 + + elif b >= 0x5b and b < 0x60: + punct += 1 + + elif b >= 0x61 and b < 0x7a: + lower += 1 + + elif b >= 0x7b and b < 0x7e: + punct += 1 + + else: + non_printable += 1 + + if b in b'$-_ < predict dex or - >' % sys.argv[0]) + sys.exit(1) + + training_dex = sys.argv[1] if sys.argv[1] != '-' else None + predict_dex = sys.argv[2] if sys.argv[2] != '-' else None + + dl = DeepLearning(15) + + print() + print('--- Training ---') + print() + + if training_dex: + print('Input file:', training_dex) + + inputs, outputs = dl.get_training_data(training_dex) + + nn = NeuralNetwork(inputs) + + x = np.array(inputs, dtype=float) + y = np.array(outputs, dtype=float) + + for i in range(100000): + + print('#', i + 1, 'Loss:', np.mean(np.square(y - nn.forward(x))), end='\r') + + if nn.train(x, y): + break + + print() + + print() + print('--- Predictions ---') + print() + + if predict_dex: + print('Predict file:', predict_dex) + + strings, inputs, outputs = dl.get_predict_data(predict_dex) + + right = 0 + + for i in range(len(inputs)): + + x = np.array(inputs[i], dtype=float) + + encrypted = nn.predict(strings[i][0], x) + + if encrypted == strings[i][1]: + right += 1 + + print() + print('Right guessed:', (right * 100 ) / len(strings)) + print() -- cgit v0.11.2-87-g4458