diff options
author | Cyrille Bagard <nocbos@gmail.com> | 2018-08-16 18:11:04 (GMT) |
---|---|---|
committer | Cyrille Bagard <nocbos@gmail.com> | 2018-08-16 18:11:04 (GMT) |
commit | e0419341c4e0e1bd29ea5f6a48f1b9da6deb106c (patch) | |
tree | 6b5d47d7cd16cc358c2e840b93f7d82826af5f45 | |
parent | cb4bc9b61622803024345538fb55f6781ef872b9 (diff) |
Used DeepLearning to find encrypted strings.
-rw-r--r-- | python/alla_net.py | 367 |
1 files changed, 367 insertions, 0 deletions
diff --git a/python/alla_net.py b/python/alla_net.py new file mode 100644 index 0000000..0ff0da1 --- /dev/null +++ b/python/alla_net.py @@ -0,0 +1,367 @@ + +import numpy as np +import os.path +import pickle +import pychrysalide +from pychrysalide.analysis.contents import FileContent +from pychrysalide.analysis import LoadedBinary +from pychrysalide.arch import ArchInstruction +from pychrysalide.format import BinSymbol +from pychrysalide.format.dex import DexFormat +import random +import sys + + +class NeuralNetwork(): + """Neural network.""" + + + def __init__(self, inputs): + """Initialize the neural network.""" + + # Parameters + input_size = len(inputs[0]) + output_size = 1 + + # Hidden layer + # See https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw + hidden_size = int((2 * input_size) / 3) + output_size + + # (mxn) weight matrix from input to hidden layer + self._w1 = np.random.randn(input_size, hidden_size) + + # (nx1) weight matrix from hidden to output layer + self._w2 = np.random.randn(hidden_size, output_size) + + + def forward(self, x): + """Forward propagation through the network.""" + + self._z = np.dot(x, self._w1) + self._z2 = self.sigmoid(self._z) + self._z3 = np.dot(self._z2, self._w2) + + o = self.sigmoid(self._z3) + + return o + + + def sigmoid(self, s): + """Activation function.""" + + return 1 / (1 + np.exp(-s)) + + + def sigmoidPrime(self, s): + """Derivative of sigmoid.""" + + return s * (1 - s) + + + def backward(self, x, y, o): + """Backward propagate through the network.""" + + # Error in output + self._o_error = y - o + self._o_delta = self._o_error * self.sigmoidPrime(o) + + # How much the hidden layer weights contributed to output error ? + self._z2_error = self._o_delta.dot(self._w2.T) + self._z2_delta = self._z2_error * self.sigmoidPrime(self._z2) + + # Adjusting set weights + + # (input --> hidden) + self._w1 += x.T.dot(self._z2_delta) + + # (hidden --> output) + self._w2 += self._z2.T.dot(self._o_delta) + + + def train(self, x, y): + """Train the neural network with samples.""" + + o = self.forward(x) + + old_loss = np.mean(np.square(y - nn.forward(x))) + + self.backward(x, y, o) + + loss = np.mean(np.square(y - nn.forward(x))) + + return old_loss == loss + + + def predict(self, origin, x): + """Guess a results with the trained neural network.""" + + got = self.forward(x)[0] + + print('Input:', origin) + print('Output:', got) + + return got > 0.5 + + +class DeepLearning(): + """Deep learning.""" + + + def __init__(self, keep): + """Build a deep learning system.""" + + self._keep = keep + + + def _get_input_strings(self, filename): + """Grab all plain and encrypted strings from a Dex file.""" + + cnt = FileContent(filename) + fmt = DexFormat(cnt) + binary = LoadedBinary(fmt) + + binary.analyze_and_wait() + + encrypted = [] + + strings = [] + + for sym in binary.format.symbols: + + if sym.target_type == BinSymbol.STP_DYN_STRING: + + ins = binary.processor.find_instr_by_addr(sym.range.addr) + assert(ins) + + for slink, stype in ins.sources: + + if stype == ArchInstruction.ILT_REF: + + encrypted.append(slink.range.addr.phys) + + elif sym.target_type == BinSymbol.STP_RO_STRING: + + strings.append(sym) + + final = [] + + for s in strings: + + if len(s.raw) < 5: + continue + + final.append( [ s.raw, s.range.addr in encrypted ] ) + + return final + + + def _normalize_string(self, raw): + """Produce an input using a common input format.""" + + non_printable = 0 + punct = 0 + digit = 0 + upper = 0 + lower = 0 + + descriptor = 0 + + for b in raw: + + if b <= 0x20: + non_printable += 1 + + elif b >= 0x21 and b < 0x2f: + punct += 1 + + elif b >= 0x30 and b < 0x39: + digit += 1 + + elif b >= 0x3a and b < 0x40: + punct += 1 + + elif b >= 0x41 and b < 0x5a: + upper += 1 + + elif b >= 0x5b and b < 0x60: + punct += 1 + + elif b >= 0x61 and b < 0x7a: + lower += 1 + + elif b >= 0x7b and b < 0x7e: + punct += 1 + + else: + non_printable += 1 + + if b in b'$-_</[;': + descriptor += 1 + punct -= 1 + + length = len(raw) + + return [ non_printable / length, punct / length, digit / length, upper / length, lower / length, + descriptor / length] + + + def _build_inputs_and_outputs(self, strings): + """Produces inputs and outputs.""" + + inputs = [] + outputs = [] + + for raw, encrypted in strings: + + inputs.append( self._normalize_string(raw) ) + + outputs.append( [ 1.0 if encrypted else 0.1 ] ) + + return inputs, outputs + + + def get_training_data(self, filename): + """Provide some training data.""" + + if filename: + + if not(os.path.isfile('training.data')): + + strings = self._get_input_strings(filename) + + #strings = sorted(strings, key=lambda s: len(s[0]), reverse=True) + random.shuffle(strings) + + kept = [] + + plain_count = 0 + encrypted_count = 0 + + for raw, encrypted in strings: + + if encrypted: + if encrypted_count < self._keep: + kept.append( [ raw, True ] ) + encrypted_count += 1 + + else: + if plain_count < self._keep: + kept.append( [ raw, False ] ) + plain_count += 1 + + if encrypted_count == self._keep and plain_count < self._keep: + break + + fd = open('training.data', 'wb') + pickle.dump(kept, fd) + fd.close() + + else: + + fd = open('training.data', 'rb') + kept = pickle.load(fd) + fd.close() + + else: + + kept = [ + [ b'versionNeededToExtract', False ], + [ b'versionNumber', False ], + [ b'vhstxos(gtxp~brt9Rrjrv\x7fiy\x7f9GTR^IYYTNVHPCHBR@VS[R', True ], + [ b'vhstxos(~hccyr9gtr~iy(TJXURYD_DRRKHB^G[IPU', True ], + ] + + return self._build_inputs_and_outputs(kept) + + + def get_predict_data(self, filename): + """Provide some data to predict.""" + + if filename: + + if not(os.path.isfile('predict.data')): + + strings = self._get_input_strings(filename) + + fd = open('predict.data', 'wb') + pickle.dump(strings, fd) + fd.close() + + else: + + fd = open('predict.data', 'rb') + strings = pickle.load(fd) + fd.close() + + else: + + strings = [ + [ b'vhstxos(gtxp~brt9Rrjrv\x7fiy\x7f9GTR^IYYTNVHPCHBR@VS[R', True ], + [ b'versionNeededToExtract', False ], + [ b'jLoPdKo\x0cbL\x7fGeV%ChVbMe\x0cHcGn', True ], + [ b'isWriteComprSizeInZip64ExtraRecord', False ] + ] + + inputs, outputs = self._build_inputs_and_outputs(strings) + + return strings, inputs, outputs + + +if __name__ == '__main__': + """Entry point.""" + + if len(sys.argv) != 3: + print('Usage: %s < training dex or - > < predict dex or - >' % sys.argv[0]) + sys.exit(1) + + training_dex = sys.argv[1] if sys.argv[1] != '-' else None + predict_dex = sys.argv[2] if sys.argv[2] != '-' else None + + dl = DeepLearning(15) + + print() + print('--- Training ---') + print() + + if training_dex: + print('Input file:', training_dex) + + inputs, outputs = dl.get_training_data(training_dex) + + nn = NeuralNetwork(inputs) + + x = np.array(inputs, dtype=float) + y = np.array(outputs, dtype=float) + + for i in range(100000): + + print('#', i + 1, 'Loss:', np.mean(np.square(y - nn.forward(x))), end='\r') + + if nn.train(x, y): + break + + print() + + print() + print('--- Predictions ---') + print() + + if predict_dex: + print('Predict file:', predict_dex) + + strings, inputs, outputs = dl.get_predict_data(predict_dex) + + right = 0 + + for i in range(len(inputs)): + + x = np.array(inputs[i], dtype=float) + + encrypted = nn.predict(strings[i][0], x) + + if encrypted == strings[i][1]: + right += 1 + + print() + print('Right guessed:', (right * 100 ) / len(strings)) + print() |