import numpy as np import os.path import pickle import pychrysalide from pychrysalide.analysis.contents import FileContent from pychrysalide.analysis import LoadedBinary from pychrysalide.arch import ArchInstruction from pychrysalide.format import BinSymbol from pychrysalide.format.dex import DexFormat import random import sys class NeuralNetwork(): """Neural network.""" def __init__(self, inputs): """Initialize the neural network.""" # Parameters input_size = len(inputs[0]) output_size = 1 # Hidden layer # See https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw hidden_size = int((2 * input_size) / 3) + output_size # (mxn) weight matrix from input to hidden layer self._w1 = np.random.randn(input_size, hidden_size) # (nx1) weight matrix from hidden to output layer self._w2 = np.random.randn(hidden_size, output_size) def forward(self, x): """Forward propagation through the network.""" self._z = np.dot(x, self._w1) self._z2 = self.sigmoid(self._z) self._z3 = np.dot(self._z2, self._w2) o = self.sigmoid(self._z3) return o def sigmoid(self, s): """Activation function.""" return 1 / (1 + np.exp(-s)) def sigmoidPrime(self, s): """Derivative of sigmoid.""" return s * (1 - s) def backward(self, x, y, o): """Backward propagate through the network.""" # Error in output self._o_error = y - o self._o_delta = self._o_error * self.sigmoidPrime(o) # How much the hidden layer weights contributed to output error? self._z2_error = self._o_delta.dot(self._w2.T) self._z2_delta = self._z2_error * self.sigmoidPrime(self._z2) # Adjusting set weights # (input --> hidden) self._w1 += x.T.dot(self._z2_delta) # (hidden --> output) self._w2 += self._z2.T.dot(self._o_delta) def train(self, x, y): """Train the neural network with samples.""" o = self.forward(x) old_loss = np.mean(np.square(y - nn.forward(x))) self.backward(x, y, o) loss = np.mean(np.square(y - nn.forward(x))) return old_loss == loss def predict(self, origin, x): """Guess a results with the trained neural network.""" got = self.forward(x)[0] print('Input:', origin) print('Output:', got) return got > 0.5 class DeepLearning(): """Deep learning.""" def __init__(self, keep): """Build a deep learning system.""" self._keep = keep def _get_input_strings(self, filename): """Grab all plain and encrypted strings from a Dex file.""" cnt = FileContent(filename) fmt = DexFormat(cnt) binary = LoadedBinary(fmt) binary.analyze_and_wait() encrypted = [] strings = [] for sym in binary.format.symbols: if sym.target_type == BinSymbol.STP_DYN_STRING: ins = binary.processor.find_instr_by_addr(sym.range.addr) assert(ins) for slink, stype in ins.sources: if stype == ArchInstruction.ILT_REF: encrypted.append(slink.range.addr.phys) elif sym.target_type == BinSymbol.STP_RO_STRING: strings.append(sym) final = [] for s in strings: if len(s.raw) < 5: continue final.append( [ s.raw, s.range.addr in encrypted ] ) return final def _vectorize_string(self, raw): """Produce an input using a common input format.""" non_printable = 0 punct = 0 digit = 0 upper = 0 lower = 0 descriptor = 0 for b in raw: if b <= 0x20: non_printable += 1 elif b >= 0x21 and b < 0x2f: punct += 1 elif b >= 0x30 and b < 0x39: digit += 1 elif b >= 0x3a and b < 0x40: punct += 1 elif b >= 0x41 and b < 0x5a: upper += 1 elif b >= 0x5b and b < 0x60: punct += 1 elif b >= 0x61 and b < 0x7a: lower += 1 elif b >= 0x7b and b < 0x7e: punct += 1 else: non_printable += 1 if b in b'$-_ < predict dex or - >' % sys.argv[0]) sys.exit(1) training_dex = sys.argv[1] if sys.argv[1] != '-' else None predict_dex = sys.argv[2] if sys.argv[2] != '-' else None dl = DeepLearning(15) print() print('--- Training ---') print() if training_dex: print('Input file:', training_dex) inputs, outputs = dl.get_training_data(training_dex) nn = NeuralNetwork(inputs) x = np.array(inputs, dtype=float) y = np.array(outputs, dtype=float) for i in range(100000): print('#', i + 1, 'Loss:', np.mean(np.square(y - nn.forward(x))), end='\r') if nn.train(x, y): break print() print() print('--- Predictions ---') print() if predict_dex: print('Predict file:', predict_dex) strings, inputs, outputs = dl.get_predict_data(predict_dex) right = 0 for i in range(len(inputs)): x = np.array(inputs[i], dtype=float) encrypted = nn.predict(strings[i][0], x) if encrypted == strings[i][1]: right += 1 print() print('Right guessed:', (right * 100 ) / len(strings)) print()