summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCyrille Bagard <nocbos@gmail.com>2018-08-16 18:11:04 (GMT)
committerCyrille Bagard <nocbos@gmail.com>2018-08-16 18:11:04 (GMT)
commite0419341c4e0e1bd29ea5f6a48f1b9da6deb106c (patch)
tree6b5d47d7cd16cc358c2e840b93f7d82826af5f45
parentcb4bc9b61622803024345538fb55f6781ef872b9 (diff)
Used DeepLearning to find encrypted strings.
-rw-r--r--python/alla_net.py367
1 files changed, 367 insertions, 0 deletions
diff --git a/python/alla_net.py b/python/alla_net.py
new file mode 100644
index 0000000..0ff0da1
--- /dev/null
+++ b/python/alla_net.py
@@ -0,0 +1,367 @@
+
+import numpy as np
+import os.path
+import pickle
+import pychrysalide
+from pychrysalide.analysis.contents import FileContent
+from pychrysalide.analysis import LoadedBinary
+from pychrysalide.arch import ArchInstruction
+from pychrysalide.format import BinSymbol
+from pychrysalide.format.dex import DexFormat
+import random
+import sys
+
+
+class NeuralNetwork():
+ """Neural network."""
+
+
+ def __init__(self, inputs):
+ """Initialize the neural network."""
+
+ # Parameters
+ input_size = len(inputs[0])
+ output_size = 1
+
+ # Hidden layer
+ # See https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw
+ hidden_size = int((2 * input_size) / 3) + output_size
+
+ # (mxn) weight matrix from input to hidden layer
+ self._w1 = np.random.randn(input_size, hidden_size)
+
+ # (nx1) weight matrix from hidden to output layer
+ self._w2 = np.random.randn(hidden_size, output_size)
+
+
+ def forward(self, x):
+ """Forward propagation through the network."""
+
+ self._z = np.dot(x, self._w1)
+ self._z2 = self.sigmoid(self._z)
+ self._z3 = np.dot(self._z2, self._w2)
+
+ o = self.sigmoid(self._z3)
+
+ return o
+
+
+ def sigmoid(self, s):
+ """Activation function."""
+
+ return 1 / (1 + np.exp(-s))
+
+
+ def sigmoidPrime(self, s):
+ """Derivative of sigmoid."""
+
+ return s * (1 - s)
+
+
+ def backward(self, x, y, o):
+ """Backward propagate through the network."""
+
+ # Error in output
+ self._o_error = y - o
+ self._o_delta = self._o_error * self.sigmoidPrime(o)
+
+ # How much the hidden layer weights contributed to output error ?
+ self._z2_error = self._o_delta.dot(self._w2.T)
+ self._z2_delta = self._z2_error * self.sigmoidPrime(self._z2)
+
+ # Adjusting set weights
+
+ # (input --> hidden)
+ self._w1 += x.T.dot(self._z2_delta)
+
+ # (hidden --> output)
+ self._w2 += self._z2.T.dot(self._o_delta)
+
+
+ def train(self, x, y):
+ """Train the neural network with samples."""
+
+ o = self.forward(x)
+
+ old_loss = np.mean(np.square(y - nn.forward(x)))
+
+ self.backward(x, y, o)
+
+ loss = np.mean(np.square(y - nn.forward(x)))
+
+ return old_loss == loss
+
+
+ def predict(self, origin, x):
+ """Guess a results with the trained neural network."""
+
+ got = self.forward(x)[0]
+
+ print('Input:', origin)
+ print('Output:', got)
+
+ return got > 0.5
+
+
+class DeepLearning():
+ """Deep learning."""
+
+
+ def __init__(self, keep):
+ """Build a deep learning system."""
+
+ self._keep = keep
+
+
+ def _get_input_strings(self, filename):
+ """Grab all plain and encrypted strings from a Dex file."""
+
+ cnt = FileContent(filename)
+ fmt = DexFormat(cnt)
+ binary = LoadedBinary(fmt)
+
+ binary.analyze_and_wait()
+
+ encrypted = []
+
+ strings = []
+
+ for sym in binary.format.symbols:
+
+ if sym.target_type == BinSymbol.STP_DYN_STRING:
+
+ ins = binary.processor.find_instr_by_addr(sym.range.addr)
+ assert(ins)
+
+ for slink, stype in ins.sources:
+
+ if stype == ArchInstruction.ILT_REF:
+
+ encrypted.append(slink.range.addr.phys)
+
+ elif sym.target_type == BinSymbol.STP_RO_STRING:
+
+ strings.append(sym)
+
+ final = []
+
+ for s in strings:
+
+ if len(s.raw) < 5:
+ continue
+
+ final.append( [ s.raw, s.range.addr in encrypted ] )
+
+ return final
+
+
+ def _normalize_string(self, raw):
+ """Produce an input using a common input format."""
+
+ non_printable = 0
+ punct = 0
+ digit = 0
+ upper = 0
+ lower = 0
+
+ descriptor = 0
+
+ for b in raw:
+
+ if b <= 0x20:
+ non_printable += 1
+
+ elif b >= 0x21 and b < 0x2f:
+ punct += 1
+
+ elif b >= 0x30 and b < 0x39:
+ digit += 1
+
+ elif b >= 0x3a and b < 0x40:
+ punct += 1
+
+ elif b >= 0x41 and b < 0x5a:
+ upper += 1
+
+ elif b >= 0x5b and b < 0x60:
+ punct += 1
+
+ elif b >= 0x61 and b < 0x7a:
+ lower += 1
+
+ elif b >= 0x7b and b < 0x7e:
+ punct += 1
+
+ else:
+ non_printable += 1
+
+ if b in b'$-_</[;':
+ descriptor += 1
+ punct -= 1
+
+ length = len(raw)
+
+ return [ non_printable / length, punct / length, digit / length, upper / length, lower / length,
+ descriptor / length]
+
+
+ def _build_inputs_and_outputs(self, strings):
+ """Produces inputs and outputs."""
+
+ inputs = []
+ outputs = []
+
+ for raw, encrypted in strings:
+
+ inputs.append( self._normalize_string(raw) )
+
+ outputs.append( [ 1.0 if encrypted else 0.1 ] )
+
+ return inputs, outputs
+
+
+ def get_training_data(self, filename):
+ """Provide some training data."""
+
+ if filename:
+
+ if not(os.path.isfile('training.data')):
+
+ strings = self._get_input_strings(filename)
+
+ #strings = sorted(strings, key=lambda s: len(s[0]), reverse=True)
+ random.shuffle(strings)
+
+ kept = []
+
+ plain_count = 0
+ encrypted_count = 0
+
+ for raw, encrypted in strings:
+
+ if encrypted:
+ if encrypted_count < self._keep:
+ kept.append( [ raw, True ] )
+ encrypted_count += 1
+
+ else:
+ if plain_count < self._keep:
+ kept.append( [ raw, False ] )
+ plain_count += 1
+
+ if encrypted_count == self._keep and plain_count < self._keep:
+ break
+
+ fd = open('training.data', 'wb')
+ pickle.dump(kept, fd)
+ fd.close()
+
+ else:
+
+ fd = open('training.data', 'rb')
+ kept = pickle.load(fd)
+ fd.close()
+
+ else:
+
+ kept = [
+ [ b'versionNeededToExtract', False ],
+ [ b'versionNumber', False ],
+ [ b'vhstxos(gtxp~brt9Rrjrv\x7fiy\x7f9GTR^IYYTNVHPCHBR@VS[R', True ],
+ [ b'vhstxos(~hccyr9gtr~iy(TJXURYD_DRRKHB^G[IPU', True ],
+ ]
+
+ return self._build_inputs_and_outputs(kept)
+
+
+ def get_predict_data(self, filename):
+ """Provide some data to predict."""
+
+ if filename:
+
+ if not(os.path.isfile('predict.data')):
+
+ strings = self._get_input_strings(filename)
+
+ fd = open('predict.data', 'wb')
+ pickle.dump(strings, fd)
+ fd.close()
+
+ else:
+
+ fd = open('predict.data', 'rb')
+ strings = pickle.load(fd)
+ fd.close()
+
+ else:
+
+ strings = [
+ [ b'vhstxos(gtxp~brt9Rrjrv\x7fiy\x7f9GTR^IYYTNVHPCHBR@VS[R', True ],
+ [ b'versionNeededToExtract', False ],
+ [ b'jLoPdKo\x0cbL\x7fGeV%ChVbMe\x0cHcGn', True ],
+ [ b'isWriteComprSizeInZip64ExtraRecord', False ]
+ ]
+
+ inputs, outputs = self._build_inputs_and_outputs(strings)
+
+ return strings, inputs, outputs
+
+
+if __name__ == '__main__':
+ """Entry point."""
+
+ if len(sys.argv) != 3:
+ print('Usage: %s < training dex or - > < predict dex or - >' % sys.argv[0])
+ sys.exit(1)
+
+ training_dex = sys.argv[1] if sys.argv[1] != '-' else None
+ predict_dex = sys.argv[2] if sys.argv[2] != '-' else None
+
+ dl = DeepLearning(15)
+
+ print()
+ print('--- Training ---')
+ print()
+
+ if training_dex:
+ print('Input file:', training_dex)
+
+ inputs, outputs = dl.get_training_data(training_dex)
+
+ nn = NeuralNetwork(inputs)
+
+ x = np.array(inputs, dtype=float)
+ y = np.array(outputs, dtype=float)
+
+ for i in range(100000):
+
+ print('#', i + 1, 'Loss:', np.mean(np.square(y - nn.forward(x))), end='\r')
+
+ if nn.train(x, y):
+ break
+
+ print()
+
+ print()
+ print('--- Predictions ---')
+ print()
+
+ if predict_dex:
+ print('Predict file:', predict_dex)
+
+ strings, inputs, outputs = dl.get_predict_data(predict_dex)
+
+ right = 0
+
+ for i in range(len(inputs)):
+
+ x = np.array(inputs[i], dtype=float)
+
+ encrypted = nn.predict(strings[i][0], x)
+
+ if encrypted == strings[i][1]:
+ right += 1
+
+ print()
+ print('Right guessed:', (right * 100 ) / len(strings))
+ print()