path: root/python/
diff options
authorCyrille Bagard <>2018-08-16 18:11:04 (GMT)
committerCyrille Bagard <>2018-08-16 18:11:04 (GMT)
commite0419341c4e0e1bd29ea5f6a48f1b9da6deb106c (patch)
tree6b5d47d7cd16cc358c2e840b93f7d82826af5f45 /python/
parentcb4bc9b61622803024345538fb55f6781ef872b9 (diff)
Used DeepLearning to find encrypted strings.
Diffstat (limited to 'python/')
1 files changed, 367 insertions, 0 deletions
diff --git a/python/ b/python/
new file mode 100644
index 0000000..0ff0da1
--- /dev/null
+++ b/python/
@@ -0,0 +1,367 @@
+import numpy as np
+import os.path
+import pickle
+import pychrysalide
+from pychrysalide.analysis.contents import FileContent
+from pychrysalide.analysis import LoadedBinary
+from pychrysalide.arch import ArchInstruction
+from pychrysalide.format import BinSymbol
+from pychrysalide.format.dex import DexFormat
+import random
+import sys
+class NeuralNetwork():
+ """Neural network."""
+ def __init__(self, inputs):
+ """Initialize the neural network."""
+ # Parameters
+ input_size = len(inputs[0])
+ output_size = 1
+ # Hidden layer
+ # See
+ hidden_size = int((2 * input_size) / 3) + output_size
+ # (mxn) weight matrix from input to hidden layer
+ self._w1 = np.random.randn(input_size, hidden_size)
+ # (nx1) weight matrix from hidden to output layer
+ self._w2 = np.random.randn(hidden_size, output_size)
+ def forward(self, x):
+ """Forward propagation through the network."""
+ self._z =, self._w1)
+ self._z2 = self.sigmoid(self._z)
+ self._z3 =, self._w2)
+ o = self.sigmoid(self._z3)
+ return o
+ def sigmoid(self, s):
+ """Activation function."""
+ return 1 / (1 + np.exp(-s))
+ def sigmoidPrime(self, s):
+ """Derivative of sigmoid."""
+ return s * (1 - s)
+ def backward(self, x, y, o):
+ """Backward propagate through the network."""
+ # Error in output
+ self._o_error = y - o
+ self._o_delta = self._o_error * self.sigmoidPrime(o)
+ # How much the hidden layer weights contributed to output error ?
+ self._z2_error =
+ self._z2_delta = self._z2_error * self.sigmoidPrime(self._z2)
+ # Adjusting set weights
+ # (input --> hidden)
+ self._w1 +=
+ # (hidden --> output)
+ self._w2 +=
+ def train(self, x, y):
+ """Train the neural network with samples."""
+ o = self.forward(x)
+ old_loss = np.mean(np.square(y - nn.forward(x)))
+ self.backward(x, y, o)
+ loss = np.mean(np.square(y - nn.forward(x)))
+ return old_loss == loss
+ def predict(self, origin, x):
+ """Guess a results with the trained neural network."""
+ got = self.forward(x)[0]
+ print('Input:', origin)
+ print('Output:', got)
+ return got > 0.5
+class DeepLearning():
+ """Deep learning."""
+ def __init__(self, keep):
+ """Build a deep learning system."""
+ self._keep = keep
+ def _get_input_strings(self, filename):
+ """Grab all plain and encrypted strings from a Dex file."""
+ cnt = FileContent(filename)
+ fmt = DexFormat(cnt)
+ binary = LoadedBinary(fmt)
+ binary.analyze_and_wait()
+ encrypted = []
+ strings = []
+ for sym in binary.format.symbols:
+ if sym.target_type == BinSymbol.STP_DYN_STRING:
+ ins = binary.processor.find_instr_by_addr(sym.range.addr)
+ assert(ins)
+ for slink, stype in ins.sources:
+ if stype == ArchInstruction.ILT_REF:
+ encrypted.append(slink.range.addr.phys)
+ elif sym.target_type == BinSymbol.STP_RO_STRING:
+ strings.append(sym)
+ final = []
+ for s in strings:
+ if len(s.raw) < 5:
+ continue
+ final.append( [ s.raw, s.range.addr in encrypted ] )
+ return final
+ def _normalize_string(self, raw):
+ """Produce an input using a common input format."""
+ non_printable = 0
+ punct = 0
+ digit = 0
+ upper = 0
+ lower = 0
+ descriptor = 0
+ for b in raw:
+ if b <= 0x20:
+ non_printable += 1
+ elif b >= 0x21 and b < 0x2f:
+ punct += 1
+ elif b >= 0x30 and b < 0x39:
+ digit += 1
+ elif b >= 0x3a and b < 0x40:
+ punct += 1
+ elif b >= 0x41 and b < 0x5a:
+ upper += 1
+ elif b >= 0x5b and b < 0x60:
+ punct += 1
+ elif b >= 0x61 and b < 0x7a:
+ lower += 1
+ elif b >= 0x7b and b < 0x7e:
+ punct += 1
+ else:
+ non_printable += 1
+ if b in b'$-_</[;':
+ descriptor += 1
+ punct -= 1
+ length = len(raw)
+ return [ non_printable / length, punct / length, digit / length, upper / length, lower / length,
+ descriptor / length]
+ def _build_inputs_and_outputs(self, strings):
+ """Produces inputs and outputs."""
+ inputs = []
+ outputs = []
+ for raw, encrypted in strings:
+ inputs.append( self._normalize_string(raw) )
+ outputs.append( [ 1.0 if encrypted else 0.1 ] )
+ return inputs, outputs
+ def get_training_data(self, filename):
+ """Provide some training data."""
+ if filename:
+ if not(os.path.isfile('')):
+ strings = self._get_input_strings(filename)
+ #strings = sorted(strings, key=lambda s: len(s[0]), reverse=True)
+ random.shuffle(strings)
+ kept = []
+ plain_count = 0
+ encrypted_count = 0
+ for raw, encrypted in strings:
+ if encrypted:
+ if encrypted_count < self._keep:
+ kept.append( [ raw, True ] )
+ encrypted_count += 1
+ else:
+ if plain_count < self._keep:
+ kept.append( [ raw, False ] )
+ plain_count += 1
+ if encrypted_count == self._keep and plain_count < self._keep:
+ break
+ fd = open('', 'wb')
+ pickle.dump(kept, fd)
+ fd.close()
+ else:
+ fd = open('', 'rb')
+ kept = pickle.load(fd)
+ fd.close()
+ else:
+ kept = [
+ [ b'versionNeededToExtract', False ],
+ [ b'versionNumber', False ],
+ [ b'vhstxos(gtxp~brt9Rrjrv\x7fiy\x7f9GTR^IYYTNVHPCHBR@VS[R', True ],
+ [ b'vhstxos(~hccyr9gtr~iy(TJXURYD_DRRKHB^G[IPU', True ],
+ ]
+ return self._build_inputs_and_outputs(kept)
+ def get_predict_data(self, filename):
+ """Provide some data to predict."""
+ if filename:
+ if not(os.path.isfile('')):
+ strings = self._get_input_strings(filename)
+ fd = open('', 'wb')
+ pickle.dump(strings, fd)
+ fd.close()
+ else:
+ fd = open('', 'rb')
+ strings = pickle.load(fd)
+ fd.close()
+ else:
+ strings = [
+ [ b'vhstxos(gtxp~brt9Rrjrv\x7fiy\x7f9GTR^IYYTNVHPCHBR@VS[R', True ],
+ [ b'versionNeededToExtract', False ],
+ [ b'jLoPdKo\x0cbL\x7fGeV%ChVbMe\x0cHcGn', True ],
+ [ b'isWriteComprSizeInZip64ExtraRecord', False ]
+ ]
+ inputs, outputs = self._build_inputs_and_outputs(strings)
+ return strings, inputs, outputs
+if __name__ == '__main__':
+ """Entry point."""
+ if len(sys.argv) != 3:
+ print('Usage: %s < training dex or - > < predict dex or - >' % sys.argv[0])
+ sys.exit(1)
+ training_dex = sys.argv[1] if sys.argv[1] != '-' else None
+ predict_dex = sys.argv[2] if sys.argv[2] != '-' else None
+ dl = DeepLearning(15)
+ print()
+ print('--- Training ---')
+ print()
+ if training_dex:
+ print('Input file:', training_dex)
+ inputs, outputs = dl.get_training_data(training_dex)
+ nn = NeuralNetwork(inputs)
+ x = np.array(inputs, dtype=float)
+ y = np.array(outputs, dtype=float)
+ for i in range(100000):
+ print('#', i + 1, 'Loss:', np.mean(np.square(y - nn.forward(x))), end='\r')
+ if nn.train(x, y):
+ break
+ print()
+ print()
+ print('--- Predictions ---')
+ print()
+ if predict_dex:
+ print('Predict file:', predict_dex)
+ strings, inputs, outputs = dl.get_predict_data(predict_dex)
+ right = 0
+ for i in range(len(inputs)):
+ x = np.array(inputs[i], dtype=float)
+ encrypted = nn.predict(strings[i][0], x)
+ if encrypted == strings[i][1]:
+ right += 1
+ print()
+ print('Right guessed:', (right * 100 ) / len(strings))
+ print()