summaryrefslogtreecommitdiff
path: root/python/alla_net.py
blob: 02c5f8d1e856f667a6be9e8500f5cf89a949193d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367

import numpy as np
import os.path
import pickle
import pychrysalide
from pychrysalide.analysis.contents import FileContent
from pychrysalide.analysis import LoadedBinary
from pychrysalide.arch import ArchInstruction
from pychrysalide.format import BinSymbol
from pychrysalide.format.dex import DexFormat
import random
import sys


class NeuralNetwork():
    """Neural network."""


    def __init__(self, inputs):
        """Initialize the neural network."""

        # Parameters
        input_size = len(inputs[0])
        output_size = 1

        # Hidden layer
        # See https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw
        hidden_size = int((2 * input_size) / 3) + output_size

        # (mxn) weight matrix from input to hidden layer
        self._w1 = np.random.randn(input_size, hidden_size) 

        # (nx1) weight matrix from hidden to output layer
        self._w2 = np.random.randn(hidden_size, output_size)


    def forward(self, x):
        """Forward propagation through the network."""

        self._z = np.dot(x, self._w1)
        self._z2 = self.sigmoid(self._z)
        self._z3 = np.dot(self._z2, self._w2)

        o = self.sigmoid(self._z3)

        return o


    def sigmoid(self, s):
        """Activation function."""

        return 1 / (1 + np.exp(-s))


    def sigmoidPrime(self, s):
        """Derivative of sigmoid."""

        return s * (1 - s)


    def backward(self, x, y, o):
        """Backward propagate through the network."""

        # Error in output
        self._o_error = y - o
        self._o_delta = self._o_error * self.sigmoidPrime(o)

         # How much the hidden layer weights contributed to output error?
        self._z2_error = self._o_delta.dot(self._w2.T)
        self._z2_delta = self._z2_error * self.sigmoidPrime(self._z2)

        # Adjusting set weights

        # (input --> hidden)
        self._w1 += x.T.dot(self._z2_delta)

        # (hidden --> output)
        self._w2 += self._z2.T.dot(self._o_delta)


    def train(self, x, y):
        """Train the neural network with samples."""

        o = self.forward(x)

        old_loss = np.mean(np.square(y - nn.forward(x)))

        self.backward(x, y, o)

        loss = np.mean(np.square(y - nn.forward(x)))

        return old_loss == loss


    def predict(self, origin, x):
        """Guess a results with the trained neural network."""

        got = self.forward(x)[0]

        print('Input:', origin)
        print('Output:', got)

        return got > 0.5


class DeepLearning():
    """Deep learning."""


    def __init__(self, keep):
        """Build a deep learning system."""

        self._keep = keep


    def _get_input_strings(self, filename):
        """Grab all plain and encrypted strings from a Dex file."""

        cnt = FileContent(filename)
        fmt = DexFormat(cnt)
        binary = LoadedBinary(fmt)

        binary.analyze_and_wait()

        encrypted = []

        strings = []

        for sym in binary.format.symbols:

            if sym.target_type == BinSymbol.STP_DYN_STRING:

                ins = binary.processor.find_instr_by_addr(sym.range.addr)
                assert(ins)

                for slink, stype in ins.sources:

                    if stype == ArchInstruction.ILT_REF:

                        encrypted.append(slink.range.addr.phys)

            elif sym.target_type == BinSymbol.STP_RO_STRING:

                strings.append(sym)

        final = []

        for s in strings:

            if len(s.raw) < 5:
                continue

            final.append( [ s.raw, s.range.addr in encrypted ] )

        return final


    def _vectorize_string(self, raw):
        """Produce an input using a common input format."""

        non_printable = 0
        punct = 0
        digit = 0
        upper = 0
        lower = 0

        descriptor = 0

        for b in raw:

            if b <= 0x20:
                non_printable += 1

            elif b >= 0x21 and b < 0x2f:
                punct += 1

            elif b >= 0x30 and b < 0x39:
                digit += 1

            elif b >= 0x3a and b < 0x40:
                punct += 1

            elif b >= 0x41 and b < 0x5a:
                upper += 1

            elif b >= 0x5b and b < 0x60:
                punct += 1

            elif b >= 0x61 and b < 0x7a:
                lower += 1

            elif b >= 0x7b and b < 0x7e:
                punct += 1

            else:
                non_printable += 1

            if b in b'$-_</[;':
                descriptor += 1
                punct -= 1

        length = len(raw)

        return [ non_printable / length, punct / length, digit / length, upper / length, lower / length,
                 descriptor / length]


    def _build_inputs_and_outputs(self, strings):
        """Produces inputs and outputs."""

        inputs = []
        outputs = []

        for raw, encrypted in strings:

            inputs.append( self._vectorize_string(raw) )

            outputs.append( [ 1.0 if encrypted else 0.1 ] )

        return inputs, outputs


    def get_training_data(self, filename):
        """Provide some training data."""

        if filename:

            if not(os.path.isfile('training.data')):

                strings = self._get_input_strings(filename)

                #strings = sorted(strings, key=lambda s: len(s[0]), reverse=True)
                random.shuffle(strings)

                kept = []

                plain_count = 0
                encrypted_count = 0

                for raw, encrypted in strings:

                    if encrypted:
                        if encrypted_count < self._keep:
                            kept.append( [ raw, True ] )
                            encrypted_count += 1

                    else:
                        if plain_count < self._keep:
                            kept.append( [ raw, False ] )
                            plain_count += 1

                    if encrypted_count == self._keep and plain_count < self._keep:
                        break

                fd = open('training.data', 'wb')
                pickle.dump(kept, fd)
                fd.close()

            else:

                fd = open('training.data', 'rb')
                kept = pickle.load(fd)
                fd.close()

        else:

            kept = [
                [ b'versionNeededToExtract', False ],
                [ b'versionNumber', False ],
                [ b'vhstxos(gtxp~brt9Rrjrv\x7fiy\x7f9GTR^IYYTNVHPCHBR@VS[R', True ],
                [ b'vhstxos(~hccyr9gtr~iy(TJXURYD_DRRKHB^G[IPU', True ],
            ]

        return self._build_inputs_and_outputs(kept)


    def get_predict_data(self, filename):
        """Provide some data to predict."""

        if filename:

            if not(os.path.isfile('predict.data')):

                strings = self._get_input_strings(filename)

                fd = open('predict.data', 'wb')
                pickle.dump(strings, fd)
                fd.close()

            else:

                fd = open('predict.data', 'rb')
                strings = pickle.load(fd)
                fd.close()

        else:

            strings = [
                [ b'vhstxos(gtxp~brt9Rrjrv\x7fiy\x7f9GTR^IYYTNVHPCHBR@VS[R', True ],
                [ b'versionNeededToExtract', False ],
                [ b'jLoPdKo\x0cbL\x7fGeV%ChVbMe\x0cHcGn', True ],
                [ b'isWriteComprSizeInZip64ExtraRecord', False ]
            ]

        inputs, outputs = self._build_inputs_and_outputs(strings)

        return strings, inputs, outputs


if __name__ == '__main__':
    """Entry point."""

    if len(sys.argv) != 3:
        print('Usage: %s < training dex or - > < predict dex or - >' % sys.argv[0])
        sys.exit(1)

    training_dex = sys.argv[1] if sys.argv[1] != '-' else None
    predict_dex = sys.argv[2] if sys.argv[2] != '-' else None

    dl = DeepLearning(15)

    print()
    print('--- Training ---')
    print()

    if training_dex:
        print('Input file:', training_dex)

    inputs, outputs = dl.get_training_data(training_dex)

    nn = NeuralNetwork(inputs)

    x = np.array(inputs, dtype=float)
    y = np.array(outputs, dtype=float)

    for i in range(100000):

        print('#', i + 1, 'Loss:',  np.mean(np.square(y - nn.forward(x))), end='\r')

        if nn.train(x, y):
            break

    print()

    print()
    print('--- Predictions ---')
    print()

    if predict_dex:
        print('Predict file:', predict_dex)

    strings, inputs, outputs = dl.get_predict_data(predict_dex)

    right = 0

    for i in range(len(inputs)):

        x = np.array(inputs[i], dtype=float)

        encrypted = nn.predict(strings[i][0], x)

        if encrypted == strings[i][1]:
            right += 1

    print()
    print('Right guessed:', (right * 100 ) / len(strings))
    print()