summaryrefslogtreecommitdiff
path: root/gen-code.py
blob: 4328d2bfa3579b15cd51c254a60472ad818e36bb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290

import json
import re
import subprocess
import sys

from ollama import chat, ChatResponse


__MODEL__ = 'qwen2.5-coder:7b'
#__MODEL__ = 'llama3.2:latest'
#__MODEL__ = 'qwen3:8b'
#__MODEL__ = 'mistral:7b'

__OPTIONS__ = dict(
    temperature=0.8,
    #'repeat_penalty': 1.2
)


__SYSTEM__PROMPT__ = '''
You are an Android development engineer, writing low level code according to the instructions below.

You are a powerful code editing assistant capable of writing code and creating artifacts in conversations with users, or modifying and updating existing artifacts as requested by users.

An artifact refers to a runnable complete code snippet, you prefer to integrate and output such complete runnable code rather than breaking it down into several code blocks.

'''
#Output only the smali-compatible code, without any additional descriptive text.


__SYSTEM__PROMPT__ = '''
Act as an expert instructor with decades of experience in the Android ecosystem, low-level coding, software security, and reverse engineering. You can list all the Dalvik bytecode mnemonics. You know how to deal with all the Dalvik bytecode mnemonics and what to make the world know it.

Your task is to write assembly code for the Dalvik Virtual Machine. The code has to be fully compatible with the smali assembler.

# Coding and Response Practices

- Use practical examples drawn from real-world open-source samples as inspiration to demonstrate techniques for coding complex structures.
- Encourage reasoning through original code writing before drawing conclusions and provide clear, concise explanations supported by annotated code and examples.
- After generation, please check the code execution again to ensure there are no errors in the output.

# Output Format

Avoid explanations and focus on code variety of Dalvik bytecode when producing code. Format the output in markdown for clarity, using only one code block for all code excerpts.

All the generated code has to be included into one class only.
'''
#  Include step-by-step reasoning where appropriate.



__PREFIX__ = 'code-'


def send_message_to_ollama(messages, ins):
    """Envoie de nouvelles instructions à ollama."""

    ins = ins.lstrip().rstrip()

    print('--------------------')
    print('<<')
    print(ins)
    print('<<')

    msg = {
        'role': 'user',
        'content': ins #.replace('\n', ' ')
    }

    messages.append(msg)

    with open('messages.json', 'w') as fd:
        json.dump(messages, fd)

    resp: ChatResponse = chat(
        model=__MODEL__,
        options=__OPTIONS__,
        messages=messages,
    )

    #messages.append(resp.message)

    msg = {
        'role': resp.message.role,
        'content': resp.message.content
    }

    messages.append(msg)

    print('--------------------')
    print('>>')
    print(resp.message.content)
    print('>>')
    print('--------------------')

    with open('messages.json', 'w') as fd:
        json.dump(messages, fd)

    return resp.message.content


def dump_smali_code(data, index):
    """Extrait le code Smali fournit par ollama."""

    pattern = r'```(.*?)```'
    code_blocks = re.findall(pattern, data, re.DOTALL)

    for blk in code_blocks:

        if blk.startswith('smali'):
            blk = blk[len('smali'):]

        with open(__PREFIX__ + '%04u.smali' % index, 'w') as fd:
            fd.write(blk)

        data = data.replace(blk, '')
        print(data)

        print('Code written!')

        break


def compile_file(filename):
    """Lance une compilation avec smali."""

    cmd = 'java -jar smali-3.0.9-fat.jar a ' + filename + ' -o test.dex'

    process = subprocess.Popen(
        cmd.split(' '),
        stdout=subprocess.DEVNULL,
        stderr=subprocess.PIPE,
        text=True
    )

    # process.returncode sera toujours 0...
    _, stderr = process.communicate()

    return stderr


def extract_errors(data):
    """Liste toutes les erreurs renvoyées par l'assemblage."""

    found = []

    pat = re.compile("^" + __PREFIX__ + "\d+.smali\[(\d+),(\d+)] (.*)$")

    lines = data.split('\n')

    for l in lines:

        match = pat.match(l)

        if match:

            found.append({
                'line': int(match.group(1)),
                'col': int(match.group(2)),
                'msg': match.group(3)
            })

    return found


def check_code(index):
    """Génère au besoin une nouvelle requête pour corriger le code fourni."""

    filename = __PREFIX__ + '%04u.smali' % index

    stderr = compile_file(filename)

    errors = extract_errors(stderr)

    next_msg = []

    print('Errors? %d' % len(errors))

    with open(filename, 'r') as fd:
        content = fd.read().split('\n')

    for e in errors:

        if len(next_msg) == 0:
            next_msg.append('The smali assembler encountered errors with the previously generated code.')
            next_msg.append('')
            next_msg.append('Here is the error list with location and bug origin:')

        assert(e['line'] > 0)

        next_msg.append('- at line %d column %d: %s (erroneous line content : "%s" )' \
                        % (e['line'], e['col'], e['msg'], content[e['line'] - 1].lstrip()))

    if len(next_msg) > 0:
        next_msg.append('')
        next_msg.append('Please fix your code and provide an updated version of smali assembly code!')
        next_msg.append('')
        next_msg.append('Focus on name suffixes and operands while solving errors. Check if used instructions actually exist according to the Dalvik bytecode specifications. For instance, there is no mul-int/lit16 mnemonic.')
        next_msg.append('')
        next_msg.append('Remember to use only plain hexdecimal integers for numbers. Break long high level statements into several Dalvik instructions as much as possible.')
        next_msg.append('')
        next_msg.append('For instance, call to System.out.println has to translate to instructions relying on sget-object and invoke-virtual mnemonics.')
        next_msg.append('')
        next_msg.append('Line counter starts at 1.')
        next_msg.append('')
        next_msg.append('If you do not know how to fix the generated bytecode, rewrite it completely or remove the relative line. Do not hesitate to remove an entire function if there are too much issues inside it.')

    return '\n'.join(next_msg)


if __name__ == '__main__':
    """Point d'entrée."""

    if len(sys.argv) == 1:

        messages = [
            {
                'role': 'system',
                'content': __SYSTEM__PROMPT__ #.replace('\n', ' ')
            }
        ]

        messages = []

        instructions = '''
Write me a bunch of valid Android smali bytecode. Your goal is to write a demonstration of the Dalvik bytecode features.

Try to use one function per instruction set category. Pick one target from the list below:
- Data handling
- Arithmetic and logic operations
- Control flow operations
- Specific instructions for highest Dex files versions (038 or 039)

As demonstration, include some function implementing well-known algorithms such as Fibonacci sequence, FNV1a or murmuhash3 hashes, aso.

Include all the created functions into one class only.

Try to make sure that a lot of different Dalvik mnemonics are used. Mix as much different mnemonics as possible.

Ensure that each instruction is used according its proper format.

The result code HAS TO BE able to get assembled using the smali assembler without modification.
'''
        instructions = '''
Write me a bunch of valid Android smali bytecode. Your goal is to write a demonstration of the Dalvik bytecode features.

Try to use one function per instruction set category. Pick one target from the list below:
- Data handling
- Arithmetic and logic operations
- Control flow operations
- Specific instructions for highest Dex files versions (038 or 039)

Include all the created functions into one class only.

Try to make sure that a lot of different Dalvik mnemonics are used. Mix as much different mnemonics as possible.

Ensure that each instruction is used according its proper format.

Do not write explainations. Do not write comments in generated Dalvik code. Use only plain hexdecimal integers for numbers. Break long high level statements into several Dalvik instructions as much as possible.

For instance, call to System.out.println has to translate to instructions relying on sget-object and invoke-virtual mnemonics.

The result code HAS TO BE able to get assembled using the smali assembler without modification.
'''

        response = send_message_to_ollama(messages, instructions)

        counter = 0
        dump_smali_code(response, counter)

        while True:

            instructions = check_code(counter)

            if len(instructions) == 0:
                break

            response = send_message_to_ollama(messages, instructions)

            counter += 1
            dump_smali_code(response, counter)

    else:

        stderr = compile_file(sys.argv[1])

        errors = extract_errors(stderr)

        sys.exit(0 if len(errors) == 0 else 1)