Colorizer Analysis Tutorial

We will solve the same problem twice using both Colorizer and ColorizerReadWrite analyses.

Take 1: Colorizer

The following function call_sys illustrates one way in which the colorizer analysis can be of use.

[0x00001060]> pdf @ sym.call_sys
            ; CALL XREF from sub.main_1243 @ 0x1260(x)
┌ 250: sym.call_sys (int64_t arg1);
│ `- args(rdi) vars(6:sp[0x10..0x30])
│           0x00001149      55             push rbp
│           0x0000114a      4889e5         mov rbp, rsp
│           0x0000114d      4883ec30       sub rsp, 0x30
│           0x00001151      48897dd8       mov qword [var_28h], rdi    ; arg1
│           0x00001155      488b45d8       mov rax, qword [var_28h]
│           0x00001159      488945f8       mov qword [string], rax
│           0x0000115d      488b45f8       mov rax, qword [string]
│           0x00001161      48c1e818       shr rax, 0x18
│           0x00001165      8945ec         mov dword [var_14h], eax
│           0x00001168      c745e80900..   mov dword [var_18h], 9
│           0x0000116f      8b45ec         mov eax, dword [var_14h]
│           0x00001172      488d158b0e..   lea rdx, str.n_d_n          ; 0x2004 ; "n=%d\n"
│           0x00001179      89c6           mov esi, eax
│           0x0000117b      4889d7         mov rdi, rdx                ; const char *format
│           0x0000117e      b800000000     mov eax, 0
│           0x00001183      e8b8feffff     call sym.imp.printf         ; int printf(const char *format)
│           0x00001188      488b45f8       mov rax, qword [string]
│           0x0000118c      488d15770e..   lea rdx, str.a_lX_n         ; 0x200a ; "a=%lX\n"
│           0x00001193      4889c6         mov rsi, rax
│           0x00001196      4889d7         mov rdi, rdx                ; const char *format
│           0x00001199      b800000000     mov eax, 0
│           0x0000119e      e89dfeffff     call sym.imp.printf         ; int printf(const char *format)
│           0x000011a3      b8efbeadde     mov eax, 0xdeadbeef
│           0x000011a8      483145f8       xor qword [string], rax
│           0x000011ac      c745f40000..   mov dword [var_ch], 0
│       ┌─< 0x000011b3      eb21           jmp 0x11d6
│       │   ; CODE XREF from sym.call_sys @ 0x11dc(x)
│      ┌──> 0x000011b5      8b45f4         mov eax, dword [var_ch]
│      ╎│   0x000011b8      4898           cdqe
│      ╎│   0x000011ba      488d148500..   lea rdx, [rax*4]
│      ╎│   0x000011c2      488d05772e..   lea rax, obj.mints          ; 0x4040 ; U"=J`\W9J\x1a\v"
│      ╎│   0x000011c9      8b0402         mov eax, dword [rdx + rax]
│      ╎│   0x000011cc      4898           cdqe
│      ╎│   0x000011ce      480145f8       add qword [string], rax
│      ╎│   0x000011d2      8345f401       add dword [var_ch], 1
│      ╎│   ; CODE XREF from sym.call_sys @ 0x11b3(x)
│      ╎└─> 0x000011d6      8b45f4         mov eax, dword [var_ch]
│      ╎    0x000011d9      3b45e8         cmp eax, dword [var_18h]
│      └──< 0x000011dc      7cd7           jl 0x11b5
│           0x000011de      488b45f8       mov rax, qword [string]
│           0x000011e2      488d15210e..   lea rdx, str.a_lX_n         ; 0x200a ; "a=%lX\n"
│           0x000011e9      4889c6         mov rsi, rax
│           0x000011ec      4889d7         mov rdi, rdx                ; const char *format
│           0x000011ef      b800000000     mov eax, 0
│           0x000011f4      e847feffff     call sym.imp.printf         ; int printf(const char *format)
│           0x000011f9      8b45e8         mov eax, dword [var_18h]
│           0x000011fc      83e801         sub eax, 1
│           0x000011ff      8945f0         mov dword [var_10h], eax
│       ┌─< 0x00001202      eb21           jmp 0x1225
│       │   ; CODE XREF from sym.call_sys @ 0x1229(x)
│      ┌──> 0x00001204      8b45f0         mov eax, dword [var_10h]
│      ╎│   0x00001207      4898           cdqe
│      ╎│   0x00001209      488d148500..   lea rdx, [rax*4]
│      ╎│   0x00001211      488d05282e..   lea rax, obj.mints          ; 0x4040 ; U"=J`\W9J\x1a\v"
│      ╎│   0x00001218      8b0402         mov eax, dword [rdx + rax]
│      ╎│   0x0000121b      4898           cdqe
│      ╎│   0x0000121d      482945f8       sub qword [string], rax
│      ╎│   0x00001221      836df001       sub dword [var_10h], 1
│      ╎│   ; CODE XREF from sym.call_sys @ 0x1202(x)
│      ╎└─> 0x00001225      837df000       cmp dword [var_10h], 0
│      └──< 0x00001229      79d9           jns 0x1204
│           0x0000122b      b8efbeadde     mov eax, 0xdeadbeef
│           0x00001230      483145f8       xor qword [string], rax
│           0x00001234      488b45f8       mov rax, qword [string]
│           0x00001238      4889c7         mov rdi, rax                ; const char *string
│           0x0000123b      e8f0fdffff     call sym.imp.system         ; int system(const char *string)
│           0x00001240      90             nop
│           0x00001241      c9             leave
└           0x00001242      c3             ret

There is a call to the libc function system near the end of this function, at instruction 0x123b . However, it is hard to tell what argument is being passed to it, i.e. what string is being pointed to. That is because the code has been obfuscated. But we can use the Colorizer to defeat this obfuscation directly, via dynamic execution and tracking values or colors.

First, we need to harness this function. This is something you have seen before, but here is the kind of code that will do this.

import copy
import logging

import smallworld
from smallworld.analyses import Colorizer
from smallworld.analyses.colorizer import randomize_uninitialized
from smallworld.hinting.hints import DynamicRegisterValueHint

smallworld.logging.setup_logging(level=logging.DEBUG)
logger = logging.getLogger(__name__)

# configure the platform for emulation
platform = smallworld.platforms.Platform(
    smallworld.platforms.Architecture.X86_64, smallworld.platforms.Byteorder.LITTLE
)

machine = smallworld.state.Machine()

cpu = smallworld.state.cpus.CPU.for_platform(platform)

base_address = 0x0
code = smallworld.state.memory.code.Executable.from_elf(
    open("c-example", "rb"), address=base_address
)
machine.add(code)

stack = smallworld.state.memory.stack.Stack.for_platform(platform, 0x10000, 0x4000)
machine.add(stack)
rsp = stack.get_pointer()
cpu.rsp.set(rsp)

entry_point = 0x1149 + base_address

# call to "system"
exit_point = 0x123B

cpu.rip.set(entry_point)
machine.add(cpu)
machine.add_exit_point(exit_point)

printf = smallworld.state.models.Model.lookup(
    "printf", platform, smallworld.platforms.ABI.SYSTEMV, 0x1040 + base_address
)
machine.add(printf)
printf.allow_imprecise = True
code.update_symbol_value("printf", printf._address)

A few things to point out here. First, we set, as our exit point, the call to system, because we want to see the color of the value in the argument to the call. That is, what we want to know is what is the dynamic value in rdi when we reach 0x123b call sym.imp.system, and to determine where that value came from, if possible. So we can stop analyzing when we reach that call. Second, we arrange to model the calls to printf in this function. We set up the model to take over for calls to 0x1040 + base_address. This is not obvious in the above disassembly since radare glosses such calls as call sym.imp.printf. If you disassemble, instead, with objdump (or use Binary Ninja or Ghdira), you can see that the call is really to 0x1040.

119e:   call  1040 <printf@plt>

Modeling printf has two benefits. We will be able to see what gets printed, which might be useful, but, more important, the harness will not fail on them since libc is not actually available.

We will use the colorizer twice, i.e. make two dynamic analyses of the function’s code. In the first, we determine the color for rax at the instruction immediately prior to the system call. This is the value that gets copied into rdi, the argument to system. Here is code that does this.

# first pass through to figure out color for rax/rdi
# at instruction
#   0x00001238      mov rdi, rax,
# immediately prior to call to system.

the_color = None


def collect_hints(hint):
    global the_color  # noqa
    if hint.pc == 0x1238:
        print(f"First pass, color in rdi @ pc=0x{hint.pc:x} is {hint.color}")
        the_color = hint.color


hinter = smallworld.hinting.Hinter()
hinter.register(DynamicRegisterValueHint, collect_hints)
c = Colorizer(hinter, num_insns=1000, exec_id=1)

We register a callback collect_hints to collect hints coming from the colorizer and then look specifically at the one that corresponds to pc=0x1238 which copies rax into rdi. We save this color into the global the_color.

In a second pass use of the colorizer, we determine when the_color was first observed, by looking for the hint involving it which has the message indicating that it is a read-def (see above), meaning it is the first observation.

perturbed_machine = randomize_uninitialized(machine_copy, 1234)
c.run(perturbed_machine)

# second pass to figure out when we first saw that color


def collect_hints2(hint):
    global the_color  # noqa
    if hint.color == the_color and hint.message == "read-def":
        print(
            f"Second pass, first obs of color {the_color} is pc=0x{hint.pc:x}, in {hint.reg_name}"
        )


hinter = smallworld.hinting.Hinter()

This is done by another callback collect_hints2, which is looking the a hint involving the_color where that color is first observed.

Note, in both passes we employ a call to randomize_unitialized to ensure that values will get nice looking colors if not initialized already, and we use the same seed so that the same colors will get assigned in both and the same trace will get executed.

Here is the script in its entirety.

import copy
import logging

import smallworld
from smallworld.analyses import Colorizer
from smallworld.analyses.colorizer import randomize_uninitialized
from smallworld.hinting.hints import DynamicRegisterValueHint

smallworld.logging.setup_logging(level=logging.DEBUG)
logger = logging.getLogger(__name__)

# configure the platform for emulation
platform = smallworld.platforms.Platform(
    smallworld.platforms.Architecture.X86_64, smallworld.platforms.Byteorder.LITTLE
)

machine = smallworld.state.Machine()

cpu = smallworld.state.cpus.CPU.for_platform(platform)

base_address = 0x0
code = smallworld.state.memory.code.Executable.from_elf(
    open("c-example", "rb"), address=base_address
)
machine.add(code)

stack = smallworld.state.memory.stack.Stack.for_platform(platform, 0x10000, 0x4000)
machine.add(stack)
rsp = stack.get_pointer()
cpu.rsp.set(rsp)

entry_point = 0x1149 + base_address

# call to "system"
exit_point = 0x123B

cpu.rip.set(entry_point)
machine.add(cpu)
machine.add_exit_point(exit_point)

printf = smallworld.state.models.Model.lookup(
    "printf", platform, smallworld.platforms.ABI.SYSTEMV, 0x1040 + base_address
)
machine.add(printf)
printf.allow_imprecise = True
code.update_symbol_value("printf", printf._address)

# first pass through to figure out color for rax/rdi
# at instruction
#   0x00001238      mov rdi, rax,
# immediately prior to call to system.

the_color = None


def collect_hints(hint):
    global the_color  # noqa
    if hint.pc == 0x1238:
        print(f"First pass, color in rdi @ pc=0x{hint.pc:x} is {hint.color}")
        the_color = hint.color


hinter = smallworld.hinting.Hinter()
hinter.register(DynamicRegisterValueHint, collect_hints)
c = Colorizer(hinter, num_insns=1000, exec_id=1)
machine_copy = copy.deepcopy(machine)
perturbed_machine = randomize_uninitialized(machine_copy, 1234)
c.run(perturbed_machine)

# second pass to figure out when we first saw that color


def collect_hints2(hint):
    global the_color  # noqa
    if hint.color == the_color and hint.message == "read-def":
        print(
            f"Second pass, first obs of color {the_color} is pc=0x{hint.pc:x}, in {hint.reg_name}"
        )


hinter = smallworld.hinting.Hinter()
hinter.register(DynamicRegisterValueHint, collect_hints2)
c = Colorizer(hinter, num_insns=1000, exec_id=1)
machine_copy = copy.deepcopy(machine)
perturbed_machine = randomize_uninitialized(machine_copy, 1234)
c.run(perturbed_machine)

We can run this script to answer the question “Where does the argument to system come from? It takes no arguments but we can filter the output to get just the operative parts which come from the print statements in those two hint callbacks.

$ python3 c-example.py 2> /dev/null | egrep '(First pass|Second pass)'
First pass, color in rdi @ pc=0x1238 is 8594311575614880821
Second pass, first obs of color 8594311575614880821 is pc=0x1151, in rdi

The script tells us that, for this trace, the value in rdi passed to system is the same as the value in rdi at the start of the function call_sys, i.e. in the instruction 0x00001151  mov qword [var_28h], rdi which copies the function argument into a local variable. traces.

Take 2: ColorizerReadWrite

We can take a somewhat more straightforward approach by making use of the ColorizerReadWrite analysis which allows us to ask derivation questions about instructions and values.

The first part of the script, which harnesses the code to run is unchanged. And we don’t need to collect any hints, we simply run the colorizer and arrange for its hints to be passed to a ColorizerReadWrite object which will construct a read->write graph as described in Colorizer Concepts.

hinter = smallworld.hinting.Hinter()
crw = ColorizerReadWrite(hinter)
c = Colorizer(hinter, num_insns=1000, exec_id=1)
machine_copy = copy.deepcopy(machine)
perturbed_machine = randomize_uninitialized(machine_copy, 1234)
c.run(perturbed_machine)
crw.run(perturbed_machine)

We can now derive where the value in rax comes from at instruction 0x1238 with the code

# directly ask for a derivation of the value in rax in instruction 0x1238
der = crw.graph.derive(0x1238, True, RegisterDef("rax", 8))
for r in der:

The complete script looks like this:

import copy
import logging

import smallworld
from smallworld.analyses import Colorizer, ColorizerReadWrite
from smallworld.analyses.colorizer import randomize_uninitialized
from smallworld.platforms.defs.platformdef import RegisterDef

smallworld.logging.setup_logging(level=logging.DEBUG)
logger = logging.getLogger(__name__)

# configure the platform for emulation
platform = smallworld.platforms.Platform(
    smallworld.platforms.Architecture.X86_64, smallworld.platforms.Byteorder.LITTLE
)

machine = smallworld.state.Machine()

cpu = smallworld.state.cpus.CPU.for_platform(platform)

base_address = 0x0
code = smallworld.state.memory.code.Executable.from_elf(
    open("c-example", "rb"), address=base_address
)
machine.add(code)

stack = smallworld.state.memory.stack.Stack.for_platform(platform, 0x10000, 0x4000)
machine.add(stack)
rsp = stack.get_pointer()
cpu.rsp.set(rsp)

entry_point = 0x1149 + base_address

# call to "system"
exit_point = 0x123B

cpu.rip.set(entry_point)
machine.add(cpu)
machine.add_exit_point(exit_point)

printf = smallworld.state.models.Model.lookup(
    "printf", platform, smallworld.platforms.ABI.SYSTEMV, 0x1040 + base_address
)
machine.add(printf)
printf.allow_imprecise = True
code.update_symbol_value("printf", printf._address)

# New and interesting stuff follows:

hinter = smallworld.hinting.Hinter()
crw = ColorizerReadWrite(hinter)
c = Colorizer(hinter, num_insns=1000, exec_id=1)
machine_copy = copy.deepcopy(machine)
perturbed_machine = randomize_uninitialized(machine_copy, 1234)
c.run(perturbed_machine)
crw.run(perturbed_machine)

# directly ask for a derivation of the value in rax in instruction 0x1238
der = crw.graph.derive(0x1238, True, RegisterDef("rax", 8))
for r in der:
    print(f"derivation result is pc=0x{r.pc:x}: {r.wr}")

We can run this (discarding the stderr output which is considerable) with

$ python3 c-example3.py 2> /dev/null
n=1159780329
a=774520D7E98D7C35
a=774520D73720C51C
derivation result is pc=0x1151: ReadInfo(info=RegisterInfo(color=3, is_new=True, register=RegisterDef(name='rdi', size=8)))

We obtain the answer we wanted directly, namely that rax at pc=0x1238 is directly derived from the value of rdi at pc=0x1151 which is the input to the function.

Further Reading

See the Colorizer Concepts page for more details.