Colorizer Analysis Tutorial¶
We will solve the same problem twice using both Colorizer and
ColorizerReadWrite analyses.
Take 1: Colorizer¶
The following function call_sys illustrates one way in which the
colorizer analysis can be of use.
[0x00001060]> pdf @ sym.call_sys
; CALL XREF from sub.main_1243 @ 0x1260(x)
┌ 250: sym.call_sys (int64_t arg1);
│ `- args(rdi) vars(6:sp[0x10..0x30])
│ 0x00001149 55 push rbp
│ 0x0000114a 4889e5 mov rbp, rsp
│ 0x0000114d 4883ec30 sub rsp, 0x30
│ 0x00001151 48897dd8 mov qword [var_28h], rdi ; arg1
│ 0x00001155 488b45d8 mov rax, qword [var_28h]
│ 0x00001159 488945f8 mov qword [string], rax
│ 0x0000115d 488b45f8 mov rax, qword [string]
│ 0x00001161 48c1e818 shr rax, 0x18
│ 0x00001165 8945ec mov dword [var_14h], eax
│ 0x00001168 c745e80900.. mov dword [var_18h], 9
│ 0x0000116f 8b45ec mov eax, dword [var_14h]
│ 0x00001172 488d158b0e.. lea rdx, str.n_d_n ; 0x2004 ; "n=%d\n"
│ 0x00001179 89c6 mov esi, eax
│ 0x0000117b 4889d7 mov rdi, rdx ; const char *format
│ 0x0000117e b800000000 mov eax, 0
│ 0x00001183 e8b8feffff call sym.imp.printf ; int printf(const char *format)
│ 0x00001188 488b45f8 mov rax, qword [string]
│ 0x0000118c 488d15770e.. lea rdx, str.a_lX_n ; 0x200a ; "a=%lX\n"
│ 0x00001193 4889c6 mov rsi, rax
│ 0x00001196 4889d7 mov rdi, rdx ; const char *format
│ 0x00001199 b800000000 mov eax, 0
│ 0x0000119e e89dfeffff call sym.imp.printf ; int printf(const char *format)
│ 0x000011a3 b8efbeadde mov eax, 0xdeadbeef
│ 0x000011a8 483145f8 xor qword [string], rax
│ 0x000011ac c745f40000.. mov dword [var_ch], 0
│ ┌─< 0x000011b3 eb21 jmp 0x11d6
│ │ ; CODE XREF from sym.call_sys @ 0x11dc(x)
│ ┌──> 0x000011b5 8b45f4 mov eax, dword [var_ch]
│ ╎│ 0x000011b8 4898 cdqe
│ ╎│ 0x000011ba 488d148500.. lea rdx, [rax*4]
│ ╎│ 0x000011c2 488d05772e.. lea rax, obj.mints ; 0x4040 ; U"=J`\W9J\x1a\v"
│ ╎│ 0x000011c9 8b0402 mov eax, dword [rdx + rax]
│ ╎│ 0x000011cc 4898 cdqe
│ ╎│ 0x000011ce 480145f8 add qword [string], rax
│ ╎│ 0x000011d2 8345f401 add dword [var_ch], 1
│ ╎│ ; CODE XREF from sym.call_sys @ 0x11b3(x)
│ ╎└─> 0x000011d6 8b45f4 mov eax, dword [var_ch]
│ ╎ 0x000011d9 3b45e8 cmp eax, dword [var_18h]
│ └──< 0x000011dc 7cd7 jl 0x11b5
│ 0x000011de 488b45f8 mov rax, qword [string]
│ 0x000011e2 488d15210e.. lea rdx, str.a_lX_n ; 0x200a ; "a=%lX\n"
│ 0x000011e9 4889c6 mov rsi, rax
│ 0x000011ec 4889d7 mov rdi, rdx ; const char *format
│ 0x000011ef b800000000 mov eax, 0
│ 0x000011f4 e847feffff call sym.imp.printf ; int printf(const char *format)
│ 0x000011f9 8b45e8 mov eax, dword [var_18h]
│ 0x000011fc 83e801 sub eax, 1
│ 0x000011ff 8945f0 mov dword [var_10h], eax
│ ┌─< 0x00001202 eb21 jmp 0x1225
│ │ ; CODE XREF from sym.call_sys @ 0x1229(x)
│ ┌──> 0x00001204 8b45f0 mov eax, dword [var_10h]
│ ╎│ 0x00001207 4898 cdqe
│ ╎│ 0x00001209 488d148500.. lea rdx, [rax*4]
│ ╎│ 0x00001211 488d05282e.. lea rax, obj.mints ; 0x4040 ; U"=J`\W9J\x1a\v"
│ ╎│ 0x00001218 8b0402 mov eax, dword [rdx + rax]
│ ╎│ 0x0000121b 4898 cdqe
│ ╎│ 0x0000121d 482945f8 sub qword [string], rax
│ ╎│ 0x00001221 836df001 sub dword [var_10h], 1
│ ╎│ ; CODE XREF from sym.call_sys @ 0x1202(x)
│ ╎└─> 0x00001225 837df000 cmp dword [var_10h], 0
│ └──< 0x00001229 79d9 jns 0x1204
│ 0x0000122b b8efbeadde mov eax, 0xdeadbeef
│ 0x00001230 483145f8 xor qword [string], rax
│ 0x00001234 488b45f8 mov rax, qword [string]
│ 0x00001238 4889c7 mov rdi, rax ; const char *string
│ 0x0000123b e8f0fdffff call sym.imp.system ; int system(const char *string)
│ 0x00001240 90 nop
│ 0x00001241 c9 leave
└ 0x00001242 c3 ret
There is a call to the libc function system near the end of this
function, at instruction 0x123b . However, it is hard to tell what
argument is being passed to it, i.e. what string is being pointed to.
That is because the code has been obfuscated. But we can use the
Colorizer to defeat this obfuscation directly, via dynamic
execution and tracking values or colors.
First, we need to harness this function. This is something you have seen before, but here is the kind of code that will do this.
import copy
import logging
import smallworld
from smallworld.analyses import Colorizer
from smallworld.analyses.colorizer import randomize_uninitialized
from smallworld.hinting.hints import DynamicRegisterValueHint
smallworld.logging.setup_logging(level=logging.DEBUG)
logger = logging.getLogger(__name__)
# configure the platform for emulation
platform = smallworld.platforms.Platform(
smallworld.platforms.Architecture.X86_64, smallworld.platforms.Byteorder.LITTLE
)
machine = smallworld.state.Machine()
cpu = smallworld.state.cpus.CPU.for_platform(platform)
base_address = 0x0
code = smallworld.state.memory.code.Executable.from_elf(
open("c-example", "rb"), address=base_address
)
machine.add(code)
stack = smallworld.state.memory.stack.Stack.for_platform(platform, 0x10000, 0x4000)
machine.add(stack)
rsp = stack.get_pointer()
cpu.rsp.set(rsp)
entry_point = 0x1149 + base_address
# call to "system"
exit_point = 0x123B
cpu.rip.set(entry_point)
machine.add(cpu)
machine.add_exit_point(exit_point)
printf = smallworld.state.models.Model.lookup(
"printf", platform, smallworld.platforms.ABI.SYSTEMV, 0x1040 + base_address
)
machine.add(printf)
printf.allow_imprecise = True
code.update_symbol_value("printf", printf._address)
A few things to point out here. First, we set, as our exit point, the
call to system, because we want to see the color of the value in
the argument to the call. That is, what we want to know is what is the
dynamic value in rdi when we reach 0x123b call sym.imp.system,
and to determine where that value came from, if possible. So we can
stop analyzing when we reach that call. Second, we arrange to model
the calls to printf in this function. We set up the model to take
over for calls to 0x1040 + base_address. This is not obvious in
the above disassembly since radare glosses such calls as call
sym.imp.printf. If you disassemble, instead, with objdump (or
use Binary Ninja or Ghdira), you can see that the call is really to
0x1040.
119e: call 1040 <printf@plt>
Modeling printf has two benefits. We will be able to see what gets printed, which might be useful, but, more important, the harness will not fail on them since libc is not actually available.
We will use the colorizer twice, i.e. make two dynamic analyses of the
function’s code. In the first, we determine the color for rax at
the instruction immediately prior to the system call. This is the
value that gets copied into rdi, the argument to system. Here
is code that does this.
# first pass through to figure out color for rax/rdi
# at instruction
# 0x00001238 mov rdi, rax,
# immediately prior to call to system.
the_color = None
def collect_hints(hint):
global the_color # noqa
if hint.pc == 0x1238:
print(f"First pass, color in rdi @ pc=0x{hint.pc:x} is {hint.color}")
the_color = hint.color
hinter = smallworld.hinting.Hinter()
hinter.register(DynamicRegisterValueHint, collect_hints)
c = Colorizer(hinter, num_insns=1000, exec_id=1)
We register a callback collect_hints to collect hints coming from
the colorizer and then look specifically at the one that corresponds
to pc=0x1238 which copies rax into rdi. We save this color
into the global the_color.
In a second pass use of the colorizer, we determine when the_color
was first observed, by looking for the hint involving it which has the
message indicating that it is a read-def (see above), meaning it
is the first observation.
perturbed_machine = randomize_uninitialized(machine_copy, 1234)
c.run(perturbed_machine)
# second pass to figure out when we first saw that color
def collect_hints2(hint):
global the_color # noqa
if hint.color == the_color and hint.message == "read-def":
print(
f"Second pass, first obs of color {the_color} is pc=0x{hint.pc:x}, in {hint.reg_name}"
)
hinter = smallworld.hinting.Hinter()
This is done by another callback collect_hints2, which is looking
the a hint involving the_color where that color is first observed.
Note, in both passes we employ a call to randomize_unitialized to
ensure that values will get nice looking colors if not initialized
already, and we use the same seed so that the same colors will get
assigned in both and the same trace will get executed.
Here is the script in its entirety.
import copy
import logging
import smallworld
from smallworld.analyses import Colorizer
from smallworld.analyses.colorizer import randomize_uninitialized
from smallworld.hinting.hints import DynamicRegisterValueHint
smallworld.logging.setup_logging(level=logging.DEBUG)
logger = logging.getLogger(__name__)
# configure the platform for emulation
platform = smallworld.platforms.Platform(
smallworld.platforms.Architecture.X86_64, smallworld.platforms.Byteorder.LITTLE
)
machine = smallworld.state.Machine()
cpu = smallworld.state.cpus.CPU.for_platform(platform)
base_address = 0x0
code = smallworld.state.memory.code.Executable.from_elf(
open("c-example", "rb"), address=base_address
)
machine.add(code)
stack = smallworld.state.memory.stack.Stack.for_platform(platform, 0x10000, 0x4000)
machine.add(stack)
rsp = stack.get_pointer()
cpu.rsp.set(rsp)
entry_point = 0x1149 + base_address
# call to "system"
exit_point = 0x123B
cpu.rip.set(entry_point)
machine.add(cpu)
machine.add_exit_point(exit_point)
printf = smallworld.state.models.Model.lookup(
"printf", platform, smallworld.platforms.ABI.SYSTEMV, 0x1040 + base_address
)
machine.add(printf)
printf.allow_imprecise = True
code.update_symbol_value("printf", printf._address)
# first pass through to figure out color for rax/rdi
# at instruction
# 0x00001238 mov rdi, rax,
# immediately prior to call to system.
the_color = None
def collect_hints(hint):
global the_color # noqa
if hint.pc == 0x1238:
print(f"First pass, color in rdi @ pc=0x{hint.pc:x} is {hint.color}")
the_color = hint.color
hinter = smallworld.hinting.Hinter()
hinter.register(DynamicRegisterValueHint, collect_hints)
c = Colorizer(hinter, num_insns=1000, exec_id=1)
machine_copy = copy.deepcopy(machine)
perturbed_machine = randomize_uninitialized(machine_copy, 1234)
c.run(perturbed_machine)
# second pass to figure out when we first saw that color
def collect_hints2(hint):
global the_color # noqa
if hint.color == the_color and hint.message == "read-def":
print(
f"Second pass, first obs of color {the_color} is pc=0x{hint.pc:x}, in {hint.reg_name}"
)
hinter = smallworld.hinting.Hinter()
hinter.register(DynamicRegisterValueHint, collect_hints2)
c = Colorizer(hinter, num_insns=1000, exec_id=1)
machine_copy = copy.deepcopy(machine)
perturbed_machine = randomize_uninitialized(machine_copy, 1234)
c.run(perturbed_machine)
We can run this script to answer the question “Where does the argument
to system come from? It takes no arguments but we can filter the
output to get just the operative parts which come from the print
statements in those two hint callbacks.
$ python3 c-example.py 2> /dev/null | egrep '(First pass|Second pass)'
First pass, color in rdi @ pc=0x1238 is 8594311575614880821
Second pass, first obs of color 8594311575614880821 is pc=0x1151, in rdi
The script tells us that, for this trace, the value in rdi passed to
system is the same as the value in rdi at the start of the function
call_sys, i.e. in the instruction 0x00001151 mov qword [var_28h], rdi
which copies the function argument into a local variable. traces.
Take 2: ColorizerReadWrite¶
We can take a somewhat more straightforward approach by making use of
the ColorizerReadWrite analysis which allows us to ask derivation
questions about instructions and values.
The first part of the script, which harnesses the code to run is
unchanged. And we don’t need to collect any hints, we simply run the
colorizer and arrange for its hints to be passed to a
ColorizerReadWrite object which will construct a read->write graph
as described in Colorizer Concepts.
hinter = smallworld.hinting.Hinter()
crw = ColorizerReadWrite(hinter)
c = Colorizer(hinter, num_insns=1000, exec_id=1)
machine_copy = copy.deepcopy(machine)
perturbed_machine = randomize_uninitialized(machine_copy, 1234)
c.run(perturbed_machine)
crw.run(perturbed_machine)
We can now derive where the value in rax comes from at instruction
0x1238 with the code
# directly ask for a derivation of the value in rax in instruction 0x1238
der = crw.graph.derive(0x1238, True, RegisterDef("rax", 8))
for r in der:
The complete script looks like this:
import copy
import logging
import smallworld
from smallworld.analyses import Colorizer, ColorizerReadWrite
from smallworld.analyses.colorizer import randomize_uninitialized
from smallworld.platforms.defs.platformdef import RegisterDef
smallworld.logging.setup_logging(level=logging.DEBUG)
logger = logging.getLogger(__name__)
# configure the platform for emulation
platform = smallworld.platforms.Platform(
smallworld.platforms.Architecture.X86_64, smallworld.platforms.Byteorder.LITTLE
)
machine = smallworld.state.Machine()
cpu = smallworld.state.cpus.CPU.for_platform(platform)
base_address = 0x0
code = smallworld.state.memory.code.Executable.from_elf(
open("c-example", "rb"), address=base_address
)
machine.add(code)
stack = smallworld.state.memory.stack.Stack.for_platform(platform, 0x10000, 0x4000)
machine.add(stack)
rsp = stack.get_pointer()
cpu.rsp.set(rsp)
entry_point = 0x1149 + base_address
# call to "system"
exit_point = 0x123B
cpu.rip.set(entry_point)
machine.add(cpu)
machine.add_exit_point(exit_point)
printf = smallworld.state.models.Model.lookup(
"printf", platform, smallworld.platforms.ABI.SYSTEMV, 0x1040 + base_address
)
machine.add(printf)
printf.allow_imprecise = True
code.update_symbol_value("printf", printf._address)
# New and interesting stuff follows:
hinter = smallworld.hinting.Hinter()
crw = ColorizerReadWrite(hinter)
c = Colorizer(hinter, num_insns=1000, exec_id=1)
machine_copy = copy.deepcopy(machine)
perturbed_machine = randomize_uninitialized(machine_copy, 1234)
c.run(perturbed_machine)
crw.run(perturbed_machine)
# directly ask for a derivation of the value in rax in instruction 0x1238
der = crw.graph.derive(0x1238, True, RegisterDef("rax", 8))
for r in der:
print(f"derivation result is pc=0x{r.pc:x}: {r.wr}")
We can run this (discarding the stderr output which is considerable) with
$ python3 c-example3.py 2> /dev/null
n=1159780329
a=774520D7E98D7C35
a=774520D73720C51C
derivation result is pc=0x1151: ReadInfo(info=RegisterInfo(color=3, is_new=True, register=RegisterDef(name='rdi', size=8)))
We obtain the answer we wanted directly, namely that rax at
pc=0x1238 is directly derived from the value of rdi at
pc=0x1151 which is the input to the function.
Further Reading¶
See the Colorizer Concepts page for more details.