# Credit: much of this comes from: # http://www.muppetlabs.com/~breadbox/software/tiny/teensy.html # by Brian Raiter # compile tiny1.c gcc -Wall tiny1.c # run it, print out the exit value from running it ($?) ./a.out ; echo $? # see how big it is wc -c a.out # recompile it, "strip" it to remove all symbols ("man strip") gcc -Wall -s tiny1.c wc -c a.out # recompile it, add optimizations that diminish code size gcc -Wall -s -Os tiny1.c wc -c a.out # try a simpler program gcc -Wall -s -Os tiny2.c ./a.out; echo $? wc -c a.out # maybe C is too inefficient -- try handcoding assembly. # all we need to do is return 42 from main; in x86, this # means setting the %eax register to value 42, then ret'ing. gcc -Wall -s -m32 tiny3.s ./a.out; echo $? wc -c a.out # nope. let's actually see what's in the file. objdump -xdst a.out | less # yike! let's temporarily put the symbols back in, and look again gcc -Wall -m32 tiny3.s objdump -xdst a.out | less # so, the linker is, by default, causing the first byte that is # executed to be the symbol _start. When linux runs an executable, # it starts the process by setting the PC to the virtual address # of the first byte of the .text section; this is where the linker # places _start. # # gcc is linking in a bunch of stuff -- including its own _start # routing that sets up argc, argv, and so on, and then invokes main(). # # maybe we can dispense with that and just define our own _start # routine! gcc -Wall -s -m32 tiny4.s # close, but gcc is still linking in its own start files. # turns out gcc has an option to not do that. gcc -Wall -s -m32 -nostartfiles tiny4.s ./a.out; echo $? # yike! what went wrong? well, our assembly language is following # the C calling conventions; it assumed that _start was a C procedure, # and tried to "ret" out of it. But, in fact, a program starts life # out not as a subroutine, but as a naked "jmp" to the _start address. # How does _start exit? Well, it calls the exit system call! It # turns out that exit() is a high-level libc function that eventually # calls the low-level _exit() function; _exit() is just a simple # wrapper around the assembly we need to call the Linux exit system # call. _exit takes single integer argument, which is the exit code # (42 in our case). We need to push that onto the stack, then call # _exit. Let's try that. gcc -Wall -s -m32 -nostartfiles tiny5.s ./a.out; echo $? wc -c a.out # closer! let's try the objdump objdump -xdst a.out | less # looks like gcc is still dynamically linking in libc and other things, # and there is space being taken up to specify those dependencies. # we can confirm with ldd: ldd a.out # gcc has an option to not include the standard libraries; let's # try that. (-nostartfiles happens as a side-effect of -nostdlib) gcc -Wall -s -m32 -nostdlib tiny5.s # well, of course that happened -- _exit is part of libc. We # can confirm by recompiling with libc statically linked in. # we won't strip the symbols. gcc -static -Wall -m32 -nostartfiles tiny5.s ./a.out ; echo $? wc -c a.out objdump -xdt a.out | less # ok, so we just need to emulate the important part of exit, # which uses the "int 0x80" assembly software interrupt; this # is how linux system calls work. we need to put the system # call number for exit in eax. lets figure out the system # call number for exit(): less /usr/include/asm/unistd_32.h # it's system call number 1. So, we have to put that in eax, and # we have to put the argument (the exit status code) in ebx. gcc -Wall -s -m32 -nostdlib tiny6.s ./a.out ; echo $? wc -c a.out objdump -xdst a.out | less # much closer! we can trim down the code next. We can set %eax to # zero by xor'ing it to zero and then using a one-byte increment. # and, instead of using all 32 bits of %ebx, we can just use the # lowest 8 bits (register "%bl"), since that's all the OS uses. # note -- we're getting very, very linux-specific now. not safe. # but fun! gcc -Wall -s -m32 -nostdlib tiny7.s ./a.out ; echo $? wc -c a.out # saved four bytes! objdump -xdst a.out | less # unlikely to make the program itself any smaller. what's left # in the executable file? wc -c a.out objdump -xdst a.out | less # looks like gcc puts some gunk in there we don't need. at this # point, we might as well use as and ld directly, rather than gcc, # since gcc isn't adding anything. as --32 tiny7.s -o tiny7.o ld -s tiny7.o ./a.out; echo $? wc -c a.out objdump -xdst a.out | less # Great! We're down to the ELF file format itself. Our program is # only 7 bytes long -- and, the .text section of the executable is # only 7 bytes, so we have shaved down the exectuable part of the # executable to the smallest we can get. The rest is ELF overhead. # so, to get smaller, we'll need to understand more about ELF # (the "EXECUTABLE AND LINKABLE FORMAT"). For the gory details # about ELF, you can see: # http://refspecs.freestandards.org/elf/elf.pdf # ELF starts with a 52 byte long header; it contains information that # describes the contents of the file. The first 16 bytes contain an # identifier, including the 4-byte magic number (7F 45 4C 46), and some # fields that describe other aspects of the file, such as 32-bit vs 64-bit, # endianness, target architecture, the program start address, and so on. # You can see this stuff at the top of objdump's output: objdump -xdst a.out | less # as well, there are two tables: # # - the program header table, which is used by the loader (the linux # OS, via the execve() system call) to figure out where the various # sections in the file should be loaded into virtual memory. # # - the section header table, which is used by the linker (gcc/ld) to # declare where the various sections are within the file itself. # # We *don't need* the section header anymore, since we're post-link. # How do we get rid of it? Well, we'll need to construct a valid ELF # executable somewhat manually, and not include it. # # We could look through all of the elf specs, linux's elf header # files (/usr/include/linux/elf.h), and some executables generated # by standard tools. After doing that, we can create an assembly # file that contains these things somewhat manually. We'll switch # to "nasm" rather than "as", since it lets us control various things # more carefully. nasm -f bin -o a.out tiny8.s chmod 755 a.out ./a.out; echo $? wc -c a.out objdump -xdst a.out | less # getting very close to the end game. note that there are 8 zeroes at # the end of the elf header. pure padding. Can't we fit our 7 byte # program in it? yes! nasm -f bin -o a.out tiny9.s chmod 755 a.out ./a.out; echo $? wc -c a.out # ok, now we're gonna get ugly. Note that the last 8 bytes # in the elf header (dw 1, dw 0, dw 0, dw 0) are identical # to the first 8 bytes in the program header table # (dd 1, dd 0). So... nasm -f bin -o a.out tiny10.s chmod 755 a.out ./a.out; echo $? wc -c a.out # so, can we make the elf header and program header table # overlap even more? turns out the answer, disgustingly, is # yes. Linux quietly ignores a bunch of fields in the elf # header, so we can contort them to match what we need in # our program header table. # --> first four bytes: magic number, linux requires them # --> rest of e_ident: ignored by linux! (we've already # used 7 bytes of it to stuff our program into) # --> e_type: has to be 2 to indicate executable (rather than .o file) # --> e_machine: has to be 3 to indicate i386 target # --> e_version: ignored # --> e_entry: has to be correct # --> e_phoff: has to be correct program header offset # --> e_shoff: about section header, so ignored for us # --> e_flags: unused on intel # --> e_ehsize: ignored by Linux! # --> e_phentsize: has to be correct (validating size of header table) # --> e_phnum: has to contain the right number of ph entries (1 for us) # --> e_shentsize, e_shnum, e_shstrndx: ignored! # # --> p_type: must contain 1 (marks it as a loadable segment) # --> p_offset: must contain correct offset within file to start loading # --> p_vaddr: must be present, but doesn't need to b 0x08048000. # anything page-aligned above 0x00000000 and below 0x80000000 works! # --> p_paddr: ignored # --> p_filesz: must be correct (# of bytes to load from file to mem) # --> p_memsz: must be correct (how large mem segment should be) # --> p_flags: must be readable + executable (0x4 | 0x1), other bits usable # --> p_align: mostly used by PIC, so linux will ignore for us # # Given this, we can more aggressively interpose the two headers... # # The major "aha" in this is setting the virtual address at which the # program loads to 0x00200000, which makes the top half 0x0020, which # is exactly the program header size we need in the e_phentsize field!! nasm -f bin -o a.out tiny11.s chmod 755 a.out ./a.out; echo $? wc -c a.out # woohoo! down to 64 bytes. One last set of dirty, dirty tricks; # read the last little bit of: # http://www.muppetlabs.com/~breadbox/software/tiny/teensy.html # to understand this awful monstrosity: nasm -f bin -o a.out tiny12.s chmod 755 a.out ./a.out; echo $? wc -c a.out