Introduction

The starting point of this tutorial is the following brainfsck interpreter:

#include <stdio.h>
#include <stdlib.h>

#define TAPE_SIZE 30000
#define MAX_NESTING 100

typedef struct bf_state
{
  unsigned char* tape;
  unsigned char (*get_ch)(struct bf_state*);
  void (*put_ch)(struct bf_state*, unsigned char);
} bf_state_t;

#define bad_program(s) exit(fprintf(stderr, "bad program near %.16s: %s\n", program, s))

static void bf_interpret(const char* program, bf_state_t* state)
{
  const char* loops[MAX_NESTING];
  int nloops = 0;
  int n;
  int nskip = 0;
  unsigned char* tape_begin = state->tape - 1;
  unsigned char* ptr = state->tape;
  unsigned char* tape_end = state->tape + TAPE_SIZE - 1;
  for(;;) {
    switch(*program++) {
    case '<':
      for(n = 1; *program == '<'; ++n, ++program);
      if(!nskip) {
        ptr -= n;
        while(ptr <= tape_begin)
          ptr += TAPE_SIZE;
      }
      break;
    case '>':
      for(n = 1; *program == '>'; ++n, ++program);
      if(!nskip) {
        ptr += n;
        while(ptr > tape_end)
          ptr -= TAPE_SIZE;
      }
      break;
    case '+':
      for(n = 1; *program == '+'; ++n, ++program);
      if(!nskip)
        *ptr += n;
      break;
    case '-':
      for(n = 1; *program == '-'; ++n, ++program);
      if(!nskip)
        *ptr -= n;
      break;
    case ',':
      if(!nskip)
        *ptr = state->get_ch(state);
      break;
    case '.':
      if(!nskip)
        state->put_ch(state, *ptr);
      break;
    case '[':
      if(nloops == MAX_NESTING)
        bad_program("Nesting too deep");
      loops[nloops++] = program;
      if(!*ptr)
        ++nskip;
      break;
    case ']':
      if(nloops == 0)
        bad_program("] without matching [");
      if(*ptr)
        program = loops[nloops-1];
      else
        --nloops;
      if(nskip)
        --nskip;
      break;
    case 0:
      if(nloops != 0)
        program = "<EOF>", bad_program("[ without matching ]");
      return;
    }
  }
}

static void bf_putchar(bf_state_t* s, unsigned char c)
{
  putchar((int)c);
}

static unsigned char bf_getchar(bf_state_t* s)
{
  return (unsigned char)getchar();
}

static void bf_run(const char* program)
{
  bf_state_t state;
  unsigned char tape[TAPE_SIZE] = {0};
  state.tape = tape;
  state.get_ch = bf_getchar;
  state.put_ch = bf_putchar;
  bf_interpret(program, &state);
}

int main(int argc, char** argv)
{
  if(argc == 2) {
    long sz;
    char* program;
    FILE* f = fopen(argv[1], "r");
    if(!f) {
      fprintf(stderr, "Cannot open %s\n", argv[1]);
      return 1;
    }
    fseek(f, 0, SEEK_END);
    sz = ftell(f);
    program = (char*)malloc(sz + 1);
    fseek(f, 0, SEEK_SET);
    program[fread(program, 1, sz, f)] = 0;
    fclose(f);
    bf_run(program);
    return 0;
  } else {
    fprintf(stderr, "Usage: %s INFILE.bf\n", argv[0]);
    return 1;
  }
}

Over the course of this tutorial, we'll use DynASM to transform this interpreter into a brainfsck JIT compiler, therein hopefully making it faster.

To follow along, clone this repository and start from bf_c.c:

git clone https://github.com/corsix/dynasm-doc.git
cd dynasm-doc
git submodule update --init
cp bf_c.c tutorial.c

The functionality of the starting point can be checked by running the following, which should very slowly render the Mandelbrot set:

gcc -o tutorial tutorial.c
./tutorial mandelbrot.bf

Groundwork

Before the real fun can begin, we need to lay a few pieces of groundwork.


Includes

First of all, we need to #include the DynASM headers:

#include "luajit-2.0/dynasm/dasm_proto.h"
#include "luajit-2.0/dynasm/dasm_x86.h"

As described in more detail on the reference page, dasm_proto.h defines the DynASM API, and dasm_x86.h contains the implementation of said API (for x86 / x64).


Types

Next, we'll rename bf_interpret to bf_compile and change its type signature:

static void bf_interpret(const char* program, bf_state_t* state)
static void(* bf_compile(const char* program) )(bf_state_t*)

Where previously bf_interpret accepted both a const char* and a bf_state_t*, bf_compile now accepts just the const char* portion, and will return a function pointer to the JIT-compiled code.

The code which calls bf_interpret also needs updating at this point:

bf_interpret(program, &state);
bf_compile(program)(&state);

Initialisation

With the groundwork done, the next task is creating and initialising a DynASM state.


Variables

We'll need a variable of type dasm_State* to contain the DynASM state, and two extra variables whose purpose will be explained later. We can also get rid of an interpreter variable at the same time:

int nskip = 0;
dasm_State* d;
unsigned npc = 8;
unsigned nextpc = 0;

.arch

We now reach the first of many DynASM directives, which are instructions to the DymASM preprocessor. In this case, we need to instruct it as to which architecture we're generating machine code for, which will either be x86 or x64:

|.if X64
|.arch x64
|.else
|.arch x86
|.endif

Lines starting with a vertical bar will be picked up by the DynASM preprocessor. The .if, .else, and .endif directives will be handled by DynASM's prepreprocessor, with semantics similar to C's preprocessor #if, #else, and #endif. As a result, exactly one .arch directive will take effect.


dasm_init

Having declared a variable of type dasm_State*, we need to actually allocate a dasm_State to put in it, which is done by calling dasm_init:

|.section code
dasm_init(&d, DASM_MAXSECTION);

Note that as well as a dasm_State**, dasm_init also requires an integer argument, which specifies the number of sections of machine code that'll be generated. We only need one code section, so we invoke the .section directive with one argument, which the DynASM preprocessor will rewrite to #define DASM_MAXSECTION 1 (amongst other things). This is a slightly convoluted way of passing 1 as the second argument to dasm_init, but is a good habit in case we need more sections in the future.


dasm_setupglobal

dasm_init will have allocated a dasm_State, but won't have fully initialised it. A few more calls are required to fully initialise the state, the first of which is dasm_setupglobal:

|.globals lbl_
void* labels[lbl__MAX];
dasm_setupglobal(&d, labels, lbl__MAX);

The .globals directive with the argument lbl_ will be rewritten by the DynASM preprocessor to become an enum containing several things, one of which will be lbl__MAX. This value must be passed to dasm_setupglobal, along with an array of void* of equal extent. We'll make use of this labels array much later.


dasm_setup

The next call in the initialisation sequence is to dasm_setup:

|.actionlist bf_actions
dasm_setup(&d, bf_actions);

The .actionlist directive with the argument bf_actions will be rewritten by the DynASM preprocessor to become a variable called bf_actions, and this variable must be passed to dasm_setup.


dasm_growpc

For a lot of use cases, the dasm_State would be fully initialised at this point. However, as we'll be making use of dynamic labels, there is one more initialisation call to make, which is to dasm_growpc:

dasm_growpc(&d, npc);

We're passing npc as an argument, which is a variable we declared earlier. Said variable represents the number of dynamic labels we've allocated, while the related variable nextpc represents the number of dynamic labels we've used. These dynamic labels will come into play when compiling [ and ].


Abstractions

Before we start emitting machine code, it is useful to define a few abstractions. The first few abstractions are to give slightly more meaningful names to the registers we'll be using:

AbstractionCorresponding Interpreter VariableDefinition
aStatestateebx or rbx
aPtrptrebp or r12
aTapeBegintape_beginesi or rsi or r13
aTapeEndtape_endedi or rdi or r14

The next group of useful abstractions relate to function calls:

AbstractionDescription
prologueSet up the stack frame, and set aState from the passed parameter.
prepcall1 arg1Prepare to call a function with one argument, arg1.
prepcall2 arg1, arg2Prepare to call a function with two arguments, arg1 and arg2.
postcall nDo cleanup after a call to a function with n arguments.
epilogueTear down the stack frame.

All of these abstractions are defined by means of .define (for simple substitutions) or .macro (for more complex constructions), and have different definitions for each of x86, x64 POSIX, and x64 Windows:

|.if X64
  |.define aPtr, rbx
  |.define aState, r12
  |.if WIN
    |.define aTapeBegin, rsi
    |.define aTapeEnd, rdi
    |.define rArg1, rcx
    |.define rArg2, rdx
  |.else
    |.define aTapeBegin, r13
    |.define aTapeEnd, r14
    |.define rArg1, rdi
    |.define rArg2, rsi
  |.endif
  |.macro prepcall1, arg1
    | mov rArg1, arg1
  |.endmacro
  |.macro prepcall2, arg1, arg2
    | mov rArg1, arg1
    | mov rArg2, arg2
  |.endmacro
  |.define postcall, .nop
  |.macro prologue
    | push aPtr
    | push aState
    | push aTapeBegin
    | push aTapeEnd
    | push rax
    | mov aState, rArg1
  |.endmacro
  |.macro epilogue
    | pop rax
    | pop aTapeEnd
    | pop aTapeBegin
    | pop aState
    | pop aPtr
    | ret
  |.endmacro
|.else
  |.define aPtr, ebx
  |.define aState, ebp
  |.define aTapeBegin, esi
  |.define aTapeEnd, edi
  |.macro prepcall1, arg1
    | push arg1
  |.endmacro
  |.macro prepcall2, arg1, arg2
    | push arg2
    | push arg1
  |.endmacro
  |.macro postcall, n
    | add esp, 4*n
  |.endmacro
  |.macro prologue
    | push aPtr
    | push aState
    | push aTapeBegin
    | push aTapeEnd
    | mov aState, [esp+20]
  |.endmacro
  |.macro epilogue
    | pop aTapeEnd
    | pop aTapeBegin
    | pop aState
    | pop aPtr
    | ret 4
  |.endmacro
|.endif

Having made all of these architecture and operating system dependent definitions for the DynASM preprocessor, it is useful to check that the architecture and operating system specified to the DynASM preprocessor match the architecture and operating system as known by the C preprocessor, which is done by the following:

||#if ((defined(_M_X64) || defined(__amd64__)) != X64) || (defined(_WIN32) != WIN)
#error "Wrong DynASM flags used: pass `-D X64` and/or `-D WIN` to dynasm.lua as appropriate"
#endif

Note the line starting with two vertical bars: such lines undergo .define substitution by the DynASM prepreprocessor (and can particicpate in .macro definitions), but are otherwise unchanged by the DynASM preprocessor. In particular, if X64 and/or WIN are defined (to 1) at DynASM prepreprocessing time, then they'll be substituted for 1. If they're not defined at DynASM prepreprocessing time, they'll be left unchanged, and be substituated for 0 by the C preprocessor.


Emitting Code

With all of that done, we're finally ready to emit some machine code.


Prologue

The first thing we need to emit is a prologue, which replaces some of the initialisation previously done by the interpreter:

unsigned char* tape_begin = state->tape - 1;
unsigned char* ptr = state->tape;
unsigned char* tape_end = state->tape + TAPE_SIZE - 1;
|.type state, bf_state_t, aState

dasm_State** Dst = &d;
|.code
|->bf_main:
| prologue
| mov aPtr, state->tape
| lea aTapeBegin, [aPtr-1]
| lea aTapeEnd, [aPtr+TAPE_SIZE-1]

The first item of interest here is the .type directive, which subsequently allows us to write state->tape as a shorthand for [aState + offsetof(bf_state_t,tape)].

The next line defines a variable called Dst, and initialises it to &d. This is done because the DynASM preprocessor will rewrite the subsequent lines to calls of the form dasm_put(Dst, ...), and like the previous calls we've made to dasm_ functions, the first argument wants to be &d.

The next line contains a .code directive. Said directive was introduced by the prior .section code directive, and states that subsequently emitted machine code should be placed in the code section (which happens to be the one and only section we're working with).

After this, we define the global label ->bf_main. After we've finished emitting machine code, we'll obtain the address of this global label and turn it into a function pointer.

We then invoke the prologue macro as defined earlier, which will cause a few instructions to be emitted.

Finally, we have a mov instruction and two lea instructions, which directly correspond to the removed interpreter code. As mentioned, the state->tape specified as an operand to mov is recognised as shorthand for [aState + offsetof(bf_state_t,tape)]. Note that both offsetof(bf_state_t,tape) and TAPE_SIZE-1 (part of the lea operand) are so-called encoding-time constants: DynASM doesn't understand what they mean, so it defers their computation to the C compiler. Both of these values happen to be compile-time constants in C, but encoding-time constants don't have to be compile-time constants (we'll see examples of this in just a minute).


Tape Movement

We've reached the guts of the interpreter now, and the first job is to replace the interpreter's handling of < with the compiler's interpretation:

if(!nskip) {
  ptr -= n;
  while(ptr <= tape_begin)
    ptr += TAPE_SIZE;
}
| sub aPtr, n%TAPE_SIZE
| cmp aPtr, aTapeBegin
| ja >1
| add aPtr, TAPE_SIZE
|1:

Note that the compiler doesn't have a notion of skipping over code like the interpreter does, so the outer if is dropped entirely. After that, ptr -= n; and some iterations of the subsequent loop have become | sub aPtr, n%TAPE_SIZE. Note that n%TAPE_SIZE is an encoding-time constant which isn't a compile-time constant in C: DynASM still doesn't understand what the operand means, but in this case the final value of the operand is computed when bf_compile is running.

After performing some iterations of the loop at compile time by means of %TAPE_SIZE, there might still be one iteration to perform at runtime, which correspond to the cmp, ja, and add instructions. Note that the syntax >1 jumps forward to the next definition of the local label 1, which is just after the add instruction.

A similar transformation happens for >, but with add and sub transposed:

if(!nskip) {
  ptr += n;
  while(ptr > tape_end)
    ptr -= TAPE_SIZE;
}
| add aPtr, n%TAPE_SIZE
| cmp aPtr, aTapeEnd
| jbe >1
| sub aPtr, TAPE_SIZE
|1:

Arithmetic

The next instruction to be rewritten is +, which is relatively simple:

if(!nskip)
  *ptr += n;
| add byte [aPtr], n

The only notable thing is the presence of the memory size specifier byte before the memory operand [aPtr]. As neither the memory operand nor the immediate operand have a natural operand size, DynASM needs to be explicitly told. Note that our prior uses of memory operands didn't require memory size specifiers: lea instructions don't require them because the memory operands aren't memory accesses, and mov aPtr, state->tape didn't require one because the size of the memory operand was inferred to be equal to size of the register operand.

The handling of - is similar:

if(!nskip)
  *ptr -= n;
| sub byte [aPtr], n

I/O

The next job involves the logic for , (read char) and . (write char), which are notable because they involve calling other functions. The first of these is ,:

if(!nskip)
  *ptr = state->get_ch(state);
| prepcall1 aState
| call aword state->get_ch
| postcall 1
| mov byte [aPtr], al

Note the invocations of the prepcall1 and postcall abstractions that we defined earlier. Also note that state->get_ch is shorthand for [aState + offsetof(bf_state_t,get_ch)] courtesy of the earlier .type directive, and that memory size specifiers are still required when these shorthands are used: the size of the memory operand will not be automatically inferred to be equal to the size of the named C structure member. The aword (address-sized word) specifier refers to either 4 bytes x86 or 8 bytes x64.

The transformation of . is similar:

if(!nskip)
  state->put_ch(state, *ptr);
| movzx r0, byte [aPtr]
| prepcall2 aState, r0
| call aword state->put_ch
| postcall 2

Note that r0 is used as a register operand: it refers to either eax x86 or rax x64.


Loops

We now reach the really interesting instructions: [ and ]. The first of these has a rather complex transformation:

loops[nloops++] = program;
if(!*ptr)
  ++nskip;
if(program[0] == '-' && program[1] == ']') {
  program += 2;
  | xor eax, eax
  | mov byte [aPtr], al
} else {
  if(nextpc == npc) {
    npc *= 2;
    dasm_growpc(&d, npc);
  }
  | cmp byte [aPtr], 0
  | jz =>nextpc+1
  |=>nextpc:
  loops[nloops++] = nextpc;
  nextpc += 2;
}

First of all, we now recognise the instruction sequence [-] and emit optimised machine code for it. Having excluded this specific case, the general case requires two dynamic labels: one for jumping from [ to after ] (previously done by means of the nskip variable in the interpreter), and one for jumping from ] to after [ (previously done by means of the loops stack).

If the number of dynamic labels we've used equals the number we've allocated, then we call dasm_growpc in order to allocate some more. We then emit a cmp instruction, which does the obvious thing. If the byte at [aPtr] was zero, we jump to the dynamic label =>nextpc+1 (which we'll subsequently define when we see ]). After this, we define the dynamic label =>nextpc (which is what ] will jump back to). Note that both nextpc+1 and nextpc are encoding-time constants.

The second half of the magic comes from the handling of ]:

if(*ptr)
  program = loops[nloops-1];
else
  --nloops;
if(nskip)
  --nskip;
--nloops;
| cmp byte [aPtr], 0
| jnz =>loops[nloops]
|=>loops[nloops]+1:

Note the conditional jump to the dynamic label =>loops[nloops] (which jumps to the =>nextpc defined by the corresponding [), and the definition of the dynamic label =>loops[nloops]+1 (which is jumped to by jz =>nextpc+1 emitted by the corresponding [).


Epilogue

Having covered all of the instructions, all that is left is handling the epilogue and extracting a function pointer from DynASM:

return;
| epilogue
link_and_encode(&d);
dasm_free(&d);
return (void(*)(bf_state_t*))labels[lbl_bf_main];

The first of these lines invokes the epilogue macro we defined earlier. The next line calls out to link_and_encode, which is a function we'll define in just a minute. We then call dasm_free, which frees the DynASM state. Finally, we take the labels array we previously defined and passed to dasm_setupglobal, index it with lbl_bf_main (which was defined by .globals lbl_ and corresponds to the global label ->bf_main), and cast it to a function pointer.

The link_and_encode function is defined as follows:

#if _WIN32
#include <Windows.h>
#else
#include <sys/mman.h>
#if !defined(MAP_ANONYMOUS) && defined(MAP_ANON)
#define MAP_ANONYMOUS MAP_ANON
#endif
#endif

static void* link_and_encode(dasm_State** d)
{
  size_t sz;
  void* buf;
  dasm_link(d, &sz);
#ifdef _WIN32
  buf = VirtualAlloc(0, sz, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
#else
  buf = mmap(0, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
#endif
  dasm_encode(d, buf);
#ifdef _WIN32
  {DWORD dwOld; VirtualProtect(buf, sz, PAGE_EXECUTE_READ, &dwOld); }
#else
  mprotect(buf, sz, PROT_READ | PROT_EXEC);
#endif
  return buf;
}

The particularly interesting calls are to dasm_link and dasm_encode. The remaining calls use operating system functionality to allocate a block of read-write memory and then convert said block to read-execute. Note that we could have allocated a block of read-write-execute memory, but it is generally considered bad form to have memory which is writable and executable at the same time.


Compiling

If you've been following along, your tutorial.c should now correspond to the following:

||#if ((defined(_M_X64) || defined(__amd64__)) != X64) || (defined(_WIN32) != WIN)
#error "Wrong DynASM flags used: pass `-D X64` and/or `-D WIN` to dynasm.lua as appropriate"
#endif
#include <stdio.h>
#include <stdlib.h>
#include "luajit-2.0/dynasm/dasm_proto.h"
#include "luajit-2.0/dynasm/dasm_x86.h"
#if _WIN32
#include <Windows.h>
#else
#include <sys/mman.h>
#if !defined(MAP_ANONYMOUS) && defined(MAP_ANON)
#define MAP_ANONYMOUS MAP_ANON
#endif
#endif

static void* link_and_encode(dasm_State** d)
{
  size_t sz;
  void* buf;
  dasm_link(d, &sz);
#ifdef _WIN32
  buf = VirtualAlloc(0, sz, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
#else
  buf = mmap(0, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
#endif
  dasm_encode(d, buf);
#ifdef _WIN32
  {DWORD dwOld; VirtualProtect(buf, sz, PAGE_EXECUTE_READ, &dwOld); }
#else
  mprotect(buf, sz, PROT_READ | PROT_EXEC);
#endif
  return buf;
}

#define TAPE_SIZE 30000
#define MAX_NESTING 100

typedef struct bf_state
{
  unsigned char* tape;
  unsigned char (*get_ch)(struct bf_state*);
  void (*put_ch)(struct bf_state*, unsigned char);
} bf_state_t;

#define bad_program(s) exit(fprintf(stderr, "bad program near %.16s: %s\n", program, s))

static void(* bf_compile(const char* program) )(bf_state_t*)
{
  unsigned loops[MAX_NESTING];
  int nloops = 0;
  int n;
  dasm_State* d;
  unsigned npc = 8;
  unsigned nextpc = 0;
  |.if X64
  |.arch x64
  |.else
  |.arch x86
  |.endif
  |.section code
  dasm_init(&d, DASM_MAXSECTION);
  |.globals lbl_
  void* labels[lbl__MAX];
  dasm_setupglobal(&d, labels, lbl__MAX);
  |.actionlist bf_actions
  dasm_setup(&d, bf_actions);
  dasm_growpc(&d, npc);
  |.if X64
    |.define aPtr, rbx
    |.define aState, r12
    |.if WIN
      |.define aTapeBegin, rsi
      |.define aTapeEnd, rdi
      |.define rArg1, rcx
      |.define rArg2, rdx
    |.else
      |.define aTapeBegin, r13
      |.define aTapeEnd, r14
      |.define rArg1, rdi
      |.define rArg2, rsi
    |.endif
    |.macro prepcall1, arg1
      | mov rArg1, arg1
    |.endmacro
    |.macro prepcall2, arg1, arg2
      | mov rArg1, arg1
      | mov rArg2, arg2
    |.endmacro
    |.define postcall, .nop
    |.macro prologue
      | push aPtr
      | push aState
      | push aTapeBegin
      | push aTapeEnd
      | push rax
      | mov aState, rArg1
    |.endmacro
    |.macro epilogue
      | pop rax
      | pop aTapeEnd
      | pop aTapeBegin
      | pop aState
      | pop aPtr
      | ret
    |.endmacro
  |.else
    |.define aPtr, ebx
    |.define aState, ebp
    |.define aTapeBegin, esi
    |.define aTapeEnd, edi
    |.macro prepcall1, arg1
      | push arg1
    |.endmacro
    |.macro prepcall2, arg1, arg2
      | push arg2
      | push arg1
    |.endmacro
    |.macro postcall, n
      | add esp, 4*n
    |.endmacro
    |.macro prologue
      | push aPtr
      | push aState
      | push aTapeBegin
      | push aTapeEnd
      | mov aState, [esp+20]
    |.endmacro
    |.macro epilogue
      | pop aTapeEnd
      | pop aTapeBegin
      | pop aState
      | pop aPtr
      | ret 4
    |.endmacro
  |.endif

  |.type state, bf_state_t, aState
  
  dasm_State** Dst = &d;
  |.code
  |->bf_main:
  | prologue
  | mov aPtr, state->tape
  | lea aTapeBegin, [aPtr-1]
  | lea aTapeEnd, [aPtr+TAPE_SIZE-1]
  for(;;) {
    switch(*program++) {
    case '<':
      for(n = 1; *program == '<'; ++n, ++program);
      | sub aPtr, n%TAPE_SIZE
      | cmp aPtr, aTapeBegin
      | ja >1
      | add aPtr, TAPE_SIZE
      |1:
      break;
    case '>':
      for(n = 1; *program == '>'; ++n, ++program);
      | add aPtr, n%TAPE_SIZE
      | cmp aPtr, aTapeEnd
      | jbe >1
      | sub aPtr, TAPE_SIZE
      |1:
      break;
    case '+':
      for(n = 1; *program == '+'; ++n, ++program);
      | add byte [aPtr], n
      break;
    case '-':
      for(n = 1; *program == '-'; ++n, ++program);
      | sub byte [aPtr], n
      break;
    case ',':
      | prepcall1 aState
      | call aword state->get_ch
      | postcall 1
      | mov byte [aPtr], al
      break;
    case '.':
      | movzx r0, byte [aPtr]
      | prepcall2 aState, r0
      | call aword state->put_ch
      | postcall 2
      break;
    case '[':
      if(nloops == MAX_NESTING)
        bad_program("Nesting too deep");
      if(program[0] == '-' && program[1] == ']') {
        program += 2;
        | xor eax, eax
        | mov byte [aPtr], al
      } else {
        if(nextpc == npc) {
          npc *= 2;
          dasm_growpc(&d, npc);
        }
        | cmp byte [aPtr], 0
        | jz =>nextpc+1
        |=>nextpc:
        loops[nloops++] = nextpc;
        nextpc += 2;
      }
      break;
    case ']':
      if(nloops == 0)
        bad_program("] without matching [");
      --nloops;
      | cmp byte [aPtr], 0
      | jnz =>loops[nloops]
      |=>loops[nloops]+1:
      break;
    case 0:
      if(nloops != 0)
        program = "<EOF>", bad_program("[ without matching ]");
      | epilogue
      link_and_encode(&d);
      dasm_free(&d);
      return (void(*)(bf_state_t*))labels[lbl_bf_main];
    }
  }
}

static void bf_putchar(bf_state_t* s, unsigned char c)
{
  putchar((int)c);
}

static unsigned char bf_getchar(bf_state_t* s)
{
  return (unsigned char)getchar();
}

static void bf_run(const char* program)
{
  bf_state_t state;
  unsigned char tape[TAPE_SIZE] = {0};
  state.tape = tape;
  state.get_ch = bf_getchar;
  state.put_ch = bf_putchar;
  bf_compile(program)(&state);
}

int main(int argc, char** argv)
{
  if(argc == 2) {
    long sz;
    char* program;
    FILE* f = fopen(argv[1], "r");
    if(!f) {
      fprintf(stderr, "Cannot open %s\n", argv[1]);
      return 1;
    }
    fseek(f, 0, SEEK_END);
    sz = ftell(f);
    program = (char*)malloc(sz + 1);
    fseek(f, 0, SEEK_SET);
    program[fread(program, 1, sz, f)] = 0;
    fclose(f);
    bf_run(program);
    return 0;
  } else {
    fprintf(stderr, "Usage: %s INFILE.bf\n", argv[0]);
    return 1;
  }
}

If you've not been following that closely, you can reach the same state by doing:

git clone https://github.com/corsix/dynasm-doc.git
cd dynasm-doc
git submodule update --init
cp bf_dynasm.c tutorial.c

In order to compile tutorial.c, we first need to run it through the DynASM preprocessor. Said preprocessor is written in Lua, so we'll first compile a minimal Lua interpreter:

gcc -o minilua luajit-2.0/src/host/minilua.c

With this interpreter in place, we can run the DynASM preprocessor:

./minilua luajit-2.0/dynasm/dynasm.lua -o tutorial.posix64.c -D X64 tutorial.c

With preprocessing done, we can now invoke a C compiler:

gcc -o tutorial tutorial.posix64.c

We can then run the resulting executable, which should fairly quickly render the Mandelbrot set:

./tutorial mandelbrot.bf