The starting point of this tutorial is the following brainfsck interpreter:
#include <stdio.h>
#include <stdlib.h>
#define TAPE_SIZE 30000
#define MAX_NESTING 100
typedef struct bf_state
{
  unsigned char* tape;
  unsigned char (*get_ch)(struct bf_state*);
  void (*put_ch)(struct bf_state*, unsigned char);
} bf_state_t;
#define bad_program(s) exit(fprintf(stderr, "bad program near %.16s: %s\n", program, s))
static void bf_interpret(const char* program, bf_state_t* state)
{
  const char* loops[MAX_NESTING];
  int nloops = 0;
  int n;
  int nskip = 0;
  unsigned char* tape_begin = state->tape - 1;
  unsigned char* ptr = state->tape;
  unsigned char* tape_end = state->tape + TAPE_SIZE - 1;
  for(;;) {
    switch(*program++) {
    case '<':
      for(n = 1; *program == '<'; ++n, ++program);
      if(!nskip) {
        ptr -= n;
        while(ptr <= tape_begin)
          ptr += TAPE_SIZE;
      }
      break;
    case '>':
      for(n = 1; *program == '>'; ++n, ++program);
      if(!nskip) {
        ptr += n;
        while(ptr > tape_end)
          ptr -= TAPE_SIZE;
      }
      break;
    case '+':
      for(n = 1; *program == '+'; ++n, ++program);
      if(!nskip)
        *ptr += n;
      break;
    case '-':
      for(n = 1; *program == '-'; ++n, ++program);
      if(!nskip)
        *ptr -= n;
      break;
    case ',':
      if(!nskip)
        *ptr = state->get_ch(state);
      break;
    case '.':
      if(!nskip)
        state->put_ch(state, *ptr);
      break;
    case '[':
      if(nloops == MAX_NESTING)
        bad_program("Nesting too deep");
      loops[nloops++] = program;
      if(!*ptr)
        ++nskip;
      break;
    case ']':
      if(nloops == 0)
        bad_program("] without matching [");
      if(*ptr)
        program = loops[nloops-1];
      else
        --nloops;
      if(nskip)
        --nskip;
      break;
    case 0:
      if(nloops != 0)
        program = "<EOF>", bad_program("[ without matching ]");
      return;
    }
  }
}
static void bf_putchar(bf_state_t* s, unsigned char c)
{
  putchar((int)c);
}
static unsigned char bf_getchar(bf_state_t* s)
{
  return (unsigned char)getchar();
}
static void bf_run(const char* program)
{
  bf_state_t state;
  unsigned char tape[TAPE_SIZE] = {0};
  state.tape = tape;
  state.get_ch = bf_getchar;
  state.put_ch = bf_putchar;
  bf_interpret(program, &state);
}
int main(int argc, char** argv)
{
  if(argc == 2) {
    long sz;
    char* program;
    FILE* f = fopen(argv[1], "r");
    if(!f) {
      fprintf(stderr, "Cannot open %s\n", argv[1]);
      return 1;
    }
    fseek(f, 0, SEEK_END);
    sz = ftell(f);
    program = (char*)malloc(sz + 1);
    fseek(f, 0, SEEK_SET);
    program[fread(program, 1, sz, f)] = 0;
    fclose(f);
    bf_run(program);
    return 0;
  } else {
    fprintf(stderr, "Usage: %s INFILE.bf\n", argv[0]);
    return 1;
  }
}
        Over the course of this tutorial, we'll use DynASM to transform this interpreter into a brainfsck JIT compiler, therein hopefully making it faster.
To follow along, clone this repository and start from bf_c.c:
git clone https://github.com/corsix/dynasm-doc.git cd dynasm-doc git submodule update --init cp bf_c.c tutorial.c
The functionality of the starting point can be checked by running the following, which should very slowly render the Mandelbrot set:
gcc -o tutorial tutorial.c ./tutorial mandelbrot.bf
Before the real fun can begin, we need to lay a few pieces of groundwork.
First of all, we need to #include the DynASM headers:
#include "luajit-2.0/dynasm/dasm_proto.h" #include "luajit-2.0/dynasm/dasm_x86.h"
As described in more detail on the reference page, dasm_proto.h
           defines the DynASM API, and dasm_x86.h contains the implementation
           of said API (for x86 / x64).
Next, we'll rename bf_interpret to bf_compile and change
           its type signature:
static void bf_interpret(const char* program, bf_state_t* state) static void(* bf_compile(const char* program) )(bf_state_t*)
Where previously bf_interpret accepted both a const char*
           and a bf_state_t*, bf_compile now accepts just the
           const char* portion, and will return a function pointer to the JIT-compiled
           code.
The code which calls bf_interpret also needs updating at this point:
bf_interpret(program, &state); bf_compile(program)(&state);
With the groundwork done, the next task is creating and initialising a DynASM state.
We'll need a variable of type dasm_State* to contain the DynASM state, and
           two extra variables whose purpose will be explained later. We can also get rid of an
           interpreter variable at the same time:
int nskip = 0; dasm_State* d; unsigned npc = 8; unsigned nextpc = 0;
We now reach the first of many DynASM directives, which are instructions to the DymASM preprocessor. In this case, we need to instruct it as to which architecture we're generating machine code for, which will either be x86 or x64:
|.if X64 |.arch x64 |.else |.arch x86 |.endif
Lines starting with a vertical bar will be picked up by the DynASM preprocessor. The
           .if, .else, and .endif directives will be handled
           by DynASM's prepreprocessor, with semantics similar to C's preprocessor #if,
           #else, and #endif. As a result, exactly one .arch
           directive will take effect.
Having declared a variable of type dasm_State*, we need to actually
           allocate a dasm_State to put in it, which is done by calling dasm_init:
|.section code dasm_init(&d, DASM_MAXSECTION);
Note that as well as a dasm_State**, dasm_init also requires
           an integer argument, which specifies the number of sections of machine code that'll be
           generated. We only need one code section, so we invoke the .section directive
           with one argument, which the DynASM preprocessor will rewrite to #define DASM_MAXSECTION 1
           (amongst other things). This is a slightly convoluted way of passing 1 as the
           second argument to dasm_init, but is a good habit in case we need more sections
           in the future.
dasm_init will have allocated a dasm_State, but won't have fully
           initialised it. A few more calls are required to fully initialise the state, the first of
           which is dasm_setupglobal:
|.globals lbl_ void* labels[lbl__MAX]; dasm_setupglobal(&d, labels, lbl__MAX);
The .globals directive with the argument lbl_ will be rewritten by the
           DynASM preprocessor to become an enum containing several things, one of which will
           be lbl__MAX. This value must be passed to dasm_setupglobal, along with
           an array of void* of equal extent. We'll make use of this labels array
           much later.
The next call in the initialisation sequence is to dasm_setup:
|.actionlist bf_actions dasm_setup(&d, bf_actions);
The .actionlist directive with the argument bf_actions will be rewritten
           by the DynASM preprocessor to become a variable called bf_actions, and this variable
           must be passed to dasm_setup.
For a lot of use cases, the dasm_State would be fully initialised at this point.
           However, as we'll be making use of dynamic labels, there is one more initialisation call to
           make, which is to dasm_growpc:
dasm_growpc(&d, npc);
        We're passing npc as an argument, which is a variable we declared earlier. Said
           variable represents the number of dynamic labels we've allocated, while the related variable
           nextpc represents the number of dynamic labels we've used. These dynamic labels
           will come into play when compiling [ and ].
Before we start emitting machine code, it is useful to define a few abstractions. The first few abstractions are to give slightly more meaningful names to the registers we'll be using:
| Abstraction | Corresponding Interpreter Variable | Definition | 
|---|---|---|
| aState | state | ebxorrbx | 
| aPtr | ptr | ebporr12 | 
| aTapeBegin | tape_begin | esiorrsiorr13 | 
| aTapeEnd | tape_end | ediorrdiorr14 | 
The next group of useful abstractions relate to function calls:
| Abstraction | Description | 
|---|---|
| prologue | Set up the stack frame, and set aStatefrom the passed parameter. | 
| prepcall1 arg1 | Prepare to call a function with one argument, arg1. | 
| prepcall2 arg1, arg2 | Prepare to call a function with two arguments, arg1andarg2. | 
| postcall n | Do cleanup after a call to a function with narguments. | 
| epilogue | Tear down the stack frame. | 
All of these abstractions are defined by means of .define (for simple substitutions) or .macro (for more
           complex constructions), and have different definitions for each of x86, x64 POSIX, and x64 Windows:
|.if X64 |.define aPtr, rbx |.define aState, r12 |.if WIN |.define aTapeBegin, rsi |.define aTapeEnd, rdi |.define rArg1, rcx |.define rArg2, rdx |.else |.define aTapeBegin, r13 |.define aTapeEnd, r14 |.define rArg1, rdi |.define rArg2, rsi |.endif |.macro prepcall1, arg1 | mov rArg1, arg1 |.endmacro |.macro prepcall2, arg1, arg2 | mov rArg1, arg1 | mov rArg2, arg2 |.endmacro |.define postcall, .nop |.macro prologue | push aPtr | push aState | push aTapeBegin | push aTapeEnd | push rax | mov aState, rArg1 |.endmacro |.macro epilogue | pop rax | pop aTapeEnd | pop aTapeBegin | pop aState | pop aPtr | ret |.endmacro |.else |.define aPtr, ebx |.define aState, ebp |.define aTapeBegin, esi |.define aTapeEnd, edi |.macro prepcall1, arg1 | push arg1 |.endmacro |.macro prepcall2, arg1, arg2 | push arg2 | push arg1 |.endmacro |.macro postcall, n | add esp, 4*n |.endmacro |.macro prologue | push aPtr | push aState | push aTapeBegin | push aTapeEnd | mov aState, [esp+20] |.endmacro |.macro epilogue | pop aTapeEnd | pop aTapeBegin | pop aState | pop aPtr | ret 4 |.endmacro |.endif
Having made all of these architecture and operating system dependent definitions for the DynASM preprocessor, it is useful to check that the architecture and operating system specified to the DynASM preprocessor match the architecture and operating system as known by the C preprocessor, which is done by the following:
||#if ((defined(_M_X64) || defined(__amd64__)) != X64) || (defined(_WIN32) != WIN) #error "Wrong DynASM flags used: pass `-D X64` and/or `-D WIN` to dynasm.lua as appropriate" #endif
Note the line starting with two vertical bars: such lines undergo .define substitution by the DynASM
           prepreprocessor (and can particicpate in .macro definitions), but are otherwise unchanged by the DynASM
           preprocessor. In particular, if X64 and/or WIN are defined (to 1) at DynASM prepreprocessing time,
           then they'll be substituted for 1. If they're not defined at DynASM prepreprocessing time, they'll be
           left unchanged, and be substituated for 0 by the C preprocessor.
With all of that done, we're finally ready to emit some machine code.
The first thing we need to emit is a prologue, which replaces some of the initialisation previously done by the interpreter:
unsigned char* tape_begin = state->tape - 1; unsigned char* ptr = state->tape; unsigned char* tape_end = state->tape + TAPE_SIZE - 1; |.type state, bf_state_t, aState dasm_State** Dst = &d; |.code |->bf_main: | prologue | mov aPtr, state->tape | lea aTapeBegin, [aPtr-1] | lea aTapeEnd, [aPtr+TAPE_SIZE-1]
The first item of interest here is the .type directive, which subsequently allows us to write state->tape
           as a shorthand for [aState + offsetof(bf_state_t,tape)].
The next line defines a variable called Dst, and initialises it to &d. This is done because the
           DynASM preprocessor will rewrite the subsequent lines to calls of the form dasm_put(Dst, ...), and like the
           previous calls we've made to dasm_ functions, the first argument wants to be &d.
The next line contains a .code directive. Said directive was introduced by the prior .section code
           directive, and states that subsequently emitted machine code should be placed in the code section (which happens
           to be the one and only section we're working with).
After this, we define the global label ->bf_main. After we've finished emitting machine code, we'll
           obtain the address of this global label and turn it into a function pointer.
We then invoke the prologue macro as defined earlier, which will cause a few instructions to be emitted.
Finally, we have a mov instruction and two lea instructions, which directly correspond to the
           removed interpreter code. As mentioned, the state->tape specified as an operand to mov is
           recognised as shorthand for [aState + offsetof(bf_state_t,tape)]. Note that both offsetof(bf_state_t,tape)
           and TAPE_SIZE-1 (part of the lea operand) are so-called encoding-time constants: DynASM doesn't
           understand what they mean, so it defers their computation to the C compiler. Both of these values happen to be compile-time
           constants in C, but encoding-time constants don't have to be compile-time constants (we'll see examples of this in just a minute).
We've reached the guts of the interpreter now, and the first job is to replace the interpreter's handling of < with
           the compiler's interpretation:
if(!nskip) { ptr -= n; while(ptr <= tape_begin) ptr += TAPE_SIZE; } | sub aPtr, n%TAPE_SIZE | cmp aPtr, aTapeBegin | ja >1 | add aPtr, TAPE_SIZE |1:
Note that the compiler doesn't have a notion of skipping over code like the interpreter does, so the outer if is
           dropped entirely. After that, ptr -= n; and some iterations of the subsequent loop have become | sub aPtr, n%TAPE_SIZE.
           Note that n%TAPE_SIZE is an encoding-time constant which isn't a compile-time constant in C: DynASM still doesn't
           understand what the operand means, but in this case the final value of the operand is computed when bf_compile is running.
After performing some iterations of the loop at compile time by means of %TAPE_SIZE, there might still be one iteration
           to perform at runtime, which correspond to the cmp, ja, and add instructions. Note that
           the syntax >1 jumps forward to the next definition of the local label 1, which is just after the add
           instruction.
A similar transformation happens for >, but with add and sub transposed:
if(!nskip) { ptr += n; while(ptr > tape_end) ptr -= TAPE_SIZE; } | add aPtr, n%TAPE_SIZE | cmp aPtr, aTapeEnd | jbe >1 | sub aPtr, TAPE_SIZE |1:
The next instruction to be rewritten is +, which is relatively simple:
if(!nskip) *ptr += n; | add byte [aPtr], n
The only notable thing is the presence of the memory size specifier byte before the memory operand [aPtr]. As neither
           the memory operand nor the immediate operand have a natural operand size, DynASM needs to be explicitly told. Note that our prior uses of
           memory operands didn't require memory size specifiers: lea instructions don't require them because the memory operands aren't memory
           accesses, and mov aPtr, state->tape didn't require one because the size of the memory operand was inferred to be equal to size of
           the register operand.
The handling of - is similar:
if(!nskip) *ptr -= n; | sub byte [aPtr], n
The next job involves the logic for , (read char) and . (write char), which are notable because they involve
           calling other functions. The first of these is ,:
if(!nskip) *ptr = state->get_ch(state); | prepcall1 aState | call aword state->get_ch | postcall 1 | mov byte [aPtr], al
Note the invocations of the prepcall1 and postcall abstractions that we defined earlier. Also note that
           state->get_ch is shorthand for [aState + offsetof(bf_state_t,get_ch)] courtesy of the earlier .type
           directive, and that memory size specifiers are still required when these shorthands are used: the size of the memory operand will
           not be automatically inferred to be equal to the size of the named C structure member. The aword (address-sized word)
           specifier refers to either 4 bytes x86 or 8 bytes x64.
The transformation of . is similar:
if(!nskip) state->put_ch(state, *ptr); | movzx r0, byte [aPtr] | prepcall2 aState, r0 | call aword state->put_ch | postcall 2
Note that r0 is used as a register operand: it refers to either eax x86 or rax x64.
We now reach the really interesting instructions: [ and ]. The first of these has a rather complex transformation:
loops[nloops++] = program; if(!*ptr) ++nskip; if(program[0] == '-' && program[1] == ']') { program += 2; | xor eax, eax | mov byte [aPtr], al } else { if(nextpc == npc) { npc *= 2; dasm_growpc(&d, npc); } | cmp byte [aPtr], 0 | jz =>nextpc+1 |=>nextpc: loops[nloops++] = nextpc; nextpc += 2; }
First of all, we now recognise the instruction sequence [-] and emit optimised machine code for it. Having excluded this specific case, the
           general case requires two dynamic labels: one for jumping from [ to after ] (previously done by means of the nskip
           variable in the interpreter), and one for jumping from ] to after [ (previously done by means of the loops stack).
If the number of dynamic labels we've used equals the number we've allocated, then we call dasm_growpc in order to allocate some more. We then
           emit a cmp instruction, which does the obvious thing. If the byte at [aPtr] was zero, we jump to the dynamic label =>nextpc+1
           (which we'll subsequently define when we see ]). After this, we define the dynamic label =>nextpc (which is what ] will
           jump back to). Note that both nextpc+1 and nextpc are encoding-time constants.
The second half of the magic comes from the handling of ]:
if(*ptr) program = loops[nloops-1]; else --nloops; if(nskip) --nskip; --nloops; | cmp byte [aPtr], 0 | jnz =>loops[nloops] |=>loops[nloops]+1:
Note the conditional jump to the dynamic label =>loops[nloops] (which jumps to the =>nextpc defined by the corresponding [),
           and the definition of the dynamic label =>loops[nloops]+1 (which is jumped to by jz =>nextpc+1 emitted by the corresponding [).
Having covered all of the instructions, all that is left is handling the epilogue and extracting a function pointer from DynASM:
return; | epilogue link_and_encode(&d); dasm_free(&d); return (void(*)(bf_state_t*))labels[lbl_bf_main];
The first of these lines invokes the epilogue macro we defined earlier. The next line calls out to link_and_encode, which
           is a function we'll define in just a minute. We then call dasm_free, which frees the DynASM state. Finally, we take the labels
           array we previously defined and passed to dasm_setupglobal, index it with lbl_bf_main (which was defined by .globals lbl_ and corresponds
           to the global label ->bf_main), and cast it to a function pointer.
The link_and_encode function is defined as follows:
#if _WIN32 #include <Windows.h> #else #include <sys/mman.h> #if !defined(MAP_ANONYMOUS) && defined(MAP_ANON) #define MAP_ANONYMOUS MAP_ANON #endif #endif static void* link_and_encode(dasm_State** d) { size_t sz; void* buf; dasm_link(d, &sz); #ifdef _WIN32 buf = VirtualAlloc(0, sz, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); #else buf = mmap(0, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); #endif dasm_encode(d, buf); #ifdef _WIN32 {DWORD dwOld; VirtualProtect(buf, sz, PAGE_EXECUTE_READ, &dwOld); } #else mprotect(buf, sz, PROT_READ | PROT_EXEC); #endif return buf; }
The particularly interesting calls are to dasm_link and dasm_encode. The remaining calls use operating system functionality
           to allocate a block of read-write memory and then convert said block to read-execute. Note that we could have allocated a block of read-write-execute
           memory, but it is generally considered bad form to have memory which is writable and executable at the same time.
If you've been following along, your tutorial.c should now correspond to the following:
||#if ((defined(_M_X64) || defined(__amd64__)) != X64) || (defined(_WIN32) != WIN)
#error "Wrong DynASM flags used: pass `-D X64` and/or `-D WIN` to dynasm.lua as appropriate"
#endif
#include <stdio.h>
#include <stdlib.h>
#include "luajit-2.0/dynasm/dasm_proto.h"
#include "luajit-2.0/dynasm/dasm_x86.h"
#if _WIN32
#include <Windows.h>
#else
#include <sys/mman.h>
#if !defined(MAP_ANONYMOUS) && defined(MAP_ANON)
#define MAP_ANONYMOUS MAP_ANON
#endif
#endif
static void* link_and_encode(dasm_State** d)
{
  size_t sz;
  void* buf;
  dasm_link(d, &sz);
#ifdef _WIN32
  buf = VirtualAlloc(0, sz, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
#else
  buf = mmap(0, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
#endif
  dasm_encode(d, buf);
#ifdef _WIN32
  {DWORD dwOld; VirtualProtect(buf, sz, PAGE_EXECUTE_READ, &dwOld); }
#else
  mprotect(buf, sz, PROT_READ | PROT_EXEC);
#endif
  return buf;
}
#define TAPE_SIZE 30000
#define MAX_NESTING 100
typedef struct bf_state
{
  unsigned char* tape;
  unsigned char (*get_ch)(struct bf_state*);
  void (*put_ch)(struct bf_state*, unsigned char);
} bf_state_t;
#define bad_program(s) exit(fprintf(stderr, "bad program near %.16s: %s\n", program, s))
static void(* bf_compile(const char* program) )(bf_state_t*)
{
  unsigned loops[MAX_NESTING];
  int nloops = 0;
  int n;
  dasm_State* d;
  unsigned npc = 8;
  unsigned nextpc = 0;
  |.if X64
  |.arch x64
  |.else
  |.arch x86
  |.endif
  |.section code
  dasm_init(&d, DASM_MAXSECTION);
  |.globals lbl_
  void* labels[lbl__MAX];
  dasm_setupglobal(&d, labels, lbl__MAX);
  |.actionlist bf_actions
  dasm_setup(&d, bf_actions);
  dasm_growpc(&d, npc);
  |.if X64
    |.define aPtr, rbx
    |.define aState, r12
    |.if WIN
      |.define aTapeBegin, rsi
      |.define aTapeEnd, rdi
      |.define rArg1, rcx
      |.define rArg2, rdx
    |.else
      |.define aTapeBegin, r13
      |.define aTapeEnd, r14
      |.define rArg1, rdi
      |.define rArg2, rsi
    |.endif
    |.macro prepcall1, arg1
      | mov rArg1, arg1
    |.endmacro
    |.macro prepcall2, arg1, arg2
      | mov rArg1, arg1
      | mov rArg2, arg2
    |.endmacro
    |.define postcall, .nop
    |.macro prologue
      | push aPtr
      | push aState
      | push aTapeBegin
      | push aTapeEnd
      | push rax
      | mov aState, rArg1
    |.endmacro
    |.macro epilogue
      | pop rax
      | pop aTapeEnd
      | pop aTapeBegin
      | pop aState
      | pop aPtr
      | ret
    |.endmacro
  |.else
    |.define aPtr, ebx
    |.define aState, ebp
    |.define aTapeBegin, esi
    |.define aTapeEnd, edi
    |.macro prepcall1, arg1
      | push arg1
    |.endmacro
    |.macro prepcall2, arg1, arg2
      | push arg2
      | push arg1
    |.endmacro
    |.macro postcall, n
      | add esp, 4*n
    |.endmacro
    |.macro prologue
      | push aPtr
      | push aState
      | push aTapeBegin
      | push aTapeEnd
      | mov aState, [esp+20]
    |.endmacro
    |.macro epilogue
      | pop aTapeEnd
      | pop aTapeBegin
      | pop aState
      | pop aPtr
      | ret 4
    |.endmacro
  |.endif
  |.type state, bf_state_t, aState
  
  dasm_State** Dst = &d;
  |.code
  |->bf_main:
  | prologue
  | mov aPtr, state->tape
  | lea aTapeBegin, [aPtr-1]
  | lea aTapeEnd, [aPtr+TAPE_SIZE-1]
  for(;;) {
    switch(*program++) {
    case '<':
      for(n = 1; *program == '<'; ++n, ++program);
      | sub aPtr, n%TAPE_SIZE
      | cmp aPtr, aTapeBegin
      | ja >1
      | add aPtr, TAPE_SIZE
      |1:
      break;
    case '>':
      for(n = 1; *program == '>'; ++n, ++program);
      | add aPtr, n%TAPE_SIZE
      | cmp aPtr, aTapeEnd
      | jbe >1
      | sub aPtr, TAPE_SIZE
      |1:
      break;
    case '+':
      for(n = 1; *program == '+'; ++n, ++program);
      | add byte [aPtr], n
      break;
    case '-':
      for(n = 1; *program == '-'; ++n, ++program);
      | sub byte [aPtr], n
      break;
    case ',':
      | prepcall1 aState
      | call aword state->get_ch
      | postcall 1
      | mov byte [aPtr], al
      break;
    case '.':
      | movzx r0, byte [aPtr]
      | prepcall2 aState, r0
      | call aword state->put_ch
      | postcall 2
      break;
    case '[':
      if(nloops == MAX_NESTING)
        bad_program("Nesting too deep");
      if(program[0] == '-' && program[1] == ']') {
        program += 2;
        | xor eax, eax
        | mov byte [aPtr], al
      } else {
        if(nextpc == npc) {
          npc *= 2;
          dasm_growpc(&d, npc);
        }
        | cmp byte [aPtr], 0
        | jz =>nextpc+1
        |=>nextpc:
        loops[nloops++] = nextpc;
        nextpc += 2;
      }
      break;
    case ']':
      if(nloops == 0)
        bad_program("] without matching [");
      --nloops;
      | cmp byte [aPtr], 0
      | jnz =>loops[nloops]
      |=>loops[nloops]+1:
      break;
    case 0:
      if(nloops != 0)
        program = "<EOF>", bad_program("[ without matching ]");
      | epilogue
      link_and_encode(&d);
      dasm_free(&d);
      return (void(*)(bf_state_t*))labels[lbl_bf_main];
    }
  }
}
static void bf_putchar(bf_state_t* s, unsigned char c)
{
  putchar((int)c);
}
static unsigned char bf_getchar(bf_state_t* s)
{
  return (unsigned char)getchar();
}
static void bf_run(const char* program)
{
  bf_state_t state;
  unsigned char tape[TAPE_SIZE] = {0};
  state.tape = tape;
  state.get_ch = bf_getchar;
  state.put_ch = bf_putchar;
  bf_compile(program)(&state);
}
int main(int argc, char** argv)
{
  if(argc == 2) {
    long sz;
    char* program;
    FILE* f = fopen(argv[1], "r");
    if(!f) {
      fprintf(stderr, "Cannot open %s\n", argv[1]);
      return 1;
    }
    fseek(f, 0, SEEK_END);
    sz = ftell(f);
    program = (char*)malloc(sz + 1);
    fseek(f, 0, SEEK_SET);
    program[fread(program, 1, sz, f)] = 0;
    fclose(f);
    bf_run(program);
    return 0;
  } else {
    fprintf(stderr, "Usage: %s INFILE.bf\n", argv[0]);
    return 1;
  }
}
        If you've not been following that closely, you can reach the same state by doing:
git clone https://github.com/corsix/dynasm-doc.git cd dynasm-doc git submodule update --init cp bf_dynasm.c tutorial.c
In order to compile tutorial.c, we first need to run it through the DynASM preprocessor. Said preprocessor is written in Lua, so we'll
           first compile a minimal Lua interpreter:
gcc -o minilua luajit-2.0/src/host/minilua.c
        With this interpreter in place, we can run the DynASM preprocessor:
./minilua luajit-2.0/dynasm/dynasm.lua -o tutorial.posix64.c -D X64 tutorial.c
        With preprocessing done, we can now invoke a C compiler:
gcc -o tutorial tutorial.posix64.c
        We can then run the resulting executable, which should fairly quickly render the Mandelbrot set:
./tutorial mandelbrot.bf