python · pablogsal · May 26, 2020 · May 27, 2020 · May 27, 2020 · May 27, 2020
diff --git a/Lib/dis.py b/Lib/dis.py
@@ -394,7 +394,7 @@ def _get_name_info(name_index, get_name, **extrainfo):
     else:
         return UNKNOWN, ''
 
-def parse_varint(iterator):
+def _parse_varint(iterator):
     b = next(iterator)
     val = b & 63
     while b&64:
@@ -403,16 +403,16 @@ def parse_varint(iterator):
         val |= b&63
     return val
 
-def parse_exception_table(code):
+def _parse_exception_table(code):
     iterator = iter(code.co_exceptiontable)
     entries = []
     try:
         while True:
-            start = parse_varint(iterator)*2
-            length = parse_varint(iterator)*2
+            start = _parse_varint(iterator)*2
+            length = _parse_varint(iterator)*2
             end = start + length
-            target = parse_varint(iterator)*2
-            dl = parse_varint(iterator)
+            target = _parse_varint(iterator)*2
+            dl = _parse_varint(iterator)
             depth = dl >> 1
             lasti = bool(dl&1)
             entries.append(_ExceptionTableEntry(start, end, target, depth, lasti))
@@ -527,7 +527,7 @@ def _get_instructions_bytes(code, varname_from_oparg=None,
 def disassemble(co, lasti=-1, *, file=None, show_caches=False, adaptive=False):
     """Disassemble a code object."""
     linestarts = dict(findlinestarts(co))
-    exception_entries = parse_exception_table(co)
+    exception_entries = _parse_exception_table(co)
     _disassemble_bytes(_get_code_array(co, adaptive),
                        lasti, co._varname_from_oparg,
                        co.co_names, co.co_consts, linestarts, file=file,
@@ -717,7 +717,7 @@ def __init__(self, x, *, first_line=None, current_offset=None, show_caches=False
         self._linestarts = dict(findlinestarts(co))
         self._original_object = x
         self.current_offset = current_offset
-        self.exception_entries = parse_exception_table(co)
+        self.exception_entries = _parse_exception_table(co)
         self.show_caches = show_caches
         self.adaptive = adaptive
 

@@ -346,16 +346,16 @@ PEGEN_OBJS=		\
 		Parser/action_helpers.o \
 		Parser/parser.o \
 		Parser/string_parser.o \
-		Parser/peg_api.o
-
+		Parser/peg_api.o \
+		Parser/vm.o
 
 PEGEN_HEADERS= \
 		$(srcdir)/Include/internal/pycore_parser.h \
 		$(srcdir)/Parser/pegen.h \
 		$(srcdir)/Parser/string_parser.h
 
 POBJS=		\
-		Parser/token.o \
+		Parser/token.o
 
 PARSER_OBJS=	$(POBJS) $(PEGEN_OBJS) Parser/myreadline.o Parser/tokenizer.o
 
@@ -1633,6 +1633,8 @@ PYTHON_HEADERS= \
 
 $(LIBRARY_OBJS) $(MODOBJS) Programs/python.o: $(PYTHON_HEADERS)
 
+$(srcdir)/Parser/vm.o: $(srcdir)/Parser/vm.h $(srcdir)/Parser/vmparse.h
+
 
 ######################################################################
 

@@ -706,6 +706,9 @@ compute_parser_flags(PyCompilerFlags *flags)
     if (flags->cf_flags & PyCF_TYPE_COMMENTS) {
         parser_flags |= PyPARSE_TYPE_COMMENTS;
     }
+    if (flags->cf_flags & PyCF_VMPARSER) {
+        parser_flags |= PyPARSE_VMPARSER;
+    }
     if ((flags->cf_flags & PyCF_ONLY_AST) && flags->cf_feature_version < 7) {
         parser_flags |= PyPARSE_ASYNC_HACKS;
     }
@@ -923,7 +926,12 @@ _PyPegen_run_parser_from_string(const char *str, int start_rule, PyObject *filen
         goto error;
     }
 
-    result = _PyPegen_run_parser(p);
+    if (parser_flags & PyPARSE_VMPARSER) {
+        result = _PyPegen_vmparser(p);
+    }
+    else {
+        result = _PyPegen_run_parser(p);
+    }
     _PyPegen_Parser_Free(p);
 
 error:

@@ -23,8 +23,8 @@
 #define PyPARSE_TYPE_COMMENTS 0x0040
 #define PyPARSE_ASYNC_HACKS   0x0080
 #define PyPARSE_ALLOW_INCOMPLETE_INPUT 0x0100
-
 #define CURRENT_POS (-5)
+#define PyPARSE_VMPARSER      0x0100
 
 typedef struct _memo {
     int type;
@@ -284,6 +284,15 @@ INVALID_VERSION_CHECK(Parser *p, int version, char *msg, void *node)
 
 arg_ty _PyPegen_add_type_comment_to_arg(Parser *, arg_ty, Token *);
 PyObject *_PyPegen_new_identifier(Parser *, const char *);
+Parser *_PyPegen_Parser_New(struct tok_state *, int, int, int, int *, PyArena *);
+void _PyPegen_Parser_Free(Parser *);
+mod_ty _PyPegen_run_parser_from_file_pointer(FILE *, int, PyObject *, const char *,
+                                    const char *, const char *, PyCompilerFlags *, int *, PyArena *);
+void *_PyPegen_run_parser(Parser *);
+void *_PyPegen_vmparser(Parser *);
+mod_ty _PyPegen_run_parser_from_file(const char *, int, PyObject *, PyCompilerFlags *, PyArena *);
+mod_ty _PyPegen_run_parser_from_string(const char *, int, PyObject *, PyCompilerFlags *, PyArena *);
+void *_PyPegen_interactive_exit(Parser *);
 asdl_seq *_PyPegen_singleton_seq(Parser *, void *);
 asdl_seq *_PyPegen_seq_insert_in_front(Parser *, void *, asdl_seq *);
 asdl_seq *_PyPegen_seq_append_to_end(Parser *, asdl_seq *, void *);

@@ -0,0 +1,319 @@
+#include <Python.h>
+#include <errcode.h>
+#include "tokenizer.h"
+
+#include "pegen.h"
+#include "string_parser.h"
+
+#include "vm.h"
+#include "vmparse.h"  // Generated parser tables
+
+#undef D
+#define DEBUG 0
+
+#if DEBUG
+#define D(x) x
+#else
+#define D(x)
+#endif
+
+#define MAXFRAMES 1000
+
+typedef struct _stack {
+    Parser *p;
+    int top;
+    Frame frames[MAXFRAMES];
+} Stack;
+
+static inline Frame *
+push_frame(Stack *stack, Rule *rule)
+{
+    D(printf("               push %s\n", rule->name));
+    assert(stack->top < MAXFRAMES);
+    Frame *f = &stack->frames[stack->top++];
+    f->rule = rule;
+    f->mark = stack->p->mark;
+    f->ialt = 0;
+    f->iop = 0;
+    f->cut = 0;
+    f->ncollected = 0;
+    f->capacity = 0;
+    f->collection = NULL;
+    f->ival = 0;
+    return f;
+}
+
+static inline Frame *
+pop_frame(Stack *stack, void *v)
+{
+    assert(stack->top > 1);
+    Frame *f = &stack->frames[--stack->top];  // Frame being popped
+    if (f->collection) {
+        PyMem_Free(f->collection);
+    }
+    if (f->rule->memo) {
+        D(printf("               insert memo %s: val=%p, mark=%d\n", f->rule->name, v, stack->p->mark));
+        if (_PyPegen_insert_memo(stack->p, f->mark, f->rule->type + 1000, v) == -1) {
+            return NULL;
+        }
+    }
+    f = &stack->frames[stack->top - 1];  // New top of stack
+    D(printf("               pop %s\n", f->rule->name));
+    return f;
+}
+
+static inline asdl_seq *
+make_asdl_seq(Parser *p, void *collection[], int ncollected)
+{
+    asdl_seq *seq = _Py_asdl_seq_new(ncollected, p->arena);
+    if (!seq) {
+        return NULL;
+    }
+    for (int i = 0; i < ncollected; i++) {
+        asdl_seq_SET(seq, i, collection[i]);
+    }
+    return seq;
+}
+
+static void *
+run_vm(Parser *p, Rule rules[], int root)
+{
+    Stack stack = {p, 0, {{0}}};
+    Frame *f = push_frame(&stack, &rules[root]);
+    void *v;
+    int oparg;
+    int opc;
+
+ top:
+    opc = f->rule->opcodes[f->iop];
+    if (DEBUG) {
+        if (p->mark == p->fill)
+            _PyPegen_fill_token(p);
+        for (int i = 0; i < stack.top; i++) printf(" ");
+        printf("Rule: %s; ialt=%d; iop=%d; op=%s; arg=%d, mark=%d; token=%d; p^='%s'\n",
+               f->rule->name, f->ialt, f->iop, opcode_names[opc],
+               opc >= OP_TOKEN ? f->rule->opcodes[f->iop + 1] : -1,
+               p->mark, p->tokens[p->mark]->type,
+               p->fill > p-> mark ? PyBytes_AsString(p->tokens[p->mark]->bytes) : "<UNSEEN>");
+    }
+    f->iop++;
+    switch (opc) {
+    case OP_NOOP:
+        goto top;
+    case OP_CUT:
+        f->cut = 1;
+        goto top;
+    case OP_OPTIONAL:
+        goto top;
+
+    case OP_NAME:
+        v = _PyPegen_name_token(p);
+        break;
+    case OP_NUMBER:
+        v = _PyPegen_number_token(p);
+        break;
+    case OP_STRING:
+        v = _PyPegen_string_token(p);
+        break;
+
+    case OP_LOOP_ITERATE:
+        f->mark = p->mark;
+        assert(f->ival == 1 || f->ival == 2);
+        v = f->vals[0];
+        assert(v);
+        if (f->ncollected >= f->capacity) {
+            f->capacity = (f->ncollected + 1) * 2;  // 2, 6, 14, 30, 62, ... (2**i - 2)
+            f->collection = PyMem_Realloc(f->collection, (f->capacity) * sizeof(void *));
+            if (!f->collection) {
+                return PyErr_NoMemory();
+            }
+        }
+        f->collection[f->ncollected++] = v;
+        f->iop = f->rule->alts[f->ialt];
+        f->ival = 0;
+        goto top;
+    case OP_LOOP_COLLECT_DELIMITED:
+        /* Collect one item */
+        assert(f->ival == 1);
+        if (f->ncollected >= f->capacity) {
+            f->capacity = f->ncollected + 1;  // We know there won't be any more
+            f->collection = PyMem_Realloc(f->collection, (f->capacity) * sizeof(void *));
+            if (!f->collection) {
+                return PyErr_NoMemory();
+            }
+        }
+        f->collection[f->ncollected++] = v;
+        // Fallthrough!
+    case OP_LOOP_COLLECT_NONEMPTY:
+        if (!f->ncollected) {
+            D(printf("               Nothing collected for %s\n", f->rule->name));
+            v = NULL;
+            f = pop_frame(&stack, v);
+            if (!f) {
+                return NULL;
+            }
+            break;
+        }
+        // Fallthrough!
+    case OP_LOOP_COLLECT:
+        v = make_asdl_seq(p, f->collection, f->ncollected);
+        if (!v) {
+            return PyErr_NoMemory();
+        }
+        f = pop_frame(&stack, v);
+        if (!f) {
+            return NULL;
+        }
+        break;
+
+    case OP_SAVE_MARK:
+        f->savemark = p->mark;
+        goto top;
+    case OP_POS_LOOKAHEAD:
+        assert(f->ival > 0);
+        f->ival--;  /* Back out last added value */
+        p->mark = f->savemark;
+        goto top;
+    case OP_NEG_LOOKAHEAD:
+        v = NULL;
+        break;
+
+    case OP_SUCCESS:
+        v = f->vals[0];
+        return v;
+    case OP_FAILURE:
+        return RAISE_SYNTAX_ERROR("A syntax error");
+
+    case OP_TOKEN:
+        oparg = f->rule->opcodes[f->iop++];
+        v = _PyPegen_expect_token(p, oparg);
+        break;
+    case OP_SOFT_KEYWORD:
+        oparg = f->rule->opcodes[f->iop++];
+        v = _PyPegen_expect_soft_keyword(p, soft_keywords[oparg]);
+        break;
+    case OP_RULE:
+        oparg = f->rule->opcodes[f->iop++];
+        Rule *rule = &rules[oparg];
+        if (rule->memo || rule->leftrec) {
+            v = NULL;  // In case is_memoized ran into an error
+            int memo = _PyPegen_is_memoized(p, rule->type + 1000, &v);
+            if (memo) {
+                D(printf("          Memo hit %s\n", rule->name));
+                // The result is v; if v != NULL, p->mark has been updated
+                break;
+            }
+        }
+        f = push_frame(&stack, rule);
+        if (rule->leftrec) {
+            D(printf("               leftrec %s prep: lastval=NULL, lastmark=%d\n", rule->name, f->mark));
+            f->lastval = NULL;
+            f->lastmark = f->mark;
+            if (_PyPegen_insert_memo(p, f->mark, rule->type + 1000, NULL) == -1) {
+                return NULL;
+            }
+        }
+        goto top;
+    case OP_RETURN:
+        oparg = f->rule->opcodes[f->iop++];
+        v = call_action(p, f, oparg);
+        if (v) {
+            if (f->rule->leftrec) {
+                D(printf("               leftrec %s check\n", f->rule->name));
+                if (p->mark > f->lastmark) {  // We improved, recurse again
+                    D(printf("                    leftrec improved: lastval=%p, lastmark=%d\n", v, p->mark));
+                    f->lastval = v;
+                    f->lastmark = p->mark;
+                    if (_PyPegen_update_memo(p, f->mark, f->rule->type + 1000, v) == -1) {
+                        return NULL;
+                    }
+                    f->ialt = 0;
+                    f->iop = 0;
+                    f->ival = 0;
+                    p->mark = f->mark;
+                    goto top;
+                }
+                else {  // End recursion
+                    D(printf("                    leftrec end: lastval=%p, lastmark=%d\n", f->lastval, f->lastmark));
+                    p->mark = f->lastmark;
+                    v = f->lastval;
+                }
+            }
+            f = pop_frame(&stack, v);
+            if (!f) {
+                return NULL;
+            }
+        }
+        break;
+
+    default:
+        printf("opc=%d\n", opc);
+        assert(0);
+    }
+
+ ok:
+    if (v) {
+        D(printf("            OK\n"));
+        assert(f->ival < MAXVALS);
+        f->vals[f->ival++] = v;
+        goto top;
+    }
+    if (PyErr_Occurred()) {
+        D(printf("            PyErr\n"));
+        p->error_indicator = 1;
+        return NULL;
+    }
+
+ fail:
+    opc = f->rule->opcodes[f->iop];
+    if (opc == OP_OPTIONAL) {
+        D(printf("            OP_OPTIONAL\n"));
+        assert(f->ival < MAXVALS);
+        f->vals[f->ival++] = NULL;
+        f->iop++;  // Skip over the OP_OPTIONAL opcode
+        goto top;
+    }
+    if (opc == OP_NEG_LOOKAHEAD) {
+        D(printf("            OP_NEG_LOOKAHEAD\n"));
+        p->mark = f->savemark;
+        f->iop++;  // Skip over the OP_NEG_LOOKAHEAD opcode
+        goto top;
+    }
+
+    D(printf("            alternative fails\n"));
+    p->mark = f->mark;
+    if (f->cut)
+        goto pop;
+    f->iop = f->rule->alts[++f->ialt];
+    if (f->iop == -1)
+        goto pop;
+    f->ival = 0;
+    goto top;
+
+ pop:
+    if (f->rule->leftrec) {
+        D(printf("          leftrec %s pop!! lastval=%p, lastmark=%d\n", f->rule->name, f->lastval, f->lastmark));
+        v = f->lastval;
+        p->mark = f->lastmark;
+        if (v) {
+            D(printf("               leftrec pop okay\n"));
+            goto ok;
+        }
+        D(printf("               leftrec pop fail\n"));
+    }
+
+    f = pop_frame(&stack, NULL);
+    if (!f) {
+        return NULL;
+    }
+    goto fail;
+}
+
+void *
+_PyPegen_vmparser(Parser *p)
+{
+    p->keywords = reserved_keywords;
+    p->n_keyword_lists = n_keyword_lists;
+
+    return run_vm(p, all_rules, R_ROOT);
+}
@@ -0,0 +1,75 @@
+typedef enum _opcodes {
+    OP_NOOP,
+    OP_CUT,
+    OP_OPTIONAL,
+    OP_NAME,
+    OP_NUMBER,
+    OP_STRING,
+    OP_LOOP_ITERATE,
+    OP_LOOP_COLLECT,
+    OP_LOOP_COLLECT_NONEMPTY,
+    OP_LOOP_COLLECT_DELIMITED,
+    OP_SAVE_MARK,
+    OP_POS_LOOKAHEAD,
+    OP_NEG_LOOKAHEAD,
+    OP_SUCCESS,
+    OP_FAILURE,
+    // The rest have an argument
+    OP_SOFT_KEYWORD,
+    OP_TOKEN,
+    OP_RULE,
+    OP_RETURN,
+} Opcode;
+
+static char *opcode_names[] = {
+    "OP_NOOP",
+    "OP_CUT",
+    "OP_OPTIONAL",
+    "OP_NAME",
+    "OP_NUMBER",
+    "OP_STRING",
+    "OP_LOOP_ITERATE",
+    "OP_LOOP_COLLECT",
+    "OP_LOOP_COLLECT_NONEMPTY",
+    "OP_LOOP_COLLECT_DELIMITED",
+    "OP_SAVE_MARK",
+    "OP_POS_LOOKAHEAD",
+    "OP_NEG_LOOKAHEAD",
+    "OP_SUCCESS",
+    "OP_FAILURE",
+    // The rest have an argument
+    "OP_SOFT_KEYWORD",
+    "OP_TOKEN",
+    "OP_RULE",
+    "OP_RETURN",
+};
+
+#define MAXALTS 15
+#define MAXOPCODES 100
+
+typedef struct _rule {
+    char *name;
+    short type;
+    short memo;  // memoized rule (not left-recursive)
+    short leftrec;  // left-recursive rule (needs memo lookup)
+    short alts[MAXALTS];
+    short opcodes[MAXOPCODES];
+} Rule;
+
+#define MAXVALS 10
+
+typedef struct _frame {
+    Rule *rule;
+    int mark;
+    int savemark;
+    int lastmark;
+    short ialt;
+    short iop;
+    short ival;
+    short cut;
+    int ncollected;
+    int capacity;
+    void *lastval;
+    void **collection;
+    void *vals[MAXVALS];
+} Frame;
@@ -0,0 +1,347 @@
+Pegen Virtual Machine
+=====================
+
+The Pegen VM is an alternative for the recursive-descent Pegen parser.
+The grammar (including actions) is identical, but execution does not
+use the C stack.  We expect this to be faster, and initial
+measurements seem to bear this out.  But we need to keep diligent, and
+if it ever becomes clear that it will *not* be faster, we should stop
+working on this project.
+
+The runtime uses the same `Parser` structure as the recursive-descent
+Pegen parser, and the same helper functions
+(e.g., `_PyPegen_singleton_seq`).
+
+The VM uses a stack to hold state during parsing.  The grammar is
+represented by a few read-only tables.  The actions are represented by
+a function containing a giant switch with one case per action.  (An
+optimization here could be to combine identical actions.)
+
+The grammar tables and the action function are meant to be generated
+by a parser generator similar to the current one.  Because of the
+actions, it needs to generate C code.
+
+The primary VM state is a stack of `Frame` structures.  Each frame
+represents a particular attempt to parse a rule at a given point in
+the input.  The only state separate from the stack is a pointer to the
+`Parser` structure.
+
+The main state in a frame is as follows:
+
+- `Rule *rule`   -- points to the rule being parsed
+- `int mark`     -- the input position at the start of the rule invocation
+- `int ialt`     -- indicates which alternative is currently being tried
+- `int iop`      -- indicates where we are in the current alternative
+- `int cut`      -- whether a "cut" was executed in the current alternative
+
+State related to loops is described below.
+
+Note that `rule` doesn't change after the frame is initialized.  Also,
+`mark` normally doesn't change, except for loop operations.
+
+Each frame also has an array of values where successfully recognized
+tokens and rules are stores.  This uses:
+
+- `int ival`     -- number of values stored so far
+- `void *vals[]` -- values stored (the type is `Token *` or an AST
+  node type; may be NULL)
+
+A `Rule` structure has the following fields:
+
+- `char *name`    -- rule name, for debugging (e.g., `"start"`)
+- `int type`      -- rule type, used for memo lookup
+- `int alts[]`    -- index into `opcodes` array for each alternative,
+                     terminated by `-1`
+- `int opcodes[]` -- array of opcodes and their arguments
+
+All rules are combined in a single array; the index in this array
+is used by operations that reference other rules.
+
+The `opcodes` array is a sequence of operation codes and arguments.
+Some opcodes (e.g., `OP_TOKEN`) are followed by an argument; others
+(e.g., `OP_NAME`) are not.  Both are representable as integers.
+
+
+Operations
+----------
+
+Most operations can succeed or fail, and produce a vaue if they
+succeed.
+
+If an operation succeeds, the value is appended to the frame's values
+array (`vals`), and the VM proceeds to the next opcode.
+
+If an operation fails, the VM resets the input to the frame's mark,
+and resets the value array.  It then proceeds to the next alternative
+of the frame's rule, if there is one and the frame's `cut` flag is not
+set.  If the frame's `cut` flag is set, or if its rule has no more
+alternatives, the frame is popped off the frame stack and the VM
+proceeds with failure there.
+
+Some operations manipulate other frame fields.
+
+Calls into the support runtime can produce *errors* -- when an error
+is detected, the VM exits immediately, returning `NULL`.
+
+
+### General operations
+
+The following opcodes take no argument.
+
+- `OP_NOOP` -- succeed without a value.  (Used for opcode padding.)
+
+- `OP_NAME` -- call `_PyPegen_name_token()`; fail if it returns
+  `NULL`, otherwise succeeds with the return value.
+
+- `OP_NUMBER` -- call `_PyPegen_number_token()`; same as `OP_NAME`.
+
+- `OP_STRING` -- call `_PyPegen_string_token()`; same as `OP_NAME`.
+
+- `OP_CUT` -- set the frame's `cut` flag; succeed without a value.
+
+- `OP_OPTIONAL` -- succeed without a value; modifies the *previous*
+  operation to treat a `NULL` result as a success.  (See below.)
+
+The following operations are followed by a single integer argument.
+
+- `OP_TOKEN(type)` -- call `_PyPegen_expect_token()` with the `type`
+  argument; processing is the same as for `OP_NAME`.
+
+- `OP_RULE(rule)` -- push a new frame onto the stack, initializing it
+  with the give rule (by index), the current input position (mark),
+  at the first alternative and opcode.  Then proceed to the first
+  operation of the new frame.
+
+- `OP_RETURN(action)` -- call the action given by the argument, then
+  pop the frame off the stack.  Execution then proceeds (in the frame
+  newly revealed by that pop operation) as if the previous operation
+  succeeded or failed with the return value of the action.
+
+
+### Operations for root rules only
+
+A grammar must have one or more *root rules*.  A root rule is a
+synthetic rule that uses `OP_SUCCESS` and `OP_FAILURE` operations to
+report overall success or failure.  Only root rules may use these
+operations, and they must be used in the right format.
+
+Each root rule must have exactly two alternatives.  The first
+alternative must be a single operation (generally `OP_RULE`) that
+stores a value in the values array, followed by `OP_SUCCESS`.  The
+second alternative must be the single operation `OP_FAILURE`.
+
+- `OP_SUCCESS` -- exit the VM with the first value from the `vals`
+  array as the result.
+
+- `OP_FAILURE` -- report a syntax error and exit the VM with a NULL
+  result.
+
+
+### Operations for loop rules only
+
+For a loop such as `a*` or `a+`, a synthetic rule must be created with
+the following structure:
+
+```
+# First alternative:
+<one operation that produces a value, e.g. OP_NAME, OP_TOKEN, OP_RULE>
+OP_LOOP_ITERATE
+
+# Second alternative:
+<either OP_LOOP_COLLECT or OP_LOOP_COLLECT_NONEMPTY>
+```
+
+The values being collected are stored in a `malloc`-ed array named
+`collections` that is grown as needed.  This uses the following
+fields:
+
+- `ncollected` -- the number of collected values.
+- `collection` -- `malloc`-ed array of `void *` values representing
+  the collected values.
+
+The operations are defined as follows:
+
+- `OP_LOOP_ITERATE` -- append the current value to the `collections`
+  array, save the current input position, and start the next iteration
+  of the loop (resetting the instruction pointer).
+
+- `OP_LOOP_COLLECT` -- restore the input position from the last saved
+  position and pop the frame off the stack, producing a new value that
+  is an `asdl_seq *` containing the collected values.
+
+- `OP_LOOP_COLLECT_NONEMPTY` -- like `OP_LOOP_COLLECT` but fails if no
+  values are collected.
+
+For a "delimited" loop, written in the metagrammar as `b.a+` (one or
+more `a` items separated by the delimiter `b`), the format is
+slightly different:
+
+```
+# First alternative:
+<one operation that produces the value for 'a' (the "meat")>
+<one operation that produces the value for 'b' (the delimiter)>
+OP_LOOP_ITERATE
+
+# Second alternative:
+<one operation that produces the value for 'a' (the "meat")>
+OP_LOOP_COLLECT_DELIMITED
+```
+
+The new operation is:
+
+- `OP_LOOP_COLLECT_DELIMITED` -- Add the first value from the values array
+  to the collection and then do everything that `OP_LOOP_COLLECT does.
+
+
+### Operations for lookaheads
+
+Positive lookaheads use the following pattern:
+
+```
+OP_SAVE_MARK
+<one operation that produces a value>
+OP_POS_LOOKAHEAD
+```
+
+The operations work as follows:
+
+- `OP_SAVE_MARK` -- saves the current input position in a dedicated
+  field of the frame, `savemark`.
+
+- `OP_POS_LOOKAHEAD` -- restores the current input position from the
+  frame's `savemark` field.  (It does not reset the values array;
+  values produced by positive lookaheads are ignored by the actions.
+
+Negative lookaheads use the following pattern:
+
+```
+OP_SAVE_MARK
+<one operation that produces a value>
+OP_NEG_LOOKAHEAD
+```
+
+The new operation works as follows:
+
+- `OP_NEG_LOOKAHEAD` -- fails the current alternative.
+
+In addition, the standard code for success/failure processing checks
+whether the next operation is `OP_NEG_LOOKAHEAD`.  If so, it treats
+`NULL` as a success and restores the current input position from the
+frame's `savemark` field.
+
+
+More about `OP_OPTIONAL`
+------------------------
+
+The `OP_OPTIONAL` flag is a "postfix" operation.  It must *follow* any
+operation that may produce a result.  The normal way of the VM is that
+if the result is `NULL` this is treated as a failure by the VM.  But
+before treating `NULL` as a failure, the VM checks whether the next
+operation is `OP_OPTIONAL`.  If so, it treats `NULL` as a success.  In
+this case a `NULL` is appended to the `vals` array and control flows
+to the next operation.
+
+When the operation preceding `OP_OPTIONAL` succeeds, `OP_OPTIONAL` is
+executed as a regular operation, and always succeed.
+
+The `OP_NEG_LOOKAHEAD` works similar (but it also restores the input
+position).
+
+
+Constraints on operation order
+------------------------------
+
+Note that for operations that succeed with a value or fail, there is
+always a next operation.  These operations are `OP_NAME`, `OP_NUMBER`,
+`OP_STRING`, `OP_TOKEN`, and `OP_RULE`.
+
+These operations always succeed: `OP_NOOP`, `OP_CUT`, `OP_OPTIONAL`,
+`OP_START`.
+
+These operations must be last in their alternative: `OP_RETURN`,
+`OP_SUCCESS`, `OP_FAILURE`, `OP_LOOP_ITERATE`, `OP_LOOP_COLLECT`,
+`OP_LOOP_COLLECT_NONEMPTY`.
+
+This operation must be first in its alternative: `OP_FAILURE`.
+
+
+Grammar for lists of operations
+-------------------------------
+
+This shows the constraints on how operations can be used together.
+
+```
+rule: root_rule | normal_rule | loop_rule | delimited_rule
+
+root_rule: success_alt failure_alt
+
+success_alt: regular_op OP_SUCCESS
+failure_alt: OP_FAILURE
+
+normal_rule: alt+
+
+loop_rule: loop_start_alt loop_collect_alt
+
+loop_start_alt: regular_op OP_LOOP_ITERATE
+loop_collect_alt: OP_LOOP_COLLECT | OP_LOOP_COLLECT_NONEMPTY
+
+delimited_rule: delimited_start_alt delimited_collect_alt
+
+delimited_start_alt: regular_op regular_op OP_LOOP_ITERATE
+delimited_collect_alt: OP_LOOP_COLLECT_DELIMITED
+
+alt: any_op+ return_op
+
+any_op: regular_op [OP_OPTIONAL] | lookahead_block | special_op
+regular_op: short_op | long_op
+short_op: OP_NAME | OP_NUMBER | OP_STRING
+long_op: OP_TOKEN <token_type> | OP_RULE <rule_id>
+special_op: OP_NOOP | OP_CUT
+return_op: OP_RETURN <action_id>
+
+lookahead_block: OP_SAVE_MARK regular_op lookahead_op
+lookahead_op:  OP_POS_LOOKAHEAD | OP_NEG_LOOKAHEAD
+```
+
+Ideas
+-----
+
+### Left-recursion
+
+- First opcode of first alt is `OP_SETUP_LEFT_REC`.  This initializes
+  the memo cache for f->rule->type to `NULL`.
+
+- All `OP_RETURN` opcodes in a left-rec rule are replaced with
+  `OP_RETURN_LEFT_REC`.  This compares the current position with the
+  most recently cached position for this rule at this point in the
+  input.  If the new match is *further*, it updates the memo cache,
+  resets `f->mark`, and resets `f->iop` and `f->ialt` to the start of
+  the rule.  It then goes back to the top (possibly skipping the setup
+  op).  If the new match is not further, it is discarded and the most
+  recent match from the memo cache is returned as the result (also
+  updating the end position).
+
+### Selective memoization
+
+- We could have a flag in the rule that prevents memo lookups and
+  inserts.  Or we could have separate opcodes, e.g. `OP_RULE_NOMEMO`
+  and `OP_RETURN_NOMEMO`.
+
+### Performance tuning
+
+- To make frames smaller, we could have a separate values stack; the
+  frame would have a `void** vals` instead of `void *vals[]`.  Most
+  frames won't need 20 values, but we could ensure there are always at
+  least that many on the stack.
+
+- Is it faster to check for flags in the rule object (e.g. leftrec) or
+  is it faster to have dedicated opcodes?  My current hunch is that
+  dedicated opcodes are faster, but I really don't know.  Maybe having
+  fewer opcodes is more important than having smaller Rule objects.
+
+- Random other C tricks, esp. tricks that might increase the hit rate
+  in the CPU's L1 cache.  (Remember in modern CPUs memory access is
+  10-100x slower than cache access.)  Who can we tap that knows this
+  stuff?
+
+- If we know `x >= -1`, Which is faster: `if (x < 0)` or `if (x == -1)`?
@@ -25,6 +25,9 @@ build: peg_extension/parse.c
 peg_extension/parse.c: $(GRAMMAR) $(TOKENS) pegen/*.py peg_extension/peg_extension.c ../../Parser/pegen.c ../../Parser/pegen_errors.c ../../Parser/string_parser.c ../../Parser/action_helpers.c ../../Parser/*.h pegen/grammar_parser.py
 	$(PYTHON) -m pegen -q c $(GRAMMAR) $(TOKENS) -o peg_extension/parse.c --compile-extension
 
+generate_vm: $(GRAMMAR) $(TOKENS) pegen/*.py ../../Parser/pegen/pegen.c ../../Parser/pegen/parse_string.c ../../Parser/pegen/*.h
+	$(PYTHON) -m pegen -q vm $(GRAMMAR) $(TOKENS) -o ../../Parser/pegen/vmparse.h
+
 clean:
 	-rm -f peg_extension/*.o peg_extension/*.so peg_extension/parse.c
 	-rm -f data/xxl.py

@@ -0,0 +1,18 @@
+start: a=stmt+ ENDMARKER { _PyPegen_make_module(p, a) }
+stmt:
+    | !'if' a=expr NEWLINE { _Py_Expr(a, EXTRA) }
+    | &'if' a=if_stmt
+if_stmt:
+    | 'if' ~ a=NAME ':' b=stmt { _Py_If(a, CHECK(_PyPegen_singleton_seq(p, b)), NULL, EXTRA) }
+expr:
+    | a=expr '+' b=term { _Py_BinOp(a, Add, b, EXTRA) }
+    | term
+term: 
+    | a=term ['*'] b=factor { _Py_BinOp(a, Mult, b, EXTRA) }
+    | factor
+factor:
+    | '(' a=expr ')' { a }
+    | '[' a=','.expr+ ']' { _Py_List(a, Load, EXTRA) }
+    | NUMBER
+    | "__peg_parser__" { RAISE_SYNTAX_ERROR("You found it!") }
+    | NAME
@@ -16,6 +16,31 @@
 from pegen.validator import validate_grammar
 
 
+def generate_vm_code(
+    args: argparse.Namespace,
+) -> Tuple[Grammar, Parser, Tokenizer, ParserGenerator]:
+    from pegen.build import build_vm_parser_and_generator
+
+    verbose = args.verbose
+    verbose_tokenizer = verbose >= 3
+    verbose_parser = verbose == 2 or verbose >= 4
+    try:
+        grammar, parser, tokenizer, gen = build_vm_parser_and_generator(
+            args.grammar_filename,
+            args.tokens_filename,
+            args.output,
+            verbose_tokenizer,
+            verbose_parser,
+        )
+        return grammar, parser, tokenizer, gen
+    except Exception as err:
+        if args.verbose:
+            raise  # Show traceback
+        traceback.print_exception(err.__class__, err, None)
+        sys.stderr.write("For full traceback, use -v\n")
+        sys.exit(1)
+
+
 def generate_c_code(
     args: argparse.Namespace,
 ) -> Tuple[Grammar, Parser, Tokenizer, ParserGenerator]:
@@ -120,6 +145,18 @@ def generate_python_code(
     help="Suppress code emission for rule actions",
 )
 
+vm_parser = subparsers.add_parser("vm", help="Generate the new VM parser generator")
+vm_parser.set_defaults(func=generate_vm_code)
+vm_parser.add_argument("grammar_filename", help="Grammar description")
+vm_parser.add_argument("tokens_filename", help="Tokens description")
+vm_parser.add_argument(
+    "-o",
+    "--output",
+    metavar="OUT",
+    default="vmparse.h",
+    help="Where to write the generated parser",
+)
+
 
 def main() -> None:
     from pegen.testutil import print_memstats

@@ -7,6 +7,7 @@
 from typing import IO, Dict, List, Optional, Set, Tuple
 
 from pegen.c_generator import CParserGenerator
+from pegen.vm_generator import VMParserGenerator
 from pegen.grammar import Grammar
 from pegen.grammar_parser import GeneratedParser as GrammarParser
 from pegen.parser import Parser
@@ -249,6 +250,19 @@ def build_python_generator(
     return gen
 
 
+def build_vm_generator(
+    grammar: Grammar, grammar_file: str, tokens_file: str, output_file: str,
+) -> ParserGenerator:
+    with open(tokens_file, "r") as tok_file:
+        all_tokens, exact_tok, non_exact_tok = generate_token_definitions(tok_file)
+    with open(output_file, "w") as file:
+        gen: ParserGenerator = VMParserGenerator(
+            grammar, all_tokens, exact_tok, non_exact_tok, file
+        )
+        gen.generate(grammar_file)
+    return gen
+
+
 def build_c_parser_and_generator(
     grammar_file: str,
     tokens_file: str,
@@ -319,3 +333,26 @@ def build_python_parser_and_generator(
         skip_actions=skip_actions,
     )
     return grammar, parser, tokenizer, gen
+
+
+def build_vm_parser_and_generator(
+    grammar_file: str,
+    tokens_file: str,
+    output_file: str,
+    verbose_tokenizer: bool = False,
+    verbose_parser: bool = False,
+) -> Tuple[Grammar, Parser, Tokenizer, ParserGenerator]:
+    """Generate rules, C parser, tokenizer, parser generator for a given grammar
+
+    Args:
+        grammar_file (string): Path for the grammar file
+        tokens_file (string): Path for the tokens file
+        output_file (string): Path for the output file
+        verbose_tokenizer (bool, optional): Whether to display additional output
+          when generating the tokenizer. Defaults to False.
+        verbose_parser (bool, optional): Whether to display additional output
+          when generating the parser. Defaults to False.
+    """
+    grammar, parser, tokenizer = build_parser(grammar_file, verbose_tokenizer, verbose_parser)
+    gen = build_vm_generator(grammar, grammar_file, tokens_file, output_file)
+    return grammar, parser, tokenizer, gen