import{_ as s,c as a,e,o as t}from"./app-CjdILvTf.js";const p="/assets/lexer-indent-fsm-voG48p68.png",o="/assets/stmt-list-eqd-C4PljLFj.png",i="/assets/leading-empty-line-error-CPphCvem.png",l="/assets/leading-empty-line-good-DBlZdZSh.png",c="/assets/first-nonempty-line-indent-error-z8NUgPsE.png",u="/assets/block-end-error-CmIYQwGr.png",d="/assets/last-line-whitespace-Kz0g0fV-.png",r={};function k(m,n){return t(),a("div",null,n[0]||(n[0]=[e(`<h1 id="indentation" tabindex="-1"><a class="header-anchor" href="#indentation"><span>Indentation</span></a></h1><p>Nim is an indentation-based language, like Python. This means that the structure of the code is determined by the indentation level of the lines, and that whitespace is significant. Consider the following example:</p><div class="language-nim line-numbers-mode" data-highlighter="prismjs" data-ext="nim" data-title="nim"><pre><code><span class="line"><span class="token keyword">proc</span> <span class="token function">foo</span><span class="token punctuation">(</span><span class="token punctuation">)</span> <span class="token operator">=</span>                   <span class="token comment"># ──────────────────┐0</span></span>
<span class="line"><span class="space"> </span><span class="space"> </span><span class="token keyword">for</span> i <span class="token operator">in</span> <span class="token number">0</span> <span class="token operator">..&lt;</span> <span class="token number">10</span><span class="token operator">:</span>           <span class="token comment"># ─────────────┐1   │</span></span>
<span class="line"><span class="space"> </span><span class="space"> </span><span class="space"> </span><span class="space"> </span>echo i                     <span class="token comment"># ────────┐2   │    │</span></span>
<span class="line"><span class="space"> </span><span class="space"> </span><span class="space"> </span><span class="space"> </span><span class="token keyword">if</span> i <span class="token operator">mod</span> <span class="token number">2</span> <span class="token operator">==</span> <span class="token number">0</span><span class="token operator">:</span>           <span class="token comment">#         │    │    │</span></span>
<span class="line"><span class="space"> </span><span class="space"> </span><span class="space"> </span><span class="space"> </span><span class="space"> </span><span class="space"> </span>echo <span class="token string">&quot;even&quot;</span>              <span class="token comment"># ── 3    │    │    │</span></span>
<span class="line"><span class="space"> </span><span class="space"> </span><span class="space"> </span><span class="space"> </span><span class="token keyword">else</span><span class="token operator">:</span>                      <span class="token comment">#         │    │    │</span></span>
<span class="line"><span class="space"> </span><span class="space"> </span><span class="space"> </span><span class="space"> </span><span class="space"> </span><span class="space"> </span>echo <span class="token string">&quot;odd&quot;</span>               <span class="token comment"># ── 3 ◄──┘ ◄──┘ ◄──┘<span class="space"> </span></span></span>
<span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true" style="counter-reset:line-number 0;"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>There are four levels of indentation here. The first level is the global scope, which is at indentation 0. Block-defining elements like <code>proc</code>, <code>for</code>, and <code>if</code> statements increase the indentation level. Typically, Nim code uses 2 spaces for indentation, but this is not a strict requirement. And although not recommended, we can use different number of spaces for different levels of indentation, as long as the indentation level within the same block is consistent.</p><p>As one might expect, parsing indentation-based languages is more difficult than parsing languages with explicit block delimiters like curly braces. We need to keep track of the indentation level of each line, and we often need to compare the indentation level of a line with the previous lines to determine the structure of the code.</p><p>There are two places where we can track indentation: in the lexer or in the parser.</p><ul><li>If we track indentation in the lexer, we need to recognize the indentation level of each line and emit tokens that represent indentation changes, as well as for staying in the same indentation level. These tokens are then consumed by the parser grammar to delimit blocks.</li><li>If we track indentation in the parser, we to be able to access the length of the leading whitespace of the first token of each line. This is not always possible with lexer generators, as they typically don&#39;t provide metadata about the tokens they emit. Also, using a parser generator like Grammar-Kit makes it even more challenging to define rules based on the leading whitespace of a token.</li></ul><p>While there is a way in Grammar-Kit to write custom utility functions that can be used as grammar rules (which we can use to access the underlying text and calculate the indentation levels), it is not a straightforward process. So, I&#39;m going to take the route of tracking indentation in the lexer. This comes with a different problem though: leading whitespace becomes part of the token stream, and is not ignored by the parser anymore. The grammar rules have to account for cases where indentation tokens are present, when we wish to treat them as whitespace (e.g. at the end of the file).</p><h2 id="indentation-tokens" tabindex="-1"><a class="header-anchor" href="#indentation-tokens"><span>Indentation Tokens</span></a></h2><p>We will need three kinds of indentation tokens: <code>IND</code> (for increasing indentation), <code>DED</code> (for decreasing indentation), and <code>EQD</code> (for staying at the same indentation). The reason for the <code>EQD</code> token is that we will use it as a separator between successive statements, much the same way as a semicolon in C-like languages.</p><p>Let&#39;s use these tokens with the example above and see what we need to emit at indentation token(s) we need to emit at each line:</p><div class="language-nim line-numbers-mode" data-highlighter="prismjs" data-ext="nim" data-title="nim"><pre><code><span class="line"><span class="token keyword">proc</span> <span class="token function">foo</span><span class="token punctuation">(</span><span class="token punctuation">)</span> <span class="token operator">=</span></span>
<span class="line">  <span class="token keyword">for</span> i <span class="token operator">in</span> <span class="token number">0</span> <span class="token operator">..&lt;</span> <span class="token number">10</span><span class="token operator">:</span>       <span class="token comment"># IND</span></span>
<span class="line">    echo i                 <span class="token comment"># IND</span></span>
<span class="line">    <span class="token keyword">if</span> i <span class="token operator">mod</span> <span class="token number">2</span> <span class="token operator">==</span> <span class="token number">0</span><span class="token operator">:</span>       <span class="token comment"># EQD</span></span>
<span class="line">      echo <span class="token string">&quot;even&quot;</span>          <span class="token comment"># IND</span></span>
<span class="line">    <span class="token keyword">else</span><span class="token operator">:</span>                  <span class="token comment"># DED, EQD</span></span>
<span class="line">      echo <span class="token string">&quot;odd&quot;</span>           <span class="token comment"># IND</span></span>
<span class="line"><span class="token comment"># back to top level        # DED, DED, DED</span></span>
<span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true" style="counter-reset:line-number 0;"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>Most of the lines should be easy to understand. But let&#39;s look closer at lines 6 and 8 in particular, where we emit multiple tokens. On line 6, the first part of the <code>if</code>/<code>else</code> block ends, and we go back to the same level of the <code>if</code> statement. So we emit both a <code>DED</code> token (to close the <code>if</code> block) and a <code>EQD</code> token (to separate the <code>if</code> block from the <code>else</code> block). On line 8, we close the <code>else</code> block and go back to the top level, which requires emitting three <code>DED</code> tokens to close the <code>else</code> block, the <code>for</code> block, and the <code>proc</code> block.</p><h2 id="lexer-state-machine" tabindex="-1"><a class="header-anchor" href="#lexer-state-machine"><span>Lexer State Machine</span></a></h2><p>We&#39;ll need to introduce a new <code>BEGIN_LINE</code> state in our lexer, and switch to it once we encounter a new line. We&#39;ll also need a stack to keep track of the indentation levels we are in. The stack is initialized with indentation level 0. The following diagram shows the state machine we need to implement.</p><p><img src="`+p+`" alt="Lexer Indentation State Machine" width="500"></p><p>Here&#39;s how it works:</p><ul><li>We start in the <code>YYINITIAL</code> state (the default lexer state).</li><li>Since we always start at the beginning of a line, we switch to the <code>BEGIN_LINE</code> state.</li><li>In the <code>BEGIN_LINE</code> state, we treat any empty lines as whitespace.</li><li>In the <code>BEGIN_LINE</code> state, we use a regex to match the leading whitespace of the line and get its length. This is the indentation level of the line (which could be zero).</li><li>We have three cases: <ul><li>If the indentation level is greater than the top of the stack, we emit an <code>IND</code> token, push the new indentation level to the stack, and switch to the <code>DEFAULT</code> state.</li><li>If the indentation level is equal to the top of the stack, we emit an <code>EQD</code> token, and switch back to the <code>DEFAULT</code> state (we don&#39;t modify the stack).</li><li>The third case handles decrease in indentation, and is more involved than the other two. That&#39;s because the decrease in indentation could be <em>insufficient</em> relative to the parent block. So, the rule is: <ul><li>If there&#39;s more than one level of indentation on the stack, and the current indentation level is less than or equal to the second-to-top level of the stack, we emit a <code>DED</code> token, pop the stack, but we do <em>not</em> switch back to the <code>DEFAULT</code> state just yet. Instead, we stay in the <code>BEGIN_LINE</code> state, push back the whitespace text to the lexer, and let the lexer reprocess the line. This allows us to emit the correct number of <code>DED</code> tokens until we reach the correct indentation level.</li></ul></li><li>If none of the above cases are met, then we have a case where the decrease in indentation is insufficient, and we emit an <code>INVALID_IND</code> token.</li></ul></li></ul><p>The logic is a bit intricate, especially for the decrease in indentation case, but it&#39;s the price we&#39;re willing to pay to keep the parser simple. Let&#39;s go ahead and add a <code>processIndentation</code> method to our lexer so that we can call it while in the <code>BEGIN_LINE</code> state.</p><div class="language-java line-numbers-mode" data-highlighter="prismjs" data-ext="java" data-title="java"><pre><code><span class="line"><span class="token comment">// src/main/kotlin/khaledh/nimjet/lexer/Nim.flex</span></span>
<span class="line"><span class="token punctuation">.</span><span class="token punctuation">.</span><span class="token punctuation">.</span></span>
<span class="line"></span>
<span class="line"><span class="token comment">// lexer class code</span></span>
<span class="line"></span>
<span class="line"><span class="token operator">%</span><span class="token punctuation">{</span></span>
<span class="line">  <span class="token keyword">private</span> <span class="token class-name">Stack</span><span class="token generics"><span class="token punctuation">&lt;</span><span class="token class-name">Integer</span><span class="token punctuation">&gt;</span></span> indentStack <span class="token operator">=</span> <span class="token keyword">new</span> <span class="token class-name">Stack</span><span class="token generics"><span class="token punctuation">&lt;</span><span class="token punctuation">&gt;</span></span><span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">;</span></span>
<span class="line"><span class="token operator">%</span><span class="token punctuation">}</span></span>
<span class="line"></span>
<span class="line"><span class="token operator">%</span>init<span class="token punctuation">{</span></span>
<span class="line">  <span class="token comment">// initial indentation level is 0</span></span>
<span class="line">  indentStack<span class="token punctuation">.</span><span class="token function">push</span><span class="token punctuation">(</span><span class="token number">0</span><span class="token punctuation">)</span><span class="token punctuation">;</span></span>
<span class="line"><span class="token operator">%</span>init<span class="token punctuation">}</span></span>
<span class="line"></span>
<span class="line"><span class="token operator">%</span><span class="token punctuation">{</span></span>
<span class="line">  <span class="token keyword">private</span> <span class="token class-name">IElementType</span> <span class="token function">processIndent</span><span class="token punctuation">(</span><span class="token punctuation">)</span> <span class="token punctuation">{</span></span>
<span class="line">    <span class="token keyword">int</span> currIndent <span class="token operator">=</span> <span class="token function">yylength</span><span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">;</span></span>
<span class="line"></span>
<span class="line">    <span class="token keyword">if</span> <span class="token punctuation">(</span>currIndent <span class="token operator">&gt;</span> indentStack<span class="token punctuation">.</span><span class="token function">peek</span><span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span> <span class="token punctuation">{</span></span>
<span class="line">      <span class="token comment">// new indent level</span></span>
<span class="line">      indentStack<span class="token punctuation">.</span><span class="token function">push</span><span class="token punctuation">(</span>currIndent<span class="token punctuation">)</span><span class="token punctuation">;</span></span>
<span class="line">      <span class="token function">yybegin</span><span class="token punctuation">(</span><span class="token constant">DEFAULT</span><span class="token punctuation">)</span><span class="token punctuation">;</span></span>
<span class="line">      <span class="token keyword">return</span> <span class="token class-name">NimToken</span><span class="token punctuation">.</span><span class="token constant">IND</span><span class="token punctuation">;</span></span>
<span class="line">    <span class="token punctuation">}</span></span>
<span class="line"></span>
<span class="line">    <span class="token keyword">if</span> <span class="token punctuation">(</span>currIndent <span class="token operator">==</span> indentStack<span class="token punctuation">.</span><span class="token function">peek</span><span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span> <span class="token punctuation">{</span></span>
<span class="line">      <span class="token comment">// same indent level</span></span>
<span class="line">      <span class="token function">yybegin</span><span class="token punctuation">(</span><span class="token constant">DEFAULT</span><span class="token punctuation">)</span><span class="token punctuation">;</span></span>
<span class="line">      <span class="token keyword">return</span> <span class="token class-name">NimToken</span><span class="token punctuation">.</span><span class="token constant">EQD</span><span class="token punctuation">;</span></span>
<span class="line">    <span class="token punctuation">}</span></span>
<span class="line"></span>
<span class="line">    <span class="token keyword">if</span> <span class="token punctuation">(</span>indentStack<span class="token punctuation">.</span><span class="token function">size</span><span class="token punctuation">(</span><span class="token punctuation">)</span> <span class="token operator">&gt;</span> <span class="token number">1</span> <span class="token operator">&amp;&amp;</span> currIndent <span class="token operator">&lt;=</span> indentStack<span class="token punctuation">.</span><span class="token function">get</span><span class="token punctuation">(</span>indentStack<span class="token punctuation">.</span><span class="token function">size</span><span class="token punctuation">(</span><span class="token punctuation">)</span> <span class="token operator">-</span> <span class="token number">2</span><span class="token punctuation">)</span><span class="token punctuation">)</span> <span class="token punctuation">{</span></span>
<span class="line">      <span class="token comment">// We can only dedent one level at a time, so don&#39;t switch back to DEFAULT just yet,</span></span>
<span class="line">      <span class="token comment">// and keep returning DED tokens as long as there&#39;s more dedent levels.</span></span>
<span class="line">      indentStack<span class="token punctuation">.</span><span class="token function">pop</span><span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">;</span></span>
<span class="line">      <span class="token function">yypushback</span><span class="token punctuation">(</span><span class="token function">yylength</span><span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">;</span></span>
<span class="line">      <span class="token keyword">return</span> <span class="token class-name">NimToken</span><span class="token punctuation">.</span><span class="token constant">DED</span><span class="token punctuation">;</span></span>
<span class="line">    <span class="token punctuation">}</span></span>
<span class="line"></span>
<span class="line">     <span class="token comment">// invalid indentation</span></span>
<span class="line">     <span class="token keyword">return</span> <span class="token class-name">NimToken</span><span class="token punctuation">.</span><span class="token constant">INVALID_IND</span><span class="token punctuation">;</span></span>
<span class="line">  <span class="token punctuation">}</span></span>
<span class="line"><span class="token operator">%</span><span class="token punctuation">}</span></span>
<span class="line"></span>
<span class="line"><span class="token comment">// lexer states</span></span>
<span class="line"><span class="token operator">%</span>state <span class="token constant">DEFAULT</span></span>
<span class="line"><span class="token operator">%</span>state <span class="token constant">BEGIN_LINE</span></span>
<span class="line"></span>
<span class="line"><span class="token comment">// macros</span></span>
<span class="line"><span class="token constant">EOL</span> <span class="token operator">=</span> \\r\\n<span class="token operator">|</span>\\r<span class="token operator">|</span>\\n</span>
<span class="line"></span>
<span class="line"><span class="token operator">%</span><span class="token operator">%</span></span>
<span class="line"></span>
<span class="line"><span class="token generics"><span class="token punctuation">&lt;</span>YYINITIAL<span class="token punctuation">&gt;</span></span> <span class="token punctuation">[</span><span class="token operator">^</span><span class="token punctuation">]</span>           <span class="token punctuation">{</span> <span class="token function">yypushback</span><span class="token punctuation">(</span><span class="token number">1</span><span class="token punctuation">)</span><span class="token punctuation">;</span> <span class="token function">yybegin</span><span class="token punctuation">(</span><span class="token constant">BEGIN_LINE</span><span class="token punctuation">)</span><span class="token punctuation">;</span> <span class="token punctuation">}</span></span>
<span class="line"></span>
<span class="line"><span class="token generics"><span class="token punctuation">&lt;</span>DEFAULT<span class="token punctuation">&gt;</span></span> <span class="token punctuation">{</span></span>
<span class="line">  <span class="token punctuation">.</span><span class="token punctuation">.</span><span class="token punctuation">.</span></span>
<span class="line">  <span class="token punctuation">{</span><span class="token constant">EOL</span><span class="token punctuation">}</span>                   <span class="token punctuation">{</span> <span class="token function">yybegin</span><span class="token punctuation">(</span><span class="token constant">BEGIN_LINE</span><span class="token punctuation">)</span><span class="token punctuation">;</span> <span class="token keyword">return</span> <span class="token class-name">TokenType</span><span class="token punctuation">.</span><span class="token constant">WHITE_SPACE</span><span class="token punctuation">;</span> <span class="token punctuation">}</span></span>
<span class="line">  <span class="token punctuation">.</span><span class="token punctuation">.</span><span class="token punctuation">.</span></span>
<span class="line"><span class="token punctuation">}</span></span>
<span class="line"></span>
<span class="line"><span class="token generics"><span class="token punctuation">&lt;</span>BEGIN_LINE<span class="token punctuation">&gt;</span></span> <span class="token punctuation">{</span></span>
<span class="line">  <span class="token punctuation">[</span> \\t<span class="token punctuation">]</span><span class="token operator">*</span><span class="token punctuation">{</span><span class="token constant">EOL</span><span class="token punctuation">}</span>             <span class="token punctuation">{</span> <span class="token keyword">return</span> <span class="token class-name">TokenType</span><span class="token punctuation">.</span><span class="token constant">WHITE_SPACE</span><span class="token punctuation">;</span> <span class="token comment">/* skip empty lines */</span> <span class="token punctuation">}</span></span>
<span class="line">  <span class="token punctuation">[</span> \\t<span class="token punctuation">]</span><span class="token operator">*</span>                  <span class="token punctuation">{</span> <span class="token keyword">return</span> <span class="token function">processIndent</span><span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">;</span> <span class="token punctuation">}</span></span>
<span class="line"><span class="token punctuation">}</span></span>
<span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true" style="counter-reset:line-number 0;"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>When I tested this on a code sample, the IDE threw an error saying <code>&quot;Lexer is not progressing after calling advance()&quot;</code>. At first, I was puzzled by this, so I fired up the debugger and traced the code. It turns out that the IDE tries to validate that the lexer is making progress by ensuring that it doesn&#39;t produce the same token multiple times in a row at the same location while in the same state. Unfortunately, that&#39;s exactly what we&#39;re trying to do when we need to emit multiple <code>DED</code> tokens in a row when the indentation decreases more than one level.</p><p>To work around this issue, I created two identical copies of the <code>BEGIN_LINE</code> state: <code>BEGIN_LINE</code> and <code>BEGIN_LINE_2</code>, and updated the <code>DED</code> case to toggle between the two states. It&#39;s a hack, but it works.</p><div class="language-java line-numbers-mode" data-highlighter="prismjs" data-ext="java" data-title="java"><pre><code><span class="line"><span class="token comment">// src/main/kotlin/khaledh/nimjet/lexer/Nim.flex</span></span>
<span class="line"><span class="token punctuation">.</span><span class="token punctuation">.</span><span class="token punctuation">.</span></span>
<span class="line"></span>
<span class="line"><span class="token comment">// lexer class code</span></span>
<span class="line"><span class="token operator">%</span><span class="token punctuation">{</span></span>
<span class="line">  <span class="token punctuation">.</span><span class="token punctuation">.</span><span class="token punctuation">.</span></span>
<span class="line"></span>
<span class="line">  <span class="token keyword">private</span> <span class="token class-name">IElementType</span> <span class="token function">processIndent</span><span class="token punctuation">(</span><span class="token punctuation">)</span> <span class="token punctuation">{</span></span>
<span class="line">    <span class="token punctuation">.</span><span class="token punctuation">.</span><span class="token punctuation">.</span></span>
<span class="line"></span>
<span class="line">    <span class="token keyword">if</span> <span class="token punctuation">(</span>indentStack<span class="token punctuation">.</span><span class="token function">size</span><span class="token punctuation">(</span><span class="token punctuation">)</span> <span class="token operator">&gt;</span> <span class="token number">1</span> <span class="token operator">&amp;&amp;</span> currIndent <span class="token operator">&lt;=</span> indentStack<span class="token punctuation">.</span><span class="token function">get</span><span class="token punctuation">(</span>indentStack<span class="token punctuation">.</span><span class="token function">size</span><span class="token punctuation">(</span><span class="token punctuation">)</span> <span class="token operator">-</span> <span class="token number">2</span><span class="token punctuation">)</span><span class="token punctuation">)</span> <span class="token punctuation">{</span></span>
<span class="line">      <span class="token comment">// We can only dedent one level at a time, so don&#39;t switch back to DEFAULT just yet,</span></span>
<span class="line">      <span class="token comment">// and keep returning DED tokens as long as there&#39;s more dedent levels.</span></span>
<span class="line">      <span class="token comment">//</span></span>
<span class="line highlighted">      <span class="token comment">// Also, IntelliJ&#39;s lexer validation doesn&#39;t like returning the same token multiple</span></span>
<span class="line highlighted">      <span class="token comment">// times in a row at the same location while in the same state (throws an exception</span></span>
<span class="line highlighted">      <span class="token comment">// about &quot;Lexer is not progressing&quot;), so as a workaround we toggle between two</span></span>
<span class="line highlighted">      <span class="token comment">// identical states to avoid this issue.</span></span>
<span class="line highlighted">      <span class="token keyword">int</span> nextState <span class="token operator">=</span> <span class="token function">yystate</span><span class="token punctuation">(</span><span class="token punctuation">)</span> <span class="token operator">==</span> <span class="token constant">BEGIN_LINE</span> <span class="token operator">?</span> <span class="token constant">BEGIN_LINE_2</span> <span class="token operator">:</span> <span class="token constant">BEGIN_LINE</span><span class="token punctuation">;</span></span>
<span class="line highlighted">      <span class="token function">yybegin</span><span class="token punctuation">(</span>nextState<span class="token punctuation">)</span><span class="token punctuation">;</span></span>
<span class="line">      indentStack<span class="token punctuation">.</span><span class="token function">pop</span><span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">;</span></span>
<span class="line">      <span class="token function">yypushback</span><span class="token punctuation">(</span><span class="token function">yylength</span><span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">)</span><span class="token punctuation">;</span></span>
<span class="line">      <span class="token keyword">return</span> <span class="token class-name">NimToken</span><span class="token punctuation">.</span><span class="token constant">DED</span><span class="token punctuation">;</span></span>
<span class="line">    <span class="token punctuation">}</span></span>
<span class="line">  <span class="token punctuation">}</span></span>
<span class="line"><span class="token operator">%</span><span class="token punctuation">}</span></span>
<span class="line"></span>
<span class="line"><span class="token comment">// lexer states</span></span>
<span class="line highlighted"><span class="token operator">%</span>state <span class="token constant">BEGIN_LINE</span> <span class="token constant">BEGIN_LINE_2</span></span>
<span class="line"><span class="token punctuation">.</span><span class="token punctuation">.</span><span class="token punctuation">.</span></span>
<span class="line"><span class="token operator">%</span><span class="token operator">%</span></span>
<span class="line"></span>
<span class="line highlighted"><span class="token comment">// We use two identical states to avoid a lexer validation issue; see processIndent.</span></span>
<span class="line highlighted"><span class="token generics"><span class="token punctuation">&lt;</span>BEGIN_LINE<span class="token punctuation">,</span> BEGIN_LINE_2<span class="token punctuation">&gt;</span></span></span>
<span class="line">  <span class="token punctuation">[</span> \\t<span class="token punctuation">]</span><span class="token operator">*</span><span class="token punctuation">{</span><span class="token constant">EOL</span><span class="token punctuation">}</span>                    <span class="token punctuation">{</span> <span class="token keyword">return</span> <span class="token class-name">TokenType</span><span class="token punctuation">.</span><span class="token constant">WHITE_SPACE</span><span class="token punctuation">;</span> <span class="token punctuation">}</span></span>
<span class="line">  <span class="token punctuation">[</span> \\t<span class="token punctuation">]</span><span class="token operator">*</span>                         <span class="token punctuation">{</span> <span class="token keyword">return</span> <span class="token function">processIndent</span><span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">;</span> <span class="token punctuation">}</span></span>
<span class="line"><span class="token punctuation">}</span></span>
<span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true" style="counter-reset:line-number 0;"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>This keeps the lexer validation happy, and we can now emit multiple <code>DED</code> tokens in a row from the same location.</p><h2 id="parsing-top-level-code" tabindex="-1"><a class="header-anchor" href="#parsing-top-level-code"><span>Parsing Top-Level Code</span></a></h2><p>Now that we have the indentation tokens in the token stream, we need to modify our parser to use them to delimit blocks. The first obvious change is to delimit the top-level statement list with <code>EQD</code> tokens, since all top-level statements should be at indentation level 0. Let&#39;s modify the <code>StmtList</code> rule to account for this.</p><div class="language-bnf line-numbers-mode" data-highlighter="prismjs" data-ext="bnf" data-title="bnf"><pre><code><span class="line">// src/main/kotlin/khaledh/nimjet/parser/Nim.bnf</span>
<span class="line"><span class="token operator">{</span></span>
<span class="line">  <span class="token operator">...</span></span>
<span class="line"><span class="token operator">}</span></span>
<span class="line"></span>
<span class="line">Module     <span class="token operator">::=</span> !&lt;<span class="token rule"><span class="token punctuation">&lt;</span>eof<span class="token punctuation">&gt;</span></span>&gt; StmtList</span>
<span class="line highlighted">StmtList   <span class="token operator">::=</span> Stmt <span class="token operator">(</span>EQD Stmt<span class="token operator">)</span><span class="token operator">*</span></span>
<span class="line">Stmt       <span class="token operator">::=</span> LetSection</span>
<span class="line">             <span class="token operator">|</span> Command</span>
<span class="line"></span>
<span class="line">LetSection <span class="token operator">::=</span> LET IdentDecl EQ STRING_LIT</span>
<span class="line"></span>
<span class="line">Command    <span class="token operator">::=</span> IdentRef IdentRef</span>
<span class="line"></span>
<span class="line">IdentDecl  <span class="token operator">::=</span> IDENT</span>
<span class="line">IdentRef   <span class="token operator">::=</span> IDENT</span>
<span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true" style="counter-reset:line-number 0;"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>Let&#39;s generate the parser (<code>Cmd+Shit+G</code>) and try it out.</p><p><img src="`+o+'" alt="StmtList with EQD" width="650"></p><p>Although there are no visible changes in the PSI tree, the code is parsed correctly as expected. Unfortunately we don&#39;t see the <code>EQD</code> token element in the tree as I was expecting. But when I debug the lexer I see it emitting the token. My assumption is that the PSI tree builder doesn&#39;t allow multiple elements at the same location, as in this case where the <code>EQD</code> token occupies the same start location as the second <code>Stmt</code> element, and uses the last element (in sibling order) as the element for that location. It&#39;s a bit annoying not to be able to verify the existence of token, but it doesn&#39;t impact the correctness of the parsed tree. In fact, it could be considered a good thing, since, conceptually, indentation elements should be considered whitespace, which normally doesn&#39;t show up in the tree.</p><p>This works, but there&#39;s a few issues we need to address. Let&#39;s introduce an empty line at the beginning of the file.</p><p><img src="'+i+`" alt="Leading Empty Line Error" width="650"></p><p>We get an error saying that <code>&lt;stmt list&gt;</code> was expected at the beginning of the second line. The reason is that the lexer emitted an <code>EQD</code> token for the empty line, which is not expected by the parser at that location. We can easily fix this by skipping whitespace at the beginning of the file. Instead of introducing more rules to handle this, we can just use a flag <code>firstNonEmptyLine</code> (defaults to <code>true</code>), to decide whether we&#39;re processing the first line or not, and act accordingly.</p><div class="language-java line-numbers-mode" data-highlighter="prismjs" data-ext="java" data-title="java"><pre><code><span class="line"><span class="token comment">// src/main/kotlin/khaledh/nimjet/lexer/Nim.flex</span></span>
<span class="line"><span class="token punctuation">.</span><span class="token punctuation">.</span><span class="token punctuation">.</span></span>
<span class="line"></span>
<span class="line"><span class="token operator">%</span><span class="token punctuation">{</span></span>
<span class="line">  <span class="token keyword">private</span> <span class="token class-name">Stack</span><span class="token generics"><span class="token punctuation">&lt;</span><span class="token class-name">Integer</span><span class="token punctuation">&gt;</span></span> indentStack <span class="token operator">=</span> <span class="token keyword">new</span> <span class="token class-name">Stack</span><span class="token generics"><span class="token punctuation">&lt;</span><span class="token punctuation">&gt;</span></span><span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">;</span></span>
<span class="line highlighted">  <span class="token keyword">private</span> <span class="token class-name">Boolean</span> firstNonEmptyLine <span class="token operator">=</span> <span class="token boolean">true</span><span class="token punctuation">;</span></span>
<span class="line"><span class="token operator">%</span><span class="token punctuation">}</span></span>
<span class="line"><span class="token punctuation">.</span><span class="token punctuation">.</span><span class="token punctuation">.</span></span>
<span class="line"></span>
<span class="line"><span class="token operator">%</span><span class="token punctuation">{</span></span>
<span class="line">  <span class="token keyword">private</span> <span class="token class-name">IElementType</span> <span class="token function">processIndent</span><span class="token punctuation">(</span><span class="token punctuation">)</span> <span class="token punctuation">{</span></span>
<span class="line">    <span class="token keyword">int</span> currIndent <span class="token operator">=</span> <span class="token function">yylength</span><span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">;</span></span>
<span class="line"></span>
<span class="line highlighted">    <span class="token comment">// we don&#39;t want to emit EQD for the first non-empty line</span></span>
<span class="line highlighted">    <span class="token keyword">if</span> <span class="token punctuation">(</span>firstNonEmptyLine <span class="token operator">&amp;&amp;</span> currIndent <span class="token operator">==</span> <span class="token number">0</span><span class="token punctuation">)</span> <span class="token punctuation">{</span></span>
<span class="line highlighted">      firstNonEmptyLine <span class="token operator">=</span> <span class="token boolean">false</span><span class="token punctuation">;</span></span>
<span class="line highlighted">      <span class="token function">yybegin</span><span class="token punctuation">(</span><span class="token constant">DEFAULT</span><span class="token punctuation">)</span><span class="token punctuation">;</span></span>
<span class="line highlighted">      <span class="token keyword">return</span> <span class="token class-name">TokenType</span><span class="token punctuation">.</span><span class="token constant">WHITE_SPACE</span><span class="token punctuation">;</span></span>
<span class="line highlighted">    <span class="token punctuation">}</span></span>
<span class="line"></span>
<span class="line">    <span class="token punctuation">.</span><span class="token punctuation">.</span><span class="token punctuation">.</span></span>
<span class="line"><span class="token operator">%</span><span class="token punctuation">}</span></span>
<span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true" style="counter-reset:line-number 0;"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>This should fix the issue with the first non-empty line. Let&#39;s test it out.</p><p><img src="`+l+'" alt="Leading Empty Line Good" width="650"></p><p>Great! Let&#39;s also test the case where the first non-empty line has leading whitespace.</p><p><img src="'+c+`" alt="First Non-Empty Line Indent Error" width="650"></p><p>We get an error as expected! Our lexer and parser can now recognize and handle top-level indentation correctly.</p><h2 id="parsing-blocks" tabindex="-1"><a class="header-anchor" href="#parsing-blocks"><span>Parsing Blocks</span></a></h2><p>Now that we have the top-level statement list working, let&#39;s move on to parsing blocks with actual indentation. The simplest construct in Nim that uses indentation is the <code>block</code> statement, which contains an indented <code>StmtList</code> inside it, including nested <code>block</code> statements.</p><p>Let&#39;s start by adding a <code>BLOCK</code> token to our lexer (and define the corresponding token in <code>NimToken</code>).</p><div class="language-java line-numbers-mode" data-highlighter="prismjs" data-ext="java" data-title="java"><pre><code><span class="line"><span class="token comment">// src/main/kotlin/khaledh/nimjet/lexer/Nim.flex</span></span>
<span class="line"><span class="token punctuation">.</span><span class="token punctuation">.</span><span class="token punctuation">.</span></span>
<span class="line"></span>
<span class="line"><span class="token generics"><span class="token punctuation">&lt;</span>DEFAULT<span class="token punctuation">&gt;</span></span> <span class="token punctuation">{</span></span>
<span class="line">  <span class="token punctuation">.</span><span class="token punctuation">.</span><span class="token punctuation">.</span></span>
<span class="line"></span>
<span class="line">  <span class="token string">&quot;let&quot;</span>                          <span class="token punctuation">{</span> <span class="token keyword">return</span> <span class="token class-name">NimToken</span><span class="token punctuation">.</span><span class="token constant">LET</span><span class="token punctuation">;</span> <span class="token punctuation">}</span></span>
<span class="line highlighted">  <span class="token string">&quot;block&quot;</span>                        <span class="token punctuation">{</span> <span class="token keyword">return</span> <span class="token class-name">NimToken</span><span class="token punctuation">.</span><span class="token constant">BLOCK</span><span class="token punctuation">;</span> <span class="token punctuation">}</span></span>
<span class="line"></span>
<span class="line">  <span class="token punctuation">.</span><span class="token punctuation">.</span><span class="token punctuation">.</span></span>
<span class="line"><span class="token punctuation">}</span></span>
<span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true" style="counter-reset:line-number 0;"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>Next, let&#39;s add a new rule for <code>BlockStmt</code> in our parser.</p><div class="language-bnf line-numbers-mode" data-highlighter="prismjs" data-ext="bnf" data-title="bnf"><pre><code><span class="line">// src/main/kotlin/khaledh/nimjet/parser/Nim.bnf</span>
<span class="line"><span class="token operator">...</span></span>
<span class="line"></span>
<span class="line">BlockStmt  <span class="token operator">::=</span> BLOCK COLON IND StmtList DED</span>
<span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true" style="counter-reset:line-number 0;"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>The rule is simple: a <code>block</code> statement starts with the <code>block</code> keyword, followed by a colon, and a <code>StmtList</code> enclosed between <code>IND</code> and <code>DED</code> tokens. Let&#39;s generate the parser and test it out.</p><p><img src="`+u+`" alt="Block End Error" width="700"></p><p>We get an error saying that the parser was expecting either a <code>DED</code> or <code>EQD</code> at the end. That&#39;s because the file ends right after the last statement, and so there are no indentation tokens to close the block. This is a problem we cannot solve at the lexer level, unfortunately. What we can do, is to allow blocks to end with either a <code>DED</code> token or the <code>&lt;&lt;eof&gt;&gt;</code> special marker. Since we&#39;re going to need this for other rules, let&#39;s introduce a private rule <code>DED_OR_EOF</code> that matches either a <code>DED</code> token or the end of file, and use it in places where we expect a <code>DED</code> token. A private rule means it doesn&#39;t get a dedicated node in the PSI tree.</p><div class="language-bnf line-numbers-mode" data-highlighter="prismjs" data-ext="bnf" data-title="bnf"><pre><code><span class="line">// src/main/kotlin/khaledh/nimjet/parser/Nim.bnf</span>
<span class="line"><span class="token operator">...</span></span>
<span class="line"></span>
<span class="line">BlockStmt  <span class="token operator">::=</span> BLOCK COLON IND StmtList ded_or_eof</span>
<span class="line"></span>
<span class="line">private ded_or_eof <span class="token operator">::=</span> DED <span class="token operator">|</span> &lt;<span class="token rule"><span class="token punctuation">&lt;</span>eof<span class="token punctuation">&gt;</span></span>&gt;</span>
<span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true" style="counter-reset:line-number 0;"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>This solves the problem of blocks ending at the end of the file. Let&#39;s test another scenario where we create a new line that has the same indentation level inside the block, but that doesn&#39;t contain a statement, i.e. the end of file comes right after the leading whitespace.</p><p><img src="`+d+`" alt="Last Line Whitespace" width="700"></p><p>We get an error that says <code>&lt;stmt&gt;</code> expected. This is an issue similar to the one above. The lexer had emitted an <code>EQD</code> token at that location, and so the parser expects a statement to come after it. While the behaviour here is correct, and can be fixed by expecting the user to remove the leading whitespace, it&#39;s not a good experience. Users expect empty lines anywhere in the file to be ignored.</p><p>We <em>can</em> do something similar to allowing blocks to end with <code>&lt;&lt;eof&gt;&gt;</code> by modifying the <code>StmtList</code> rule to make the <code>Stmt</code> instance that comes after <code>EQD</code> optional, but this situation might come up in other places in the future as well. So it&#39;s better to solve it at the lexer level by treating a line with only whitespace at the end of the file as whitespace.</p><p>Unfortunately, there&#39;s no official way to match the end of file as part of a regex in JFlex. There&#39;s an <code>&lt;&lt;EOF&gt;&gt;</code> rule that matches the end of file, but it can only be used alone, and not as part of a regex. If we try to use it alone, it would be too late, since the leading whitespace according to our indentation rules, and we may have already emitted an indentation token. So, I&#39;m going to rely on a private variable in the generated lexer class called <code>zzEndRead</code> to get the end of file position (I know, I shouldn&#39;t use unofficial features, but I have no other way), and use it to determine if the end of the current line is at the end of the file. If it is, we return a whitespace token.</p><div class="language-java line-numbers-mode" data-highlighter="prismjs" data-ext="java" data-title="java"><pre><code><span class="line"><span class="token comment">// src/main/kotlin/khaledh/nimjet/lexer/Nim.flex</span></span>
<span class="line"><span class="token operator">%</span><span class="token punctuation">{</span></span>
<span class="line">  <span class="token punctuation">.</span><span class="token punctuation">.</span><span class="token punctuation">.</span></span>
<span class="line"></span>
<span class="line">  <span class="token keyword">private</span> <span class="token class-name">IElementType</span> <span class="token function">processIndent</span><span class="token punctuation">(</span><span class="token punctuation">)</span> <span class="token punctuation">{</span></span>
<span class="line">    <span class="token keyword">int</span> currIndent <span class="token operator">=</span> <span class="token function">yylength</span><span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">;</span></span>
<span class="line"></span>
<span class="line highlighted">    <span class="token comment">// handle a line with only whitespace at the end of the file</span></span>
<span class="line highlighted">    <span class="token keyword">if</span> <span class="token punctuation">(</span><span class="token function">getTokenEnd</span><span class="token punctuation">(</span><span class="token punctuation">)</span> <span class="token operator">==</span> zzEndRead<span class="token punctuation">)</span> <span class="token punctuation">{</span></span>
<span class="line highlighted">      <span class="token keyword">return</span> <span class="token class-name">TokenType</span><span class="token punctuation">.</span><span class="token constant">WHITE_SPACE</span><span class="token punctuation">;</span></span>
<span class="line highlighted">    <span class="token punctuation">}</span></span>
<span class="line"></span>
<span class="line">    <span class="token punctuation">.</span><span class="token punctuation">.</span><span class="token punctuation">.</span></span>
<span class="line"><span class="token punctuation">}</span></span>
<span class="line"><span class="token punctuation">.</span><span class="token punctuation">.</span><span class="token punctuation">.</span></span>
<span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true" style="counter-reset:line-number 0;"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>This should solve the issue, treating the last line as whitespace if it&#39;s empty. The parser should be happy, since there&#39;s no extra indentation tokens at the end of the file.</p><p>With this modification in place, we can actually revisit the situation that required us to add the <code>ded_or_eof</code> rule to handle the end of file in blocks. We can now add a lexer rule to match the EOF in any state, and pop all the remaining indentation levels from the stack, emitting an <code>DED</code> for each of them. The reason we couldn&#39;t do this before is that, in the case where the last line contains only whitespace, we would have emitted a <code>EQD</code> first, followed by the <code>DED</code> tokens from the stack, which would have been incorrect. But now that we ignore the last line if it&#39;s empty, we can safely emit those <code>DED</code> tokens once we encounter the end of file.</p><p>Let&#39;s add a new method <code>processEof()</code> to our lexer to handle this, and call it when we encounter the end of file from any state.</p><div class="language-java line-numbers-mode" data-highlighter="prismjs" data-ext="java" data-title="java"><pre><code><span class="line"><span class="token operator">%</span><span class="token punctuation">{</span></span>
<span class="line"><span class="token punctuation">.</span><span class="token punctuation">.</span><span class="token punctuation">.</span></span>
<span class="line"></span>
<span class="line"><span class="token keyword">private</span> <span class="token class-name">IElementType</span> <span class="token function">processEof</span><span class="token punctuation">(</span><span class="token punctuation">)</span> <span class="token punctuation">{</span></span>
<span class="line">  <span class="token comment">// return DED tokens (one at a time) for all remaining indent levels</span></span>
<span class="line">  <span class="token keyword">if</span> <span class="token punctuation">(</span>indentStack<span class="token punctuation">.</span><span class="token function">size</span><span class="token punctuation">(</span><span class="token punctuation">)</span> <span class="token operator">&gt;</span> <span class="token number">1</span><span class="token punctuation">)</span> <span class="token punctuation">{</span></span>
<span class="line">    indentStack<span class="token punctuation">.</span><span class="token function">pop</span><span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">;</span></span>
<span class="line">    <span class="token keyword">return</span> <span class="token class-name">NimToken</span><span class="token punctuation">.</span><span class="token constant">DED</span><span class="token punctuation">;</span></span>
<span class="line">  <span class="token punctuation">}</span></span>
<span class="line">  <span class="token keyword">return</span> <span class="token keyword">null</span><span class="token punctuation">;</span></span>
<span class="line"><span class="token punctuation">}</span></span>
<span class="line"><span class="token operator">%</span><span class="token punctuation">}</span></span>
<span class="line"><span class="token punctuation">.</span><span class="token punctuation">.</span><span class="token punctuation">.</span></span>
<span class="line"></span>
<span class="line"><span class="token generics"><span class="token punctuation">&lt;</span><span class="token punctuation">&lt;</span>EOF<span class="token punctuation">&gt;</span><span class="token punctuation">&gt;</span></span>                          <span class="token punctuation">{</span> <span class="token keyword">return</span> <span class="token function">processEof</span><span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">;</span> <span class="token punctuation">}</span></span>
<span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true" style="counter-reset:line-number 0;"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p><code>processEof()</code> will be called multiple times (lexer stays at EOF) until the stack is empty, at which point we return <code>null</code> to indicate that there are no more tokens.</p><p>If we test this we run into the same issue as before, where the lexer doesn&#39;t progress because it&#39;s returning the same token (<code>DED</code>) multiple times in a row at the same location. So, we need to use the same trick as before, and toggle between two identical states to avoid this issue.</p><div class="language-java line-numbers-mode" data-highlighter="prismjs" data-ext="java" data-title="java"><pre><code><span class="line"><span class="token operator">%</span><span class="token punctuation">{</span></span>
<span class="line">  <span class="token punctuation">.</span><span class="token punctuation">.</span><span class="token punctuation">.</span></span>
<span class="line"></span>
<span class="line">  <span class="token keyword">private</span> <span class="token class-name">IElementType</span> <span class="token function">processEof</span><span class="token punctuation">(</span><span class="token punctuation">)</span> <span class="token punctuation">{</span></span>
<span class="line">    <span class="token comment">// return DED tokens (one at a time) for all remaining indent levels</span></span>
<span class="line">    <span class="token keyword">if</span> <span class="token punctuation">(</span>indentStack<span class="token punctuation">.</span><span class="token function">size</span><span class="token punctuation">(</span><span class="token punctuation">)</span> <span class="token operator">&gt;</span> <span class="token number">1</span><span class="token punctuation">)</span> <span class="token punctuation">{</span></span>
<span class="line">      indentStack<span class="token punctuation">.</span><span class="token function">pop</span><span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">;</span></span>
<span class="line highlighted">      <span class="token keyword">int</span> nextState <span class="token operator">=</span> <span class="token function">yystate</span><span class="token punctuation">(</span><span class="token punctuation">)</span> <span class="token operator">==</span> <span class="token constant">AT_EOF</span> <span class="token operator">?</span> <span class="token constant">AT_EOF_2</span> <span class="token operator">:</span> <span class="token constant">AT_EOF</span><span class="token punctuation">;</span></span>
<span class="line highlighted">      <span class="token function">yybegin</span><span class="token punctuation">(</span>nextState<span class="token punctuation">)</span><span class="token punctuation">;</span></span>
<span class="line">      <span class="token keyword">return</span> <span class="token class-name">NimToken</span><span class="token punctuation">.</span><span class="token constant">DED</span><span class="token punctuation">;</span></span>
<span class="line">    <span class="token punctuation">}</span></span>
<span class="line">    <span class="token keyword">return</span> <span class="token keyword">null</span><span class="token punctuation">;</span></span>
<span class="line">  <span class="token punctuation">}</span></span>
<span class="line"><span class="token operator">%</span><span class="token punctuation">}</span></span>
<span class="line"><span class="token punctuation">.</span><span class="token punctuation">.</span><span class="token punctuation">.</span></span>
<span class="line"></span>
<span class="line highlighted"><span class="token operator">%</span>states <span class="token constant">AT_EOF</span> <span class="token constant">AT_EOF_2</span></span>
<span class="line"></span>
<span class="line"><span class="token operator">%</span><span class="token operator">%</span></span>
<span class="line"><span class="token punctuation">.</span><span class="token punctuation">.</span><span class="token punctuation">.</span></span>
<span class="line"></span>
<span class="line highlighted"><span class="token generics"><span class="token punctuation">&lt;</span><span class="token punctuation">&lt;</span>EOF<span class="token punctuation">&gt;</span><span class="token punctuation">&gt;</span></span>                          <span class="token punctuation">{</span> <span class="token function">yybegin</span><span class="token punctuation">(</span><span class="token constant">AT_EOF</span><span class="token punctuation">)</span><span class="token punctuation">;</span> <span class="token punctuation">}</span></span>
<span class="line highlighted"><span class="token generics"><span class="token punctuation">&lt;</span>AT_EOF<span class="token punctuation">,</span> AT_EOF_2<span class="token punctuation">&gt;</span></span> <span class="token generics"><span class="token punctuation">&lt;</span><span class="token punctuation">&lt;</span>EOF<span class="token punctuation">&gt;</span><span class="token punctuation">&gt;</span></span>       <span class="token punctuation">{</span> <span class="token keyword">return</span> <span class="token function">processEof</span><span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">;</span> <span class="token punctuation">}</span></span>
<span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true" style="counter-reset:line-number 0;"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>This should handle the end of file issue correctly, emitting the correct number of <code>DED</code> tokens to close any open blocks.</p><p>Now, let&#39;s remove the <code>ded_or_eof</code> rule from the <code>BlockStmt</code> rule, and revert it back to its simpler form.</p><div class="language-text line-numbers-mode" data-highlighter="prismjs" data-ext="text" data-title="text"><pre class="has-diff language-text"><code><span class="line diff remove">BlockStmt  ::= BLOCK COLON IND StmtList ded_or_eof</span>
<span class="line diff add">BlockStmt  ::= BLOCK COLON IND StmtList DED</span>
<span class="line"></span>
<span class="line diff remove">private ded_or_eof ::= DED | &lt;&lt;eof&gt;&gt;</span>
<span class="line"></span>
<span class="line"></span></code></pre><div class="line-numbers" aria-hidden="true" style="counter-reset:line-number 0;"><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div><div class="line-number"></div></div></div><p>Much better!</p><p>I believe this takes care of all the indentation issues. Not bad; for a few dozen lines of lexer code we have a working indentation-based parser. We can now focus on adding more grammar rules that build on top of this.</p>`,67)]))}const v=s(r,[["render",k],["__file","11-indentation.html.vue"]]),b=JSON.parse('{"path":"/nimjet/11-indentation.html","title":"Indentation","lang":"en-US","frontmatter":{},"headers":[{"level":2,"title":"Indentation Tokens","slug":"indentation-tokens","link":"#indentation-tokens","children":[]},{"level":2,"title":"Lexer State Machine","slug":"lexer-state-machine","link":"#lexer-state-machine","children":[]},{"level":2,"title":"Parsing Top-Level Code","slug":"parsing-top-level-code","link":"#parsing-top-level-code","children":[]},{"level":2,"title":"Parsing Blocks","slug":"parsing-blocks","link":"#parsing-blocks","children":[]}],"git":{"updatedTime":1728140284000},"filePathRelative":"nimjet/11-indentation.md","excerpt":"\\n<p>Nim is an indentation-based language, like Python. This means that the structure of the\\ncode is determined by the indentation level of the lines, and that whitespace is\\nsignificant. Consider the following example:</p>\\n<div class=\\"language-nim line-numbers-mode\\" data-highlighter=\\"prismjs\\" data-ext=\\"nim\\" data-title=\\"nim\\"><pre><code><span class=\\"line\\"><span class=\\"token keyword\\">proc</span> <span class=\\"token function\\">foo</span><span class=\\"token punctuation\\">(</span><span class=\\"token punctuation\\">)</span> <span class=\\"token operator\\">=</span>                   <span class=\\"token comment\\"># ──────────────────┐0</span></span>\\n<span class=\\"line\\"><span class=\\"space\\"> </span><span class=\\"space\\"> </span><span class=\\"token keyword\\">for</span> i <span class=\\"token operator\\">in</span> <span class=\\"token number\\">0</span> <span class=\\"token operator\\">..&lt;</span> <span class=\\"token number\\">10</span><span class=\\"token operator\\">:</span>           <span class=\\"token comment\\"># ─────────────┐1   │</span></span>\\n<span class=\\"line\\"><span class=\\"space\\"> </span><span class=\\"space\\"> </span><span class=\\"space\\"> </span><span class=\\"space\\"> </span>echo i                     <span class=\\"token comment\\"># ────────┐2   │    │</span></span>\\n<span class=\\"line\\"><span class=\\"space\\"> </span><span class=\\"space\\"> </span><span class=\\"space\\"> </span><span class=\\"space\\"> </span><span class=\\"token keyword\\">if</span> i <span class=\\"token operator\\">mod</span> <span class=\\"token number\\">2</span> <span class=\\"token operator\\">==</span> <span class=\\"token number\\">0</span><span class=\\"token operator\\">:</span>           <span class=\\"token comment\\">#         │    │    │</span></span>\\n<span class=\\"line\\"><span class=\\"space\\"> </span><span class=\\"space\\"> </span><span class=\\"space\\"> </span><span class=\\"space\\"> </span><span class=\\"space\\"> </span><span class=\\"space\\"> </span>echo <span class=\\"token string\\">\\"even\\"</span>              <span class=\\"token comment\\"># ── 3    │    │    │</span></span>\\n<span class=\\"line\\"><span class=\\"space\\"> </span><span class=\\"space\\"> </span><span class=\\"space\\"> </span><span class=\\"space\\"> </span><span class=\\"token keyword\\">else</span><span class=\\"token operator\\">:</span>                      <span class=\\"token comment\\">#         │    │    │</span></span>\\n<span class=\\"line\\"><span class=\\"space\\"> </span><span class=\\"space\\"> </span><span class=\\"space\\"> </span><span class=\\"space\\"> </span><span class=\\"space\\"> </span><span class=\\"space\\"> </span>echo <span class=\\"token string\\">\\"odd\\"</span>               <span class=\\"token comment\\"># ── 3 ◄──┘ ◄──┘ ◄──┘<span class=\\"space\\"> </span></span></span>\\n<span class=\\"line\\"></span></code></pre>\\n<div class=\\"line-numbers\\" aria-hidden=\\"true\\" style=\\"counter-reset:line-number 0\\"><div class=\\"line-number\\"></div><div class=\\"line-number\\"></div><div class=\\"line-number\\"></div><div class=\\"line-number\\"></div><div class=\\"line-number\\"></div><div class=\\"line-number\\"></div><div class=\\"line-number\\"></div></div></div>"}');export{v as comp,b as data};
