str/src/vm/lexer.ts

import {BehaviorSubject} from '../util/subject.js'
import {Input} from './input.js'
import {log} from '../log.js'
import {StreamLogger} from '../util/log.js'

export type LexTerminator = { type: 'terminator' }
export type LexInput = { type: 'input', value: string, literal?: true }

export type LexToken = LexTerminator | LexInput

const logger = log.getStreamLogger('lexer')

const LITERAL_MAP: Record<string, string> = {
    'n': '\n',
    'r': '\r',
    't': '\t',
    's': ' ',
}

export const tokenIsLVal = (input: LexInput): boolean =>
    !input.literal && !!input.value.match(/^\$[a-zA-Z0-9_]+$/)

export class Lexer extends BehaviorSubject<LexToken> {
    private isEscape: boolean = false
    private inComment: boolean = false
    private inQuote?: '"'|"'"
    private tokenAccumulator: string = ''

    private logger: StreamLogger

    constructor(input: Input) {
        super()
        this.logger = log.getStreamLogger('lexer')
        input.subscribe(input => this.lexInput(input))
    }

    private logState(c: string): void {
        this.logger.verbose({
            c,
            isEscape: this.isEscape,
            inQuote: this.inQuote,
            tokenAccumulator: this.tokenAccumulator,
        })
    }

    private async emitToken(reason: string, literal?: true): Promise<void> {
        logger.verbose({ emitToken: reason })
        await this.next({ type: 'input', value: this.tokenAccumulator, literal })
        this.tokenAccumulator = ''
    }

    private async lexInput(input: string): Promise<void> {
        logger.debug({ input })

        let inputChars = input.split('')

        while ( inputChars.length ) {
            const c = inputChars.shift()!
            this.logState(c)

            // We're in a comment. Ignore everything except newlines.
            if ( this.inComment && c !== '\n' ) {
                continue
            }

            // We got the 2nd character after an escape
            if ( this.isEscape ) {
                this.tokenAccumulator += LITERAL_MAP[c] || c
                this.isEscape = false
                continue
            }

            // We are about to get an escape character
            if ( c === '\\' ) {
                this.isEscape = true
                continue
            }

            // We got a statement terminator
            if ( (c === ';' || c === '\n') && !this.inQuote ) {
                if ( this.tokenAccumulator ) {
                    await this.emitToken('terminator')
                }
                this.inComment = false
                await this.next({ type: 'terminator' })
                continue
            }

            // Whitespace separates tokens
            if ( (c === ' ' || c === '\t' || c === '\r') && !this.inQuote ) {
                if ( this.tokenAccumulator ) {
                    await this.emitToken('whitespace')
                }
                continue
            }

            // Comments start with --
            if ( this.tokenAccumulator === '-' && c === '-' && !this.inQuote ) {
                this.tokenAccumulator = ''
                this.inComment = true
                continue
            }

            // We are either starting or ending an unescaped matching quote.
            // For now, only parse single quotes. Makes it nicer to type " in commands.
            if ( c === `'` ) {
                if ( c === this.inQuote ) {
                    this.inQuote = undefined
                    await this.emitToken('quote', true)
                    continue
                } else if ( !this.inQuote ) {
                    this.inQuote = c
                    continue
                }
            }

            this.tokenAccumulator += c
        }
    }
}