Source: assembler/passes/lexer.js

if (typeof yasp == "undefined") yasp = { };

(function () {
  var splitter = ' \t,\n;:"';
  var deadSplitter = " \t";
  var commentSplitter = ';';
  var stringSplitter = '"';

  var validLabel = "ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890_-";
  var validByteRegisters = [];
  for (var i = 0; i < 32; i++) {
    validByteRegisters.push("B" + i);
  }
  var validWordRegisters = [];
  for (var i = 0; i < 32; i++) {
    validWordRegisters.push("W" + i);
  }
  var validUnknownRegister = /[BW]\d+/;
  
  /** Tokenizes (= Lexer) all the tokens
   * @class
   */
  yasp.Lexer = function () {
    this.tokens = [];
  };

  /**
   * Returns all delimiters as a string
   * @returns {string}
   */
  yasp.Lexer.getDelimiters = function() {
    return splitter;
  };

  /**
   * Returns all registers as an array
   */
  yasp.Lexer.getRegisters = function() {
    return validByteRegisters.slice(0).concat(validWordRegisters);
  }

  /**
   * Does the tokenize step
   * @param assembler The assembler in which it is executed, this parameter is completely ignored
   * @param input What should be tokenized?
   * @returns {Array}
   */
  yasp.Lexer.prototype.pass = function (assembler, input) {
    var lastFound = 0;
    var line = 1;
    var char = 1;

    for (var i = 0; i < input.length; i++) {
      var token = input.charAt(i);
      if (splitter.indexOf(token) != -1) {
        var text = input.substring(lastFound, i);
        this.newToken(new yasp.Token(text, line, char - text.length));

        if (token == commentSplitter) {
          while (input.charAt(i) != '\n' && i < input.length) i++;
          i--;
        } else if (token == stringSplitter) {
          token = "";
          i++;
          while ((input.charAt(i) != '"' && input.charAt(i) != '\n') && i < input.length) {
            token += input.charAt(i);
            i++;
          }
          token = '"' + token + '"';
          this.newToken(new yasp.Token(token, line, char));
        } else {
          this.newToken(new yasp.Token(token, line, char));
        }

        lastFound = i + 1;
      }

      char++;
      if (input.charAt(i) == '\n') {
        line++;
        char = 1;
      }
    }
    if (lastFound < input.length) {
      var text = input.substring(lastFound, i);
      this.newToken(new yasp.Token(text, line, char - text.length));
    }
    if (this.tokens.length == 0 || this.tokens[this.tokens.length - 1].text != '\n') this.newToken(new yasp.Token("\n", line, char));

    return this.tokens;
  };

  /**
   * Creates a new token and adds it to the token array list
   * If the token is either empty or in the deadSplitter array it is not added to the array
   * @param token Which token should be added
   */
  yasp.Lexer.prototype.newToken = function (token) {
    if (!!token && token.text.length > 0 && deadSplitter.indexOf(token.text) == -1) {
      this.tokens.push(token);
    }
  };

  yasp.TokenType = {
    COMMAND: "command",
    LABEL: "label",
    NUMBER: "number", // pin / byte_literal / word_literal
    NEGATIVE_NUMBER: "negative number", // -42 for example
    BYTE_REGISTER: "byte register",
    WORD_REGISTER: "word register",
    DIRECTIVE: "directive",
    UNKNOWN_REGISTER: "unknown register",
    UNKNOWN: "unknown",
    DELIMITER: "delimiter",
    STRING: "string"
  };

  /**
   * A Token representing an atomic group of characters in the assembler code
   * @param text The content of the token
   * @param line The line in which this token is (used for error messages)
   * @param char The character where exactly this token is (used for error messages)
   * @constructor
   */
  yasp.Token = function (text, line, char) {
    this.text = text;
    this.line = line;
    this.char = char;
  };

  /** Returns the type of this Token (Literal, Label, ...)
   */
  yasp.Token.prototype.getType = function () {
    // TODO: optimize this function => cache values and dont iterate through everything
    var name = this.text.toUpperCase();

    if (!isNaN(parseFloat(name)) && isFinite(name)) {
      var num = +name;
      if (num < 0) {
        return yasp.TokenType.NEGATIVE_NUMBER;
      } else {
        return yasp.TokenType.NUMBER;
      }
    }
    
    // am i a string
    if (name.charAt(0) == '"' && name.charAt(name.length-1) == '"') {
      return yasp.TokenType.STRING;
    }
    
    // am i a byte register
    for (var i = 0; i < validByteRegisters.length; i++) {
      if (validByteRegisters[i] == name) return yasp.TokenType.BYTE_REGISTER;
    }

    // am i a word register
    for (var i = 0; i < validWordRegisters.length; i++) {
      if (validWordRegisters[i] == name) return yasp.TokenType.WORD_REGISTER;
    }
    
    // am i a directive
    switch (name) {
      case "DEFINE":
      case "ORG":
      case "STRING":
      case "DB":
      case "DA":
      case "DW":
      case "END":
        return yasp.TokenType.DIRECTIVE;
    }

    // am i a command?
    for (var i = 0; i < yasp.commands.length; i++) {
      if (yasp.commands[i].name instanceof Array) {
        for (var j = 0; j < yasp.commands[i].name.length; j++) {
          if (yasp.commands[i].name[j].toUpperCase() == name) return yasp.TokenType.COMMAND;
        }
      } else {
        if (yasp.commands[i].name.toUpperCase() == name) return yasp.TokenType.COMMAND;
      }
    }
    
    // am i an unknown register
    var unknownRegister = validUnknownRegister.exec(name)
    if (unknownRegister != null && unknownRegister.length > 0 && (name.charAt(0).toUpperCase() == 'B' || name.charAt(0).toUpperCase() == 'W')) {
      return yasp.TokenType.UNKNOWN_REGISTER + "[" + name + "]";
    }

    // am i a label?
    var amILabel = true;
    for (var i = 0; i < this.text.length; i++) {
      if (validLabel.indexOf(name.charAt(i)) == -1) {
        amILabel = false;
        break;
      }
    }
    
    // am i a delimiter?
    if (splitter.indexOf(name) != -1) {
      return yasp.TokenType.DELIMITER;
    }
    
    return amILabel ? yasp.TokenType.LABEL : yasp.TokenType.UNKNOWN;
  };

  /**
   * Returns the string representation of a token. This should only be used for testing purpose
   * @returns {string}
   */
  yasp.Token.prototype.toString = function () {
    return "'" + (this.text == '\n' ? "↵" : this.text) + "'";
  };
})();