#include #include "Assembler.h" // labelinstructionfield0field1field2#comments #define LINE_REGEX R"~(^([[:alnum:]]{1,12})?\s+(.+?)(\s+(#.*))?$)~" #define BLANK_REGEX R"~(^\s*$)~" #define COMMENT_REGEX R"~(^#.*$)~" #define INST_REGEX R"~(^([[:alpha:]]+(?:\.[[:alpha:]]+)?)(\s+.*)?$)~" #define DIRECTIVE_REGEX R"~(^\.dfill\s+([[:alnum:]]{1,12}|-?\d+\.?\d*)$)~" Assembler::Assembler(std::vector asmLines_) : asmLines(asmLines_) { ih = new InstHandler(&labels); } void Assembler::parse() { std::regex lineRegex(LINE_REGEX); std::regex blankRegex(BLANK_REGEX); std::regex commentRegex(COMMENT_REGEX); std::regex directiveRegex(DIRECTIVE_REGEX); std::smatch matches; std::smatch dfMatches; uint32_t icount = 0; // initial pass: handle dfill directive, track labels, filter out bad formatting, parse instructions for (uint64_t i = 0; i < asmLines.size(); i++) { if (std::regex_match(asmLines[i], matches, lineRegex)) { // handle .dfill directive bool dfill = false; std::string inst(matches.str(2)); if (std::regex_match(inst, dfMatches, directiveRegex)) { // a .dfill directive means we will need to increment icount extra later, and might have to align to 8 bytes dfill = true; if (icount % 2 == 1) { // alignment placeholder instructions.push_back((inst_t){i, {"BLANK"}, icount, false, 0}); icount++; } } // record location of label, if we're creating one if (matches.str(1).size() > 0 && !newLabel(matches.str(1), icount)) return error(i, "label already exists"); // parse and save if (dfill) { if (!parseDfill(dfMatches.str(1), icount, i)) return error(i, "dfill parse error"); } else { if (!parseInst(inst, icount, i)) return error(i, "inst parse error"); } icount += (dfill ? 2 : 1); } else if (!std::regex_match(asmLines[i], blankRegex) && !std::regex_match(asmLines[i], commentRegex)) { return error(i, "invalid syntax"); } } // pass instructions to handler to fill out hex, and resolve dfill labels for (uint32_t i = 0; i < instructions.size(); i++) { if (instructions[i].tokens[0] == ".dfill" || instructions[i].tokens[0] == "BLANK") { // don't do anything with a BLANK if (instructions[i].usesLabel) { // resolve dfill label uint64_t labelLoc = ih->findLabel(instructions[i].tokens[1]); if (labelLoc == (uint64_t)-1) return error(instructions[i].lineNumber, "invalid label"); labelLoc <<= 2; instructions[i].hex = (uint32_t)labelLoc; instructions[i+1].hex = (uint32_t)(labelLoc>>32); i++; } } else { // call handler on potential instruction to do individual format checks and assembling if (!ih->handle(&instructions[i])) return error(instructions[i].lineNumber, "couldn't handle"); } } } // interpret dfill directive and put appropriate lines of hex into output int Assembler::parseDfill(std::string arg, uint32_t icount, uint64_t ln) { uint64_t dec = arg.find('.'); if (dec > 0 && dec < arg.size()-1) { // is a floating point try { // union to make access to underlying bytes easier Fp val = {std::stod(arg)}; instructions.push_back((inst_t){ln, {".dfill", arg}, icount, false, val.i[0]}); instructions.push_back((inst_t){ln, {".dfill", arg}, icount+1, false, val.i[1]}); return 1; } catch (...) { // if stod fails for whatever reason, probably the user's fault return 0; } } else if (std::regex_match(arg, std::regex("^-?\\d+$"))) { // is an integer try { // have to use stoll to correctly interpret negative! uint64_t val = std::stoll(arg); instructions.push_back((inst_t){ln, {".dfill", arg}, icount, false, (uint32_t)val}); instructions.push_back((inst_t){ln, {".dfill", arg}, icount+1, false, (uint32_t)(val>>32)}); return 1; } catch (...) { // if stoll fails for whatever reason, probably the user's fault return 0; } } else if (std::regex_match(arg, std::regex("^[[:alnum:]]{1,12}$"))) { // is a label. mark it to resolve later instructions.push_back((inst_t){ln, {".dfill", arg}, icount, true, 0}); instructions.push_back((inst_t){ln, {".dfill", arg}, icount+1, true, 0}); return 1; } else { return 0; } } // tokenize the instruction and make sure the format makes sense int Assembler::parseInst(std::string inst, uint32_t icount, uint64_t ln) { //printf("%s\n", inst.c_str()); // tokenize std::vector tokens = tokenizeInst(inst); // sanity check. never more than 4 tokens (1 instruction + 3 fields) uint64_t numTokens = tokens.size(); if (numTokens > 4 || numTokens < 1) return 0; inst_t i = (inst_t){ln, tokens, icount, false, 0}; instructions.push_back(i); return 1; } // attempt to record a new label, or error if it exists already int Assembler::newLabel(std::string name, uint32_t icount) { for (uint64_t i = 0; i < labels.size(); i++) { if (labels[i].name == name) return 0; } labels.push_back((label_t){name, icount}); return 1; } // split instruction string into lowercase instruction and all existing fields std::vector Assembler::tokenizeInst(std::string inst) { std::vector tks; static std::regex emptyRegex("^\\s*$"); static std::regex instRegex(INST_REGEX); static std::regex fieldRegex("^\\s+(-?[[:alnum:]]+)(\\s+.*)?$"); static std::smatch matches; // if instruction isn't right format, stop if (!std::regex_match(inst, matches, instRegex)) return tks; tks.push_back(toLowerCase(matches.str(1))); inst = matches.str(2); while (!std::regex_match(inst, emptyRegex)) { std::regex_match(inst, matches, fieldRegex); tks.push_back(matches.str(1)); inst = matches.str(2); } return tks; } // print a custom error and exit void Assembler::error(uint64_t ln, std::string msg) { printf("Error on line %ld: \"%s\"\n%s\n", ln+1, asmLines[ln].c_str(), msg.c_str()); exit(0); } // utility to convert string to lower case std::string Assembler::toLowerCase(std::string str) { for (uint64_t i = 0; i < str.length(); i++) { if (str[i] >= 'A' && str[i] <= 'Z') str[i] += 32; } return str; } // output pure hex std::vector Assembler::assemble() { std::vector hexCode; for (uint32_t i = 0; i < instructions.size(); i++) { hexCode.push_back(instructions[i].hex); } return hexCode; }