mips64-assembler/Assembler.C
2025-11-04 14:00:16 -05:00

185 lines
7 KiB
C

#include <regex>
#include "Assembler.h"
// label<whitespace>instruction<whitespace>field0<whitespace>field1<whitespace>field2<whitespace>#comments
#define LINE_REGEX R"~(^([[:alnum:]]{1,12})?\s+(.+?)(\s+(#.*))?$)~"
#define BLANK_REGEX R"~(^\s*$)~"
#define COMMENT_REGEX R"~(^#.*$)~"
#define INST_REGEX R"~(^([[:alpha:]]+(?:\.[[:alpha:]]+)?)(\s+.*)?$)~"
#define DIRECTIVE_REGEX R"~(^\.dfill\s+([[:alnum:]]{1,12}|-?\d+\.?\d*)$)~"
Assembler::Assembler(std::vector<std::string> asmLines_) :
asmLines(asmLines_)
{
ih = new InstHandler(&labels);
}
void Assembler::parse() {
std::regex lineRegex(LINE_REGEX);
std::regex blankRegex(BLANK_REGEX);
std::regex commentRegex(COMMENT_REGEX);
std::regex directiveRegex(DIRECTIVE_REGEX);
std::smatch matches;
std::smatch dfMatches;
uint32_t icount = 0;
// initial pass: handle dfill directive, track labels, filter out bad formatting, parse instructions
for (uint64_t i = 0; i < asmLines.size(); i++) {
if (std::regex_match(asmLines[i], matches, lineRegex)) {
// handle .dfill directive
bool dfill = false;
std::string inst(matches.str(2));
if (std::regex_match(inst, dfMatches, directiveRegex)) {
// a .dfill directive means we will need to increment icount extra later, and might have to align to 8 bytes
dfill = true;
if (icount % 2 == 1) {
// alignment placeholder
instructions.push_back((inst_t){i, {"BLANK"}, icount, false, 0});
icount++;
}
}
// record location of label, if we're creating one
if (matches.str(1).size() > 0 && !newLabel(matches.str(1), icount)) return error(i, "label already exists");
// parse and save
if (dfill) {
if (!parseDfill(dfMatches.str(1), icount, i)) return error(i, "dfill parse error");
} else {
if (!parseInst(inst, icount, i)) return error(i, "inst parse error");
}
icount += (dfill ? 2 : 1);
} else if (!std::regex_match(asmLines[i], blankRegex) && !std::regex_match(asmLines[i], commentRegex)) {
return error(i, "invalid syntax");
}
}
// pass instructions to handler to fill out hex, and resolve dfill labels
for (uint32_t i = 0; i < instructions.size(); i++) {
if (instructions[i].tokens[0] == ".dfill" || instructions[i].tokens[0] == "BLANK") {
// don't do anything with a BLANK
if (instructions[i].usesLabel) {
// resolve dfill label
uint64_t labelLoc = ih->findLabel(instructions[i].tokens[1]);
if (labelLoc == (uint64_t)-1) return error(instructions[i].lineNumber, "invalid label");
labelLoc <<= 2;
instructions[i].hex = (uint32_t)labelLoc;
instructions[i+1].hex = (uint32_t)(labelLoc>>32);
i++;
}
} else {
// call handler on potential instruction to do individual format checks and assembling
if (!ih->handle(&instructions[i])) return error(instructions[i].lineNumber, "couldn't handle");
}
}
}
// interpret dfill directive and put appropriate lines of hex into output
int Assembler::parseDfill(std::string arg, uint32_t icount, uint64_t ln) {
uint64_t dec = arg.find('.');
if (dec > 0 && dec < arg.size()-1) {
// is a floating point
try {
// union to make access to underlying bytes easier
Fp val = {std::stod(arg)};
instructions.push_back((inst_t){ln, {".dfill", arg}, icount, false, val.i[0]});
instructions.push_back((inst_t){ln, {".dfill", arg}, icount+1, false, val.i[1]});
return 1;
} catch (...) {
// if stod fails for whatever reason, probably the user's fault
return 0;
}
} else if (std::regex_match(arg, std::regex("^-?\\d+$"))) {
// is an integer
try {
// have to use stoll to correctly interpret negative!
uint64_t val = std::stoll(arg);
instructions.push_back((inst_t){ln, {".dfill", arg}, icount, false, (uint32_t)val});
instructions.push_back((inst_t){ln, {".dfill", arg}, icount+1, false, (uint32_t)(val>>32)});
return 1;
} catch (...) {
// if stoll fails for whatever reason, probably the user's fault
return 0;
}
} else if (std::regex_match(arg, std::regex("^[[:alnum:]]{1,12}$"))) {
// is a label. mark it to resolve later
instructions.push_back((inst_t){ln, {".dfill", arg}, icount, true, 0});
instructions.push_back((inst_t){ln, {".dfill", arg}, icount+1, true, 0});
return 1;
} else {
return 0;
}
}
// tokenize the instruction and make sure the format makes sense
int Assembler::parseInst(std::string inst, uint32_t icount, uint64_t ln) {
//printf("%s\n", inst.c_str());
// tokenize
std::vector<std::string> tokens = tokenizeInst(inst);
// sanity check. never more than 4 tokens (1 instruction + 3 fields)
uint64_t numTokens = tokens.size();
if (numTokens > 4 || numTokens < 1) return 0;
inst_t i = (inst_t){ln, tokens, icount, false, 0};
instructions.push_back(i);
return 1;
}
// attempt to record a new label, or error if it exists already
int Assembler::newLabel(std::string name, uint32_t icount) {
for (uint64_t i = 0; i < labels.size(); i++) {
if (labels[i].name == name) return 0;
}
labels.push_back((label_t){name, icount});
return 1;
}
// split instruction string into lowercase instruction and all existing fields
std::vector<std::string> Assembler::tokenizeInst(std::string inst) {
std::vector<std::string> tks;
static std::regex emptyRegex("^\\s*$");
static std::regex instRegex(INST_REGEX);
static std::regex fieldRegex("^\\s+(-?[[:alnum:]]+)(\\s+.*)?$");
static std::smatch matches;
// if instruction isn't right format, stop
if (!std::regex_match(inst, matches, instRegex)) return tks;
tks.push_back(toLowerCase(matches.str(1)));
inst = matches.str(2);
while (!std::regex_match(inst, emptyRegex)) {
std::regex_match(inst, matches, fieldRegex);
tks.push_back(matches.str(1));
inst = matches.str(2);
}
return tks;
}
// print a custom error and exit
void Assembler::error(uint64_t ln, std::string msg) {
printf("Error on line %ld: \"%s\"\n%s\n", ln+1, asmLines[ln].c_str(), msg.c_str());
exit(0);
}
// utility to convert string to lower case
std::string Assembler::toLowerCase(std::string str) {
for (uint64_t i = 0; i < str.length(); i++) {
if (str[i] >= 'A' && str[i] <= 'Z') str[i] += 32;
}
return str;
}
// output pure hex
std::vector<uint32_t> Assembler::assemble() {
std::vector<uint32_t> hexCode;
for (uint32_t i = 0; i < instructions.size(); i++) {
hexCode.push_back(instructions[i].hex);
}
return hexCode;
}