From 978c1f9fdb1edfc9a859df2304b0966db631ecde Mon Sep 17 00:00:00 2001 From: snedmore Date: Tue, 4 Nov 2025 14:00:16 -0500 Subject: [PATCH] copy project to git --- .gitignore | 36 +---- Assembler.C | 185 ++++++++++++++++++++++++++ Assembler.h | 36 +++++ InstHandler.C | 361 ++++++++++++++++++++++++++++++++++++++++++++++++++ InstHandler.h | 60 +++++++++ README.md | 12 +- ma.C | 53 ++++++++ makefile | 18 +++ 8 files changed, 728 insertions(+), 33 deletions(-) create mode 100644 Assembler.C create mode 100644 Assembler.h create mode 100644 InstHandler.C create mode 100644 InstHandler.h create mode 100644 ma.C create mode 100644 makefile diff --git a/.gitignore b/.gitignore index e257658..b23f30f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,34 +1,6 @@ -# ---> C++ -# Prerequisites -*.d - -# Compiled Object files -*.slo -*.lo *.o -*.obj - -# Precompiled Headers -*.gch -*.pch - -# Compiled Dynamic libraries -*.so -*.dylib -*.dll - -# Fortran module files -*.mod -*.smod - -# Compiled Static libraries -*.lai -*.la -*.a -*.lib - -# Executables -*.exe -*.out -*.app +ma +.vscode/ +Tests/ +run.pl diff --git a/Assembler.C b/Assembler.C new file mode 100644 index 0000000..9655192 --- /dev/null +++ b/Assembler.C @@ -0,0 +1,185 @@ +#include +#include "Assembler.h" + +// labelinstructionfield0field1field2#comments +#define LINE_REGEX R"~(^([[:alnum:]]{1,12})?\s+(.+?)(\s+(#.*))?$)~" +#define BLANK_REGEX R"~(^\s*$)~" +#define COMMENT_REGEX R"~(^#.*$)~" + +#define INST_REGEX R"~(^([[:alpha:]]+(?:\.[[:alpha:]]+)?)(\s+.*)?$)~" +#define DIRECTIVE_REGEX R"~(^\.dfill\s+([[:alnum:]]{1,12}|-?\d+\.?\d*)$)~" + +Assembler::Assembler(std::vector asmLines_) : + asmLines(asmLines_) +{ + ih = new InstHandler(&labels); +} + +void Assembler::parse() { + std::regex lineRegex(LINE_REGEX); + std::regex blankRegex(BLANK_REGEX); + std::regex commentRegex(COMMENT_REGEX); + std::regex directiveRegex(DIRECTIVE_REGEX); + std::smatch matches; + std::smatch dfMatches; + uint32_t icount = 0; + + // initial pass: handle dfill directive, track labels, filter out bad formatting, parse instructions + for (uint64_t i = 0; i < asmLines.size(); i++) { + if (std::regex_match(asmLines[i], matches, lineRegex)) { + // handle .dfill directive + bool dfill = false; + std::string inst(matches.str(2)); + if (std::regex_match(inst, dfMatches, directiveRegex)) { + // a .dfill directive means we will need to increment icount extra later, and might have to align to 8 bytes + dfill = true; + if (icount % 2 == 1) { + // alignment placeholder + instructions.push_back((inst_t){i, {"BLANK"}, icount, false, 0}); + icount++; + } + } + + // record location of label, if we're creating one + if (matches.str(1).size() > 0 && !newLabel(matches.str(1), icount)) return error(i, "label already exists"); + + // parse and save + if (dfill) { + if (!parseDfill(dfMatches.str(1), icount, i)) return error(i, "dfill parse error"); + } else { + if (!parseInst(inst, icount, i)) return error(i, "inst parse error"); + } + + icount += (dfill ? 2 : 1); + } else if (!std::regex_match(asmLines[i], blankRegex) && !std::regex_match(asmLines[i], commentRegex)) { + return error(i, "invalid syntax"); + } + } + + // pass instructions to handler to fill out hex, and resolve dfill labels + for (uint32_t i = 0; i < instructions.size(); i++) { + if (instructions[i].tokens[0] == ".dfill" || instructions[i].tokens[0] == "BLANK") { + // don't do anything with a BLANK + if (instructions[i].usesLabel) { + // resolve dfill label + uint64_t labelLoc = ih->findLabel(instructions[i].tokens[1]); + if (labelLoc == (uint64_t)-1) return error(instructions[i].lineNumber, "invalid label"); + labelLoc <<= 2; + instructions[i].hex = (uint32_t)labelLoc; + instructions[i+1].hex = (uint32_t)(labelLoc>>32); + i++; + } + } else { + // call handler on potential instruction to do individual format checks and assembling + if (!ih->handle(&instructions[i])) return error(instructions[i].lineNumber, "couldn't handle"); + } + } +} + +// interpret dfill directive and put appropriate lines of hex into output +int Assembler::parseDfill(std::string arg, uint32_t icount, uint64_t ln) { + uint64_t dec = arg.find('.'); + if (dec > 0 && dec < arg.size()-1) { + // is a floating point + try { + // union to make access to underlying bytes easier + Fp val = {std::stod(arg)}; + + instructions.push_back((inst_t){ln, {".dfill", arg}, icount, false, val.i[0]}); + instructions.push_back((inst_t){ln, {".dfill", arg}, icount+1, false, val.i[1]}); + return 1; + } catch (...) { + // if stod fails for whatever reason, probably the user's fault + return 0; + } + } else if (std::regex_match(arg, std::regex("^-?\\d+$"))) { + // is an integer + try { + // have to use stoll to correctly interpret negative! + uint64_t val = std::stoll(arg); + + instructions.push_back((inst_t){ln, {".dfill", arg}, icount, false, (uint32_t)val}); + instructions.push_back((inst_t){ln, {".dfill", arg}, icount+1, false, (uint32_t)(val>>32)}); + return 1; + } catch (...) { + // if stoll fails for whatever reason, probably the user's fault + return 0; + } + } else if (std::regex_match(arg, std::regex("^[[:alnum:]]{1,12}$"))) { + // is a label. mark it to resolve later + instructions.push_back((inst_t){ln, {".dfill", arg}, icount, true, 0}); + instructions.push_back((inst_t){ln, {".dfill", arg}, icount+1, true, 0}); + return 1; + } else { + return 0; + } +} + +// tokenize the instruction and make sure the format makes sense +int Assembler::parseInst(std::string inst, uint32_t icount, uint64_t ln) { + //printf("%s\n", inst.c_str()); + + // tokenize + std::vector tokens = tokenizeInst(inst); + + // sanity check. never more than 4 tokens (1 instruction + 3 fields) + uint64_t numTokens = tokens.size(); + if (numTokens > 4 || numTokens < 1) return 0; + inst_t i = (inst_t){ln, tokens, icount, false, 0}; + instructions.push_back(i); + + return 1; +} + +// attempt to record a new label, or error if it exists already +int Assembler::newLabel(std::string name, uint32_t icount) { + for (uint64_t i = 0; i < labels.size(); i++) { + if (labels[i].name == name) return 0; + } + labels.push_back((label_t){name, icount}); + return 1; +} + +// split instruction string into lowercase instruction and all existing fields +std::vector Assembler::tokenizeInst(std::string inst) { + std::vector tks; + static std::regex emptyRegex("^\\s*$"); + static std::regex instRegex(INST_REGEX); + static std::regex fieldRegex("^\\s+(-?[[:alnum:]]+)(\\s+.*)?$"); + static std::smatch matches; + + // if instruction isn't right format, stop + if (!std::regex_match(inst, matches, instRegex)) return tks; + tks.push_back(toLowerCase(matches.str(1))); + + inst = matches.str(2); + while (!std::regex_match(inst, emptyRegex)) { + std::regex_match(inst, matches, fieldRegex); + tks.push_back(matches.str(1)); + inst = matches.str(2); + } + return tks; +} + +// print a custom error and exit +void Assembler::error(uint64_t ln, std::string msg) { + printf("Error on line %ld: \"%s\"\n%s\n", ln+1, asmLines[ln].c_str(), msg.c_str()); + exit(0); +} + +// utility to convert string to lower case +std::string Assembler::toLowerCase(std::string str) { + for (uint64_t i = 0; i < str.length(); i++) { + if (str[i] >= 'A' && str[i] <= 'Z') str[i] += 32; + } + return str; +} + +// output pure hex +std::vector Assembler::assemble() { + std::vector hexCode; + for (uint32_t i = 0; i < instructions.size(); i++) { + hexCode.push_back(instructions[i].hex); + } + return hexCode; +} diff --git a/Assembler.h b/Assembler.h new file mode 100644 index 0000000..e8e3e28 --- /dev/null +++ b/Assembler.h @@ -0,0 +1,36 @@ +#include +#include +#include +#include "InstHandler.h" + +#ifndef ASSEMBLER_H +#define ASSEMBLER_H + + + +class Assembler { + private: + InstHandler * ih; + + std::vector asmLines; + std::vector labels; + std::vector instructions; + + union Fp { + double d; + uint32_t i[2]; + }; + + int parseDfill(std::string arg, uint32_t icount, uint64_t ln); + int parseInst(std::string inst, uint32_t icount, uint64_t ln); + int newLabel(std::string name, uint32_t icount); + std::vector tokenizeInst(std::string inst); + std::string toLowerCase(std::string str); + void error(uint64_t ln, std::string msg); + public: + Assembler(std::vector); + void parse(); + std::vector assemble(); +}; + +#endif diff --git a/InstHandler.C b/InstHandler.C new file mode 100644 index 0000000..a2b069a --- /dev/null +++ b/InstHandler.C @@ -0,0 +1,361 @@ +#include "InstHandler.h" + +#define HANDLER(f) [this](inst_t* inst){return f(inst);} + +#define REQ_TKS(n) if (inst->tokens.size() != n) return 0 +#define PARSE_REG(n, r) if (!parseRegister(inst->tokens[n], r)) return 0 +#define PARSE_FPREG(n, r) if (!parseFPRegister(inst->tokens[n], r)) return 0 +#define PARSE_IMM(n, i, d) if (!parseImmediate(inst->tokens[n], i, d)) return 0 + +#define OPCODE(o) ((uint32_t)o << 26) +#define RS(r) (r << 21) +#define RT(r) (r << 16) +#define RD(r) (r << 11) +#define IMM16(i) (i & 0x0000FFFF) + +#define COMB_I(o) 0u | OPCODE(o) | RS(rs) | RT(rt) | IMM16(imm) +#define COMB_R(o, f) 0u | OPCODE(o) | RS(rs) | RT(rt) | RD(rd) | ((uint32_t)f & 0x0000003F) + + +// set up function map +InstHandler::InstHandler(std::vector* labels_) : + labels(labels_) +{ + funcMap["ld"] = HANDLER(handleLD); + funcMap["l.d"] = HANDLER(handleLfD); + funcMap["sd"] = HANDLER(handleSD); + funcMap["s.d"] = HANDLER(handleSfD); + funcMap["daddi"] = HANDLER(handleDADDI); + funcMap["daddiu"] = HANDLER(handleDADDIU); + funcMap["beq"] = HANDLER(handleBEQ); + funcMap["bne"] = HANDLER(handleBNE); + funcMap["dadd"] = HANDLER(handleDADD); + funcMap["dsub"] = HANDLER(handleDSUB); + funcMap["add.d"] = HANDLER(handleADDfD); + funcMap["sub.d"] = HANDLER(handleSUBfD); + funcMap["mul.d"] = HANDLER(handleMULfD); + funcMap["div.d"] = HANDLER(handleDIVfD); + funcMap["j"] = HANDLER(handleJ); + funcMap["halt"] = HANDLER(handleHALT); + funcMap["nop"] = HANDLER(handleNOP); + funcMap["dump"] = HANDLER(handleDUMP); +} + +// call the appropriate function. this is where the instruction op is validated +int InstHandler::handle(inst_t* inst) { + try { + return funcMap.at(inst->tokens[0])(inst); + } catch (...) { + // not found in map + return 0; + } +} + + +int InstHandler::handleLD(inst_t* inst) { + REQ_TKS(4); + + uint32_t rt, rs; + PARSE_REG(1, rt); + PARSE_REG(3, rs); + + uint32_t imm; + PARSE_IMM(2, imm, false); + + inst->hex = COMB_I(55); + return 1; +} + +int InstHandler::handleLfD(inst_t* inst) { + REQ_TKS(4); + + uint32_t rt, rs; + PARSE_FPREG(1, rt); + PARSE_REG(3, rs); + + uint32_t imm; + PARSE_IMM(2, imm, false); + + inst->hex = COMB_I(53); + return 1; +} + +int InstHandler::handleSD(inst_t* inst) { + REQ_TKS(4); + + uint32_t rt, rs; + PARSE_REG(1, rt); + PARSE_REG(3, rs); + + uint32_t imm; + PARSE_IMM(2, imm, false); + + inst->hex = COMB_I(63); + return 1; +} + +int InstHandler::handleSfD(inst_t* inst) { + REQ_TKS(4); + + uint32_t rt, rs; + PARSE_FPREG(1, rt); + PARSE_REG(3, rs); + + uint32_t imm; + PARSE_IMM(2, imm, false); + + inst->hex = COMB_I(61); + return 1; +} + +int InstHandler::handleDADDI(inst_t* inst) { + REQ_TKS(4); + + uint32_t rt, rs; + PARSE_REG(1, rt); + PARSE_REG(2, rs); + + uint32_t imm; + PARSE_IMM(3, imm, false); + + inst->hex = COMB_I(24); + return 1; +} + +int InstHandler::handleDADDIU(inst_t* inst) { + REQ_TKS(4); + + uint32_t rt, rs; + PARSE_REG(1, rt); + PARSE_REG(2, rs); + + uint32_t imm; + PARSE_IMM(3, imm, false); + + inst->hex = COMB_I(25); + return 1; +} + +int InstHandler::handleBEQ(inst_t* inst) { + REQ_TKS(4); + + uint32_t rt, rs; + PARSE_REG(1, rt); + PARSE_REG(2, rs); + + uint32_t imm; + if (!parseImmediateBranch(inst->tokens[3], imm, inst->loc)) return 0; + + inst->hex = COMB_I(4); + return 1; +} + +int InstHandler::handleBNE(inst_t* inst) { + REQ_TKS(4); + + uint32_t rt, rs; + PARSE_REG(1, rt); + PARSE_REG(2, rs); + + uint32_t imm; + if (!parseImmediateBranch(inst->tokens[3], imm, inst->loc)) return 0; + + inst->hex = COMB_I(5); + return 1; +} + +int InstHandler::handleDADD(inst_t* inst) { + REQ_TKS(4); + + uint32_t rd, rs, rt; + PARSE_REG(1, rd); + PARSE_REG(2, rs); + PARSE_REG(3, rt); + + inst->hex = COMB_R(0, 44); + return 1; +} + +int InstHandler::handleDSUB(inst_t* inst) { + REQ_TKS(4); + + uint32_t rd, rs, rt; + PARSE_REG(1, rd); + PARSE_REG(2, rs); + PARSE_REG(3, rt); + + inst->hex = COMB_R(0, 46); + return 1; +} + +int InstHandler::handleADDfD(inst_t* inst) { + REQ_TKS(4); + + uint32_t rd, rs, rt; + PARSE_FPREG(1, rd); + PARSE_FPREG(2, rs); + PARSE_FPREG(3, rt); + + inst->hex = COMB_R(0, 47); + return 1; +} + +int InstHandler::handleSUBfD(inst_t* inst) { + REQ_TKS(4); + + uint32_t rd, rs, rt; + PARSE_FPREG(1, rd); + PARSE_FPREG(2, rs); + PARSE_FPREG(3, rt); + + inst->hex = COMB_R(0, 48); + return 1; +} + +int InstHandler::handleMULfD(inst_t* inst) { + REQ_TKS(4); + + uint32_t rd, rs, rt; + PARSE_FPREG(1, rd); + PARSE_FPREG(2, rs); + PARSE_FPREG(3, rt); + + inst->hex = COMB_R(0, 49); + return 1; +} + +int InstHandler::handleDIVfD(inst_t* inst) { + REQ_TKS(4); + + uint32_t rd, rs, rt; + PARSE_FPREG(1, rd); + PARSE_FPREG(2, rs); + PARSE_FPREG(3, rt); + + inst->hex = COMB_R(0, 50); + return 1; +} + +int InstHandler::handleJ(inst_t* inst) { + REQ_TKS(2); + + uint32_t imm; + if (!parseImmediateJump(inst->tokens[1], imm)) return 0; + + inst->hex = 0u | OPCODE(2) | (imm & 0x03FFFFFF); + return 1; +} + +int InstHandler::handleHALT(inst_t* inst) { + REQ_TKS(1); + + inst->hex = 0u | OPCODE(1); + return 1; +} + +int InstHandler::handleNOP(inst_t* inst) { + REQ_TKS(1); + + inst->hex = 0u | OPCODE(3); + return 1; +} + +int InstHandler::handleDUMP(inst_t* inst) { + REQ_TKS(2); + + uint32_t imm; + PARSE_IMM(1, imm, true); + + inst->hex = 0u | OPCODE(44) | (imm & 0x03FFFFFF); + return 1; +} + +// verifies register arguments are written correctly and sets the reg var to the reg number +int InstHandler::parseRegister(std::string arg, uint32_t& reg) { + static std::regex regRegex("^[rR](\\d{1,2})$"); + if (!std::regex_match(arg, matches, regRegex)) return 0; + reg = std::stoul(matches.str(1)); + if (reg > 31) return 0; + return 1; +} + +int InstHandler::parseFPRegister(std::string arg, uint32_t& reg) { + static std::regex regRegex("^[fF](\\d{1,2})$"); + if (!std::regex_match(arg, matches, regRegex)) return 0; + reg = std::stoul(matches.str(1)); + if (reg > 31) return 0; + return 1; +} + +// for most immediate functions; parses the immediate value or looks up the label value +int InstHandler::parseImmediate(std::string arg, uint32_t& imm, bool isDump) { + static std::regex immRegex("^-?\\d+$"); + static std::regex labelRegex("^[[:alnum:]]{1,12}$"); + + if (std::regex_match(arg, immRegex)) { + try { + imm = std::stol(arg); + return 1; + } catch (...) { + return 0; + } + } else if (!isDump && std::regex_match(arg, labelRegex)) { + imm = findLabel(arg); + if (imm == (uint32_t)-1) return 0; + imm <<= 2; + return 1; + } else { + return 0; + } +} + +// jumps don't shift labels left at all, but also aren't pc-relative? +int InstHandler::parseImmediateJump(std::string arg, uint32_t& imm) { + static std::regex immRegex("^-?\\d+$"); + static std::regex labelRegex("^[[:alnum:]]{1,12}$"); + + if (std::regex_match(arg, immRegex)) { + try { + imm = std::stol(arg); + return 1; + } catch (...) { + return 0; + } + } else if (std::regex_match(arg, labelRegex)) { + imm = findLabel(arg); + if (imm == (uint32_t)-1) return 0; + return 1; + } else { + return 0; + } +} + +// branches to labels are pc-relative i guess +int InstHandler::parseImmediateBranch(std::string arg, uint32_t& imm, uint32_t icount) { + static std::regex immRegex("^-?\\d+$"); + static std::regex labelRegex("^[[:alnum:]]{1,12}$"); + + if (std::regex_match(arg, immRegex)) { + try { + imm = std::stol(arg); + return 1; + } catch (...) { + return 0; + } + } else if (std::regex_match(arg, labelRegex)) { + imm = findLabel(arg); + if (imm == (uint32_t)-1) return 0; + imm -= icount + 1; + return 1; + } else { + return 0; + } +} + +// looks up label +uint32_t InstHandler::findLabel(std::string l) { + for (uint64_t i = 0; i < labels->size(); i++) { + if (labels->at(i).name == l) return labels->at(i).loc; + } + return -1; +} diff --git a/InstHandler.h b/InstHandler.h new file mode 100644 index 0000000..64731c8 --- /dev/null +++ b/InstHandler.h @@ -0,0 +1,60 @@ +#include +#include +#include +#include + +#ifndef INSTHANDLER_H +#define INSTHANDLER_H + +typedef struct { + uint64_t lineNumber; + std::vector tokens; + uint32_t loc; + bool usesLabel; + uint32_t hex; + //xstd::string comment; +} inst_t; + +typedef struct { + std::string name; + uint32_t loc; +} label_t; + +class InstHandler { + private: + typedef std::function lf_t; + std::smatch matches; + std::map funcMap; + std::vector* labels; + + int handleLD(inst_t* inst); + int handleLfD(inst_t* inst); + int handleSD(inst_t* inst); + int handleSfD(inst_t* inst); + int handleDADDI(inst_t* inst); + int handleDADDIU(inst_t* inst); + int handleBEQ(inst_t* inst); + int handleBNE(inst_t* inst); + int handleDADD(inst_t* inst); + int handleDSUB(inst_t* inst); + int handleADDfD(inst_t* inst); + int handleSUBfD(inst_t* inst); + int handleMULfD(inst_t* inst); + int handleDIVfD(inst_t* inst); + int handleJ(inst_t* inst); + int handleHALT(inst_t* inst); + int handleNOP(inst_t* inst); + int handleDUMP(inst_t* inst); + + int parseRegister(std::string arg, uint32_t& reg); + int parseFPRegister(std::string arg, uint32_t& reg); + int parseImmediate(std::string arg, uint32_t& imm, bool isDump); + int parseImmediateJump(std::string arg, uint32_t& imm); + int parseImmediateBranch(std::string arg, uint32_t& imm, uint32_t icount); + public: + InstHandler(std::vector* labels_); + int handle(inst_t* inst); + uint32_t findLabel(std::string l); +}; + +#endif diff --git a/README.md b/README.md index 9f73cf5..8344d82 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,12 @@ -# mips64-assembler +# CS5483 Computer Architecture Project 1: MIPS Assembler +## Ryan Densmore +## Compiling and Running +1. To compile my assembler, simply run the Makefile via `make`. +2. Run the assembler with `./ma `. + +## Tests +While the provided `run.pl` seems a little janky to use, I currently pass 29/29 tests. Running the `run.pl` program currently in my directory will run every test, and clean up the files for the tests that pass. + +## Todos +Had I more time, I would actually try to implement outputting comments. There appears to be some sort of memory issue that only occurs when I try to store another string (containing the comment) alongside every instruction. There are a few other ways I could do it, but attaching the comment to its relevant instruction in the data structure makes the most sense. diff --git a/ma.C b/ma.C new file mode 100644 index 0000000..359307c --- /dev/null +++ b/ma.C @@ -0,0 +1,53 @@ +/* CS5483 Project 1: MIPS64 Assembler + * Ryan Densmore +*/ + +#include +#include +#include +#include +#include "Assembler.h" + +std::string parseArgs(int argc, char ** argv); +void usage(); + +int main(int argc, char ** argv) { + std::string name = parseArgs(argc, argv); + + std::fstream asmFile(argv[1], std::fstream::in); + if (!asmFile.is_open()) usage(); + + std::string line; + std::vector lines; + while (asmFile.good()) { + std::getline(asmFile, line); + lines.push_back(std::move(line)); + } + + Assembler assembler(std::move(lines)); + assembler.parse(); + + // assemble + std::vector hexCode = assembler.assemble(); + std::fstream hexFile(name + ".hex", std::fstream::out); + char hex[9]; + for (uint32_t i = 0; i < hexCode.size(); i++) { + sprintf(hex, "%.8X", hexCode[i]); + hexFile << hex << "\n"; + } +} + +std::string parseArgs(int argc, char ** argv) { + // make sure only one arg + if (argc != 2) usage(); + + // make sure it's a .asm + std::string fileArg(argv[1]); + if (fileArg.length() < 4 || fileArg.substr(fileArg.length()-4, 4) != ".asm") usage(); + return fileArg.substr(0, fileArg.length()-4); +} + +void usage() { + printf("usage: ma .asm\n"); + exit(0); +} diff --git a/makefile b/makefile new file mode 100644 index 0000000..c8f689d --- /dev/null +++ b/makefile @@ -0,0 +1,18 @@ +CC = g++ +CFLAGS = -c -std=c++11 -O2 -Wall -Werror +OBJS = InstHandler.o Assembler.o ma.o + +ma: $(OBJS) + $(CC) $(OBJS) -o ma + +ma.o: ma.C Assembler.h + $(CC) $(CFLAGS) ma.C -o ma.o + +Assembler.o: Assembler.C Assembler.h InstHandler.h + $(CC) $(CFLAGS) Assembler.C -o Assembler.o + +InstHandler.o: InstHandler.C InstHandler.h + $(CC) $(CFLAGS) InstHandler.C -o InstHandler.o + +clean: + rm *.o ma *.asm *.problems tmp1 tmp2