diff --git a/test/Infrastructure/MaxRingMulticastVary/src/MaxRingMulticastVaryCpuCode.c b/test/Infrastructure/MaxRingMulticastVary/src/MaxRingMulticastVaryCpuCode.c new file mode 100644 index 0000000..203b8f7 --- /dev/null +++ b/test/Infrastructure/MaxRingMulticastVary/src/MaxRingMulticastVaryCpuCode.c @@ -0,0 +1,125 @@ +#include +#include +#include +#include +#include + +#include "MaxSLiCInterface.h" +#include "MaxRingMulticastVary.h" + +char* strcat_int(const char *str_in, int i){ + char *str, str_c[2]; + str = (char*)malloc(50); + strcpy(str, str_in); + sprintf(str_c, "%d", i); + strcat(str, str_c); + return str; +} + +int main(void) +{ + + const int numTotalData[] = {4, 8, 12, 24, 20, 16}; + // Since data are sent in multiple of 32 bytes, dataPerRound should + // be a multiple of 8 since each data is 32 bits + int numTotalDataMax = 0, numTotalDataSum = 0; + const int dataPerRound = 8; + const int numFPGAs = 6; + const bool MAX4 = 1; + const bool sim = true; + const int FPGA_id_map[6] = {1, 0, 2, 3, 5, 4}; + char eng_string[50], kernel_name[50]; + uint32_t* data[numFPGAs]; + int i, j, kernel_ticks; + struct timeval tv1, tv2; + const int dataSize = sizeof(uint32_t); // 4 bytes + + + for (i = 0; i < numFPGAs; i++){ + data[i] = (uint32_t*)malloc(numTotalData[i]*numFPGAs*dataSize); + if (numTotalData[i] > numTotalDataMax){ + numTotalDataMax = numTotalData[i]; + } + numTotalDataSum += numTotalData[i]; + } + + if (MAX4){ + strcpy(eng_string, "192.168.0.10"); + } + else{ + strcpy(eng_string, "local"); + } + + for (i = 0; i < numFPGAs; i++){ + for (j = 0; j < numTotalData[i]*numFPGAs; j++){ + data[i][j] = 0; + } + } + + + max_file_t* maxfile = MaxRingMulticastVary_init(); + + if (!sim){ + max_actarray_t *action_array = max_actarray_init(maxfile, numFPGAs); + max_actions_t *action[numFPGAs]; + max_engarray_t* engine_array = + max_load_array(maxfile, numFPGAs, eng_string); + + strcpy(kernel_name, "MaxRingMulticastVaryKernel"); + kernel_ticks = numFPGAs*2 *ceil((float)(numTotalDataMax+1) / dataPerRound) * dataPerRound - dataPerRound; + for (i = 0; i < numFPGAs; i++){ + action[i] = max_actions_init(maxfile, NULL); + max_set_uint64t(action[i], kernel_name, "dataPerRound", dataPerRound); + max_set_uint64t(action[i], kernel_name, "numTotalData", numTotalData[i]); + max_set_uint64t(action[i], kernel_name, "numFPGAs", numFPGAs); + max_set_uint64t(action[i], kernel_name, "FPGA_id", i); + max_set_ticks(action[i], kernel_name, kernel_ticks); + max_queue_output(action[i], "dataOut", data[i], numTotalDataSum*dataSize); + max_set_action(action_array, i, action[i]); + + } + + printf(">> Run DFE for %d ticks\n", kernel_ticks); + gettimeofday(&tv1, NULL); + max_run_array(engine_array, action_array); + gettimeofday(&tv2, NULL); + max_actarray_free(action_array); + max_unload_array(engine_array); + } + else + { + max_actions_t *actions = max_actions_init(maxfile, NULL); + max_engine_t* engine = max_load(maxfile, "local"); + + kernel_ticks = numFPGAs*2 *ceil((float)(numTotalDataMax+1) / dataPerRound) * dataPerRound - dataPerRound; + for (i = 0; i < numFPGAs; i++){ + strcpy(kernel_name, strcat_int("MaxRingMulticastVaryKernel", i)); + max_set_uint64t(actions, kernel_name, "dataPerRound", dataPerRound); + max_set_uint64t(actions, kernel_name, "numTotalData", numTotalData[i]); + max_set_uint64t(actions, kernel_name, "numFPGAs", numFPGAs); + max_set_uint64t(actions, kernel_name, "FPGA_id", i); + max_set_ticks(actions, kernel_name, kernel_ticks); + max_queue_output(actions, strcat_int("dataOut", i), data[i], numTotalDataSum*dataSize); + } + + printf(">> Run Sim for %d ticks\n", kernel_ticks); + gettimeofday(&tv1, NULL); + max_run(engine, actions); + gettimeofday(&tv2, NULL); + max_actions_free(actions); + max_unload(engine); + } + + /* + for (i = 0; i < numFPGAs; i++){ + for (j = 0; j < numTotalData[i]*numFPGAs; j++){ + if (data[i][j] != 0){ + printf("[FPGA%d:%d] Card%u. %u\n", i, j, (data[i][j] / 0x10000000)-8, data[i][j] % 0x10000000); + } + } + } + */ + double runtime = ((tv2.tv_sec-tv1.tv_sec)*1000000+(tv2.tv_usec-tv1.tv_usec))/1000000.0; + printf(">> Finish!\n"); + printf(">> Runtime: %0.3fs\n", runtime); +} diff --git a/test/Infrastructure/MaxRingMulticastVary/src/MaxRingMulticastVaryKernel.maxj b/test/Infrastructure/MaxRingMulticastVary/src/MaxRingMulticastVaryKernel.maxj new file mode 100644 index 0000000..5fdae01 --- /dev/null +++ b/test/Infrastructure/MaxRingMulticastVary/src/MaxRingMulticastVaryKernel.maxj @@ -0,0 +1,285 @@ +import com.maxeler.maxcompiler.v2.kernelcompiler.Kernel; +import com.maxeler.maxcompiler.v2.kernelcompiler.KernelParameters; +import com.maxeler.maxcompiler.v2.kernelcompiler.types.base.DFEType; +import com.maxeler.maxcompiler.v2.kernelcompiler.types.base.DFEVar; +import com.maxeler.maxcompiler.v2.kernelcompiler.stdlib.core.Count; +import com.maxeler.maxcompiler.v2.kernelcompiler.stdlib.core.Count.*; +import com.maxeler.maxcompiler.v2.kernelcompiler.stdlib.memory.Memory; +import com.maxeler.maxcompiler.v2.utils.MathUtils; +import com.maxeler.maxcompiler.v2.kernelcompiler.stdlib.Reductions; + +class MaxRingMulticastVaryKernel extends Kernel { + + private static final int counterWidth = 32; + private static final int dataWidth = 28; + private static final int maxCards = 8; + private static final int maxDataPerRound = 2048; + //private static final int loopDepth = 2; + private static final int cardCntBits = MathUtils.bitsToAddress(maxCards*2); + private static final int cardBits = MathUtils.bitsToAddress(maxCards); + private static final int maxCntBits = MathUtils.bitsToAddress(maxDataPerRound+1); + //private static final int loopDepthBits = MathUtils.bitsToAddress(loopDepth); + private static final DFEType dataType = dfeRawBits(dataWidth+cardBits+1); + private static final DFEType dataInType = dfeUInt(dataWidth); + private static final DFEType cntType = dfeUInt(counterWidth); + private static final DFEType cardCntType = dfeUInt(cardCntBits); + private static final DFEType cardType = dfeUInt(cardBits); + private static final DFEType maxCntType = dfeUInt(maxCntBits); + + private DFEVar shifter(DFEVar in, int in_bits, int shift_bits){ + return (in # constant.var(dfeUInt(shift_bits), 0)).cast(dfeUInt(in_bits+shift_bits)); + } + + protected MaxRingMulticastVaryKernel(KernelParameters parameters) + { + super(parameters); + + DFEVar dataPerRound = io.scalarInput("dataPerRound", maxCntType); + DFEVar numTotalData = io.scalarInput("numTotalData", cntType); + DFEVar numFPGAs = io.scalarInput("numFPGAs", cardType); + DFEVar FPGA_id = io.scalarInput("FPGA_id", cardType); + DFEVar FPGA_id_long = FPGA_id.cast(cardCntType); + DFEVar reset = constant.var(dfeBool(), 0); + + + // ******************** Algorithm ******************** + // card id 0 1 2 3 4 5 + // 1. data to receive/pass-on* to R 0 1 2 3 4 5 + // 2. send own data to pass on to R 1 1 1 1 1 0 + // 3. idle# 5 4 3 2 1 1 + // 4. data to receive/pass-on* to L 5 4 3 2 1 0 + // 5. send own data to pass on to L 0 1 1 1 1 1 + // 6. idle# 1 1 2 3 4 5 + + // * Numbers = data sent in multiple of dataPerRound + // # 1 cardCnt cycle extra to account for the pipeline depth + // When all data is sent, valid bit is set to 0 and then complete the round + + // Right Phase = 1 + 2 + 3, Left Phase = 4 + 5 + 6 + + // < Switch for each phase > + // 1. inL open, outR open + // 2. outR open + // 3. all close + // 4. inR open, outL open + // 5. outL open + // 6. all close + // (inL/outL disabled for leftmost card, oposite for rightmost card) + + // Total cnt before switching direction (i.e. (1)+(2)+(3)) + // = numFPGAs-1 + + // Total cnt for both direction in 1 round (i.e. sum of (1) to (6)) + // = (numFPGAs-1)*2 + + Count.Params ctrParams; + Counter ctr; + + // Boolean for checking rightmost and leftmost FPGAs + DFEVar rightEnd = FPGA_id === numFPGAs-1; + DFEVar leftEnd = FPGA_id === 0; + + + + + // ******************** Counters ******************** + // Loop level 1: dataCnt + // Switches and inputs from other FPGAs, determined by the FPGA id + ctrParams = control.count.makeParams(maxCntBits) + .withMax(dataPerRound) + //.withReset() + .withWrapMode(WrapMode.COUNT_LT_MAX_THEN_WRAP); + ctr = control.count.makeCounter(ctrParams); + DFEVar dataCnt = ctr.getCount(); + DFEVar dataCntWrap = ctr.getWrap(); + + // Loop level 2: cards (L/R) + DFEVar cardCntMax = shifter(numFPGAs, cardBits, 1); + ctrParams = control.count.makeParams(cardCntBits) + .withMax(cardCntMax) + //.withReset() + .withEnable(dataCntWrap) + .withWrapMode(WrapMode.COUNT_LT_MAX_THEN_WRAP); + ctr = control.count.makeCounter(ctrParams); + DFEVar cardCnt = ctr.getCount(); + DFEVar cardWrap = ctr.getWrap(); + + // Loop level 3: check 1st round (for maxCount) + ctrParams = control.count.makeParams(1) + .withEnable(cardWrap) + //.withReset() + .withWrapMode(WrapMode.STOP_AT_MAX); + ctr = control.count.makeCounter(ctrParams); + DFEVar firstRound = ~ctr.getCount(); + + + + + + // ******************** Control Signals ******************** + // Send data when passing on all data from FPGAs from the other direction + DFEVar leftPhase = cardCnt >= numFPGAs.cast(cardCntType); // 0: send to right, 1: send to left + DFEVar ownDataToR = (cardCnt === FPGA_id_long) & ~rightEnd; // phase 2, send own data + DFEVar ownDataToL = (cardCnt === cardCntMax - FPGA_id_long - 1) & ~leftEnd; // phase 5, send own data + DFEVar idle = (~leftPhase & cardCnt >= FPGA_id_long & ~ownDataToR) | (leftPhase & cardCnt >= cardCntMax - FPGA_id_long - 1 & ~ownDataToL); + + + // pass all data to other direction, then add data + // from own FPGA + DFEVar ctrlInL = ~leftEnd & ~leftPhase & ~ownDataToR & ~idle; + DFEVar ctrlOutR = ~rightEnd & ~leftPhase & ~idle; + + DFEVar ctrlInR = ~rightEnd & leftPhase & ~ownDataToL & ~idle; + DFEVar ctrlOutL = ~leftEnd & leftPhase & ~idle; + + + // Convert left & right to maxring I/O A & B (Note: A/B not equivalent to L/R) + DFEVar A_equals_L = FPGA_id.slice(0); + + DFEVar ctrlInA = A_equals_L? ctrlInL: ctrlInR; + DFEVar ctrlInB = A_equals_L? ctrlInR: ctrlInL; + + // Total number of data sent + DFEVar ctrlIn_cardCnt = cardCnt === (FPGA_id_long + rightEnd.cast(cardCntType)); + ctrParams = control.count.makeParams(counterWidth) + .withEnable(ctrlIn_cardCnt) + //.withReset() + .withWrapMode(WrapMode.STOP_AT_MAX); + ctr = control.count.makeCounter(ctrParams); + DFEVar numTotalDataCnt = ctr.getCount(); + + // CHECK VALID (own data end): + // validBit: 1 if data is new data from source (when ctrl = 0, data remains + // same as the last one, not required to send from source) + DFEVar validBit = numTotalDataCnt < numTotalData+1; + DFEVar ctrlIn_data = ctrlIn_cardCnt & validBit; + + /* + // CHECK END: + // Check if kernel finishes using maxCount + DFEVar maxTotalDataNew = cntType.newInstance(this); + DFEVar maxTotalData = loopCnt === loopDepth? maxTotalDataNew: numTotalData; + DFEVar kernel_run = numTotalDataCnt < round((maxTotalData+1)/dataPerRound)*numFPGAs*dataPerRound; // +1 for the totalCnt send at 1st cycle + */ + DFEVar kernel_run = constant.var(dfeBool(), 1); + + // Send data after sending numTotalData + ctrParams = control.count.makeParams(1) + .withMax(1) + //.withReset() + .withWrapMode(WrapMode.STOP_AT_MAX) + .withEnable(ctrlIn_cardCnt); + ctr = control.count.makeCounter(ctrParams); + DFEVar startDataSend = ctr.getCount(); // false if it is the first cycle to send/receive totalNumCnt + + // ctrlIn_data: both count and actual input, ctrlIn: only actual input + DFEVar ctrlIn = ctrlIn_data & startDataSend; + + + + + // ******************** Data I/O and Data Routing ******************** + // ******************************************************************** + // The following code replaces the source from another kernel, + // the source need to send number of 'maxTotalData' data to this kernel + // + // DFEVar dataIn = io.input("dataIn", dataInType, ctrlIn); + // + // ******************************************************************** + + // Data from this FPGA to be sent + // validBit and FPGA_id padded in front of actual data + ctrParams = control.count.makeParams(dataWidth) + .withEnable(ctrlIn); + ctr = control.count.makeCounter(ctrParams); + DFEVar dataIn = ctr.getCount(); + + // Send numTotalData as 1st data to other FPGAs + dataIn = startDataSend? dataIn: numTotalData.slice(0, dataWidth).cast(dataInType); + + DFEVar dataInPadded = validBit # FPGA_id # dataIn; // # = cat + + + // Store the data into BRAM buffer for later use for leftward transfer + Memory dataIn_buffer = mem.alloc(dataType, maxDataPerRound); + DFEVar buffer_addr; + if (MathUtils.isPowerOf2(maxDataPerRound)){ + int maxCntBits_ = MathUtils.bitsToAddress(maxDataPerRound); + buffer_addr = dataCnt.slice(0, maxCntBits_).cast(dfeUInt(maxCntBits_)); + } + else { + buffer_addr = dataCnt; + } + dataIn_buffer.write(buffer_addr, dataInPadded, ctrlIn_cardCnt); // addr, data, enable + DFEVar dataBufferOut = dataIn_buffer.read(buffer_addr); // addr + + DFEVar dataToSend = ctrlIn_cardCnt? dataInPadded : dataBufferOut; + // Note: ctrlIn is a bit cheeky, when it's 1 dataIn writes into buffer, + // but for the right border case (FPGA_id == num_FPGAs -1) in the leftward phase + // the dataIn is also used instead of dataBramOut + + // Data from other FPGAs + DFEVar dataInA = io.input("dataInA", dataType, ctrlInA & kernel_run); + DFEVar dataInB = io.input("dataInB", dataType, ctrlInB & kernel_run); + + DFEVar dataInL = A_equals_L? dataInA: dataInB; + DFEVar dataInR = A_equals_L? dataInB: dataInA; + + DFEVar dataOutL = ownDataToL? dataToSend: dataInR; + DFEVar dataOutR = ownDataToR? dataToSend: dataInL; + + + // Output to FPGAs + DFEVar dataOutA = A_equals_L? dataOutL: dataOutR; + DFEVar dataOutB = A_equals_L? dataOutR: dataOutL; + + DFEVar ctrlOutA = A_equals_L? ctrlOutL: ctrlOutR; + DFEVar ctrlOutB = A_equals_L? ctrlOutR: ctrlOutL; + + io.output("dataOutA", dataOutA, dataType, ctrlOutA & kernel_run); + io.output("dataOutB", dataOutB, dataType, ctrlOutB & kernel_run); + + + // Output to downstream computation + DFEVar dataOutPadded = leftPhase? dataInR: dataInL; + dataOutPadded = ctrlIn_cardCnt? dataInPadded : dataOutPadded; + + // Recv numTotalData as 1st data from other FPGAs + DFEVar recvNumTotalData = firstRound & dataCnt === 0; + + DFEVar dataOut = dataOutPadded.slice(0, dataWidth).cast(dataInType); + DFEVar cardNumOut = dataOutPadded.slice(dataWidth, cardBits); + DFEVar validBitOut = dataOutPadded.slice(dataWidth + cardBits, 1); + + DFEVar maxTotalData = Reductions.streamMax(dataOut, reset, recvNumTotalData); + + // use 'valid' bit to see if it should be sent downstream + DFEVar ctrlOut = (ctrlIn | ctrlInL | ctrlInR) & validBitOut & ~recvNumTotalData; + + // Output to CPU/other kernels + // like dataIn, dataOut outputs data if it is valid + io.output("dataOut", dataOutPadded, dataType, ctrlOut & kernel_run); // actual dataOut + + + + + // ******************** DEBUG ******************** + debug.simPrintf("[SIM%d: Counter] FPGACnt:%d dataCnt:%d maxTotalData:%d totalCnt:%d/%d recvNumTotalData:%d firstRound:%d\n", FPGA_id, cardCnt, dataCnt, maxTotalData, numTotalDataCnt, numTotalData, recvNumTotalData, firstRound); + debug.simPrintf("[SIM%d: Data] dataIn:%d dataInL:%o dataInR:%o dataInPadded:%o\n", FPGA_id, dataIn, dataInL, dataInR, dataInPadded); + debug.simPrintf("[SIM%d: Data] dataOut:%d dataOutL:%o dataOutR:%o dataOutPadded:%o\n", FPGA_id, dataOut, dataOutL, dataOutR, dataOutPadded); + debug.simPrintf("[SIM%d: Ctrl] ctrlIn:%d ctrlInL:%d ctrlInR:%d\n", FPGA_id, ctrlIn_data, ctrlInL, ctrlInR); + debug.simPrintf("[SIM%d: Ctrl] ctrlOut:%d ctrlOutL:%d ctrlOutR:%d\n", FPGA_id, ctrlOut, ctrlOutL, ctrlOutR); + debug.simPrintf("[SIM%d: Ctrl] leftPhase:%d ownDataToL:%d ownDataToR:%d\n", FPGA_id, leftPhase, ownDataToL, ownDataToR); + debug.simPrintf(ctrlOut, "[SIM%d: Out] FromFPGA:%d Data:%d\n", FPGA_id, cardNumOut, dataOut); + + + ctrParams = control.count.makeParams(counterWidth) + .withEnable(ctrlOut); + ctr = control.count.makeCounter(ctrParams); + DFEVar dataOutCnt = ctr.getCount(); + io.scalarOutput("dataOutCnt", dataOutCnt, cntType); + io.scalarOutput("cardCnt", cardCnt, cardCntType); + io.scalarOutput("dataCnt", dataCnt, maxCntType); + } + +} diff --git a/test/Infrastructure/MaxRingMulticastVary/src/MaxRingMulticastVaryManager.maxj b/test/Infrastructure/MaxRingMulticastVary/src/MaxRingMulticastVaryManager.maxj new file mode 100644 index 0000000..8d6f357 --- /dev/null +++ b/test/Infrastructure/MaxRingMulticastVary/src/MaxRingMulticastVaryManager.maxj @@ -0,0 +1,72 @@ +import com.maxeler.maxcompiler.v2.managers.custom.CustomManager; +import com.maxeler.maxcompiler.v2.managers.custom.blocks.KernelBlock; +import com.maxeler.maxcompiler.v2.build.EngineParameters; +import com.maxeler.maxcompiler.v2.managers.custom.stdlib.Max3RingConnection; +import com.maxeler.maxcompiler.v2.managers.custom.stdlib.Max4MAIARingConnection; +import com.maxeler.maxcompiler.v2.managers.custom.stdlib.MaxRingBidirectionalStream; +import com.maxeler.maxcompiler.v2.managers.DFEModel; + +public class MaxRingMulticastVaryManager extends CustomManager{ + + private static final String s_kernelName = "MaxRingMulticastVaryKernel"; + private static final int sim_kernels = 6; + + MaxRingMulticastVaryManager(EngineParameters ep) + { + super(ep); + + if (ep.getTarget() == EngineParameters.Target.DFE){ + // Build target = DFE, connect MaxRing I/O + KernelBlock k = addKernel( + new MaxRingMulticastVaryKernel(makeKernelParameters(s_kernelName))); + + MaxRingBidirectionalStream maxRingA, maxRingB; + if (ep.getDFEModel() == DFEModel.MAX4848A | ep.getDFEModel() == DFEModel.MAIA){ + maxRingA = addMaxRingBidirectionalStream("maxRingA", Max4MAIARingConnection.MAXRING_A); + maxRingB = addMaxRingBidirectionalStream("maxRingB", Max4MAIARingConnection.MAXRING_B); + } + else { + maxRingA = addMaxRingBidirectionalStream("maxRingA", Max3RingConnection.MAXRING_A); + maxRingB = addMaxRingBidirectionalStream("maxRingB", Max3RingConnection.MAXRING_B); + } + + maxRingA.getLinkToRemoteDFE() <== k.getOutput("dataOutA"); + k.getInput("dataInA") <== maxRingA.getLinkFromRemoteDFE(); + + maxRingB.getLinkToRemoteDFE() <== k.getOutput("dataOutB"); + k.getInput("dataInB") <== maxRingB.getLinkFromRemoteDFE(); + + addStreamToCPU("dataOut") <== k.getOutput("dataOut"); + } + else { + // Build target = sim, instantiate 6 kernels instead + // since in/out can't be unconnected, 0 and 5 are connected + // but isn't used + KernelBlock[] k = new KernelBlock[sim_kernels]; + for (int i = 0; i < sim_kernels; i++){ + k[i] = addKernel( + new MaxRingMulticastVaryKernel( + makeKernelParameters(s_kernelName+i))); + addStreamToCPU("dataOut"+i) <== k[i].getOutput("dataOut"); + } + for (int i = 0; i < sim_kernels; i+=2){ + k[i].getInput("dataInA") <== k[i+1].getOutput("dataOutA"); + k[i+1].getInput("dataInA") <== k[i].getOutput("dataOutA"); + k[(i-1+sim_kernels) % sim_kernels].getInput("dataInB") <== k[i].getOutput("dataOutB"); + k[i].getInput("dataInB") <== k[(i-1+sim_kernels) % sim_kernels].getOutput("dataOutB"); + } + } + + + } + + + public static void main(String[] args) { + MaxRingMulticastVaryManager manager = + new MaxRingMulticastVaryManager(new EngineParameters(args)); + + // this generates C defines in Maxfiles.h so that CPU code + // knows the constants + manager.build(); + } +}