diff --git a/.gitignore b/.gitignore index b447d2d498f5423ef4a743935c5317b40f33d294..432deab8752b31c2ff7c3863141b3136e8b63fcd 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ venv .vscode host_aurora_flow_test +host_aurora_flow_ring ip_creation* aurora_flow_0_project* aurora_flow_1_project* @@ -34,3 +35,4 @@ eval/.ipynb_checkpoints *.wdb *.wcfg build +scripts/run_ring_n*.sh diff --git a/Makefile b/Makefile index ae65e40f02077ca69563373ed85f9b17f833f6c2..f7adf10d636324f6609d6d7fddb19088c3d93a07 100644 --- a/Makefile +++ b/Makefile @@ -22,6 +22,7 @@ ECHO=@echo aurora: aurora_flow_0.xo aurora_flow_1.xo CXX=c++ +MPICXX=mpic++ # change here to test different boards PART := xcu280-fsvh2892-2L-e @@ -161,8 +162,14 @@ recv_$(TARGET).xo: ./hls/recv.cpp send_$(TARGET).xo: ./hls/send.cpp v++ $(HLSCFLAGS) --temp_dir _x_send --kernel send --output $@ $^ +send_recv_$(TARGET).xo: ./hls/send_recv.cpp + v++ $(HLSCFLAGS) --temp_dir _x_send_recv --kernel send_recv --output $@ $^ + aurora_flow_test_hw.xclbin: aurora send_$(TARGET).xo recv_$(TARGET).xo aurora_flow_test_$(TARGET).cfg - v++ $(LINKFLAGS) --temp_dir _x_aurora_flow_$(TARGET) --config aurora_flow_test_$(TARGET).cfg --output $@ aurora_flow_0.xo aurora_flow_1.xo recv_$(TARGET).xo send_$(TARGET).xo + v++ $(LINKFLAGS) --temp_dir _x_aurora_flow_test_$(TARGET) --config aurora_flow_test_$(TARGET).cfg --output $@ aurora_flow_0.xo aurora_flow_1.xo recv_$(TARGET).xo send_$(TARGET).xo + +aurora_flow_ring_hw.xclbin: aurora send_recv_$(TARGET).xo aurora_flow_ring_$(TARGET).cfg + v++ $(LINKFLAGS) --temp_dir _x_aurora_flow_ring_$(TARGET) --config aurora_flow_ring_$(TARGET).cfg --output $@ aurora_flow_0.xo aurora_flow_1.xo send_recv_$(TARGET).xo aurora_flow_test_sw_emu_loopback.xclbin: send_$(TARGET).xo recv_$(TARGET).xo aurora_flow_test_$(TARGET)_loopback.cfg v++ $(LINKFLAGS) --temp_dir _x_aurora_flow_$(TARGET) --config aurora_flow_test_$(TARGET)_loopback.cfg --output $@ recv_$(TARGET).xo send_$(TARGET).xo @@ -181,7 +188,11 @@ LDFLAGS += $(LDFLAGS) -lxrt_coreutil -luuid host_aurora_flow_test: ./host/host_aurora_flow_test.cpp ./host/Aurora.hpp ./host/Results.hpp ./host/Configuration.hpp ./host/Kernel.hpp $(CXX) -o host_aurora_flow_test $< $(CXXFLAGS) $(LDFLAGS) -host: host_aurora_flow_test +host_aurora_flow_ring: ./host/host_aurora_flow_ring.cpp ./host/Aurora.hpp ./host/Results.hpp ./host/Configuration.hpp ./host/Kernel.hpp + $(MPICXX) -o host_aurora_flow_ring $< $(CXXFLAGS) $(LDFLAGS) + + +host: host_aurora_flow_test host_aurora_flow_ring # verilog testbenches diff --git a/aurora_flow_ring_hw.cfg b/aurora_flow_ring_hw.cfg new file mode 100644 index 0000000000000000000000000000000000000000..0e2597e6b2009cfebe667b87c3c7c2c94e058c4e --- /dev/null +++ b/aurora_flow_ring_hw.cfg @@ -0,0 +1,24 @@ +[connectivity] +nk=aurora_flow_0:1:aurora_flow_0 +nk=aurora_flow_1:1:aurora_flow_1 +nk=send_recv:2:send_recv_0,send_recv_1 + +# SLR bindings +slr=aurora_flow_0:SLR2 +slr=aurora_flow_1:SLR2 + +# AXI connections +stream_connect=aurora_flow_0.rx_axis:send_recv_0.data_input +stream_connect=send_recv_0.data_output:aurora_flow_1.tx_axis + +stream_connect=aurora_flow_1.rx_axis:send_recv_1.data_input +stream_connect=send_recv_1.data_output:aurora_flow_0.tx_axis + +# QSFP ports +connect=io_clk_qsfp0_refclkb_00:aurora_flow_0/gt_refclk_0 +connect=aurora_flow_0/gt_port:io_gt_qsfp0_00 +connect=aurora_flow_0/init_clk:ii_level0_wire/ulp_m_aclk_freerun_ref_00 + +connect=io_clk_qsfp1_refclkb_00:aurora_flow_1/gt_refclk_1 +connect=aurora_flow_1/gt_port:io_gt_qsfp1_00 +connect=aurora_flow_1/init_clk:ii_level0_wire/ulp_m_aclk_freerun_ref_00 diff --git a/env.sh b/env.sh index e8c0c7dd518875671c9d4f5aa03a8c2d62c4b7c0..a9985708c1756a58400855929f587f4542526e0c 100644 --- a/env.sh +++ b/env.sh @@ -1,3 +1,3 @@ #!/usr/bin/bash module reset -ml fpga devel lib tools && ml xilinx/xrt/2.14 changeFPGAlinks git-lfs +ml fpga devel lib tools toolchain && ml xilinx/xrt/2.14 changeFPGAlinks gompi/2024a diff --git a/hls/dump.cpp b/hls/dump.cpp deleted file mode 100644 index bbe3c8dda592ee39dbff8ea712556a9e67d3559e..0000000000000000000000000000000000000000 --- a/hls/dump.cpp +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright 2023-2025 Gerrit Pape (papeg@mail.upb.de) - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <hls_stream.h> -#include <ap_int.h> -#include <ap_axi_sdata.h> - -#ifndef DATA_WIDTH_BYTES -#define DATA_WIDTH_BYTES 64 -#endif - - -#define DATA_WIDTH (DATA_WIDTH_BYTES * 8) - -#define STREAM_DEPTH 256 - -extern "C" -{ - void recv_data( - unsigned int iterations, - unsigned int chunks, - hls::stream<ap_axiu<DATA_WIDTH, 0, 0, 0>> &data_input, - hls::stream<ap_uint<DATA_WIDTH>, STREAM_DEPTH> &data_stream, - unsigned int ack_mode, - hls::stream<ap_axiu<1, 0, 0, 0>>& loopback_ack_stream, - hls::stream<ap_axiu<1, 0, 0, 0>>& pair_ack_stream - ) { - recv_iterations: - for (unsigned int n = 0; n < iterations; n++) { - recv_chunks: - for (int i = 0; i < chunks; i++) { -#pragma HLS PIPELINE II = 1 - data_stream.write(data_input.read().data); - } - ap_axiu<1, 0, 0, 0> ack; - if (ack_mode == 0) { - loopback_ack_stream.write(ack); - } else if (ack_mode == 1) { - pair_ack_stream.write(ack); - } - } - } - - void write_data( - unsigned int iterations, - unsigned int chunks, - hls::stream<ap_uint<DATA_WIDTH>, STREAM_DEPTH> &data_stream, - ap_uint<DATA_WIDTH> *data_output - ) { - write_iterations: - for (unsigned int n = 0; n < iterations; n++) { - write_chunks: - for (int i = 0; i < chunks; i++) { -#pragma HLS PIPELINE II = 1 - data_output[i] = data_stream.read(); - } - } - } - - void recv( - hls::stream<ap_axiu<DATA_WIDTH, 0, 0, 0>> &data_input, - ap_uint<DATA_WIDTH> *data_output, - unsigned int byte_size, - unsigned int iterations, - unsigned int ack_mode, - hls::stream<ap_axiu<1, 0, 0, 0>> &loopback_ack_stream, - hls::stream<ap_axiu<1, 0, 0, 0>> &pair_ack_stream - ) { -#pragma HLS dataflow - int chunks = byte_size / DATA_WIDTH_BYTES; - hls::stream<ap_uint<DATA_WIDTH>, STREAM_DEPTH> data_stream; - - recv_data(iterations, chunks, data_input, data_stream, ack_mode, loopback_ack_stream, pair_ack_stream); - write_data(iterations, chunks, data_stream, data_output); - } -} - - diff --git a/hls/send_recv.cpp b/hls/send_recv.cpp new file mode 100644 index 0000000000000000000000000000000000000000..737b3d42757beacf9234a5889fba14f5f6da23b8 --- /dev/null +++ b/hls/send_recv.cpp @@ -0,0 +1,45 @@ +/* + * Copyright 2023-2025 Gerrit Pape (papeg@mail.upb.de) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <hls_stream.h> +#include <ap_int.h> +#include <ap_axi_sdata.h> + +#ifndef DATA_WIDTH_BYTES +#define DATA_WIDTH_BYTES 64 +#endif + +#define DATA_WIDTH (DATA_WIDTH_BYTES * 8) + +extern "C" +{ + void send_recv( + hls::stream<ap_axiu<DATA_WIDTH, 0, 0, 0>> &data_input, + hls::stream<ap_axiu<DATA_WIDTH, 0, 0, 0>> &data_output, + unsigned int byte_size, + unsigned int iterations + ) { + int chunks = byte_size / DATA_WIDTH_BYTES; + send_recv_iterations: + for (unsigned int n = 0; n < iterations; n++) { + send_recv_chunks: + for (unsigned int i = 0; i < chunks; i++) { + #pragma HLS PIPELINE II = 1 + data_output.write(data_input.read()); + } + } + } +} diff --git a/host/Kernel.hpp b/host/Kernel.hpp index 67d92809d6252f7df0fe8ab29a95444dd39ae661..c244a3a07367f1212cec28787a1e05e755bb5ca7 100644 --- a/host/Kernel.hpp +++ b/host/Kernel.hpp @@ -115,3 +115,43 @@ private: Configuration config; }; +class SendRecvKernel +{ +public: + SendRecvKernel(uint32_t instance, xrt::device &device, xrt::uuid &xclbin_uuid, Configuration &config) : instance(instance), config(config) + { + char name[100]; + snprintf(name, 100, "send_recv:{send_recv_%u}", instance); + kernel = xrt::kernel(device, xclbin_uuid, name); + } + + SendRecvKernel() {} + + void prepare_repetition(uint32_t repetition) + { + run = xrt::run(kernel); + + run.set_arg(2, config.message_sizes[repetition]); + run.set_arg(3, config.iterations_per_message[repetition]); + } + + void start() + { + run.start(); + } + + bool timeout() + { + return run.wait(std::chrono::milliseconds(config.timeout_ms)) == ERT_CMD_STATE_TIMEOUT; + } + + std::vector<char> data; +private: + xrt::bo data_bo; + xrt::kernel kernel; + xrt::run run; + uint32_t instance; + Configuration config; +}; + + diff --git a/host/host_aurora_flow_ring.cpp b/host/host_aurora_flow_ring.cpp new file mode 100644 index 0000000000000000000000000000000000000000..706562d5425e0f187584dcd2d8891da83fc4d801 --- /dev/null +++ b/host/host_aurora_flow_ring.cpp @@ -0,0 +1,242 @@ +/* + * Copyright 2023-2024 Gerrit Pape (papeg@mail.upb.de) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include <vector> +#include <mpi.h> +#include <iostream> +#include <fstream> + +#include "Aurora.hpp" + +#include "Configuration.hpp" +#include "Results.hpp" +#include "Kernel.hpp" + +std::vector<std::vector<char>> generate_data(uint32_t num_bytes, uint32_t size) +{ + char *slurm_job_id = std::getenv("SLURM_JOB_ID"); + std::vector<std::vector<char>> data; + data.resize(size); + for (uint32_t r = 0; r < size; r++) { + unsigned int seed = (slurm_job_id == NULL) ? r : (r + ((unsigned int)std::stoi(slurm_job_id))); + srand(seed); + data[r].resize(num_bytes); + for (uint32_t b = 0; b < num_bytes; b++) { + data[r][b] = rand() % 256; + } + } + return data; +} + +void check_core_status_global(Aurora &aurora_0, Aurora &aurora_1, size_t timeout_ms, int rank, int size) +{ + bool local_core_ok[2]; + + // barrier so timeout is working for all configurations + MPI_Barrier(MPI_COMM_WORLD); + local_core_ok[0] = aurora_0.core_status_ok(3000); + local_core_ok[1] = aurora_1.core_status_ok(3000); + + bool core_ok[size * 2]; + MPI_Gather(local_core_ok, 2, MPI_CXX_BOOL, core_ok, 2, MPI_CXX_BOOL, 0, MPI_COMM_WORLD); + + if (rank == 0) { + int errors = 0; + for (int i = 0; i < (2 * size); i++) { + if (!core_ok[i]) { + std::cout << "problem with core " << i % 2 << " on rank " << i / 2 << std::endl; + errors += 1; + } + } + if (errors) { + MPI_Abort(MPI_COMM_WORLD, errors); + } + } +} + +std::string bdf_map(uint32_t device_id, bool emulation) +{ + if (device_id == 0) { + return "0000:a1:00.1"; + } else if (device_id == 1) { + return "0000:81:00.1"; + } else if (device_id == 2) { + return "0000:01:00.1"; + } else { + throw std::invalid_argument("Invalid device id"); + } +} + +void write_results(bool semaphore, int32_t world_size, uint32_t iterations, uint32_t message_size, double latency) +{ + if (semaphore) { + while (rename("ring_results.csv", "ring_results.csv.lock") != 0) {} + } + + char *job_id = std::getenv("SLURM_JOB_ID"); + std::string job_id_str(job_id == NULL ? "none" : job_id); + + std::ofstream of; + of.open(semaphore ? "ring_results.csv.lock" : "ring_results.csv", std::ios_base::app); + + of << job_id_str << "," + << world_size << "," + << iterations << "," + << message_size << "," + << latency << std::endl; + + of.close(); + + if (semaphore) { + rename("ring_results.csv.lock", "ring_results.csv"); + } +} + +int main(int argc, char *argv[]) +{ + Configuration config(argc, argv); + + MPI_Init(&argc, &argv); + + int size, rank; + MPI_Comm_size(MPI_COMM_WORLD, &size); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + bool emulation = (std::getenv("XCL_EMULATION_MODE") != nullptr); + + if (emulation) { + config.finish_setup(64, false, emulation); + } + + uint32_t device_id; + std::string device_bdf; + xrt::device device; + xrt::uuid xclbin_uuid; + + device_id = emulation ? 0 : (rank % 3); + + device_bdf = bdf_map(device_id, emulation); + + std::cout << "Programming device " << device_bdf << std::endl; + device = xrt::device(device_bdf); + + if (rank == 0) { + xclbin_uuid = device.load_xclbin("aurora_flow_test_hw.xclbin"); + } else { + xclbin_uuid = device.load_xclbin("aurora_flow_ring_hw.xclbin"); + } + + std::vector<Aurora> aurora(2); + aurora[0] = Aurora(0, device, xclbin_uuid); + aurora[1] = Aurora(1, device, xclbin_uuid); + + check_core_status_global(aurora[0], aurora[1], 3000, rank, size); + + if (rank == 0) { + std::cout << "All links are ready" << std::endl; + } + + config.finish_setup(aurora[0].fifo_width, aurora[0].has_framing(), emulation); + + if (rank == 0) { + config.print(); + + std::cout << "Aurora core has framing " << (aurora[0].has_framing() ? "enabled" : "disabled") + << " and input width of " << aurora[0].fifo_width << " bytes" << std::endl; + } + + std::vector<std::vector<char>> data = generate_data(config.max_num_bytes, 2); + + // create kernel objects + std::vector<SendKernel> send_kernels(2); + std::vector<RecvKernel> recv_kernels(2); + std::vector<SendRecvKernel> send_recv_kernels(2); + + if (rank == 0) { + for (uint32_t i = 0; i < 2; i++) { + send_kernels[i] = SendKernel(i, device, xclbin_uuid, config, data[i]); + recv_kernels[i] = RecvKernel(i, device, xclbin_uuid, config); + } + } else { + for (uint32_t i = 0; i < 2; i++) { + send_recv_kernels[i] = SendRecvKernel(i, device, xclbin_uuid, config); + } + } + + for (uint32_t r = 0; r < config.repetitions; r++) { + if (rank == 0) { + std::cout << "Repetition " << r << " with " << config.message_sizes[r] << " bytes" << std::endl; + } + try { + uint32_t i_send = 1; + uint32_t i_recv = 0; + SendKernel &send = send_kernels[i_send]; + RecvKernel &recv = recv_kernels[i_recv]; + SendRecvKernel &send_recv = send_recv_kernels[i_recv]; + if (rank == 0) { + send.prepare_repetition(r); + recv.prepare_repetition(r); + recv.start(); + } else { + send_recv.prepare_repetition(r); + send_recv.start(); + } + + MPI_Barrier(MPI_COMM_WORLD); + double start_time = get_wtime(); + if (rank == 0) { + send.start(); + + if (recv.timeout()) { + std::cout << "Recv " << i_recv << " timeout" << std::endl; + } + + if (send.timeout()) { + std::cout << "Send " << i_send << " timeout" << std::endl; + } + + double end_time = get_wtime(); + + recv.write_back(); + + uint32_t errors = recv.compare_data(data[i_send].data(), r); + if (errors) { + std::cout << errors << " byte errors" << std::endl; + } + double latency = (end_time - start_time); + double latency_per_iteration = latency / config.iterations_per_message[r]; + double gigabits_per_iteration = config.message_sizes[r] * 8 / 1000000000.0; + double gigabits = config.iterations_per_message[r] * gigabits_per_iteration; + + std::cout << "Latency per iteration (us): " << (latency_per_iteration) * 1000000.0 << std::endl; + std::cout << "Throughput: " << gigabits / latency << std::endl; + + write_results(config.semaphore, size, config.iterations_per_message[r], config.message_sizes[r], latency); + + } + } catch (const std::runtime_error &e) { + std::cout << "caught runtime error: " << e.what() << std::endl; + } catch (const std::exception &e) { + std::cout << "caught unexpected error: " << e.what() << std::endl; + } catch (...) { + std::cout << "caught non-std::logic_error " << std::endl; + } + } + + MPI_Finalize(); + + return EXIT_SUCCESS; +} + diff --git a/scripts/run_ring.sh b/scripts/run_pair_pressure.sh similarity index 59% rename from scripts/run_ring.sh rename to scripts/run_pair_pressure.sh index 644e2208ced29cc129283ed080fc252b28c944a6..d4fa87d93d56a479c72606a8062784a89cda9e5e 100755 --- a/scripts/run_ring.sh +++ b/scripts/run_pair_pressure.sh @@ -1,9 +1,8 @@ #!/usr/bin/bash #SBATCH -p fpga -#SBATCH -t 00:30:00 +#SBATCH -t 28:00:00 #SBATCH -N 1 #SBATCH --constraint=xilinx_u280_xrt2.14 -#SBATCH --tasks-per-node 6 #SBATCH --mail-type=ALL if ! command -v v++ &> /dev/null @@ -13,6 +12,7 @@ fi ./scripts/reset.sh -./scripts/configure_ring.sh +./scripts/configure_pair.sh -./host_aurora_flow_test -m 2 $@ +# approx. 2:15 hours +./host_aurora_flow_test -d 1 -m 1 -b 268435456 -i 184320 $@ diff --git a/scripts/run_ring.py b/scripts/run_ring.py new file mode 100755 index 0000000000000000000000000000000000000000..ecbbcdcbf679a4f064d52cfbe68edb644fc9a37b --- /dev/null +++ b/scripts/run_ring.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 +import sys, math, subprocess, urllib.request + +nums = [] +if (len(sys.argv) == 2): + nums.append(int(sys.argv[1])) +else: + start_num = 3 + end_num = 9 + step_num = 3 + if (len(sys.argv) > 2): + start_num = int(sys.argv[1]) + end_num = int(sys.argv[2]) + if (len(sys.argv) > 3): + step_num = int(sys.argv[3]) + for i in range(start_num, end_num + 1, step_num): + nums.append(i) + +with open('./scripts/run_ring_template.sh') as f: + template = f.read() + +def create_ring_linkconfig(n): + conf = '' + nodes = ['n{:02d}'.format(math.floor(i / 3)) for i in range(n)] + devices = ['acl{}'.format((i % 3)) for i in range(n)] + for i in range(n): + conf += '--fpgalink={}:{}:ch0-{}:{}:ch1 '.format(nodes[i], devices[i], nodes[i-1], devices[i-1]) + return conf + +for num in nums: + linkconfig = create_ring_linkconfig(num) + script = template.format(num, math.ceil(num / 3), linkconfig, urllib.request.pathname2url(linkconfig)) + path = './scripts/run_ring_n{}.sh'.format(num) + with open(path, 'w') as f: + f.write(script) + subprocess.run(['sbatch', path]) diff --git a/scripts/run_ring_template.sh b/scripts/run_ring_template.sh new file mode 100644 index 0000000000000000000000000000000000000000..5e5439bb6ebbb6be7d1667a060559136aff42be9 --- /dev/null +++ b/scripts/run_ring_template.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +#SBATCH -t 02:00:00 +#SBATCH -n {0} +#SBATCH --ntasks-per-node 3 +#SBATCH -J "afr_{0}" +#SBATCH -o aurora_flow_ring_n{0}_%j.out +#SBATCH -p fpga +#SBATCH -A hpc-lco-kenter +#SBATCH --constraint xilinx_u280_xrt2.14 + +## Load environment modules +source env.sh + +#https://pc2.github.io/fpgalink-gui/index.html?import={3} +changeFPGAlinksXilinx {2} +srun -l -n {1} --spread-job ./scripts/reset.sh + +srun -l -n {0} ./host_aurora_flow_ring -s -m 2 -i 1024 diff --git a/scripts/synth.sh b/scripts/synth.sh index 111025c5193b04d600fc65ad299e76b15c20787f..08e16076e64de6d08dc0bf275f18f65ee13fa457 100755 --- a/scripts/synth.sh +++ b/scripts/synth.sh @@ -8,4 +8,4 @@ source env.sh -make xclbin -j6 $@ \ No newline at end of file +make aurora_flow_ring_hw.xclbin -j6 $@