diff --git a/08_bilateral_filter/Jason-Young123/.gitignore b/08_bilateral_filter/Jason-Young123/.gitignore new file mode 100644 index 0000000..01e7a30 --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/.gitignore @@ -0,0 +1,8 @@ +/obj/ +/useful/ +/useless/ +/analysis/ +/runTester +/tmp_env/ +/result/ +init.mk diff --git a/08_bilateral_filter/Jason-Young123/Makefile b/08_bilateral_filter/Jason-Young123/Makefile new file mode 100644 index 0000000..15ff528 --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/Makefile @@ -0,0 +1,145 @@ +WDIR = . +SRC_DIR = $(WDIR)/src +OBJ_DIR = $(WDIR)/obj +INCLUDES_DIR = $(WDIR)/include +ANALYSIS_DIR = $(WDIR)/analysis +ENV_DIR = $(WDIR)/tmp_env +SCRIPTS_DIR = $(WDIR)/scripts +MK = $(WDIR)/init.mk +RESULT_DIR = ./result +LOG = $(RESULT_DIR)/runtime.log + +-include $(MK) + +PLATFORM ?= nvidia +# resolve platform +ifeq ($(PLATFORM),nvidia) + PLATFORM_DEFINE := -DPLATFORM_NVIDIA + PREFIX := /usr/local/cuda-12.8/bin/ + CXX := $(PREFIX)nvcc + GDB := $(PREFIX)cuda-gdb + NCU := $(PREFIX)ncu + NCU-GUI := $(PREFIX)ncu-ui + NSYS := $(PREFIX)nsys + CXXFLAGS := -Xcompiler -std=c++17 -O3 -use_fast_math -Xcudafe --diag_suppress=611 -Wno-deprecated-gpu-targets -I$(INCLUDES_DIR) + SUFFIX := cu +else ifeq ($(PLATFORM),iluvatar) + PLATFORM_DEFINE := -DPLATFORM_ILUVATAR + CXX := clang++ + CXXFLAGS := -std=c++17 -O3 -I$(INCLUDES_DIR) -Wno-implicit-const-int-float-conversion -Wno-literal-range + LIBS := -lcudart -I/usr/local/corex/include -L/usr/local/corex/lib64 -fPIC + SUFFIX := cu +else ifeq ($(PLATFORM),moore) + PLATFORM_DEFINE := -DPLATFORM_MOORE + CXX := mcc + CXXFLAGS := -std=c++17 -O3 -I$(INCLUDES_DIR) -I/usr/local/musa/include + LIBS := -L$(ENV_DIR)/lib -L/usr/lib/gcc/x86_64-linux-gnu/11/ -L/usr/local/musa/lib -lmusart + RUNTIME_LIBS += /usr/local/musa/lib + SUFFIX := mu +else ifeq ($(PLATFORM),metax) + PLATFORM_DEFINE := -DPLATFORM_METAX + CXX := mxcc + CXXFLAGS := -std=c++17 -O3 -I$(INCLUDES_DIR) + SUFFIX := maca +else + $(error Unsupported PLATFORM '$(PLATFORM)' (expected: nvidia, iluvatar, moore, metax)) +endif +-include $(SCRIPTS_DIR)/$(PLATFORM).mk + + +ifeq ($(HAS_CV_ENV), yes) + CXXFLAGS += -DHAS_CV +endif + + + +SRCS = $(wildcard $(SRC_DIR)/*.$(SUFFIX)) +OBJS = $(patsubst $(SRC_DIR)/%.$(SUFFIX), $(OBJ_DIR)/%.o, $(SRCS)) +EXEC = $(WDIR)/runTester + + +.DEFAULT_GOAL := all + +all: $(EXEC) + + +init: + @rm -rf $(ENV_DIR) $(MK) + @echo "[INIT] Checking OpenCV Environment..." + @$(PLATFORM_INIT_ENV); \ + if pkg-config --exists opencv4; then \ + echo "[INFO] Built-in OpenCV Found."; \ + echo "HAS_CV_ENV = yes" > $(MK); \ + echo "CVFLAGS = $$(pkg-config --cflags opencv4)" >> $(MK); \ + echo "CVLIBS = $$(pkg-config --libs opencv4)" >> $(MK); \ + echo "CVRUNTIME_LIBS = $${LD_LIBRARY_PATH}" >> $(MK); \ + elif command -v conda > /dev/null 2>&1; then \ + echo "[INFO] Using Conda to Create $(ENV_DIR)..."; \ + conda create --prefix $(ENV_DIR) -y libopencv; \ + echo "HAS_CV_ENV = yes" > $(MK); \ + echo "CVFLAGS = -I$(ENV_DIR)/include/opencv4" >> $(MK); \ + echo "CVLIBS = -L$(ENV_DIR)/lib -lopencv_core -lopencv_imgproc -lopencv_imgcodecs" >> $(MK); \ + echo "CVRUNTIME_LIBS = $(ENV_DIR)/lib:$${LD_LIBRARY_PATH}" >> $(MK); \ + else \ + echo "[WARN] No OpenCV is Available on this Server."; \ + echo "HAS_CV_ENV = no" > $(MK); \ + fi + @echo "[INFO] Project Init Done." + @echo "[INFO] Enter \`make (run) PLATFORM=[nvidia, moore, metax, iluvatar]\` to Compile & Run." + + + + +#link: .o -> exec +$(EXEC): $(OBJS) + @echo "[INFO] $^ -> $@" + @$(CXX) $(CXXFLAGS) $(CVFLAGS) $(LIBS) $(CVLIBS) -o $@ $^ + + +#compile: .cu -> .o +$(OBJ_DIR)/%.o: $(SRC_DIR)/%.$(SUFFIX) + @mkdir -p $(OBJ_DIR) + @echo "[INFO] $< -> $@" + @$(CXX) $(CXXFLAGS) $(CVFLAGS) $(PLATFORM_DEFINE) -c -o $@ $< + + +run: $(EXEC) + @mkdir -p $(RESULT_DIR) + @export LD_LIBRARY_PATH=$(CVRUNTIME_LIBS):$$LD_LIBRARY_PATH; \ + $(MAKE) -s -f $(SCRIPTS_DIR)/$(PLATFORM).mk exec LD_LIBRARY_PATH="$$LD_LIBRARY_PATH" 2>&1 | tee $(LOG) + @echo "[INFO] Runtime log has been written to $(LOG)" + + + +gdb: $(EXEC) + @$(GDB) $< + + + + +#Nsight System,系统级分析工具 +nsys: $(EXEC) + @mkdir -p $(ANALYSIS_DIR) + @$(NSYS) profile -t cuda,nvtx,osrt -o $(ANALYSIS_DIR)/$(EXEC) -f true $< + @$(NSYS) stats $(ANALYSIS_DIR)/$(EXEC).nsys-rep --force-export=true + +#Nsight Compute,内核级分析工具 +ncu: $(EXEC) + @mkdir -p $(ANALYSIS_DIR) + @$(NCU) --print-details all --nvtx --call-stack --set full $< + +ncu-gui: $(EXEC) + @mkdir -p $(ANALYSIS_DIR) + @$(NCU) --nvtx --call-stack --set full -f --export $(ANALYSIS_DIR)/$(EXEC).ncu-rep $< + @$(NCU-GUI) $(ANALYSIS_DIR)/$(EXEC).ncu-rep + + +clean: + rm -rf $(OBJ_DIR) $(ANALYSIS_DIR) $(EXEC) $(RESULT_DIR) + +cleanup: + rm -rf $(OBJ_DIR) $(ANALYSIS_DIR) $(EXEC) $(RESULT_DIR) + rm -rf $(MK) $(ENV_DIR) + + +.PHONY: diff --git a/08_bilateral_filter/Jason-Young123/README.md b/08_bilateral_filter/Jason-Young123/README.md new file mode 100644 index 0000000..7b56c3e --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/README.md @@ -0,0 +1,38 @@ +# Bilateral-Filter + + +## 介绍 +InifiniTensor 2025冬训练营项目阶段, CUDA方向题目八: 基于CUDA的实时图像双边滤波 + + +## 使用说明 +### step1: 环境初始化 +在开始编译前需配置本地openCV环境(优先使用系统自带openCV, 若缺失则尝试基于conda建立虚拟环境tmp_env并安装openCV; 若conda也不可用则项目将在缺少openCV对照的情况下直接运行) +```bash +make init PLATFORM=[nvidia, moore, metax, iluvatar] +``` + +### step2: 配置参数文件 +修改`tester/config.txt`以配置radius, sigma_spatial和sigma_color(当radius设置为 <= 0时,代码将自适应选取滤波半径); +在`tester/gray/4K`和`tester/rgb/4K`路径下存放待测试的4K(3840 * 2160)图片的bin文件,分别包含10张灰白壁纸和15张RGB壁纸(具体jpeg图片参见resource下的相应路径); +你也可以在`tester/gray/4K`和`tester/rgb/4K`中自行添加待测试.bin文件 + +### step3: 编译运行 +输出.bin文件和性能日志(runtime.log)将自动保存至`result`路径下 +```bash +make run PLATFORM=[nvidia, moore, metax, iluvatar] +``` + + +## 其他 +清除编译和运行结果 +```bash +make clean +``` + +清除所有生成文件(包括虚拟环境和辅助脚本) +```bash +make cleanup +``` + +基于各平台的结果和性能分析参见`report`路径 diff --git a/08_bilateral_filter/Jason-Young123/include/auxiliary.h b/08_bilateral_filter/Jason-Young123/include/auxiliary.h new file mode 100644 index 0000000..4b7a123 --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/include/auxiliary.h @@ -0,0 +1,83 @@ +//文件读写等辅助函数 +#pragma once + +#include +#include +#include +#include +#include + + +bool getCfg(const std::string& cfgPath, int& radius, float& sigma_spatial, float& sigma_color); + +bool getBin(const std::string& binPath, int& width, int& height, uint8_t*& src1, myPixel*& src2); + +float binDiff(const std::string& binPath_ref, const std::string& binPath_test); + + +//生成bin文件 +template +bool genBin(const std::string& outBinPath, const T* img, int width, int height){ + std::ofstream file(outBinPath, std::ios::binary); + if(!file){ + std::cerr << "Error: Could not open file: " << outBinPath << std::endl; + return false; + } + + int channels = std::is_same_v ? 3 : 1; + + file.write(reinterpret_cast(&width), sizeof(int)); + file.write(reinterpret_cast(&height), sizeof(int)); + file.write(reinterpret_cast(&channels), sizeof(int)); + if constexpr(std::is_same_v){ + for(size_t i = 0; i < width * height; ++i){ + uint8_t rgb[3] = {img[i].R(), img[i].G(), img[i].B() }; + file.write(reinterpret_cast(rgb), 3); + } + } + else{ + file.write(reinterpret_cast(img), width * height); + } + + file.close(); + //std::cout << "Successfully genBin: " << outBinPath << std::endl; + return true; +} + + +//大图坐标相对小图坐标而言,取值范围在-radius ~ len + radius - 1 +inline int mapReflect101(int p, int len){ + if(p < 0){//左越界 + return -p; + } + else if(p >= len){//右越界 + return 2 * (len - 1) - p; + } + else{//不越界 + return p; + } +} + +//将输入图像以reflect101模式进行边缘延拓 +template +T* Reflect101(const T* src, int width, int height, int radius){ + if(radius < 0 || radius >= width || radius >= height){ + std::cerr << "Error: unsupported radius" << std::endl; + return nullptr; + } + + int pixel_count_r101 = (width + 2 * radius) * (height + 2 * radius); + T* src_r101 = new T[pixel_count_r101]; + + for(int i = 0; i < height + 2 * radius; ++i){ + for(int j = 0; j < width + 2 * radius; ++j){ + int relative_i = i - radius; + int relative_j = j - radius; + int src_y = mapReflect101(relative_i, height);//纵坐标 + int src_x = mapReflect101(relative_j, width);//横坐标 + + src_r101[i * (width + 2 * radius) + j] = src[src_y * width + src_x]; + } + } + return src_r101; +} \ No newline at end of file diff --git a/08_bilateral_filter/Jason-Young123/include/bilateral.h b/08_bilateral_filter/Jason-Young123/include/bilateral.h new file mode 100644 index 0000000..026129a --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/include/bilateral.h @@ -0,0 +1,219 @@ +//基于gpu的双边滤波器 +#pragma once + +#include +#include +#include +#include +#include + + +//格式化输出 +#define RESET "\033[0m" +#define BOLD "\033[1m" +#define UNDERLINE "\033[4m" +#define RED "\033[31m" +#define GREEN "\033[32m" +#define BLUE "\033[34m" + +#define EMSG "\033[31m???????\033[0m" + + +//常量内存(lut)声明 +static __constant__ float spatial_lut_data[6][6]; +static __constant__ float color_lut_data[768]; + +__device__ inline float spatial_lut(int delta_y, int delta_x){ + return spatial_lut_data[delta_y][delta_x]; +} + + +__device__ inline float color_lut(int L1_diff){ + return color_lut_data[L1_diff]; +} + + + + +//主体函数: 基于gpu的双边滤波, T = uint8_t / myPixel +template +__global__ void bilateralFilter(T* src, T* dst, int radius, int width, int height, float sigma_spatial_sq, float sigma_color_sq){ + int x_dst = blockIdx.x * blockDim.x + threadIdx.x;//小图dst的x坐标 + int y_dst = blockIdx.y * blockDim.y + threadIdx.y;//小图dst的y坐标 + int x_src = x_dst + radius;//大图src的x坐标 + int y_src = y_dst + radius;//大图src的y坐标 + + //一些宏定义, 计算常量和累加器 + int width_plus_radius = width + 2 * radius; + #define SRC_AT(y, x) src[(y) * width_plus_radius + (x)] + #define DST_AT(y, x) dst[(y) * width + (x)] + float weight_sum = 0; + float3 RGB_product_sum = make_float3(0.0f, 0.0f, 0.0f); + float Gray_product_sum = 0.0f; + + + //主要处理逻辑 + if(x_dst < width && y_dst < height){//如果在范围内才计算(考虑边界block里面的部分thread可能越界),注意是针对被生成对象(小图dst)而言 + T p = SRC_AT(y_src, x_src);//获取center_pixel + for(int i = - radius; i <= radius; i += 1){//行 + int bound_j = sqrtf(radius * radius - i * i);//代替if(){continue}, 消除分支分歧; + //int bound_j = dist_lut[radius][abs(i)];//注意这里查表和weight查表的区别(broudcast vs bank conflict) + //int bound_j = dist_lut1[radius][i + radius]; + for(int j = - bound_j; j <= bound_j; j += 1){ + //step1: 计算weight_spatial + int dist_sq = i * i + j * j; + //if(dist_sq > radius * radius){ + // continue; + //} + + //step2: 计算weight_color/ + T q = SRC_AT(i + y_src, j + x_src);//获取neighbor_pixel + float diff_color = q - p; + float w = __expf( - dist_sq / sigma_spatial_sq - diff_color * diff_color / sigma_color_sq);//这里直接计算性能显著高于查表 + //float weight_spatial = spatial_lut(abs(i), abs(j)); + //float weight_color = color_lut(abs(q - p)); + //float w = weight_spatial * weight_color; + + + weight_sum += w; + + if constexpr(std::is_same_v){//RGB + uint32_t q_val = *(uint32_t*)(&q); + RGB_product_sum.x += (q_val & 0xff) * w; RGB_product_sum.y += ((q_val >> 8) & 0xFF) * w; RGB_product_sum.z += ((q_val >> 16) & 0xFF) * w; + } + else{//灰度 + Gray_product_sum += q * w; + } + } + } + + float weight_inv = 1.0f / weight_sum; + if constexpr(std::is_same_v){ + DST_AT(y_dst, x_dst) = myPixel( + (uint8_t)__float2uint_rn(RGB_product_sum.x * weight_inv), + (uint8_t)__float2uint_rn(RGB_product_sum.y * weight_inv), + (uint8_t)__float2uint_rn(RGB_product_sum.z * weight_inv) + ); + } + else{ + DST_AT(y_dst, x_dst) = (uint8_t)__float2uint_rn(Gray_product_sum * weight_inv); + } + + } + + #undef SRC_AT + #undef DST_AT +} + + + + +#ifdef PLATFORM_NVIDIA +//不包含malloc和free的纯粹版本 +template +void runFilterPure(const T* h_src, T* d_src, T* d_dst, T* h_dst, int radius, int width, int height, float sigma_spatial_sq, float sigma_color_sq, size_t src_size, size_t dst_size, cudaStream_t stream){ + if(h_src == nullptr){ + return; + } + + //copy from host to device + RUNTIME_CHECK(cudaMemcpyAsync(d_src, h_src, src_size, cudaMemcpyHostToDevice, stream)); + + //kernel + dim3 block_dim(16, 16);//blockDim固定为32 x 32 + dim3 grid_dim((width + 15) / 16, (height + 15) / 16);//gridDim根据img尺寸适配 + bilateralFilter<<>>(d_src, d_dst, radius, width, height, sigma_spatial_sq, sigma_color_sq); + + //copy from device to host + RUNTIME_CHECK(cudaMemcpyAsync(h_dst, d_dst, dst_size, cudaMemcpyDeviceToHost, stream)); + + cudaStreamSynchronize(stream); +} +#endif + + + + +#ifdef PLATFORM_MOORE +//不包含malloc和free的纯粹版本 +template +void runFilterPure(const T* h_src, T* d_src, T* d_dst, T* h_dst, int radius, int width, int height, float sigma_spatial_sq, float sigma_color_sq, size_t src_size, size_t dst_size, musaStream_t stream){ + if(h_src == nullptr){ + return; + } + + //copy from host to device + RUNTIME_CHECK(musaMemcpyAsync(d_src, h_src, src_size, musaMemcpyHostToDevice, stream)); + + //kernel + dim3 block_dim(16, 16);//blockDim固定为32 x 32 + dim3 grid_dim((width + 15) / 16, (height + 15) / 16);//gridDim根据img尺寸适配 + bilateralFilter<<>>(d_src, d_dst, radius, width, height, sigma_spatial_sq, sigma_color_sq); + + //copy from device to host + RUNTIME_CHECK(musaMemcpyAsync(h_dst, d_dst, dst_size, musaMemcpyDeviceToHost, stream)); + + musaStreamSynchronize(stream); +} + +#endif + + + + +#ifdef PLATFORM_METAX +template +void runFilterPure(const T* h_src, T* d_src, T* d_dst, T* h_dst, int radius, int width, int height, float sigma_spatial_sq, float sigma_color_sq, size_t src_size, size_t dst_size, mcStream_t stream){ + if(h_src == nullptr){ + return; + } + + //copy from host to device + RUNTIME_CHECK(mcMemcpyAsync(d_src, h_src, src_size, mcMemcpyHostToDevice, stream)); + + //kernel + dim3 block_dim(16, 16);//blockDim�~[��~Z为32 x 32 + dim3 grid_dim((width + 15) / 16, (height + 15) / 16);//gridDim�| ��~M�img尺寸�~@~B�~E~M + bilateralFilter<<>>(d_src, d_dst, radius, width, height, sigma_spatial_sq, sigma_color_sq); + + //copy from device to host + RUNTIME_CHECK(mcMemcpyAsync(h_dst, d_dst, dst_size, mcMemcpyDeviceToHost, stream)); + + mcStreamSynchronize(stream); +} + +#endif + + + +#ifdef PLATFORM_ILUVATAR +template +void runFilterPure(const T* h_src, T* d_src, T* d_dst, T* h_dst, int radius, int width, int height, float sigma_spatial_sq, float sigma_color_sq, size_t src_size, size_t dst_size, cudaStream_t stream){ + if(h_src == nullptr){ + return; + } + + //copy from host to device + RUNTIME_CHECK(cudaMemcpyAsync(d_src, h_src, src_size, cudaMemcpyHostToDevice, stream)); + + //kernel + dim3 block_dim(16, 16);//blockDim�~[��~Z为32 x 32 + dim3 grid_dim((width + 15) / 16, (height + 15) / 16);//gridDim�| ��~M�img尺寸�~@~B�~E~M + bilateralFilter<<>>(d_src, d_dst, radius, width, height, sigma_spatial_sq, sigma_color_sq); + + //copy from device to host + RUNTIME_CHECK(cudaMemcpyAsync(h_dst, d_dst, dst_size, cudaMemcpyDeviceToHost, stream)); + + cudaStreamSynchronize(stream); +} +#endif + + + + +//非模板函数声明 +void runSingleCase(const std::filesystem::path& casePath, int radius, float sigma_spatial, float sigma_color, int warmup_round, int test_round); + +void runAll(const std::string testerPath, int warmup_round, int test_round); + +void runTester(const std::string& testerPath, int warmup_round, int test_round, bool only4K = true); diff --git a/08_bilateral_filter/Jason-Young123/include/myPixel.h b/08_bilateral_filter/Jason-Young123/include/myPixel.h new file mode 100644 index 0000000..a1453b1 --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/include/myPixel.h @@ -0,0 +1,79 @@ +//辅助类定义 +#pragma once + +#include +#include + +//辅助类定义,四字节对齐 +class __align__(4) myPixel{ +private: + uint8_t _R; + uint8_t _G; + uint8_t _B; + uint8_t _A; +public: + __host__ __device__ + myPixel() : _R(0), _G(0), _B(0), _A(255) {}//默认初始化 + + __host__ __device__ + myPixel(uint8_t R, uint8_t G, uint8_t B): _R(R), _G(G), _B(B), _A(255){}//显式初始化 + + __host__ __device__ + myPixel(uint8_t common): _R(common), _G(common), _B(common), _A(common){}//显式初始化 + + __host__ __device__ + myPixel(const myPixel& other){//拷贝初始化 + _R = other._R; + _G = other._G; + _B = other._B; + _A = other._A; + //*(uint32_t*)this = *(const uint32_t*)&other; + } + + __host__ __device__ + ~myPixel(){}//析构 + + __host__ __device__ + myPixel& operator=(const myPixel& other){//拷贝赋值 + if(this != &other){ + _R = other._R; + _G = other._G; + _B = other._B; + _A = other._A; + //*(uint32_t*)this = *(const uint32_t*)&other; + } + return *this; + } + + __host__ __device__ + uint8_t R() const { return _R; } + + __host__ __device__ + uint8_t G() const { return _G; } + + __host__ __device__ + uint8_t B() const { return _B; } + +#if defined(PLATFORM_METAX) || defined(PLATFORM_ILUVATAR) + __host__ __device__ + int operator-(const myPixel& other) const{ + int r_d = (int)_R - (int)other._R; + int g_d = (int)_G - (int)other._G; + int b_d = (int)_B - (int)other._B; + //float sum = fabsf(r_d) + fabsf(g_d) + fabsf(b_d); + return abs(r_d) + abs(g_d) + abs(b_d); + } +#else + __device__ + int operator-(const myPixel& other) const { + unsigned int p1, p2; + p1 = *(unsigned int*)this; p2 = *(unsigned int*)&other; + + //莫名其妙的bug: 5090不支持__vabsdiffu4等__v*的SIMD函数,直接将两个操作数操作而非先拆分后操作? 只能采用PTX代替 + unsigned int result; + asm("vabsdiff4.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(result) : "r"(p1), "r"(p2), "r"(0)); + return result; + } +#endif + +}; diff --git a/08_bilateral_filter/Jason-Young123/include/myopencv.h b/08_bilateral_filter/Jason-Young123/include/myopencv.h new file mode 100644 index 0000000..281c0de --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/include/myopencv.h @@ -0,0 +1,88 @@ +//包含和cv有关的函数, 如果服务器不包含opencv库则忽略该文件 +#pragma once + +#include +#include +#include +#include + +#ifdef HAS_CV + +#ifdef PLATFORM_ILUVATAR + #include + namespace std { + template + auto max(T1 a, T2 b) -> typename std::common_type::type { + return (a > b) ? a : b; + } + } + #pragma clang diagnostic push + #pragma clang diagnostic ignored "-Wliteral-range" +#endif + +#include + +#ifdef PLATFORM_ILUVATAR + #pragma clang diagnostic pop +#endif + +bool genBin_cv(const std::string& outBinPath, const cv::Mat& img); + +void genTester(); + +bool genImg_cv(const std::string& outImgPath, const cv::Mat& img, bool isRGB); + +//cv版本bilateralFilter +template +void runFilter_cv(const T* src, int radius, int width, int height, float sigma_spatial, float sigma_color, T* dst){ + if(src == nullptr){ + return; + } + + int type = std::is_same_v ? CV_8UC4 : CV_8UC1; + cv::Mat inputMat(height, width, type, const_cast(src)); + cv::Mat outputMat; + + if constexpr(std::is_same_v){ + cv::Mat tempBGR; + cv::cvtColor(inputMat, tempBGR, cv::COLOR_RGBA2BGR); // 4 -> 3 + cv::Mat filteredBGR; + cv::bilateralFilter(tempBGR, filteredBGR, 2 * radius + 1, sigma_color, sigma_spatial); + cv::cvtColor(filteredBGR, outputMat, cv::COLOR_BGR2RGBA); // 3 -> 4 + } + else{ + cv::bilateralFilter(inputMat, outputMat, 2 * radius + 1, sigma_color, sigma_spatial); + } + + size_t total_bytes = width * height * sizeof(T); + std::memcpy(dst, outputMat.data, total_bytes); +} + + +//生成图片, T = uint8_t/myPixel +template +bool genImg(const std::string& outImgPath, const T* img, int width, int height){ + if constexpr(std::is_same_v){ + cv::Mat imgRGB(height, width, CV_8UC3, (void*)img); + cv::Mat imgBGR; + cv::cvtColor(imgRGB, imgBGR, cv::COLOR_RGB2BGR); + if (!cv::imwrite(outImgPath, imgBGR)){ + return false; + } + } + else if constexpr(std::is_same_v) { + cv::Mat imgGray(height, width, CV_8UC1, (void*)img); + if (!cv::imwrite(outImgPath, imgGray)){ + return false; + } + } + else{ + std::cerr << "Error: Unsupported type for genImg." << std::endl; + return false; + } + + std::cout << "Successfully genImg: " << outImgPath << std::endl; + return true; +} + +#endif diff --git a/08_bilateral_filter/Jason-Young123/include/utils.h b/08_bilateral_filter/Jason-Young123/include/utils.h new file mode 100644 index 0000000..e165e0b --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/include/utils.h @@ -0,0 +1,36 @@ +//调试辅助文件 +#pragma once + +#include + +#if defined(PLATFORM_NVIDIA) || defined(PLATFORM_ILUVATAR) +#include +#define RUNTIME_ERR_TYPE cudaError_t +#define RUNTIME_SUCCESS_CODE cudaSuccess +#define RUNTIME_GET_ERROR_STR cudaGetErrorString + +#elif defined(PLATFORM_MOORE) +#include +#define RUNTIME_ERR_TYPE musaError_t +#define RUNTIME_SUCCESS_CODE musaSuccess +#define RUNTIME_GET_ERROR_STR musaGetErrorString + +#elif defined(PLATFORM_METAX) +#include +#define RUNTIME_ERR_TYPE mcError_t +#define RUNTIME_SUCCESS_CODE mcSuccess +#define RUNTIME_GET_ERROR_STR mcGetErrorString + +#else +#error "Unknown PLATFORM for RUNTIME_CHECK" +#endif + +#define RUNTIME_CHECK(call) \ + do { \ + RUNTIME_ERR_TYPE err = call; \ + if (err != RUNTIME_SUCCESS_CODE) { \ + std::cerr << "Runtime error at " << __FILE__ << ":" << __LINE__ << " - " \ + << RUNTIME_GET_ERROR_STR(err) << "\n"; \ + exit(EXIT_FAILURE); \ + } \ + } while (0) diff --git "a/08_bilateral_filter/Jason-Young123/report/Bilateral Filter\350\256\276\350\256\241\346\212\245\345\221\212.pdf" "b/08_bilateral_filter/Jason-Young123/report/Bilateral Filter\350\256\276\350\256\241\346\212\245\345\221\212.pdf" new file mode 100644 index 0000000..4c28ec2 Binary files /dev/null and "b/08_bilateral_filter/Jason-Young123/report/Bilateral Filter\350\256\276\350\256\241\346\212\245\345\221\212.pdf" differ diff --git a/08_bilateral_filter/Jason-Young123/report/Report.md b/08_bilateral_filter/Jason-Young123/report/Report.md new file mode 100644 index 0000000..0423b34 --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/report/Report.md @@ -0,0 +1,921 @@ +# Bilateral Filter设计报告 + + + +## 一.设计思路 + +### 1.1 算法 + +#### 1.1.1 基于给定半径 + +​ 通过修改`tester/config.txt`中radius、sigma_spatial、sigma_color可修改滤波参数。任何大于0的整数radius均视为合法半径,将被成功读取并参与计算: + +- **OpenCV** 方面,直接调用函数: + + ```cpp + cv::bilateralFilter(InputArray src, OutputArray dst, int d, double sigma_color, double sigma_spatial, int borderType = BORDER_DEFAULT)//d = 2 * radius + 1 + ``` + +相关源码可参考官方仓库:<[opencv/modules/imgproc/src at 4.x · opencv/opencv](https://github.com/opencv/opencv/tree/4.x/modules/imgproc/src)> + + + +- **CUDA** 方面,遵循以下公式,基于GPU硬件特征,并行运算每一个像素点的滤波结果: + $$ + I_{filtered}(x) = \frac{1}{W_p} \sum_{x_i \in \Omega} I(x_i) \exp\left( -\frac{\|I(x_i) - I(x)\|^2}{2\sigma_c^2} \right) \exp\left( -\frac{\|x_i - x\|^2}{2\sigma_s^2} \right) + $$ + 其中:$x$为窗口中心像素点,$I(·)$代表像素值,$\Omega$代表$(2r+1)*(2r+1)$的滤波窗口,$\sigma_c$即sigma_color,$\sigma_s$即sigma_spatial,$W_p$为归一化因子; + + + + 具体实现时,需注意以下细节: + + - 在OpenCV的官方实现中,滤波窗口并非半径为$r$的矩形,而是半径$r$的圆;为确保结果尽可能接近,CUDA设计时沿用OpenCV这一设定; + - 在OpenCV的官方实现中,空间域距离计算采用**向量$L_{2}$范数**,色彩域距离计算采用**向量$L_{1}$范数**;CUDA沿用相同设定; + - OpenCV的图像边界处理(borderType)默认采用**Reflect101**延拓<[BorderTypes in opencv::core - Rust](https://docs.rs/opencv/latest/opencv/core/enum.BorderTypes.html)>;CUDA沿用相同设定。 + + + +#### 1.1.2 自适应半径选取 + +​ 当`tester/config.txt`中radius设置为 <= 0时,设计将基于以下算法进行自动滤波半径选取(`src/bilateral.cu: autoSelectRadius`): + +- **$3\sigma$物理基准** + + 首先基于高斯分布的 **$3\sigma$ 原则** 设定基础半径: + $$ + Radius_{base} = 3 \times \sigma_{spatial} + $$ + 这确保了在空域权重下降到忽略不计(约 $0.01$)之前,窗口能够覆盖绝大部分有效的像素贡献区域; + +- **局部活跃度采样** + + 通过**跳跃式步进**对图像进行全局均匀采样(约2500采样点/pic),计算相邻像素的平均梯度$d_{avg}$($L_1$ 范数),从而量化图像噪声水平和平滑程度; + +- **自适应微调** + + 基于上述步骤获取的平均梯度对基础半径进行微调($factor$),对于整体平滑的图像适当收缩半径(0.6x ~ 1.0x),而对整体嘈杂的图像适当扩大半径(1.0x ~ 1.4x),并最终将半径约束在$[3, 10]$内,即: + $$ + factor = \text{clamp} \left( 0.6 + (d_{avg} - 2.0) \cdot \frac{1.4 - 0.6}{15.0 - 2.0}, \,\, 0.6, \,\, 1.4 \right)\\ + Radius_{final} = min[max(Radius_{base} \times factor, 3), 10] + $$ + + + + + +​ 例如,当sigma_spatial = 3时,对于下左图(整体平滑),自动选取半径为**radius = 6**;对于下右图(具有较多细致纹理),自动选取半径为**radius = 9**,符合预期。 + +
+ + + + + +### 1.2 系统框图 + +​ 整体系统框图如下所示,主要分为预处理、运算和后处理三部分: + +
+ + + + + + + +## 二.优化方法 + +### 2.1 合适的并行计算策略 + +​ 对于输入尺寸为width * height的图像,设计过程中先后尝试了3种并行化方案: + +- **方案1:每个线程(thread)处理一个滤波窗口** + +​ 设置 `blockDim = (16, 16), gridDim = (ceil(width/16), ceil(height/16))`;block沿x和y方向延申直至铺满整个待处理图像,block内部横向和纵向各16个thread,每个thread负责计算图像上以某点为中心区域的滤波结果(通过二重循环处理窗口内每一个像素)。 + +​ 整体如下所示,其中蓝框标注为整个grid范围;红框标注为一个block范围(16 x 16),红圈代表一个thread;橙色标注为一个滤波窗口范围,同时代表一个thread的处理范围。 + +
+ + + +- **方案2:每个线程束(warp)处理一个滤波窗口** + +​ 在方案1中,每个线程需要对滤波窗口内每个像素点进行遍历,二重循环开销较大。考虑到双边滤波本质上是对数据的加权求和,因而可以考虑用一个线程束(warp)处理一个滤波窗口,并通过warp内寄存器级归约函数`__shfl_down_sync`完成最终的归约求和。 + +​ 具体而言,设置 `blockDim = (32, 4, 4), gridDim = (ceil(width/4), ceil(height/4))`;block同样沿x和y方向延申至铺满整个待处理图像,block内部沿图像横向和纵向各4组thread,每组thread包含一个warp共32个子thread,每个warp负责计算图像上以某点为中心区域的滤波结果(此时warp内每个thread的工作量大幅减少,只需处理窗口内的个别像素点,并依赖最终的warp级归约得出最终结果)。 + +​ 整体如下所示,其中蓝框标注为整个grid范围;红框标注为一个block范围(32 x 4 x 4)(的平面映射),红圈代表一个warp共32个thread;橙色标注为一个滤波窗口范围,每个橙圈代表warp内一个thread。相较方案1,此时每个thread只需处理 $(2r+1)^2/32$ 个像素点,工作量为原来的1/32;最后由warp内归约得到该warp所对应中心像素点的滤波结果。 + +
+ + + +- **方案3:RGB通道级并行** + +​ 对于RGB图像,其具有三个通道,且每个通道计算流程完全相同,因此可以考虑在通道层面进行并行计算。具体而言,可以在方案1基础上修改 `gridDim = (ceil(width/16), ceil(height/4), 3)`,在不同的 `gridDim.z` 上处理不同颜色通道。 + +​ 整体如下所示,除 `gridDim` 维度增加外,其余处理思路同方案1。 + +
+ + + +​ 实验结果表明,在**常规滤波半径窗口下(~ 5)**,**方案1**效率最高。 + +​ **方案2** 虽然减少了单线程循环次数,但引入了频繁的寄存器级同步与`__shfl_down_sync`规约操作,在双边滤波这种计算密集型任务中,同步开销抵消了并行收益(尤其当滤波窗口半径较小时);且方案2中block数量剧增(是方案1的16倍),从而导致硬件调度和指令分发开销随之增加,进一步消磨了高并行度所带来的理论收益。 + +​ **方案3** 虽然增加了通道并行度,却带来了访存带宽的冗余消耗。通道级并行导致相同坐标像素被R/G/B三个独立线程块(对应不同`gridDim.z`)分3次加载,相比方案1这种单次加载、线程内寄存器级复用、原地分离并处理RGB分量的高度集成模式,方案3降级为显存级重复加载,带来了不可忽略的冗余开销。 + + + +- **结论** + +​ 在 **常规半径** 下,**方案1**更好地实现了访存效率与并行度的平衡,是本设计中首选的计算策略。 + +​ 但不可否认的是,随着滤波窗口半径增加(比如15 ~ 20的超大半径滤波),方案1的朴素双重循环开销会快速增加,此时可能需要探索更高效的并行策略,这也是后续优化探索的重要内容。 + + + +方案1的核心滤波函数`bilateralFilter`如下所示: + +```cpp +//主体函数: 基于gpu的双边滤波, T = uint8_t / myPixel +template +__global__ void bilateralFilter(T* src, T* dst, int radius, int width, int height, float sigma_spatial_sq, float sigma_color_sq){ + int x_dst = blockIdx.x * blockDim.x + threadIdx.x;//小图dst的x坐标 + int y_dst = blockIdx.y * blockDim.y + threadIdx.y;//小图dst的y坐标 + int x_src = x_dst + radius;//大图src的x坐标 + int y_src = y_dst + radius;//大图src的y坐标 + + //一些宏定义, 计算常量和累加器 + int width_plus_radius = width + 2 * radius; + #define SRC_AT(y, x) src[(y) * width_plus_radius + (x)] + #define DST_AT(y, x) dst[(y) * width + (x)] + float weight_sum = 0; + float3 RGB_product_sum = make_float3(0.0f, 0.0f, 0.0f); + float Gray_product_sum = 0.0f; + + + //主要处理逻辑 + if(x_dst < width && y_dst < height){//如果在范围内才计算(考虑边界block里面的部分thread可能越界),注意是针对被生成对象(小图dst)而言 + T p = SRC_AT(y_src, x_src);//获取center_pixel + for(int i = - radius; i <= radius; i += 1){//行 + int bound_j = sqrtf(radius * radius - i * i);//代替if(){continue}, 消除分支分歧; + for(int j = - bound_j; j <= bound_j; j += 1){//列 + int dist_sq = i * i + j * j; + T q = SRC_AT(i + y_src, j + x_src);//获取neighbor pixel + float diff_color = q - p; + float w = __expf( - dist_sq / sigma_spatial_sq - diff_color * diff_color / sigma_color_sq);//这里合并后的直接计算性能显著高于查表 + weight_sum += w; + if constexpr(std::is_same_v){//RGB + uint32_t q_val = *(uint32_t*)(&q); + RGB_product_sum.x += (q_val & 0xff) * w; + RGB_product_sum.y += ((q_val >> 8) & 0xFF) * w; + RGB_product_sum.z += ((q_val >> 16) & 0xFF) * w; + } + else{//灰度 + Gray_product_sum += q * w; + } + } + } + + float weight_inv = 1.0f / weight_sum; + if constexpr(std::is_same_v){ + DST_AT(y_dst, x_dst) = myPixel( + (uint8_t)__float2uint_rn(RGB_product_sum.x * weight_inv), + (uint8_t)__float2uint_rn(RGB_product_sum.y * weight_inv), + (uint8_t)__float2uint_rn(RGB_product_sum.z * weight_inv) + ); + } + else{ + DST_AT(y_dst, x_dst) = (uint8_t)__float2uint_rn(Gray_product_sum * weight_inv); + } + + } + + #undef SRC_AT + #undef DST_AT +} +``` + + + + + +### 2.2 对齐存储与向量化 + +- **基于显式对齐的结构体设计** + +​ 设计中为存储RGB图像多通道数据,引入了遵循 **4字节对齐** 准则的 `myPixel`类:除基础`_R`、`_G`、`_B`成员外,还包含冗余Alpha通道`_A`作为Padding,并使用`__align__(4)`关键字,强制使像素对象符合全局内存的对齐访问规范,有效避免跨缓存行访问,最大限度利用总线带宽: + +```cpp +class __align__(4) myPixel{ +private: + uint8_t _R; + uint8_t _G; + uint8_t _B; + uint8_t _A; +public: + //other functions +} +``` + + + +- **访存合并与读取宽度优化** + +​ 对于RGB图像,双边滤波器会单独处理R、G、B通道,因此在主循环内,需逐次加载`myPixel`的`_R`、`_G`、`_B`成员。这通常会编译生成3次独立访存指令`LDG.E.U8`(加载8位无符号整型),带来更高的指令发射、访存延迟和流水线停顿代价。为避免这一问题,设计中将`myPixel`对象强制理解为`uint32_t`类型而仅进行1次访存`LDG.E.32`(加载32位整型),此后通过更高效的位运算(移位+掩码)进行`_R`、`_G`、`_B`成员拆分,用低成本运算代替高成本访存,实现了负载均衡: + +```cpp +//核函数内 +float3 RGB_product_sum = make_float3(0.0f, 0.0f, 0.0f);//全局累加器,对应R、G、B三个通道 + +//对于滤波窗口内每个点: +for(int i = -radis; i <= radius; ++i){ + for(int j = -radius; j <= radius; ++j){ + ...//进行权重w计算等 + T q = SRC_AT(i + y_src, j + x_src);//加载neighbor pixel + uint32_t q_val = *(uint32_t*)(&q);//强制解释为uint32_t + RGB_product_sum.x += (q_val & 0xff) * w;//获取R分量 + RGB_product_sum.y += ((q_val >> 8) & 0xFF) * w;//G + RGB_product_sum.z += ((q_val >> 16) & 0xFF) * w;//B + ... + } +} +``` + + + +- **采用内联向量指令计算色彩距离** + +​ 如 **[1.1.1](#target6)** 中所述,OpenCV计算色彩域距离时采用了$L_{1}$范数,这恰好对应CUDA的向量指令`vabsdiff4`——其具备SIMD特点,能够在一个时钟周期内同时计算两组4字节数据(如`char4`或者这里定义的`myPixel`)对应分量的绝对差值。基于此,本设计为`myPixel`设计了如下减法重载函数,有效提升了$L_{1}$范数计算速度: + +```cpp +__device__//注意vabsdiff4只适用于设备端 +int myPixel::operator-(const myPixel& other) const { + unsigned int p1, p2; + p1 = *(unsigned int*)this; p2 = *(unsigned int*)&other; + + unsigned int result; + asm("vabsdiff4.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(result) : "r"(p1), "r"(p2), "r"(0)); + return result; +} +``` + +​ 注:1. 理论上可以用功能相同的封装函数`__vabsdiff4u()`代替内联汇编指令,但不知为何在 **Nvidia** 平台,`__v*`等向量指令计算结果都存在问题,原因待进一步研究; + +2. **Metax** 和 **Iluvatar** 平台暂不支持向量指令,只能采用传统计算方式: + +```cpp +__host__ __device__ +int myPixel::operator-(const myPixel& other) const{ + int r_d = (int)_R - (int)other._R; + int g_d = (int)_G - (int)other._G; + int b_d = (int)_B - (int)other._B; + return abs(r_d) + abs(g_d) + abs(b_d); +} +``` + + + +- **总结** + +​ 采用上述对齐存储、高效加载和向量化运算等优化方法后(主要针对`myPixel`和RGB图),整体滤波速度能获得 **10% ~ 15%** 的提升。 + + + + + +### 2.3 循环边界收缩 + +​ 如前所述,在OpenCV实现中,滤波窗口是半径为radius的圆而非矩形,因此在CUDA核函数的主循环中,需判断每个邻域点是否落在中心像素半径为radius的圆内: + +```cpp +//核函数内,对于滤波窗口内每个点: +for(int i = -radis; i <= radius; ++i){ + for(int j = -radius; j <= radius; ++j){ + if(i * i + j * j >= radius * radius){ + continue;//如果落在圆外,则直接跳过 + } + ... + } +} +``` + +​ 但上述方案实际上会导致矩形四角处约$1- \frac{\pi}{4}≈21\%$的空转迭代,即便有`continue`语句也无法避免条件跳转与谓词掩码计算,从而增大了循环和分支预测开销。为此,设计中采用边界函数 $bound\_j = \sqrt{R^2 - i^2}$ 设置空域窗口遍历范围,替代了传统`if...continue`的逻辑判断: + +```cpp +//核函数内,对于滤波窗口内每个可能点: +for(int i = -radis; i <= radius; ++i){ + //给出边界函数,代替if(){continue},有效减少循环和分支语句 + int bound_j = sqrtf(radius * radius - i * i); + for(int j = -bound_j; j <= bound_j; ++j){ + ... + } +} +``` + +​ 基于此,循环边界被主动收缩,消除隐式控制流分歧的同时减少了约$21\%$的无效指令发射,使得GPU的取指/译码单元能够保持更高的流水吞吐率,加速了滤波效率。虽然相比`if...continue`方案增加了`sqrtf`的浮点运算,但经测试,其对于控制流的优化收益远超算数指令成本。 + + + +- **总结** + + 采用上述收缩循环边界的控制流优化方案后,整体滤波速度能获得 **25% ~ 30%** 的提升; + + + + + +### 2.4 计算-访存权衡 + +- **用查表代替直接计算** + + ​ 在 **[2.3](#target2)** 中,计算循环边界$bound\_j$时涉及到浮点运算`sqrtf`、存在一定时间成本,此时一个常用的优化方案是用查找表(Look-up Table,LUT)代替直接运算,即将预计算所得结果放于常量内存(`__const__`)内,需要计算时用数组索引访存以快速获取结果,如下所示: + + ```cpp + //核函数外: 定义lut(最大支持radius = 5,仅作示例) + __constant__ int dist_lut[6][6] = { + {0, 0, 0, 0, 0, 0}, // radius = 0 + {1, 0, 0, 0, 0, 0}, // radius = 1 + {2, 1, 0, 0, 0, 0}, // radius = 2 + {3, 2, 2, 0, 0, 0}, // radius = 3 + {4, 3, 3, 2, 0, 0}, // radius = 4 + {5, 4, 4, 4, 3, 0} // radius = 5 + }; + + //核函数内: + for(int i = -radis; i <= radius; ++i){ + int bound_j = dist_lut[radius][abs(i)];//用查表替代直接计算 + for(int j = -bound_j; j <= bound_j; ++j){ + ... + } + } + ``` + + ​ 经测试,此处采用LUT后,滤波速度能进一步提升约 **5%** 。但考虑到LUT尺寸会随支持半径范围增大而显著增加,为接受任意滤波半径,最终设计中暂未采用LUT。(但对于半径受限场景,如限制 $radius \le 5$时,可以采用该方案) + + + +- **查表并非总能提升效率** + +​ 在滤波循环内也会涉及到指数、平方等算数运算,一个自然的想法是将这些运算也用LUT代替,比如通过查表直接获取空域权重和色域权重: +$$ +weight\_spatial = \exp\left( -\frac{\|x_i - x\|^2}{2\sigma_s^2} \right)\\ +weight\_color = \exp\left( -\frac{\|I(x_i) - I(x)\|^2}{2\sigma_c^2} \right) +$$ +相关实现如下: + +```cpp +//核函数外: 定义spatial_lut和color_lut,后续在main函数内进行初始化 +__constant__ float spatial_lut_data[6][6]; +__constant__ float color_lut_data[768]; + +__device__ inline float spatial_lut(int delta_y, int delta_x){ + return spatial_lut_data[delta_y][delta_x]; +} + +__device__ inline float color_lut(int L1_diff){ + return color_lut_data[L1_diff]; +} + +//核函数内: +T p = SRC_AT(y_src, x_src);//center pixel +for(int i = -radis; i <= radius; ++i){ + int bound_j = sqrtf(radius * radius - i * i); + for(int j = -bound_j; j <= bound_j; ++j){ + ... + T q = SRC_AT(i + y_src, j + x_src);//neighbor pixel + float weight_spatial = spatial_lut(abs(i), abs(j));//直接查表获取weight_spatial + float weight_color = color_lut(abs(q - p));//直接查表获取weight_color + float w = weight_spatial * weight_color; + ... + } +} +``` + +​ 但实测结果表明,上述代码会使滤波效率降低 **30%** 左右。其原因在于,这里的LUT(尤其是`color_lut`)不具备warp一致性。具体而言,`color_lut`的索引取决于`diff_color`,而在本设计中,同一warp里的线程面对的是不同的邻域像素(滤波窗口),产生的`diff_color`(访存地址)几乎是随机分布的、极易产生Bank Conflict,从而导致访存序列化和常量内存控制器的阻塞。而之前的dist_lut却具备访存地址一致性:同一warp内线程执行到循环的某一步时,`i`的值是完全相同的;当warp内所有线程请求同一地址时,硬件可通过Broadcast机制进行高效访存,从而保证滤波效率。 + +​ 除此之外,CUDA具备高效的指数运算内嵌函数(Intrinsic Function)`__expf`,其可以直接映射到硬件的SFU(Special Function Unit)上,具备极高的吞吐量;且直接运算可以通过$e^a \times e^b = e^{a+b}$ 将两次指数操作合并,而采用LUT则难以实现这种操作融合。总体而言,采用如下 **合并后的直接指数运算** 可以获得更高的滤波效率: +```cpp +//核函数内: +T p = SRC_AT(y_src, x_src);//center pixel +for(int i = -radis; i <= radius; ++i){ + int bound_j = sqrtf(radius * radius - i * i); + for(int j = -bound_j; j <= bound_j; ++j){ + ... + T q = SRC_AT(i + y_src, j + x_src);//获取neighbor_pixel + float diff_color = q - p; + //为避免平方运算, sigma_spatial_sq = 2 * sigma_spatial * spatial直接作为函数入口参数; sigma_color_sq同理 + float w = __expf( - dist_sq / sigma_spatial_sq - diff_color * diff_color / sigma_color_sq); + ... + } +} +``` + + + +- **总结** + + ​ 在核函数内,通过合理权衡访存(LUT)和运算操作,如在地址一致无冲突时用查表代替浮点运算、在存在Bank Conflict时利用高效内嵌函数直接运算,可以将整体滤波效率提升约 **30%**; + + + + + +### 2.5 高效内存分配与异步传输 + +​ CUDA滤波的另一大开销在于内存分配和数据在D&H(Device & Host)间搬运,可针对二者分别进行优化。 + +- **锁页内存登记** + +​ 通常在Host端用`new`分配内存时,所得内存为 **可分页内存(Pageable Memory)**,其在传输前需要由驱动程序先拷贝至内部临时缓冲区,带来一定的时间开销。而 **锁页内存(Pinned Memory)**则允许DMA控制器直接访问Host内存,消除这一隐式拷贝。具体而言,可以通过`cudaHostRegister`将已分配的Host内存手动提升为锁页内存(并在结束后配合`cudaHostUnregister`进行释放),确立DMA直接访问路径,显著提升PCIe总线的有效吞吐量: + +```cpp +//以RGB图为例,分配Host端内存 +myPixel* h_src = nullptr, *h_dst = new Type[width * height]; + +//将Host端内存提升为锁页内存 +cudaHostRegister(h_src, src_size, cudaHostRegisterDefault); +cudaHostRegister(h_dst, dst_size, cudaHostRegisterDefault); + +//以下进行H2D拷贝、核函数启动、D2H拷贝等 +myPixel *d_src, *d_dst; +... + +//核函数运行完毕后,进行锁页内存释放 +cudaHostUnregister(h_src); +cudaHostUnregister(h_dst); +``` + + + +- **异步数据传输** + + ​ 传统D2H和H2D内存拷贝需调用`cudaMemcpy`函数,而这是 **阻塞操作** ,会妨碍主机端指令流的后续执行,导致 CPU 必须空等数据搬运完成,引发 CPU - GPU 间的协同失步;取而代之,使用`cudaMemcpyAsync`配合cuda流类型`cudaStream_t`可以实现数据拷贝事件的 **非阻塞分发**,即向接收端提交传输任务后立刻返回、不等待传输完成。这允许发送方继续执行后续的逻辑调度,最大程度实现传输与计算重叠,从而提升滤波效率。异步封装后核函数如下所示(注意`stream`作为了函数入口参数;在调用时需首先通过`cudaStreamCreate`创建`cudaStream_t stream`并传入): + + ```cpp + template + void runFilterPure(const T* h_src, T* d_src, T* d_dst, T* h_dst, int radius, int width, int height, float sigma_spatial_sq, float sigma_color_sq, size_t src_size, size_t dst_size, cudaStream_t stream){ + if(h_src == nullptr){ + return; + } + + //H2D异步数据拷贝 + RUNTIME_CHECK(cudaMemcpyAsync(d_src, h_src, src_size, cudaMemcpyHostToDevice, stream)); + + //启动核函数 + dim3 block_dim(16, 16);//blockDim固定为32 x 32 + dim3 grid_dim((width + 15) / 16, (height + 15) / 16);//gridDim根据img尺寸适配 + bilateralFilter<<>>(d_src, d_dst, radius, width, height, sigma_spatial_sq, sigma_color_sq); + + //D2H异步数据拷贝 + RUNTIME_CHECK(cudaMemcpyAsync(h_dst, d_dst, dst_size, cudaMemcpyDeviceToHost, stream)); + + cudaStreamSynchronize(stream); + } + ``` + + ​ 更重要的是,这种异步传输方式为 **多流(Multi-Streaming)**并行提供了基础,有望在处理视频时实现“传输N+1帧的同时计算第N帧”,让预处理、数据搬运和GPU端运算充分流水,最大程度提升效率;这也是后续的重要改进方向。 + + + +- **总结** + + ​ 通过上述高效内存分配和数据传输机制,可以将整体效率提升 **15% ~ 20%**; + + + + + +### 2.6 尝试过但未采用的优化手段 + +#### 2.6.1 共享内存 + +​ 为进一步提升访存效率,设计中尝试先将像素值加载至共享内存(Shared Memory),后续在核函数内直接访问SM以快速获取数据。具体而言,在进入核函数`bilateralFilter`后,由一个block(16 x 16)协作加载所有可能被处理的像素点(由若干滤波窗口交叠而成),如下所示: + +```cpp +//核函数内: +constexpr size_t SMEM_SIZE = 32;//16 + 2 * 8, 最大支持半径可达8 (仅作示例) +__shared__ char s_buffer[32 * 32 * sizeof(T)]; +T* neighbors = (T*)s_buffer; +//cooperative load, 每个block内线程一起完成(16 + 2r) * (16 + 2r)区域像素点的加载 +for(int i = ty; i < SMEM_SIZE; i += blockDim.y){//行 + for(int j = tx; j < SMEM_SIZE; j += blockDim.x){//列 + int x_cur = blockIdx.x * blockDim.x + j; + int y_cur = blockIdx.y * blockDim.y + i; + if(x_cur < (width + 2 * radius) && y_cur < (height + 2 * radius)){ + neighbors[i * 32 + j] = SRC_AT(y_cur, x_cur); + } + } +} +__syncthreads(); + +//后续对于该block内每个thread, 访问neighbor pixel可以由SRC_AT改为访问s_buffer +``` + +​ 但测试结果表明,上述引入共享内存的方案并未达到预期加速效果,甚至在中小滤波半径场景下效率明显降低,可能原因在于:双边滤波本身为计算密集型应用,每个像素对于一个线程而言只会被访问一次,因而填充SM所带来的固有消耗需要在滤波半径足够大时才会被抵消;共享内存填充/访问时需进行额外的坐标变换与索引计算,一定程度上增加了ALU的负担;此外`__syncthreads()`所带来的强制块内线程同步也会造成流水线停顿。 + +​ 综合考虑,本设计最终未采用Shared Memory进行数据加载;在未来探索中,尤其面对大/超大半径滤波时,Shared Memory可能将成为更适合的选择。 + + + +#### 2.6.2 向量化加载 + +​ 具体参见 **四.Ncu分析:**[**相关优化尝试**](#target1)。 + + + + + +## 三. 性能指标分析 + +### 3.0 总览 + +​ 本设计在 **Nvidia(英伟达)**、**Moore(摩尔线程)**、**Metax(沐曦)**、**Iluvatar(天数)**平台上均能实现以下功能: + +- 支持 **任意尺寸** 的灰度或RGB图像(.bin)输入; +- 支持 **任意半径** 配置,且当radius <= 0时根据输入图像特征 **自适应选取半径** ; +- 支持 **任意sigma_spatial和sigma_color** 配置; + + + +​ 在上述四大平台,基于**4K UHD(3840 * 2160)图像测试集(Gray/RGB)**,本设计可达到以下性能: + +- **计算精度:**和OpenCV的bilateralFilter标杆实现相比,各平台计算误差**MAE均小于1**,符合要求 + - **RGB图像:**全部测试半径下,**MAE <= 1e-5**; + - **Gray图像:**当测试半径 > 2时,**MAE <= 1e-5**;当半径 <= 2时,可能由于算法差异,**MAE ≈ 0.5**; + + - **吞吐量与加速比:** + - **Nvidia平台:**峰值吞吐量 **3254.41MP/s (Gray)** 和 **1650.54MP/s (RGB)**,峰值加速比 **4.27553x (Gray)** 和 **17.9033x (RGB)** + - **Moore平台:**峰值吞吐量 **12522.78MP/s (Gray)** 和 **4833.82MP/s (RGB)**,峰值加速比 **11.8315x (Gray)** 和 **30.1175x (RGB)** + - **Metax平台:**峰值吞吐量 **9551.09MP/s (Gray)** 和 **4361.43MP/s (RGB)**,峰值加速比 **2.80345x (Gray)** 和 **21.1496x (RGB)** + - **Iluvatar平台:**峰值吞吐量 **8594.56MP/s (Gray)** 和 **2662.75MP/s (RGB)**,峰值加速比 **8.35226x (Gray)** 和 **11.4367x (RGB)** + + + + + +### 3.1 误差 + +​ 基于4K UHD图像测试集,取sigma_spatial = 3.0,sigma_color = 30.0,各平台误差分析如下: + + + +#### 3.1.1 Nvidia + +​ 在Nvidia平台上(A100),各测试半径下典型计算误差(MAE)如下表所示: + +| 滤波半径 | Gray | RGB | +| :------: | :---------: | :---------: | +| 1 | *0.481447* | 8.43943e-07 | +| 2 | *0.499052* | 8.31887e-06 | +| 3 | 4.5814e-06 | 3.73746e-06 | +| 4 | 2.65239e-06 | 2.93371e-06 | +| 5 | 2.77296e-06 | 5.14403e-06 | +| 6 | 5.66647e-06 | 7.31417e-06 | +| 7 | 7.71605e-06 | 4.58140e-06 | +| 8 | 8.07774e-06 | 9.16281e-06 | +| 9 | 5.30478e-06 | 7.15342e-06 | +| 10 | 7.35436e-06 | 6.91229e-06 | + +​ 除小半径Gray图外,MAE均不超过1e-5; + + + +#### 3.1.2 Moore + +​ 在Moore平台上(S5000),各测试半径下典型计算误差(MAE)如下表所示: + +| 滤波半径 | Gray | RGB | +| :------: | :---------: | :---------: | +| 1 | *0.493864* | 2.57202e-6 | +| 2 | 0.497564 | 2.41450e-6 | +| 3 | 4.46084e-6 | 5.70666e-6 | +| 4 | 8.80112e-06 | 5.14403e-06 | +| 5 | 7.47492e-06 | 4.5814e-06 | +| 6 | 5.42535e-06 | 5.14403e-06 | +| 7 | 6.51042e-06 | 6.59079e-06 | +| 8 | 6.02816e-06 | 3.77765e-06 | +| 9 | 6.02816e-06 | 5.30478e-06 | +| 10 | 3.49633e-06 | 3.17483e-06 | + +​ 除小半径Gray图外,MAE均不超过1e-5; + + + +#### 3.1.3 Metax + +​ 在Metax平台上(C500),各测试半径下典型计算误差(MAE)如下表所示: + +| 滤波半径 | Gray | RGB | +| :------: | :---------: | :---------: | +| 1 | *0.486975* | 1.24582e-06 | +| 2 | *0.49526* | 8.23849e-06 | +| 3 | 4.94309e-06 | 5.94779e-06 | +| 4 | 7.11323e-06 | 3.73746e-06 | +| 5 | 6.51042e-06 | 6.95248e-06 | +| 6 | 6.14873e-06 | 5.66647e-06 | +| 7 | 7.95718e-06 | 6.10854e-06 | +| 8 | 4.21971e-06 | 7.03286e-06 | +| 9 | 4.94309e-06 | 4.74216e-06 | +| 10 | 6.63098e-06 | 4.34028e-06 | + +​ 除小半径Gray图外,MAE均不超过1e-5; + + + +#### 3.1.4 Iluvatar + +​ 在Iluvatar平台上(BI100),各测试半径下典型计算误差(MAE)如下表所示: + +| 滤波半径 | Gray | RGB | +| :------: | :---------: | :---------: | +| 1 | 1.68789e-06 | 4.42065e-07 | +| 2 | 6.51042e-06 | 2.81314e-06 | +| 3 | 2.53183e-06 | 4.38047e-06 | +| 4 | 4.09915e-06 | 4.58140e-06 | +| 5 | 3.01408e-06 | 4.38047e-06 | +| 6 | 7.23380e-06 | 2.57202e-06 | +| 7 | 6.51042e-06 | 3.49633e-06 | +| 8 | 9.88619e-06 | 5.94779e-06 | +| 9 | 6.99267e-06 | 4.25990e-06 | +| 10 | 7.11323e-06 | 5.10385e-06 | + + + + + +### 3.2 吞吐量 + +​ 基于4K UHD图像测试集,各平台吞吐量分析如下;其中用红线标注出了**4K fps**所对应的吞吐量(~ 497.7 MP/s): + + + +#### 3.2.1 Nvidia + +
+ +​ 由图可知,吞吐量随滤波半径的增长而明显下降,从Radius = 1时的峰值 **3254.41MP/s (Gray)** 和 **1650.54MP/s (RGB)** 逐步滑落到Radius = 10的最低值 **632.35MP/s (Gray)** 和 **405.94MP/s (RGB)**; + +​ 由于通道数不同,灰度图的吞吐量维持在RGB图的**1.5x ~ 2x**左右; + +​ 当**Radius <= 8**时,本设计的吞吐量可以达到4K UHD 60fps的工业级标准; + + + +#### 3.2.2 Moore + +
+ +​ 类似Nvidia平台,吞吐量随滤波半径增长而显著下降,从Radius = 1时的峰值 **12522.78MP/s (Gray)** 和 **4833.82MP/s (RGB)** 逐步滑落到Radius = 10的最低值 **885.31MP/s (Gray)** 和 **453.54MP/s (RGB)**; + +​ 灰度图的吞吐量维持在RGB图的**2x ~ 3x**左右; + +​ 此外,**Moore平台的吞吐量整体都高于Nvidia,平均提升可达1.5x(四个平台中最高)**;相应地,当**Radius <= 9**时,吞吐量均可达到4K 60fps标准; + + + +#### 3.2.3 Metax + +
+ +​ 类似前述平台,吞吐量随滤波半径增长而显著下降,从Radius = 1时的峰值 **9551.09MP/s (Gray)** 和 **4361.43MP/s (RGB)** 逐步滑落到Radius = 10的最低值 **425.89MP/s (Gray)** 和 **330.19MP/s (RGB)**; + +​ 灰度图的吞吐量维持在RGB图的**1.1x ~ 2x**左右; + +​ 整体而言,Metax平台的吞吐量介于Nvidia和Moore之间;当**Radius <= 8**时,吞吐量可达到4K 60fps标准; + + + +#### 3.2.4 Iluvatar + +
+ +​ 吞吐量同样随滤波半径增长而显著下降,从Radius = 1时的峰值 **8594.56MP/s (Gray)** 和 **2662.75MP/s (RGB)** 逐步滑落到Radius = 10的最低值 **523.72MP/s (Gray)** 和 **291.72MP/s (RGB)**; + +​ 灰度图的吞吐量维持在RGB图的**1.5x ~ 3.5x**左右; + +​ Iluvatar平台的吞吐量接近但略低于Metax;当**Radius <= 7**时,吞吐量可达到4K 60fps标准; + + + +#### 3.2.5 对比与总结 + +​ 基于RGB图片,选取几个典型半径,横向对比四大平台的吞吐量,结果如下图所示: + +
+ +​ 结论: + +- **三大国产平台是小半径滤波的首选:** + +​ 在**小半径(Radius <= 2)**下,**三大国产平台** 展现出较强的吞吐量优势,性能可达 **Nvidia** 的 **1.5x ~ 2x** 。这表明国产GPU架构在处理中低复杂度的并行任务时,具备更优的指令发射效率或内存带宽利用率,是轻量级实时滤波的首选; + +- **性能随计算强度增加而衰减的特性:** + +​ 随滤波半径增加,所有平台的吞吐量受限于$O(N^2)$计算复杂度而下降,但 **三大国产平台** 降幅更为显著。这反映出在极限计算负载下,国产硬件架构在资源调度、执行效率等方面面临更大挑战;相较而言 **Nvidia** 则更为稳健; + +- **Nvidia平台是大规模滤波的首选:** + +​ 在**大半径(Radius >= 10)**下,**Nvidia** 平台凭借更强的鲁棒性,吞吐量最终反超多数国产平台。对于追求大规模窗口的高质量图像处理场景,**Nvidia** 具有更高的性能下限保障。 + + + + + +### 3.3 加速比 + +​ 基于4K UHD图像测试集,各平台加速比(GPU吞吐量 / CPU基于openCV吞吐量)分析如下: + + + +#### 3.3.1 Nvidia + +
+ +​ 基于**Nvidia A100 GPU **和 **Intel(R) Xeon(R) Xeon(R) Processor @ 2.90GHz CPU**: + +- 对于Gray图,加速比稳定在 **3x ~ 4x** 左右; +- 对于RGB图,加速比分布在 **8x ~ 18x** 左右,随滤波半径变化而波动较大,但后期呈上升趋势; + + + +#### 3.3.2 Moore + +
+ +​ 基于 **Moore S5000 GPU** 和 **Intel(R) Xeon(R) Gold 6430 CPU**: + +- 对于Gray图,加速比分布在 **4x ~ 12x** 左右,随半径增大而逐渐上升; +- 对于RGB图,加速比分布在 **17x ~ 30x** 左右,同样随半径增大而逐渐上升; + + + +#### 3.3.3 Metax + +
+ +​ 基于 **Metax C500 GPU** 和 **Intel(R) Core(TM) i7-8550U @ 1.80GHz CPU**: + +- 对于Gray图,加速比稳定在 **2.5x** 左右; +- 对于RGB图,加速比分布在 **21x ~ 4x** 左右,随半径增大而显著下降; + + + +#### 3.3.4 Iluvatar + +
+ +​ 基于 **Iluvatar BI100 GPU** 和 **Intel(R) Xeon(R) Gold 6330 @ 2.00GHz CPU**: + +- 对于Gray图,加速比分布在 **2x ~ 8x** 左右,随半径增大而显著下降; +- 对于RGB图,加速比分布在 **4x ~ 11x** 左右,同样随半径增大而显著下降; + + + +#### 3.3.5 对比与总结 + +​ 通过对各平台加速比的分析,可以得出以下结论: + +- **计算密集度与加速效率成正相关性:** + +​ 全平台RGB图的加速比普遍高于Gray图(如 **Moore** 平台RGB加速比可高达30x,而Gray仅为12x)。这证明GPU大规模并行计算单元的算力优势在处理多通道、高计算密度的任务时更加显著,符合GPU的硬件特性,也验证了设计的有效性; + +- **不同平台对于大半径计算的抗压能力各异:** + +​ **Nvidia** 和 **Moore** 平台的加速比随半径增大呈上升或平稳趋势,展现了相应GPU硬件优秀的缓存命中率和寄存器压力管理,适合大半径、超大半径处理场景。相比之下,**Metax** 和 **Iluvatar** 平台在半径增大时加速比出现显著下滑,反映了在大尺寸窗口下,局部访存与计算特征的变化对部分国产GPU架构的处理效率提出了较大的挑战; + +- **加速比根据基准的不同存在相对性:** + +​ 上述加速比数据仅供参考,无法作为各平台GPU性能的绝对指标,因为其数值受宿主CPU性能基准影响显著。例如 **Metax** 平台的对比基准为低功耗嵌入式/网络服务器CPU Core(TM) i7-8550U ,性能和其他三个平台的Xeon系列CPU存在明显差距,因此加速比可能存在虚高现象。 + + + + + + + +## 四. Ncu分析 + +​ 由于服务器启用ncu需要sudo权限,因此只在本地(Nvidia GeForce RTX 5090 Laptop GPU)对相同代码进行ncu分析,结果如下: + +- **radius = 8,RGB图** + + 总览: + +
+ +​ 汇编细节: + +
+ + + +
+ + + +​ 由总览可知,主要待优化点在于 **非合并的全局访存**(Uncoalesced Global Access),总预期提速效果约 **19.39%**;根据汇编结果,这一瓶颈来自于代码中短字节加载指令如`LDG.E Rxx`等,而其产生根本原因在于:myPixel为4 byte结构体,而GPU硬件最多可以一次性加载16 byte;为最大限度利用带宽,编译器尝试将内层循环进行步长为4的展开、以期一次性加载更多数据(如利用`LDG.E.128`一次加载4 x 4 byte = 16 byte)以便批量处理,但实际核函数内循环的处理逻辑是针对单一像素点进行的,因此`LDG.E.128`的尝试失败了,回滚到4次分立的普通加载`LDG.E`, 也就未能最大程度利用访存带宽。 + +​ 此外根据汇编结果,`MUFU.SQRT`指令前后存在较多warp stalling,这是因为在进行循环边界 $bound\_j$ 计算时涉及开方运算,其时间成本较高、对相关硬件单元的占用导致了线程阻塞。该问题可以通过 **[2.3](#target2)** 中所述LUT方案解决,但考虑到需支持任意滤波半径,为避免LUT过大,最终还是采用了直接开方计算。 + + + +- **radius = 8,Gray图** + + 总览: + +
+ +​ 汇编细节: + +
+ + + +
+ + + +​ 和RGB图像一样,主要待优化点仍在于 **非合并的全局访存**(Uncoalesced Global Access),预期提速效果约 **34.94%**;此外指出的待优化点 **L1 缓存/纹理缓存全局加载/存储访问模式** (L1TEX Global Load/Store Access Pattern)也和非合并访存相关,预期提速效果分别约 **22.24%** 和 **16.71%**。这里主要原因同上,在于汇编生成了一系列分立的短字节加载指令如`LDG.E.U8`等,而未能利用更高效的向量化加载;由于Gray图每个像素只占用1 byte且核函数循环针对单一像素点进行,因此访存总线带宽相较RGB图而言更低、预期提速更高。 + +​ 此外同RGB图,`MUFU.SQRT`也带来了较大的warp stalling,其原因和解决方法不再赘述。 + + + +​ 由于ncu指出的待优化点主要集中在访存,因而当滤波半径减小、计算密度降低而访存占比增大时,相关问题会更显著。实际结果也印证了这一点: + + + +- **radius = 5,RGB图** + +
+ +​ 和预期一样,此时解决非合并访存所带来的理论优化效果有所提升,达到了 **23.53%**; + + + +- **radius = 5,Gray图** + +
+ +​ 理论提速达到了 **50.12%**; + + + + + +- **相关优化尝试** + +​ 为解决非合并访存带来的弊端,本设计尝试了向量加载。具体而言,对于RGB图,每次循环中,每个线程以uint4(大小等于4个myPixel)一次性加载4个像素点并批量处理;对于Gray图,每次循环中,每个线程以uchar4(大小等于4个uint8_t)同样一次性加载4个像素点并批量处理。 + +​ 但实际测试结果表明,在常规半径下,该方案反而会导致效率下降 **15% - 20%**,如下图所示(radius = 5,Gray): + +
+ +​ 不可否认,该方案确实一定程度上优化了非合并访存(理论提速从 **50.12%** 降为 **33.11%**),却带来了更为严重的 **线程分歧**(Thread Divergence)问题。原因在于,uchar4/uint4的向量化加载必须以数据4 byte/16 byte对齐为前提,而对于整张图像而言,有3/4的像素是不满足这一要求的,因此核函数需针对这些线程设置额外的逻辑以处理非对齐的头尾像素。这就导致了该3/4线程和剩余的1/4线程在循环内无法实现完美同步——1/4线程跳过了非对齐处理逻辑、空转等待剩余的3/4线程,进而引发分歧。 + +​ 权衡利弊后,当前设计暂未采用向量化加载方案,继续沿用朴素的循环处理每个像素。 + + + + + + + + + +## 五.未来工作 + +### 5.1 超大半径滤波下的并行计算策略选择 + +​ 如 **[2.1](#target4)** 所述,当前设计采用了“一个线程处理一个滤波窗口”的计算策略,本质上是双重循环;虽然在常规滤波半径下该方案速度尚可,但随着滤波半径增大,$O(N^2)$的计算复杂度必然会成为效率瓶颈。为解决这一问题,未来可以尝试在 **算法层面**(如采用近似计算、间隔计算、下采样、快速可分离双边滤波等)降低计算并行度、减少循环次数,或在 **硬件调度层面**(如采用2.1中**[方案2](#target5)** 的warp级归约)提升并行度、减少循环层数。 + + + +### 5.2 实时视频处理 + +​ 当前设计仅针对静态图片进行双边滤波,且异步数据加载仅启用单流。在未来,可以考虑将本设计拓展为基于实时视频(4K,60fps)的动态双边滤波,采用异步多流、双缓冲池、低精度(fp16)/混合精度(fp16 + fp32)计算等优化手段实现端到端的高效数据处理。 + + + +### 5.3 继续尝试利用共享内存 + +​ 虽然本设计尝试利用Shared Memory却没带来效率提升(**[2.6.1](#target3)**),但后续将在更复杂应用场景下(如超大半径滤波)继续挖掘共享内存的潜能,或尝试不一样的加载策略。 + + + +### 5.4 继续尝试向量指令与向量加载 + +​ 本设计通过向量指令`vabsdiff4`有效加速了色域$L_{1}$范数的计算,未来可以考虑在设计中融入更多SIMD的向量指令(同时尝试解决一系列封装后的向量函数`__v*`在Nvidia等平台计算行为异常的问题)。 + +​ 此外后续将继续探索高效的向量加载方案(尤其在超大半径滤波场景下),尝试克服现存的由字节对齐导致的线程分歧问题。 + +​ + +### 5.5 消除小半径下和OpenCV存在的系统误差 + +​ 如前所述,当前设计在处理小滤波半径(radius <= 2)的灰度图时,在大多数平台上都和OpenCV标杆实现存在0.5左右的MAE,具体原因有待进一步分析。 diff --git a/08_bilateral_filter/Jason-Young123/report/data/iluvatar.dat b/08_bilateral_filter/Jason-Young123/report/data/iluvatar.dat new file mode 100644 index 0000000..13d0eb0 --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/report/data/iluvatar.dat @@ -0,0 +1,79 @@ +#Platform: iluvatar +#GPU: 天数 BI100 +#CPU: Intel(R) Xeon(R) Gold 6330 CPU @ 2.00GHz + + +#gray +throughput_gray: +8594.56 +6140.55 +3894.23 +2685.26 +1799.97 +1350.31 +1055.55 +817.68 +647.83 +523.72 + +speedup_gray: +8.35226 +7.02025 +6.36762 +6.03288 +4.52927 +3.66657 +2.98139 +2.46526 +2.38591 +2.20986 + +MAE_gray: +1.68789e-06 +6.51042e-06 +2.53183e-06 +4.09915e-06 +3.01408e-06 +7.2338e-06 +6.51042e-06 +9.88619e-06 +6.99267e-06 +7.11323e-06 + + +#rgb +throughput_rgb: +2662.75 +2254.44 +1660.98 +1256.35 +901.42 +701.81 +562.32 +444.78 +357.13 +291.72 + +speedup_rgb: +11.4367 +10.1008 +9.04011 +8.06227 +6.37318 +5.15521 +4.31887 +3.92115 +3.71837 +3.51772 + +MAE_rgb: +4.42065e-07 +2.81314e-06 +4.38047e-06 +4.5814e-06 +4.38047e-06 +2.57202e-06 +3.49633e-06 +5.94779e-06 +4.2599e-06 +5.10385e-06 diff --git a/08_bilateral_filter/Jason-Young123/report/data/metax.dat b/08_bilateral_filter/Jason-Young123/report/data/metax.dat new file mode 100644 index 0000000..3e408c6 --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/report/data/metax.dat @@ -0,0 +1,79 @@ +#Platform: metax +#GPU: 沐曦 C500-16G +#CPU: Intel(R) Core(TM) i7-8550U CPU @ 1.80GHz + + +#gray +throughput_gray: +9551.09 +5931.08 +3457.88 +2271.39 +1494.44 +1102.80 +862.00 +664.94 +527.62 +425.89 + +speedup_gray: +1.98467 +1.79002 +2.67978 +2.62693 +2.80345 +2.69172 +2.73795 +2.69132 +2.73793 +2.72632 + +MAE_gray: +0.486975 +0.49526 +4.94309e-06 +7.11323e-06 +6.51042e-06 +6.14873e-06 +7.95718e-06 +4.21971e-06 +4.94309e-06 +6.63098e-06 + +#rgb +throughput_rgb: +4361.43 +3215.41 +2188.33 +1576.27 +1086.52 +829.44 +655.42 +506.24 +407.79 +330.19 + +speedup_rgb: +21.1496 +15.6313 +12.5088 +10.5937 +8.23794 +7.87266 +4.59177 +4.41521 +4.16321 +5.26927 + +MAE_rgb: +1.24582e-06 +8.23849e-06 +5.94779e-06 +3.73746e-06 +6.95248e-06 +5.66647e-06 +6.10854e-06 +7.03286e-06 +4.74216e-06 +4.34028e-06 + diff --git a/08_bilateral_filter/Jason-Young123/report/data/moore.dat b/08_bilateral_filter/Jason-Young123/report/data/moore.dat new file mode 100644 index 0000000..e154a65 --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/report/data/moore.dat @@ -0,0 +1,78 @@ +#Platform: moore +#GPU: 摩尔线程 S5000 +#CPU: Intel(R) Xeon(R) Gold 6430 + + +#gray +throughput_gray: +12522.78 +9385.87 +6173.55 +4329.85 +2957.89 +2239.56 +1759.93 +1371.34 +1091.28 +885.31 + +speedup_gray: +3.61347 +4.35318 +9.09573 +10.029 +10.792 +10.9914 +11.4681 +11.5015 +11.7973 +11.8315 + +MAE_gray: +0.493864 +0.497564 +0.00000446084 +8.80112e-06 +7.47492e-06 +5.42535e-06 +6.51042e-06 +6.02816e-06 +6.02816e-06 +3.49633e-06 + +#rgb +throughput_rgb: +4833.82 +3833.93 +2739.49 +2022.33 +1424.69 +1104.50 +880.56 +694.45 +556.19 +453.54 + +speedup_rgb: +16.8466 +18.1259 +27.2701 +28.9265 +29.0732 +29.556 +29.8733 +30.0655 +30.1244 +30.1175 + +MAE_rgb: +0.00000257202 +0.0000024145 +0.00000570666 +5.14403e-06 +4.5814e-06 +5.14403e-06 +6.59079e-06 +3.77765e-06 +5.30478e-06 +3.17483e-06 diff --git a/08_bilateral_filter/Jason-Young123/report/data/nvidia.dat b/08_bilateral_filter/Jason-Young123/report/data/nvidia.dat new file mode 100644 index 0000000..43cd4c7 --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/report/data/nvidia.dat @@ -0,0 +1,80 @@ +#Platform: nvidia +#GPU: Nvidia A100 +#CPU: Intel(R) Xeon(R) Processor @ 2.90GHz + +#gray +throughput_gray: +3254.41 +3060.33 +2763.45 +2478.31 +2130.61 +1878.15 +1079.03 +972.94 +690.79 +632.35 + +speedup_gray: +4.27553 +3.99363 +3.93609 +4.14105 +3.92931 +3.99847 +3.02489 +4.03044 +2.73277 +2.94585 + +MAE_gray: +0.481447 +0.499052 +4.5814e-06 +2.65239e-06 +2.77296e-06 +5.66647e-06 +7.71605e-06 +8.07774e-06 +5.30478e-06 +7.35436e-06 + + + +#rgb +throughput_rgb: +1650.54 +1582.13 +1468.59 +1354.47 +1201.05 +806.97 +742.75 +551.99 +442.79 +405.94 + +speedup_rgb: +14.1611 +8.02206 +9.37769 +11.2139 +10.7873 +9.27599 +13.915 +16.017 +16.684 +17.9033 + +MAE_rgb: +8.43943e-07 +8.31887e-06 +3.73746e-06 +2.93371e-06 +5.14403e-06 +7.31417e-06 +4.5814e-06 +9.16281e-06 +7.15342e-06 +6.91229e-06 + diff --git a/08_bilateral_filter/Jason-Young123/report/pic/flow_chart.png b/08_bilateral_filter/Jason-Young123/report/pic/flow_chart.png new file mode 100644 index 0000000..b42eb6c Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/report/pic/flow_chart.png differ diff --git a/08_bilateral_filter/Jason-Young123/report/pic/mode1.png b/08_bilateral_filter/Jason-Young123/report/pic/mode1.png new file mode 100644 index 0000000..521533e Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/report/pic/mode1.png differ diff --git a/08_bilateral_filter/Jason-Young123/report/pic/mode2.png b/08_bilateral_filter/Jason-Young123/report/pic/mode2.png new file mode 100644 index 0000000..e49e8f4 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/report/pic/mode2.png differ diff --git a/08_bilateral_filter/Jason-Young123/report/pic/mode3.png b/08_bilateral_filter/Jason-Young123/report/pic/mode3.png new file mode 100644 index 0000000..15409c2 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/report/pic/mode3.png differ diff --git a/08_bilateral_filter/Jason-Young123/report/pic/r5_failed.png b/08_bilateral_filter/Jason-Young123/report/pic/r5_failed.png new file mode 100644 index 0000000..6983e6d Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/report/pic/r5_failed.png differ diff --git a/08_bilateral_filter/Jason-Young123/report/pic/r5_gray.png b/08_bilateral_filter/Jason-Young123/report/pic/r5_gray.png new file mode 100644 index 0000000..0baa25d Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/report/pic/r5_gray.png differ diff --git a/08_bilateral_filter/Jason-Young123/report/pic/r5_rgb.png b/08_bilateral_filter/Jason-Young123/report/pic/r5_rgb.png new file mode 100644 index 0000000..04f9426 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/report/pic/r5_rgb.png differ diff --git a/08_bilateral_filter/Jason-Young123/report/pic/r8_detail1.png b/08_bilateral_filter/Jason-Young123/report/pic/r8_detail1.png new file mode 100644 index 0000000..969b121 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/report/pic/r8_detail1.png differ diff --git a/08_bilateral_filter/Jason-Young123/report/pic/r8_detail2.png b/08_bilateral_filter/Jason-Young123/report/pic/r8_detail2.png new file mode 100644 index 0000000..39d7156 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/report/pic/r8_detail2.png differ diff --git a/08_bilateral_filter/Jason-Young123/report/pic/r8_detail3.png b/08_bilateral_filter/Jason-Young123/report/pic/r8_detail3.png new file mode 100644 index 0000000..51a7488 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/report/pic/r8_detail3.png differ diff --git a/08_bilateral_filter/Jason-Young123/report/pic/r8_detail4.png b/08_bilateral_filter/Jason-Young123/report/pic/r8_detail4.png new file mode 100644 index 0000000..31748a8 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/report/pic/r8_detail4.png differ diff --git a/08_bilateral_filter/Jason-Young123/report/pic/r8_gray.png b/08_bilateral_filter/Jason-Young123/report/pic/r8_gray.png new file mode 100644 index 0000000..af5afaf Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/report/pic/r8_gray.png differ diff --git a/08_bilateral_filter/Jason-Young123/report/pic/r8_rgb.png b/08_bilateral_filter/Jason-Young123/report/pic/r8_rgb.png new file mode 100644 index 0000000..980d35e Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/report/pic/r8_rgb.png differ diff --git a/08_bilateral_filter/Jason-Young123/report/pic/speedup_iluvatar.png b/08_bilateral_filter/Jason-Young123/report/pic/speedup_iluvatar.png new file mode 100644 index 0000000..1d6a6c1 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/report/pic/speedup_iluvatar.png differ diff --git a/08_bilateral_filter/Jason-Young123/report/pic/speedup_metax.png b/08_bilateral_filter/Jason-Young123/report/pic/speedup_metax.png new file mode 100644 index 0000000..e3c3553 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/report/pic/speedup_metax.png differ diff --git a/08_bilateral_filter/Jason-Young123/report/pic/speedup_moore.png b/08_bilateral_filter/Jason-Young123/report/pic/speedup_moore.png new file mode 100644 index 0000000..86b3e8d Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/report/pic/speedup_moore.png differ diff --git a/08_bilateral_filter/Jason-Young123/report/pic/speedup_nvidia.png b/08_bilateral_filter/Jason-Young123/report/pic/speedup_nvidia.png new file mode 100644 index 0000000..9f0d815 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/report/pic/speedup_nvidia.png differ diff --git a/08_bilateral_filter/Jason-Young123/report/pic/throughput_comparison.png b/08_bilateral_filter/Jason-Young123/report/pic/throughput_comparison.png new file mode 100644 index 0000000..2ab278b Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/report/pic/throughput_comparison.png differ diff --git a/08_bilateral_filter/Jason-Young123/report/pic/throughput_iluvatar.png b/08_bilateral_filter/Jason-Young123/report/pic/throughput_iluvatar.png new file mode 100644 index 0000000..246a8e8 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/report/pic/throughput_iluvatar.png differ diff --git a/08_bilateral_filter/Jason-Young123/report/pic/throughput_metax.png b/08_bilateral_filter/Jason-Young123/report/pic/throughput_metax.png new file mode 100644 index 0000000..370b31b Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/report/pic/throughput_metax.png differ diff --git a/08_bilateral_filter/Jason-Young123/report/pic/throughput_moore.png b/08_bilateral_filter/Jason-Young123/report/pic/throughput_moore.png new file mode 100644 index 0000000..47dfcfc Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/report/pic/throughput_moore.png differ diff --git a/08_bilateral_filter/Jason-Young123/report/pic/throughput_nvidia.png b/08_bilateral_filter/Jason-Young123/report/pic/throughput_nvidia.png new file mode 100644 index 0000000..d017c7d Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/report/pic/throughput_nvidia.png differ diff --git a/08_bilateral_filter/Jason-Young123/report/plot.py b/08_bilateral_filter/Jason-Young123/report/plot.py new file mode 100644 index 0000000..36ae9f7 --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/report/plot.py @@ -0,0 +1,182 @@ +import matplotlib.pyplot as plt +import numpy as np + +def plot_curve(speedup_gray, speedup_rgb): + radius = list(range(1, 11)) + + plt.rcParams['font.sans-serif'] = ['Calibri'] + plt.rcParams['axes.unicode_minus'] = False + + fig, ax = plt.subplots(figsize=(8, 5)) + + ax.plot(radius, speedup_gray, marker='o', markersize=6, linewidth=2, + color='gray', label='Gray') + ax.plot(radius, speedup_rgb, marker='s', markersize=6, linewidth=2, + color='red', label='RGB') + + + fig.text(0.5, 0.95, 'Speedup Analysis (Platform: Nvidia)', + ha='center', va='center', fontsize=18, fontweight='bold') + fig.text(0.5, 0.90, 'GPU: A100 / CPU: Intel(R) Xeon(R) Processor @ 2.90GHz', + ha='center', va='center', fontsize=14, fontweight='bold') + + ax.set_xlabel('Radius', fontsize=13, fontweight='bold') + ax.set_ylabel('Speedup x', fontsize=13, fontweight='bold') + ax.set_xticks(radius) + ax.grid(True, linestyle=':', alpha=0.6) + + #ax.set_ylim(0, 34) + + ax.legend(loc='upper center', ncol=2, frameon=True, shadow=True) + + plt.subplots_adjust(top=0.85) + + #plt.savefig('speedup_nvidia.png', dpi=300, bbox_inches='tight') + plt.show() + + + + + + +def plot_bar(thruput_gray, thruput_rgb): + radius = np.arange(1, 11) # 横坐标 1-10 + width = 0.35 # 柱子的宽度 + + plt.rcParams['font.sans-serif'] = ['Calibri'] + plt.rcParams['axes.unicode_minus'] = False + + fig, ax = plt.subplots(figsize=(8, 5)) + + # 绘制并排柱状图 + ax.bar(radius - width/2, thruput_gray, width, color='gray', label='Gray Throughput') + ax.bar(radius + width/2, thruput_rgb, width, color='lightskyblue', label='RGB Throughput') + + # --- 关键修改:红色虚线,不带 label (不会出现在图例中) --- + fps_4k_60 = 497.66 + ax.axhline(y=fps_4k_60, color='red', linestyle='--', linewidth=1.5) + + # 标注文字保持红色以匹配线条 + ax.text(11, fps_4k_60 + 60, '4K 60fps', + color='red', fontweight='bold', fontsize=13, va='bottom', ha = 'right') + + # --- 标题设置 --- + fig.text(0.5, 0.95, 'Throughput Analysis (Platform: Iluvatar)', + ha='center', va='center', fontsize=18, fontweight='bold') + fig.text(0.5, 0.90, 'GPU: BI100', + ha='center', va='center', fontsize=14, fontweight='bold') + + ax.set_xlabel('Radius', fontsize=13, fontweight='bold') + ax.set_ylabel('Throughput (MP/s)', fontsize=13, fontweight='bold') + ax.set_xticks(radius) + ax.grid(axis='y', linestyle=':', alpha=0.6) + + # 设置 Y 轴范围 + y_max = max(max(thruput_gray), max(thruput_rgb)) + ax.set_ylim(0, y_max * 1.05) + + # 图例设置:此时只会显示 Gray 和 RGB 两项 + ax.legend(loc='upper center', ncol=2, frameon=True, shadow=True, fontsize=11) + + plt.subplots_adjust(top=0.85) + #plt.savefig('throughput_iluvatar.png', dpi=300, bbox_inches='tight') + plt.show() + + + + +def plot_bar_comparison(nvidia, moore, metax, iluvatar): + # 横坐标为 2, 4, 6, 8, 10 + radius_labels = [2, 4, 6, 8, 10] + x = np.arange(len(radius_labels)) # 标签位置 + width = 0.2 # 每个柱子的宽度 + + # 设置字体和基础样式 + plt.rcParams['font.sans-serif'] = ['Calibri'] + plt.rcParams['axes.unicode_minus'] = False + + fig, ax = plt.subplots(figsize=(10, 6)) + + # 选取四个不同深度的蓝色 + colors = ['#B3E5FC', '#81D4FA', '#039BE5', '#01579B'] + + # 绘制四个平台的并列柱状图 + ax.bar(x - 1.5*width, nvidia, width, color=colors[0], label='Nvidia(A100)') + ax.bar(x - 0.5*width, moore, width, color=colors[1], label='Moore(S5000)') + ax.bar(x + 0.5*width, metax, width, color=colors[2], label='Metax(C500)') + ax.bar(x + 1.5*width, iluvatar, width, color=colors[3], label='Iluvatar(BI100)') + + # 绘制 4K 60fps 参考基准线 + fps_4k_60 = 497.66 + ax.axhline(y=fps_4k_60, color='red', linestyle='--', linewidth=1.5, zorder=3) + ax.text(len(x)-0.5, fps_4k_60 + 50, '4K 60fps', + color='red', fontweight='bold', fontsize=11, va='bottom', ha='right') + + # 设置标题和标签 + fig.text(0.5, 0.94, 'Throughput Comparison among 4 Platforms', + ha='center', va='center', fontsize=18, fontweight='bold') + + ax.set_xlabel('Radius', fontsize=13, fontweight='bold') + ax.set_ylabel('Throughput (MP/s)', fontsize=13, fontweight='bold') + + # 设置横坐标刻度 + ax.set_xticks(x) + ax.set_xticklabels(radius_labels) + + # 辅助线和背景 + ax.grid(axis='y', linestyle=':', alpha=0.6) + + # 设置 Y 轴范围(自适应并留出图例空间) + y_all = nvidia + moore + metax + iluvatar + ax.set_ylim(0, max(y_all) * 1.1) + + # 图例设置:ncol=4 放在正上方 + ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.0), + ncol=4, frameon=True, shadow=True, fontsize=10) + + plt.subplots_adjust(top=0.88) + plt.savefig('./pic/throughput_comparison.png', dpi=300, bbox_inches='tight') + plt.show() + + + +speedup_gray_nvidia = [4.27553, 3.99363, 3.93609, 4.14105, 3.92931, 3.99847, 3.02489, 4.03044, 2.73277, 2.94585] +speedup_rgb_nvidia = [14.1611, 8.02206, 9.37769, 11.2139, 10.7873, 9.27599, 13.915, 16.017, 16.684, 17.9033] + +thruput_gray_nvidia = [3254.41, 3060.33, 2763.45, 2478.31, 2130.61, 1878.15, 1079.03, 972.94, 690.79, 632.35] +thruput_rgb_nvidia = [1650.54, 1582.13, 1468.59, 1354.47, 1201.05, 806.97, 742.75, 551.99, 442.79, 405.94] + +speedup_gray_moore = [3.61347, 4.35318, 9.09573, 10.029, 10.792, 10.9914, 11.4681, 11.5015, 11.7973, 11.8315] +speedup_rgb_moore = [16.8466, 18.1259, 27.2701, 28.9265, 29.0732, 29.556, 29.8733, 30.0655, 30.1244, 30.1175] + +thruput_gray_moore = [12522.78, 9385.87, 6173.55, 4329.85, 2957.89, 2239.56, 1759.93, 1371.34, 1091.28, 885.31] +thruput_rgb_moore = [4833.82, 3833.93, 2739.49, 2022.33, 1424.69, 1104.50, 880.56, 694.45, 556.19, 453.54] + +speedup_gray_metax = [1.98467, 1.79002, 2.67978, 2.62693, 2.80345, 2.69172, 2.73795, 2.69132, 2.73793, 2.72632] +speedup_rgb_metax = [21.1496, 15.6313, 12.5088, 10.5937, 8.23794, 7.87266, 4.59177, 4.41521, 4.16321, 5.26927] + +thruput_gray_metax = [9551.09, 5931.08, 3457.88, 2271.39, 1494.44, 1102.80, 862.00, 664.94, 527.62, 425.89] +thruput_rgb_metax = [4361.43, 3215.41, 2188.33, 1576.27, 1086.52, 829.44, 655.42, 506.24, 407.79, 330.19] + + +speedup_gray_iluvatar = [8.35226, 7.02025, 6.36762, 6.03288, 4.52927, 3.66657, 2.98139, 2.46526, 2.38591, 2.20986] +speedup_rgb_iluvatar = [11.4367, 10.1008, 9.04011, 8.06227, 6.37318, 5.15521, 4.31887, 3.92115, 3.71837, 3.51772] + +thruput_gray_iluvatar = [8594.56, 6140.55, 3894.23, 2685.26, 1799.97, 1350.31, 1055.55, 817.68, 647.83, 523.72] +thruput_rgb_iluvatar = [2662.75, 2254.44, 1660.98, 1256.35, 901.42, 701.81, 562.32, 444.78, 357.13, 291.72] + + + +thruput_rgb_nvidia1 = [1582.13, 1354.47, 806.97, 551.99, 405.94] +thruput_rgb_moore1 = [3833.93, 2022.33, 1104.50, 694.45, 453.54] +thruput_rgb_metax1 = [3215.41, 1576.27, 829.44, 506.24, 330.19] +thruput_rgb_iluvatar1 = [2254.44, 1256.35, 701.81, 444.78, 291.72] + + + +if __name__ == "__main__": + #plot_curve(speedup_gray_nvidia, speedup_rgb_nvidia) + #plot_bar(thruput_gray_iluvatar, thruput_rgb_iluvatar) + #plot_bar(thruput_gray_nvidia, thruput_rgb_nvidia) + + plot_bar_comparison(thruput_rgb_nvidia1, thruput_rgb_moore1, thruput_rgb_metax1, thruput_rgb_iluvatar1) \ No newline at end of file diff --git a/08_bilateral_filter/Jason-Young123/resource/gray/4K/wallpaper_g1.jpg b/08_bilateral_filter/Jason-Young123/resource/gray/4K/wallpaper_g1.jpg new file mode 100644 index 0000000..9e4f663 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/gray/4K/wallpaper_g1.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/gray/4K/wallpaper_g10.jpg b/08_bilateral_filter/Jason-Young123/resource/gray/4K/wallpaper_g10.jpg new file mode 100644 index 0000000..e544b43 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/gray/4K/wallpaper_g10.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/gray/4K/wallpaper_g2.jpg b/08_bilateral_filter/Jason-Young123/resource/gray/4K/wallpaper_g2.jpg new file mode 100644 index 0000000..5fc3ceb Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/gray/4K/wallpaper_g2.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/gray/4K/wallpaper_g3.jpg b/08_bilateral_filter/Jason-Young123/resource/gray/4K/wallpaper_g3.jpg new file mode 100644 index 0000000..71e2885 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/gray/4K/wallpaper_g3.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/gray/4K/wallpaper_g4.jpg b/08_bilateral_filter/Jason-Young123/resource/gray/4K/wallpaper_g4.jpg new file mode 100644 index 0000000..1d38283 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/gray/4K/wallpaper_g4.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/gray/4K/wallpaper_g5.jpg b/08_bilateral_filter/Jason-Young123/resource/gray/4K/wallpaper_g5.jpg new file mode 100644 index 0000000..73ecd88 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/gray/4K/wallpaper_g5.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/gray/4K/wallpaper_g6.jpg b/08_bilateral_filter/Jason-Young123/resource/gray/4K/wallpaper_g6.jpg new file mode 100644 index 0000000..51b22ef Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/gray/4K/wallpaper_g6.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/gray/4K/wallpaper_g7.jpg b/08_bilateral_filter/Jason-Young123/resource/gray/4K/wallpaper_g7.jpg new file mode 100644 index 0000000..635e1bd Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/gray/4K/wallpaper_g7.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/gray/4K/wallpaper_g8.jpg b/08_bilateral_filter/Jason-Young123/resource/gray/4K/wallpaper_g8.jpg new file mode 100644 index 0000000..04c5760 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/gray/4K/wallpaper_g8.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/gray/4K/wallpaper_g9.jpg b/08_bilateral_filter/Jason-Young123/resource/gray/4K/wallpaper_g9.jpg new file mode 100644 index 0000000..4dc0ec1 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/gray/4K/wallpaper_g9.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/gray/others/airplane.tiff b/08_bilateral_filter/Jason-Young123/resource/gray/others/airplane.tiff new file mode 100644 index 0000000..451dd14 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/gray/others/airplane.tiff differ diff --git a/08_bilateral_filter/Jason-Young123/resource/gray/others/airport.tiff b/08_bilateral_filter/Jason-Young123/resource/gray/others/airport.tiff new file mode 100644 index 0000000..294e127 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/gray/others/airport.tiff differ diff --git a/08_bilateral_filter/Jason-Young123/resource/gray/others/apple.jpeg b/08_bilateral_filter/Jason-Young123/resource/gray/others/apple.jpeg new file mode 100644 index 0000000..367a5ac Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/gray/others/apple.jpeg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/gray/others/barbara.bmp b/08_bilateral_filter/Jason-Young123/resource/gray/others/barbara.bmp new file mode 100644 index 0000000..4ead0f9 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/gray/others/barbara.bmp differ diff --git a/08_bilateral_filter/Jason-Young123/resource/gray/others/bridge.tiff b/08_bilateral_filter/Jason-Young123/resource/gray/others/bridge.tiff new file mode 100644 index 0000000..b4afe0e Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/gray/others/bridge.tiff differ diff --git a/08_bilateral_filter/Jason-Young123/resource/gray/others/cameraman.tif b/08_bilateral_filter/Jason-Young123/resource/gray/others/cameraman.tif new file mode 100644 index 0000000..bf8495b Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/gray/others/cameraman.tif differ diff --git a/08_bilateral_filter/Jason-Young123/resource/gray/others/fishingboat.tiff b/08_bilateral_filter/Jason-Young123/resource/gray/others/fishingboat.tiff new file mode 100644 index 0000000..fc1205a Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/gray/others/fishingboat.tiff differ diff --git a/08_bilateral_filter/Jason-Young123/resource/gray/others/male.tiff b/08_bilateral_filter/Jason-Young123/resource/gray/others/male.tiff new file mode 100644 index 0000000..13a756d Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/gray/others/male.tiff differ diff --git a/08_bilateral_filter/Jason-Young123/resource/gray/others/palette.jpg b/08_bilateral_filter/Jason-Young123/resource/gray/others/palette.jpg new file mode 100644 index 0000000..6665d88 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/gray/others/palette.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/gray/others/ruler.tiff b/08_bilateral_filter/Jason-Young123/resource/gray/others/ruler.tiff new file mode 100644 index 0000000..6a7f5ce Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/gray/others/ruler.tiff differ diff --git a/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper1.jpeg b/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper1.jpeg new file mode 100644 index 0000000..ce2cad3 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper1.jpeg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper10.jpg b/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper10.jpg new file mode 100644 index 0000000..f5e2e5a Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper10.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper11.jpg b/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper11.jpg new file mode 100644 index 0000000..a719f01 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper11.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper12.jpg b/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper12.jpg new file mode 100644 index 0000000..55afd7a Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper12.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper13.jpg b/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper13.jpg new file mode 100644 index 0000000..719b1f2 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper13.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper14.jpg b/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper14.jpg new file mode 100644 index 0000000..d23ef99 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper14.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper15.jpg b/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper15.jpg new file mode 100644 index 0000000..06db8ed Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper15.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper2.jpg b/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper2.jpg new file mode 100644 index 0000000..d9fe248 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper2.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper3.jpg b/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper3.jpg new file mode 100644 index 0000000..eb6fcab Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper3.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper4.jpg b/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper4.jpg new file mode 100644 index 0000000..c9330e6 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper4.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper5.jpg b/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper5.jpg new file mode 100644 index 0000000..5060ad6 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper5.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper6.jpg b/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper6.jpg new file mode 100644 index 0000000..e355953 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper6.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper7.jpg b/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper7.jpg new file mode 100644 index 0000000..23c918f Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper7.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper8.jpg b/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper8.jpg new file mode 100644 index 0000000..84f905d Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper8.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper9.jpg b/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper9.jpg new file mode 100644 index 0000000..44507f1 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/rgb/4K/wallpaper9.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/rgb/others/budgies.jpg b/08_bilateral_filter/Jason-Young123/resource/rgb/others/budgies.jpg new file mode 100644 index 0000000..62f20d1 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/rgb/others/budgies.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/rgb/others/cat.jpg b/08_bilateral_filter/Jason-Young123/resource/rgb/others/cat.jpg new file mode 100644 index 0000000..a9b17cb Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/rgb/others/cat.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/rgb/others/flowers.jpg b/08_bilateral_filter/Jason-Young123/resource/rgb/others/flowers.jpg new file mode 100644 index 0000000..0f3ce1b Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/rgb/others/flowers.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/rgb/others/house.tiff b/08_bilateral_filter/Jason-Young123/resource/rgb/others/house.tiff new file mode 100644 index 0000000..8d26d7d Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/rgb/others/house.tiff differ diff --git a/08_bilateral_filter/Jason-Young123/resource/rgb/others/hue.jpg b/08_bilateral_filter/Jason-Young123/resource/rgb/others/hue.jpg new file mode 100644 index 0000000..c4dcb7c Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/rgb/others/hue.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/rgb/others/jellybeans.tiff b/08_bilateral_filter/Jason-Young123/resource/rgb/others/jellybeans.tiff new file mode 100644 index 0000000..08def93 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/rgb/others/jellybeans.tiff differ diff --git a/08_bilateral_filter/Jason-Young123/resource/rgb/others/lena.png b/08_bilateral_filter/Jason-Young123/resource/rgb/others/lena.png new file mode 100644 index 0000000..2b2cf4c Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/rgb/others/lena.png differ diff --git a/08_bilateral_filter/Jason-Young123/resource/rgb/others/mandrill.tiff b/08_bilateral_filter/Jason-Young123/resource/rgb/others/mandrill.tiff new file mode 100644 index 0000000..017ce58 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/rgb/others/mandrill.tiff differ diff --git a/08_bilateral_filter/Jason-Young123/resource/rgb/others/peacock.jpg b/08_bilateral_filter/Jason-Young123/resource/rgb/others/peacock.jpg new file mode 100644 index 0000000..3ed548c Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/rgb/others/peacock.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/rgb/others/peppers.tiff b/08_bilateral_filter/Jason-Young123/resource/rgb/others/peppers.tiff new file mode 100644 index 0000000..8c956f8 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/rgb/others/peppers.tiff differ diff --git a/08_bilateral_filter/Jason-Young123/resource/rgb/others/sailboat.tiff b/08_bilateral_filter/Jason-Young123/resource/rgb/others/sailboat.tiff new file mode 100644 index 0000000..bd10c10 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/rgb/others/sailboat.tiff differ diff --git a/08_bilateral_filter/Jason-Young123/resource/rgb/others/snow.jpg b/08_bilateral_filter/Jason-Young123/resource/rgb/others/snow.jpg new file mode 100644 index 0000000..f2db51c Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/rgb/others/snow.jpg differ diff --git a/08_bilateral_filter/Jason-Young123/resource/rgb/others/splash.tiff b/08_bilateral_filter/Jason-Young123/resource/rgb/others/splash.tiff new file mode 100644 index 0000000..f156b30 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/resource/rgb/others/splash.tiff differ diff --git a/08_bilateral_filter/Jason-Young123/scripts/iluvatar.mk b/08_bilateral_filter/Jason-Young123/scripts/iluvatar.mk new file mode 100644 index 0000000..134941f --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/scripts/iluvatar.mk @@ -0,0 +1,10 @@ +define PLATFORM_INIT_ENV +if [ "$$(id -u)" -eq 0 ]; then \ + apt update && apt install libopencv-dev -y; \ +else \ + true; \ +fi +endef + +exec: ./runTester + ./runTester diff --git a/08_bilateral_filter/Jason-Young123/scripts/metax.mk b/08_bilateral_filter/Jason-Young123/scripts/metax.mk new file mode 100644 index 0000000..c2999c2 --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/scripts/metax.mk @@ -0,0 +1,7 @@ +define PLATFORM_INIT_ENV +true +endef + +exec: ./runTester + export MACA_DEVICE_IMAGE_CHECK=1 + ./runTester diff --git a/08_bilateral_filter/Jason-Young123/scripts/moore.mk b/08_bilateral_filter/Jason-Young123/scripts/moore.mk new file mode 100644 index 0000000..5bac9f5 --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/scripts/moore.mk @@ -0,0 +1,7 @@ +define PLATFORM_INIT_ENV +true +endef + +exec: ./runTester + srun --partition=mt --nodes=1 --gres=gpu:mt:2 --ntasks=1 --cpus-per-task=16 --mem=256G --time=00:20:00 ./runTester + diff --git a/08_bilateral_filter/Jason-Young123/scripts/nvidia.mk b/08_bilateral_filter/Jason-Young123/scripts/nvidia.mk new file mode 100644 index 0000000..8646f40 --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/scripts/nvidia.mk @@ -0,0 +1,7 @@ +define PLATFORM_INIT_ENV +[ -f /data/shared/miniconda3/etc/profile.d/conda.sh ] && . /data/shared/miniconda3/etc/profile.d/conda.sh || true +endef + +exec: ./runTester + ./runTester + diff --git a/08_bilateral_filter/Jason-Young123/src/auxiliary.cu b/08_bilateral_filter/Jason-Young123/src/auxiliary.cu new file mode 100644 index 0000000..a1c9537 --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/src/auxiliary.cu @@ -0,0 +1,108 @@ +#include + +//获取cfg文件 +bool getCfg(const std::string& cfgPath, int& radius, float& sigma_spatial, float& sigma_color){ + std::ifstream file(cfgPath); + if(!file){ + std::cerr << "Error: Could not open file: " << cfgPath << std::endl; + return false; + } + std::string line; + while (std::getline(file, line)) { + if (line.find("radius") != std::string::npos) + sscanf(line.c_str(), " radius = %d", &radius); + else if (line.find("sigma_spatial") != std::string::npos) + sscanf(line.c_str(), " sigma_spatial = %f", &sigma_spatial); + else if (line.find("sigma_color") != std::string::npos) + sscanf(line.c_str(), " sigma_color = %f", &sigma_color); + } + file.close(); + return true; +} + + +//获取bin文件 +bool getBin(const std::string& binPath, int& width, int& height, uint8_t*& src1, myPixel*& src2){ + std::ifstream file(binPath, std::ios::binary); + if(!file){ + std::cerr << "Error: Could not open file: " << binPath << std::endl; + return false; + } + + //step1: 获取头信息 + int channels; + file.read(reinterpret_cast(&width), sizeof(int)); + file.read(reinterpret_cast(&height), sizeof(int)); + file.read(reinterpret_cast(&channels), sizeof(int)); + + //step2: 获取像素信息 + size_t pixel_count = static_cast(width) * height; + if(channels == 3){ + src1 = nullptr; + src2 = new myPixel[pixel_count]; + + std::vector tmp_buffer(pixel_count * 3); + file.read(reinterpret_cast(tmp_buffer.data()), pixel_count * 3); + for(size_t i = 0; i < pixel_count; ++i) { + uint8_t r = tmp_buffer[i * 3 + 0]; + uint8_t g = tmp_buffer[i * 3 + 1]; + uint8_t b = tmp_buffer[i * 3 + 2]; + src2[i] = myPixel(r, g, b); // 构造函数会自动把 _A 设为 255 + } + } + else if(channels == 1){ + src1 = new uint8_t[width * height]; + src2 = nullptr; + file.read(reinterpret_cast(src1), pixel_count); + } + else{ + std::cerr << "Error: Unsupported channels = " << channels << std::endl; + return false; + } + + file.close(); + return true; +} + + + +//对比bin文件并返回MAE +float binDiff(const std::string& binPath_ref, const std::string& binPath_test){ + std::ifstream file_ref(binPath_ref, std::ios::binary); + std::ifstream file_test(binPath_test, std::ios::binary); + + if(!file_ref || !file_test){ + std::cerr << "Error: Could not open file for comparison." << std::endl; + return -1.0f; + } + + int w1, h1, c1; + int w2, h2, c2; + file_ref.read(reinterpret_cast(&w1), sizeof(int)); + file_ref.read(reinterpret_cast(&h1), sizeof(int)); + file_ref.read(reinterpret_cast(&c1), sizeof(int)); + file_test.read(reinterpret_cast(&w2), sizeof(int)); + file_test.read(reinterpret_cast(&h2), sizeof(int)); + file_test.read(reinterpret_cast(&c2), sizeof(int)); + if(w1 != w2 || h1 != h2 || c1 != c2){ + std::cerr << "Error: Image size not match." << std::endl; + return -1.0f; + } + + size_t totalPixels = static_cast(w1) * h1 * c1; + std::vector data_ref(totalPixels); + std::vector data_test(totalPixels); + file_ref.read(reinterpret_cast(data_ref.data()), totalPixels); + file_test.read(reinterpret_cast(data_test.data()), totalPixels); + + file_ref.close(); + file_test.close(); + + long totalError = 0; + for (size_t i = 0; i < totalPixels; ++i) { + totalError += std::abs(int(data_ref[i]) - int(data_test[i])); + } + + float MAE = double(totalError) / double(totalPixels); + return MAE; +} diff --git a/08_bilateral_filter/Jason-Young123/src/auxiliary.maca b/08_bilateral_filter/Jason-Young123/src/auxiliary.maca new file mode 100644 index 0000000..a1c9537 --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/src/auxiliary.maca @@ -0,0 +1,108 @@ +#include + +//获取cfg文件 +bool getCfg(const std::string& cfgPath, int& radius, float& sigma_spatial, float& sigma_color){ + std::ifstream file(cfgPath); + if(!file){ + std::cerr << "Error: Could not open file: " << cfgPath << std::endl; + return false; + } + std::string line; + while (std::getline(file, line)) { + if (line.find("radius") != std::string::npos) + sscanf(line.c_str(), " radius = %d", &radius); + else if (line.find("sigma_spatial") != std::string::npos) + sscanf(line.c_str(), " sigma_spatial = %f", &sigma_spatial); + else if (line.find("sigma_color") != std::string::npos) + sscanf(line.c_str(), " sigma_color = %f", &sigma_color); + } + file.close(); + return true; +} + + +//获取bin文件 +bool getBin(const std::string& binPath, int& width, int& height, uint8_t*& src1, myPixel*& src2){ + std::ifstream file(binPath, std::ios::binary); + if(!file){ + std::cerr << "Error: Could not open file: " << binPath << std::endl; + return false; + } + + //step1: 获取头信息 + int channels; + file.read(reinterpret_cast(&width), sizeof(int)); + file.read(reinterpret_cast(&height), sizeof(int)); + file.read(reinterpret_cast(&channels), sizeof(int)); + + //step2: 获取像素信息 + size_t pixel_count = static_cast(width) * height; + if(channels == 3){ + src1 = nullptr; + src2 = new myPixel[pixel_count]; + + std::vector tmp_buffer(pixel_count * 3); + file.read(reinterpret_cast(tmp_buffer.data()), pixel_count * 3); + for(size_t i = 0; i < pixel_count; ++i) { + uint8_t r = tmp_buffer[i * 3 + 0]; + uint8_t g = tmp_buffer[i * 3 + 1]; + uint8_t b = tmp_buffer[i * 3 + 2]; + src2[i] = myPixel(r, g, b); // 构造函数会自动把 _A 设为 255 + } + } + else if(channels == 1){ + src1 = new uint8_t[width * height]; + src2 = nullptr; + file.read(reinterpret_cast(src1), pixel_count); + } + else{ + std::cerr << "Error: Unsupported channels = " << channels << std::endl; + return false; + } + + file.close(); + return true; +} + + + +//对比bin文件并返回MAE +float binDiff(const std::string& binPath_ref, const std::string& binPath_test){ + std::ifstream file_ref(binPath_ref, std::ios::binary); + std::ifstream file_test(binPath_test, std::ios::binary); + + if(!file_ref || !file_test){ + std::cerr << "Error: Could not open file for comparison." << std::endl; + return -1.0f; + } + + int w1, h1, c1; + int w2, h2, c2; + file_ref.read(reinterpret_cast(&w1), sizeof(int)); + file_ref.read(reinterpret_cast(&h1), sizeof(int)); + file_ref.read(reinterpret_cast(&c1), sizeof(int)); + file_test.read(reinterpret_cast(&w2), sizeof(int)); + file_test.read(reinterpret_cast(&h2), sizeof(int)); + file_test.read(reinterpret_cast(&c2), sizeof(int)); + if(w1 != w2 || h1 != h2 || c1 != c2){ + std::cerr << "Error: Image size not match." << std::endl; + return -1.0f; + } + + size_t totalPixels = static_cast(w1) * h1 * c1; + std::vector data_ref(totalPixels); + std::vector data_test(totalPixels); + file_ref.read(reinterpret_cast(data_ref.data()), totalPixels); + file_test.read(reinterpret_cast(data_test.data()), totalPixels); + + file_ref.close(); + file_test.close(); + + long totalError = 0; + for (size_t i = 0; i < totalPixels; ++i) { + totalError += std::abs(int(data_ref[i]) - int(data_test[i])); + } + + float MAE = double(totalError) / double(totalPixels); + return MAE; +} diff --git a/08_bilateral_filter/Jason-Young123/src/auxiliary.mu b/08_bilateral_filter/Jason-Young123/src/auxiliary.mu new file mode 100644 index 0000000..a1c9537 --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/src/auxiliary.mu @@ -0,0 +1,108 @@ +#include + +//获取cfg文件 +bool getCfg(const std::string& cfgPath, int& radius, float& sigma_spatial, float& sigma_color){ + std::ifstream file(cfgPath); + if(!file){ + std::cerr << "Error: Could not open file: " << cfgPath << std::endl; + return false; + } + std::string line; + while (std::getline(file, line)) { + if (line.find("radius") != std::string::npos) + sscanf(line.c_str(), " radius = %d", &radius); + else if (line.find("sigma_spatial") != std::string::npos) + sscanf(line.c_str(), " sigma_spatial = %f", &sigma_spatial); + else if (line.find("sigma_color") != std::string::npos) + sscanf(line.c_str(), " sigma_color = %f", &sigma_color); + } + file.close(); + return true; +} + + +//获取bin文件 +bool getBin(const std::string& binPath, int& width, int& height, uint8_t*& src1, myPixel*& src2){ + std::ifstream file(binPath, std::ios::binary); + if(!file){ + std::cerr << "Error: Could not open file: " << binPath << std::endl; + return false; + } + + //step1: 获取头信息 + int channels; + file.read(reinterpret_cast(&width), sizeof(int)); + file.read(reinterpret_cast(&height), sizeof(int)); + file.read(reinterpret_cast(&channels), sizeof(int)); + + //step2: 获取像素信息 + size_t pixel_count = static_cast(width) * height; + if(channels == 3){ + src1 = nullptr; + src2 = new myPixel[pixel_count]; + + std::vector tmp_buffer(pixel_count * 3); + file.read(reinterpret_cast(tmp_buffer.data()), pixel_count * 3); + for(size_t i = 0; i < pixel_count; ++i) { + uint8_t r = tmp_buffer[i * 3 + 0]; + uint8_t g = tmp_buffer[i * 3 + 1]; + uint8_t b = tmp_buffer[i * 3 + 2]; + src2[i] = myPixel(r, g, b); // 构造函数会自动把 _A 设为 255 + } + } + else if(channels == 1){ + src1 = new uint8_t[width * height]; + src2 = nullptr; + file.read(reinterpret_cast(src1), pixel_count); + } + else{ + std::cerr << "Error: Unsupported channels = " << channels << std::endl; + return false; + } + + file.close(); + return true; +} + + + +//对比bin文件并返回MAE +float binDiff(const std::string& binPath_ref, const std::string& binPath_test){ + std::ifstream file_ref(binPath_ref, std::ios::binary); + std::ifstream file_test(binPath_test, std::ios::binary); + + if(!file_ref || !file_test){ + std::cerr << "Error: Could not open file for comparison." << std::endl; + return -1.0f; + } + + int w1, h1, c1; + int w2, h2, c2; + file_ref.read(reinterpret_cast(&w1), sizeof(int)); + file_ref.read(reinterpret_cast(&h1), sizeof(int)); + file_ref.read(reinterpret_cast(&c1), sizeof(int)); + file_test.read(reinterpret_cast(&w2), sizeof(int)); + file_test.read(reinterpret_cast(&h2), sizeof(int)); + file_test.read(reinterpret_cast(&c2), sizeof(int)); + if(w1 != w2 || h1 != h2 || c1 != c2){ + std::cerr << "Error: Image size not match." << std::endl; + return -1.0f; + } + + size_t totalPixels = static_cast(w1) * h1 * c1; + std::vector data_ref(totalPixels); + std::vector data_test(totalPixels); + file_ref.read(reinterpret_cast(data_ref.data()), totalPixels); + file_test.read(reinterpret_cast(data_test.data()), totalPixels); + + file_ref.close(); + file_test.close(); + + long totalError = 0; + for (size_t i = 0; i < totalPixels; ++i) { + totalError += std::abs(int(data_ref[i]) - int(data_test[i])); + } + + float MAE = double(totalError) / double(totalPixels); + return MAE; +} diff --git a/08_bilateral_filter/Jason-Young123/src/bilateral.cu b/08_bilateral_filter/Jason-Young123/src/bilateral.cu new file mode 100644 index 0000000..4c61891 --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/src/bilateral.cu @@ -0,0 +1,262 @@ +#include +#include +#include + + + + +//掐头去尾求平均用时 +float ave_time(const std::vector& time_ms, size_t size) { + if (size < 3 || time_ms.size() < size) { + float sum = std::accumulate(time_ms.begin(), time_ms.begin() + std::min(size, time_ms.size()), 0.0f); + return sum / std::min(size, time_ms.size()); + } + + auto it_begin = time_ms.begin(); + auto it_end = time_ms.begin() + size; + + //遍历寻找最大最小值 + auto [min_it, max_it] = std::minmax_element(it_begin, it_end); + float min_val = *min_it; + float max_val = *max_it; + + float sum = std::accumulate(it_begin, it_end, 0.0f); + + return (sum - min_val - max_val) / (float)(size - 2); +} + + + + +int autoSelectRadius(const uint8_t* src_gray, const myPixel* src_rgb, int channels, int width, int height, int radius, float sigma_spatial){ + if(radius > 0){//指定合法半径,直接使用 + return radius; + } + + float base_r = 3.0f * sigma_spatial; + + long long total_diff = 0; + int samples = 0; + int stride_x = std::max(1, width / 50); + int stride_y = std::max(1, height / 50); + + for(int i = 0; i < height - 1; i += stride_y){ + for(int j = 0; j < width - 1; j += stride_x){ + int idx = i * width + j; + int idx_right = idx + 1; + int idx_down = std::min(i + 1, height - 1) * width + j; + if(channels == 1 && src_gray){ + total_diff += std::abs(int(src_gray[idx]) - int(src_gray[idx_right])); + total_diff += std::abs(int(src_gray[idx]) - int(src_gray[idx_down])); + } + else if(channels == 3 && src_rgb){ + total_diff += std::abs(int(src_rgb[idx].R()) - int(src_rgb[idx_right].R())); + total_diff += std::abs(int(src_rgb[idx].G()) - int(src_rgb[idx_right].G())); + total_diff += std::abs(int(src_rgb[idx].B()) - int(src_rgb[idx_right].B())); + total_diff += std::abs(int(src_rgb[idx].R()) - int(src_rgb[idx_down].R())); + total_diff += std::abs(int(src_rgb[idx].G()) - int(src_rgb[idx_down].G())); + total_diff += std::abs(int(src_rgb[idx].B()) - int(src_rgb[idx_down].B())); + } + else{ + + } + samples++; + } + } + + double divisor = (channels == 1) ? 2.0 : 6.0; + float avg_diff = (samples > 0) ? (double)total_diff / (samples * divisor) : 0; + + float factor = 0.6f + (avg_diff - 2.0f) * (1.4f - 0.6f) / (15.0f - 2.0f); + factor = std::max(0.6f, std::min(1.4f, factor)); + + int final_r = (int)(base_r * factor + 0.5f); + if (final_r < 3) final_r = 3; + if (final_r > 10) final_r = 10; + + return final_r; + +} + + + + + +void runSingleCase(const std::filesystem::path& casePath, int radius, float sigma_spatial, float sigma_color, int warmup_round, int test_round){ + std::string fileName = casePath.filename().string();//文件名,如cameraman.bin + std::string filePath = casePath.string();//文件路径 + //std::string outPath = "result/gpu/" + fileName; + //std::string refPath = "result/opencv/" + fileName; + + int width, height; + uint8_t* src_gray = nullptr; + myPixel* src_rgb = nullptr; + getBin(filePath, width, height, src_gray, src_rgb); + int channels = src_gray ? 1 : 3; + float sigma_color_sq = 2 * sigma_color * sigma_color; + float sigma_spatial_sq = 2 * sigma_spatial * sigma_spatial; + + + std::string outPath = (channels == 1) ? ("result/gpu/gray/" + fileName) : ("result/gpu/rgb/" + fileName); + std::string refPath = (channels == 1) ? ("result/opencv/gray/" + fileName) : ("result/opencv/rgb/" + fileName); + + + std::cout << BLUE << " [ " << casePath.stem().string() << " ] " << ":"; + if (width == 3840 && height == 2160) { + std::cout << " 4K(3840 * 2160 * " << channels << ")" << RESET << std::endl; + } else { + std::cout << " " << width << " * " << height << " * " << channels << RESET << std::endl; + } + int radius1 = autoSelectRadius(src_gray, src_rgb, channels, width, height, radius, sigma_spatial);//自适应调整半径 + std::string msg = (radius <= 0) ? "(Auto)" : ""; + std::cout << "Param: " << "radius = " << radius1 << msg << " sigma_s = " << sigma_spatial << " sigma_c = " << sigma_color << std::endl; + + auto runCase = [&](auto* h_src){ + using Type = std::remove_pointer_t; + size_t src_size = static_cast(width + 2 * radius1) * (height + 2 * radius1) * sizeof(Type); + size_t dst_size = static_cast(width) * height * sizeof(Type); + Type* h_src_r101 = nullptr, *h_dst = new Type[width * height]; + h_src_r101 = Reflect101(h_src, width, height, radius1); + + //gpu版本 + RUNTIME_CHECK(cudaHostRegister(h_src_r101, src_size, cudaHostRegisterDefault)); + RUNTIME_CHECK(cudaHostRegister(h_dst, dst_size, cudaHostRegisterDefault)); + + //预分配显存, 只需进行一次 + Type *d_src, *d_dst; + cudaStream_t stream; + RUNTIME_CHECK(cudaMalloc(&d_src, src_size)); + RUNTIME_CHECK(cudaMalloc(&d_dst, dst_size)); + RUNTIME_CHECK(cudaStreamCreate(&stream)); + + for(int i = 0; i < warmup_round; ++i){//warmup + runFilterPure(h_src_r101, d_src, d_dst, h_dst, radius1, width, height, sigma_spatial_sq, sigma_color_sq, src_size, dst_size, stream); + } + std::vector time_ms; + for(int i = 0; i < test_round; ++i){//test + auto start = std::chrono::high_resolution_clock::now(); + runFilterPure(h_src_r101, d_src, d_dst, h_dst, radius1, width, height, sigma_spatial_sq, sigma_color_sq, src_size, dst_size, stream); + auto end = std::chrono::high_resolution_clock::now(); + float duration = std::chrono::duration(end - start).count(); + time_ms.push_back(duration); + } + + //清理资源, 只需进行一次 + RUNTIME_CHECK(cudaHostUnregister(h_src_r101)); + RUNTIME_CHECK(cudaHostUnregister(h_dst)); + RUNTIME_CHECK(cudaFree(d_src)); + RUNTIME_CHECK(cudaFree(d_dst)); + RUNTIME_CHECK(cudaStreamDestroy(stream)); + + float time_ms_gpu = ave_time(time_ms, test_round); + float throughput_gpu = (width * height / 1000000.0f) / (time_ms_gpu / 1000.0f); + genBin(outPath, h_dst, width, height); + printf("gpu( cuda ): Time: %8.3f ms | Throughput: %8.2f MP/s \n", time_ms_gpu, throughput_gpu); + +#ifdef HAS_CV + std::vector time_ms1; + for(int i = 0; i < test_round; ++i) { + auto start = std::chrono::high_resolution_clock::now(); + runFilter_cv(h_src, radius1, width, height, sigma_spatial, sigma_color, h_dst); + auto end = std::chrono::high_resolution_clock::now(); + float duration = std::chrono::duration(end - start).count(); + time_ms1.push_back(duration); + } + float time_ms_cv = ave_time(time_ms1, test_round); + float throughput_cv = (width * height / 1000000.0f) / (time_ms_cv / 1000.0f); + genBin(refPath, h_dst, width, height); + + printf("cpu(opencv): Time: %8.3f ms | Throughput: %8.2f MP/s \n", time_ms_cv, throughput_cv); + + std::cout << "Acceleration Ratio: " << throughput_gpu / throughput_cv << std::endl; + + float MAE = binDiff(refPath, outPath); + if (MAE >= 1) { + std::cout << "MAE: " << RED << MAE << " ( failed )" << RESET << std::endl << std::endl; + } else { + std::cout << "MAE: " << GREEN << MAE << " ( passed )" << RESET << std::endl << std::endl; + } +#endif + + delete[] h_src_r101; delete[] h_dst; + }; + + if (channels == 1) { + runCase(src_gray); + delete[] src_gray; + } else { + runCase(src_rgb); + delete[] src_rgb; + } + +} + + + + + +void runAll(const std::string testerPath, int warmup_round, int test_round){ + //step0: preparation, get configuration info + std::string cfgPath = testerPath + "/config.txt";//tester/config.txt + int radius = 5; + float sigma_color = 30.0f; + float sigma_spatial = 3.0f; + getCfg(cfgPath, radius, sigma_spatial, sigma_color); + if(sigma_color <= 0.0f || sigma_spatial <= 0.0f){ + std::cout << "Invalid argument: sigma_spatial/sigma_color, Quit..." << std::endl; + return; + } + + float sigma_color_sq = 2 * sigma_color * sigma_color; + float sigma_spatial_sq = 2 * sigma_spatial * sigma_spatial; + int ret = std::system("mkdir -p result/gpu/gray result/gpu/rgb result/opencv/gray result/opencv/rgb");//存储结果 + (void)ret; + + //init spatial_lut + //float h_spatial_lut_data[6][6]; + //for(int i = 0; i < 6; ++i){ + // for(int j = 0; j < 6; ++j){ + // h_spatial_lut_data[i][j] = expf(-(float)(i * i + j * j) / (2.0f * sigma_spatial * sigma_spatial)); + // } + //} + //cudaMemcpyToSymbol(spatial_lut_data, h_spatial_lut_data, sizeof(h_spatial_lut_data)); + + //init color_lut + //float h_color_lut_data[768]; + //for(int i = 0; i < 768; ++i){ + // h_color_lut_data[i] = expf(-(float)(i * i) / (2.0f * sigma_color * sigma_color)); + //} + //cudaMemcpyToSymbol(color_lut_data, h_color_lut_data, sizeof(h_color_lut_data)); + + + //step1: 灰度测试 + std::cout << "\n" << BOLD << BLUE << " ### Tester of gray images: ### " << RESET << std::endl << std::endl; + std::string grayPath = testerPath + "/gray/4K"; + for(const auto& entry : std::filesystem::recursive_directory_iterator(grayPath)){ + if(entry.is_regular_file()){ + runSingleCase(entry.path(), radius, sigma_spatial, sigma_color, warmup_round, test_round); + } + } + + + //step2: RGB测试 + std::cout << "\n" << BOLD << BLUE << " ### Tester of RGB images: ### " << RESET << std::endl << std::endl; + std::string rgbPath = testerPath + "/rgb/4K"; + for(const auto& entry : std::filesystem::recursive_directory_iterator(rgbPath)){ + if(entry.is_regular_file()){ + runSingleCase(entry.path(), radius, sigma_spatial, sigma_color, warmup_round, test_round); + } + } + +} + + + + + + + + + + + diff --git a/08_bilateral_filter/Jason-Young123/src/bilateral.maca b/08_bilateral_filter/Jason-Young123/src/bilateral.maca new file mode 100644 index 0000000..5723df7 --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/src/bilateral.maca @@ -0,0 +1,262 @@ +#include +#include +#include + + + + +//掐头去尾求平均用时 +float ave_time(const std::vector& time_ms, size_t size) { + if (size < 3 || time_ms.size() < size) { + float sum = std::accumulate(time_ms.begin(), time_ms.begin() + std::min(size, time_ms.size()), 0.0f); + return sum / std::min(size, time_ms.size()); + } + + auto it_begin = time_ms.begin(); + auto it_end = time_ms.begin() + size; + + //遍历寻找最大最小值 + auto [min_it, max_it] = std::minmax_element(it_begin, it_end); + float min_val = *min_it; + float max_val = *max_it; + + float sum = std::accumulate(it_begin, it_end, 0.0f); + + return (sum - min_val - max_val) / (float)(size - 2); +} + + + + +int autoSelectRadius(const uint8_t* src_gray, const myPixel* src_rgb, int channels, int width, int height, int radius, float sigma_spatial){ + if(radius > 0){//指定合法半径,直接使用 + return radius; + } + + float base_r = 3.0f * sigma_spatial; + + long long total_diff = 0; + int samples = 0; + int stride_x = std::max(1, width / 50); + int stride_y = std::max(1, height / 50); + + for(int i = 0; i < height - 1; i += stride_y){ + for(int j = 0; j < width - 1; j += stride_x){ + int idx = i * width + j; + int idx_right = idx + 1; + int idx_down = std::min(i + 1, height - 1) * width + j; + if(channels == 1 && src_gray){ + total_diff += std::abs(int(src_gray[idx]) - int(src_gray[idx_right])); + total_diff += std::abs(int(src_gray[idx]) - int(src_gray[idx_down])); + } + else if(channels == 3 && src_rgb){ + total_diff += std::abs(int(src_rgb[idx].R()) - int(src_rgb[idx_right].R())); + total_diff += std::abs(int(src_rgb[idx].G()) - int(src_rgb[idx_right].G())); + total_diff += std::abs(int(src_rgb[idx].B()) - int(src_rgb[idx_right].B())); + total_diff += std::abs(int(src_rgb[idx].R()) - int(src_rgb[idx_down].R())); + total_diff += std::abs(int(src_rgb[idx].G()) - int(src_rgb[idx_down].G())); + total_diff += std::abs(int(src_rgb[idx].B()) - int(src_rgb[idx_down].B())); + } + else{ + + } + samples++; + } + } + + double divisor = (channels == 1) ? 2.0 : 6.0; + float avg_diff = (samples > 0) ? (double)total_diff / (samples * divisor) : 0; + + float factor = 0.6f + (avg_diff - 2.0f) * (1.4f - 0.6f) / (15.0f - 2.0f); + factor = std::max(0.6f, std::min(1.4f, factor)); + + int final_r = (int)(base_r * factor + 0.5f); + if (final_r < 3) final_r = 3; + if (final_r > 10) final_r = 10; + + return final_r; + +} + + + + + +void runSingleCase(const std::filesystem::path& casePath, int radius, float sigma_spatial, float sigma_color, int warmup_round, int test_round){ + std::string fileName = casePath.filename().string();//文件名,如cameraman.bin + std::string filePath = casePath.string();//文件路径 + //std::string outPath = "result/gpu/" + fileName; + //std::string refPath = "result/opencv/" + fileName; + + int width, height; + uint8_t* src_gray = nullptr; + myPixel* src_rgb = nullptr; + getBin(filePath, width, height, src_gray, src_rgb); + int channels = src_gray ? 1 : 3; + float sigma_color_sq = 2 * sigma_color * sigma_color; + float sigma_spatial_sq = 2 * sigma_spatial * sigma_spatial; + + + std::string outPath = (channels == 1) ? ("result/gpu/gray/" + fileName) : ("result/gpu/rgb/" + fileName); + std::string refPath = (channels == 1) ? ("result/opencv/gray/" + fileName) : ("result/opencv/rgb/" + fileName); + + + std::cout << BLUE << " [ " << casePath.stem().string() << " ] " << ":"; + if (width == 3840 && height == 2160) { + std::cout << " 4K(3840 * 2160 * " << channels << ")" << RESET << std::endl; + } else { + std::cout << " " << width << " * " << height << " * " << channels << RESET << std::endl; + } + int radius1 = autoSelectRadius(src_gray, src_rgb, channels, width, height, radius, sigma_spatial);//自适应调整半径 + std::string msg = (radius <= 0) ? "(Auto)" : ""; + std::cout << "Param: " << "radius = " << radius1 << msg << " sigma_s = " << sigma_spatial << " sigma_c = " << sigma_color << std::endl; + + auto runCase = [&](auto* h_src){ + using Type = std::remove_pointer_t; + size_t src_size = static_cast(width + 2 * radius1) * (height + 2 * radius1) * sizeof(Type); + size_t dst_size = static_cast(width) * height * sizeof(Type); + Type* h_src_r101 = nullptr, *h_dst = new Type[width * height]; + h_src_r101 = Reflect101(h_src, width, height, radius1); + + //gpu版本 + RUNTIME_CHECK(mcHostRegister(h_src_r101, src_size, mcHostRegisterDefault)); + RUNTIME_CHECK(mcHostRegister(h_dst, dst_size, mcHostRegisterDefault)); + + //预分配显存, 只需进行一次 + Type *d_src, *d_dst; + mcStream_t stream; + RUNTIME_CHECK(mcMalloc(&d_src, src_size)); + RUNTIME_CHECK(mcMalloc(&d_dst, dst_size)); + RUNTIME_CHECK(mcStreamCreate(&stream)); + + for(int i = 0; i < warmup_round; ++i){//warmup + runFilterPure(h_src_r101, d_src, d_dst, h_dst, radius1, width, height, sigma_spatial_sq, sigma_color_sq, src_size, dst_size, stream); + } + std::vector time_ms; + for(int i = 0; i < test_round; ++i){//test + auto start = std::chrono::high_resolution_clock::now(); + runFilterPure(h_src_r101, d_src, d_dst, h_dst, radius1, width, height, sigma_spatial_sq, sigma_color_sq, src_size, dst_size, stream); + auto end = std::chrono::high_resolution_clock::now(); + float duration = std::chrono::duration(end - start).count(); + time_ms.push_back(duration); + } + + //清理资源, 只需进行一次 + RUNTIME_CHECK(mcHostUnregister(h_src_r101)); + RUNTIME_CHECK(mcHostUnregister(h_dst)); + RUNTIME_CHECK(mcFree(d_src)); + RUNTIME_CHECK(mcFree(d_dst)); + RUNTIME_CHECK(mcStreamDestroy(stream)); + + float time_ms_gpu = ave_time(time_ms, test_round); + float throughput_gpu = (width * height / 1000000.0f) / (time_ms_gpu / 1000.0f); + genBin(outPath, h_dst, width, height); + printf("gpu( mc ): Time: %8.3f ms | Throughput: %8.2f MP/s \n", time_ms_gpu, throughput_gpu); + +#ifdef HAS_CV + std::vector time_ms1; + for(int i = 0; i < test_round; ++i) { + auto start = std::chrono::high_resolution_clock::now(); + runFilter_cv(h_src, radius1, width, height, sigma_spatial, sigma_color, h_dst); + auto end = std::chrono::high_resolution_clock::now(); + float duration = std::chrono::duration(end - start).count(); + time_ms1.push_back(duration); + } + float time_ms_cv = ave_time(time_ms1, test_round); + float throughput_cv = (width * height / 1000000.0f) / (time_ms_cv / 1000.0f); + genBin(refPath, h_dst, width, height); + + printf("cpu(opencv): Time: %8.3f ms | Throughput: %8.2f MP/s \n", time_ms_cv, throughput_cv); + + std::cout << "Acceleration Ratio: " << throughput_gpu / throughput_cv << std::endl; + + float MAE = binDiff(refPath, outPath); + if (MAE >= 1) { + std::cout << "MAE: " << RED << MAE << " ( failed )" << RESET << std::endl << std::endl; + } else { + std::cout << "MAE: " << GREEN << MAE << " ( passed )" << RESET << std::endl << std::endl; + } +#endif + + delete[] h_src_r101; delete[] h_dst; + }; + + if (channels == 1) { + runCase(src_gray); + delete[] src_gray; + } else { + runCase(src_rgb); + delete[] src_rgb; + } + +} + + + + + +void runAll(const std::string testerPath, int warmup_round, int test_round){ + //step0: preparation, get configuration info + std::string cfgPath = testerPath + "/config.txt";//tester/config.txt + int radius = 5; + float sigma_color = 30.0f; + float sigma_spatial = 3.0f; + getCfg(cfgPath, radius, sigma_spatial, sigma_color); + if(sigma_color <= 0.0f || sigma_spatial <= 0.0f){ + std::cout << "Invalid argument: sigma_spatial/sigma_color, Quit..." << std::endl; + return; + } + + float sigma_color_sq = 2 * sigma_color * sigma_color; + float sigma_spatial_sq = 2 * sigma_spatial * sigma_spatial; + int ret = std::system("mkdir -p result/gpu/gray result/gpu/rgb result/opencv/gray result/opencv/rgb");//存储结果 + (void)ret; + + //init spatial_lut + //float h_spatial_lut_data[6][6]; + //for(int i = 0; i < 6; ++i){ + // for(int j = 0; j < 6; ++j){ + // h_spatial_lut_data[i][j] = expf(-(float)(i * i + j * j) / (2.0f * sigma_spatial * sigma_spatial)); + // } + //} + //mcMemcpyToSymbol(spatial_lut_data, h_spatial_lut_data, sizeof(h_spatial_lut_data)); + + //init color_lut + //float h_color_lut_data[768]; + //for(int i = 0; i < 768; ++i){ + // h_color_lut_data[i] = expf(-(float)(i * i) / (2.0f * sigma_color * sigma_color)); + //} + //mcMemcpyToSymbol(color_lut_data, h_color_lut_data, sizeof(h_color_lut_data)); + + + //step1: 灰度测试 + std::cout << "\n" << BOLD << BLUE << " ### Tester of gray images: ### " << RESET << std::endl << std::endl; + std::string grayPath = testerPath + "/gray/4K"; + for(const auto& entry : std::filesystem::recursive_directory_iterator(grayPath)){ + if(entry.is_regular_file()){ + runSingleCase(entry.path(), radius, sigma_spatial, sigma_color, warmup_round, test_round); + } + } + + + //step2: RGB测试 + std::cout << "\n" << BOLD << BLUE << " ### Tester of RGB images: ### " << RESET << std::endl << std::endl; + std::string rgbPath = testerPath + "/rgb/4K"; + for(const auto& entry : std::filesystem::recursive_directory_iterator(rgbPath)){ + if(entry.is_regular_file()){ + runSingleCase(entry.path(), radius, sigma_spatial, sigma_color, warmup_round, test_round); + } + } + +} + + + + + + + + + + + diff --git a/08_bilateral_filter/Jason-Young123/src/bilateral.mu b/08_bilateral_filter/Jason-Young123/src/bilateral.mu new file mode 100644 index 0000000..ae6904f --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/src/bilateral.mu @@ -0,0 +1,262 @@ +#include +#include +#include + + + + +//掐头去尾求平均用时 +float ave_time(const std::vector& time_ms, size_t size) { + if (size < 3 || time_ms.size() < size) { + float sum = std::accumulate(time_ms.begin(), time_ms.begin() + std::min(size, time_ms.size()), 0.0f); + return sum / std::min(size, time_ms.size()); + } + + auto it_begin = time_ms.begin(); + auto it_end = time_ms.begin() + size; + + //遍历寻找最大最小值 + auto [min_it, max_it] = std::minmax_element(it_begin, it_end); + float min_val = *min_it; + float max_val = *max_it; + + float sum = std::accumulate(it_begin, it_end, 0.0f); + + return (sum - min_val - max_val) / (float)(size - 2); +} + + + + +int autoSelectRadius(const uint8_t* src_gray, const myPixel* src_rgb, int channels, int width, int height, int radius, float sigma_spatial){ + if(radius > 0){//指定合法半径,直接使用 + return radius; + } + + float base_r = 3.0f * sigma_spatial; + + long long total_diff = 0; + int samples = 0; + int stride_x = std::max(1, width / 50); + int stride_y = std::max(1, height / 50); + + for(int i = 0; i < height - 1; i += stride_y){ + for(int j = 0; j < width - 1; j += stride_x){ + int idx = i * width + j; + int idx_right = idx + 1; + int idx_down = std::min(i + 1, height - 1) * width + j; + if(channels == 1 && src_gray){ + total_diff += std::abs(int(src_gray[idx]) - int(src_gray[idx_right])); + total_diff += std::abs(int(src_gray[idx]) - int(src_gray[idx_down])); + } + else if(channels == 3 && src_rgb){ + total_diff += std::abs(int(src_rgb[idx].R()) - int(src_rgb[idx_right].R())); + total_diff += std::abs(int(src_rgb[idx].G()) - int(src_rgb[idx_right].G())); + total_diff += std::abs(int(src_rgb[idx].B()) - int(src_rgb[idx_right].B())); + total_diff += std::abs(int(src_rgb[idx].R()) - int(src_rgb[idx_down].R())); + total_diff += std::abs(int(src_rgb[idx].G()) - int(src_rgb[idx_down].G())); + total_diff += std::abs(int(src_rgb[idx].B()) - int(src_rgb[idx_down].B())); + } + else{ + + } + samples++; + } + } + + double divisor = (channels == 1) ? 2.0 : 6.0; + float avg_diff = (samples > 0) ? (double)total_diff / (samples * divisor) : 0; + + float factor = 0.6f + (avg_diff - 2.0f) * (1.4f - 0.6f) / (15.0f - 2.0f); + factor = std::max(0.6f, std::min(1.4f, factor)); + + int final_r = (int)(base_r * factor + 0.5f); + if (final_r < 3) final_r = 3; + if (final_r > 10) final_r = 10; + + return final_r; + +} + + + + + +void runSingleCase(const std::filesystem::path& casePath, int radius, float sigma_spatial, float sigma_color, int warmup_round, int test_round){ + std::string fileName = casePath.filename().string();//文件名,如cameraman.bin + std::string filePath = casePath.string();//文件路径 + //std::string outPath = "result/gpu/" + fileName; + //std::string refPath = "result/opencv/" + fileName; + + int width, height; + uint8_t* src_gray = nullptr; + myPixel* src_rgb = nullptr; + getBin(filePath, width, height, src_gray, src_rgb); + int channels = src_gray ? 1 : 3; + float sigma_color_sq = 2 * sigma_color * sigma_color; + float sigma_spatial_sq = 2 * sigma_spatial * sigma_spatial; + + + std::string outPath = (channels == 1) ? ("result/gpu/gray/" + fileName) : ("result/gpu/rgb/" + fileName); + std::string refPath = (channels == 1) ? ("result/opencv/gray/" + fileName) : ("result/opencv/rgb/" + fileName); + + + std::cout << BLUE << " [ " << casePath.stem().string() << " ] " << ":"; + if (width == 3840 && height == 2160) { + std::cout << " 4K(3840 * 2160 * " << channels << ")" << RESET << std::endl; + } else { + std::cout << " " << width << " * " << height << " * " << channels << RESET << std::endl; + } + int radius1 = autoSelectRadius(src_gray, src_rgb, channels, width, height, radius, sigma_spatial);//自适应调整半径 + std::string msg = (radius <= 0) ? "(Auto)" : ""; + std::cout << "Param: " << "radius = " << radius1 << msg << " sigma_s = " << sigma_spatial << " sigma_c = " << sigma_color << std::endl; + + auto runCase = [&](auto* h_src){ + using Type = std::remove_pointer_t; + size_t src_size = static_cast(width + 2 * radius1) * (height + 2 * radius1) * sizeof(Type); + size_t dst_size = static_cast(width) * height * sizeof(Type); + Type* h_src_r101 = nullptr, *h_dst = new Type[width * height]; + h_src_r101 = Reflect101(h_src, width, height, radius1); + + //gpu版本 + RUNTIME_CHECK(musaHostRegister(h_src_r101, src_size, musaHostRegisterDefault)); + RUNTIME_CHECK(musaHostRegister(h_dst, dst_size, musaHostRegisterDefault)); + + //预分配显存, 只需进行一次 + Type *d_src, *d_dst; + musaStream_t stream; + RUNTIME_CHECK(musaMalloc(&d_src, src_size)); + RUNTIME_CHECK(musaMalloc(&d_dst, dst_size)); + RUNTIME_CHECK(musaStreamCreate(&stream)); + + for(int i = 0; i < warmup_round; ++i){//warmup + runFilterPure(h_src_r101, d_src, d_dst, h_dst, radius1, width, height, sigma_spatial_sq, sigma_color_sq, src_size, dst_size, stream); + } + std::vector time_ms; + for(int i = 0; i < test_round; ++i){//test + auto start = std::chrono::high_resolution_clock::now(); + runFilterPure(h_src_r101, d_src, d_dst, h_dst, radius1, width, height, sigma_spatial_sq, sigma_color_sq, src_size, dst_size, stream); + auto end = std::chrono::high_resolution_clock::now(); + float duration = std::chrono::duration(end - start).count(); + time_ms.push_back(duration); + } + + //清理资源, 只需进行一次 + RUNTIME_CHECK(musaHostUnregister(h_src_r101)); + RUNTIME_CHECK(musaHostUnregister(h_dst)); + RUNTIME_CHECK(musaFree(d_src)); + RUNTIME_CHECK(musaFree(d_dst)); + RUNTIME_CHECK(musaStreamDestroy(stream)); + + float time_ms_gpu = ave_time(time_ms, test_round); + float throughput_gpu = (width * height / 1000000.0f) / (time_ms_gpu / 1000.0f); + genBin(outPath, h_dst, width, height); + printf("gpu( musa ): Time: %8.3f ms | Throughput: %8.2f MP/s \n", time_ms_gpu, throughput_gpu); + +#ifdef HAS_CV + std::vector time_ms1; + for(int i = 0; i < test_round; ++i) { + auto start = std::chrono::high_resolution_clock::now(); + runFilter_cv(h_src, radius1, width, height, sigma_spatial, sigma_color, h_dst); + auto end = std::chrono::high_resolution_clock::now(); + float duration = std::chrono::duration(end - start).count(); + time_ms1.push_back(duration); + } + float time_ms_cv = ave_time(time_ms1, test_round); + float throughput_cv = (width * height / 1000000.0f) / (time_ms_cv / 1000.0f); + genBin(refPath, h_dst, width, height); + + printf("cpu(opencv): Time: %8.3f ms | Throughput: %8.2f MP/s \n", time_ms_cv, throughput_cv); + + std::cout << "Acceleration Ratio: " << throughput_gpu / throughput_cv << std::endl; + + float MAE = binDiff(refPath, outPath); + if (MAE >= 1) { + std::cout << "MAE: " << RED << MAE << " ( failed )" << RESET << std::endl << std::endl; + } else { + std::cout << "MAE: " << GREEN << MAE << " ( passed )" << RESET << std::endl << std::endl; + } +#endif + + delete[] h_src_r101; delete[] h_dst; + }; + + if (channels == 1) { + runCase(src_gray); + delete[] src_gray; + } else { + runCase(src_rgb); + delete[] src_rgb; + } + +} + + + + + +void runAll(const std::string testerPath, int warmup_round, int test_round){ + //step0: preparation, get configuration info + std::string cfgPath = testerPath + "/config.txt";//tester/config.txt + int radius = 5; + float sigma_color = 30.0f; + float sigma_spatial = 3.0f; + getCfg(cfgPath, radius, sigma_spatial, sigma_color); + if(sigma_color <= 0.0f || sigma_spatial <= 0.0f){ + std::cout << "Invalid argument: sigma_spatial/sigma_color, Quit..." << std::endl; + return; + } + + float sigma_color_sq = 2 * sigma_color * sigma_color; + float sigma_spatial_sq = 2 * sigma_spatial * sigma_spatial; + int ret = std::system("mkdir -p result/gpu/gray result/gpu/rgb result/opencv/gray result/opencv/rgb");//存储结果 + (void)ret; + + //init spatial_lut + //float h_spatial_lut_data[6][6]; + //for(int i = 0; i < 6; ++i){ + // for(int j = 0; j < 6; ++j){ + // h_spatial_lut_data[i][j] = expf(-(float)(i * i + j * j) / (2.0f * sigma_spatial * sigma_spatial)); + // } + //} + //musaMemcpyToSymbol(spatial_lut_data, h_spatial_lut_data, sizeof(h_spatial_lut_data)); + + //init color_lut + //float h_color_lut_data[768]; + //for(int i = 0; i < 768; ++i){ + // h_color_lut_data[i] = expf(-(float)(i * i) / (2.0f * sigma_color * sigma_color)); + //} + //musaMemcpyToSymbol(color_lut_data, h_color_lut_data, sizeof(h_color_lut_data)); + + + //step1: 灰度测试 + std::cout << "\n" << BOLD << BLUE << " ### Tester of gray images: ### " << RESET << std::endl << std::endl; + std::string grayPath = testerPath + "/gray/4K"; + for(const auto& entry : std::filesystem::recursive_directory_iterator(grayPath)){ + if(entry.is_regular_file()){ + runSingleCase(entry.path(), radius, sigma_spatial, sigma_color, warmup_round, test_round); + } + } + + + //step2: RGB测试 + std::cout << "\n" << BOLD << BLUE << " ### Tester of RGB images: ### " << RESET << std::endl << std::endl; + std::string rgbPath = testerPath + "/rgb/4K"; + for(const auto& entry : std::filesystem::recursive_directory_iterator(rgbPath)){ + if(entry.is_regular_file()){ + runSingleCase(entry.path(), radius, sigma_spatial, sigma_color, warmup_round, test_round); + } + } + +} + + + + + + + + + + + diff --git a/08_bilateral_filter/Jason-Young123/src/main.cu b/08_bilateral_filter/Jason-Young123/src/main.cu new file mode 100644 index 0000000..a3d44ce --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/src/main.cu @@ -0,0 +1,25 @@ +#include +#include + +int main(){ + +#ifndef HAS_CV + std::cout << "\n" << BOLD << RED + << "******************************************************************\n" + << " [WARNING] OpenCV is NOT detected on this server!\n" + << " [NOTICE ] CPU Reference will be SKIPPED!\n" + << "******************************************************************" + << RESET << std::endl; +#endif + + //genTester(); + //return 0; + int warmup_round = 3; + int test_round = 15; + //bool only4K = false; + //runTester("tester", warmup_round, test_round, only4K); + + runAll("tester", warmup_round, test_round); + + return 0; +} \ No newline at end of file diff --git a/08_bilateral_filter/Jason-Young123/src/main.maca b/08_bilateral_filter/Jason-Young123/src/main.maca new file mode 100644 index 0000000..a3d44ce --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/src/main.maca @@ -0,0 +1,25 @@ +#include +#include + +int main(){ + +#ifndef HAS_CV + std::cout << "\n" << BOLD << RED + << "******************************************************************\n" + << " [WARNING] OpenCV is NOT detected on this server!\n" + << " [NOTICE ] CPU Reference will be SKIPPED!\n" + << "******************************************************************" + << RESET << std::endl; +#endif + + //genTester(); + //return 0; + int warmup_round = 3; + int test_round = 15; + //bool only4K = false; + //runTester("tester", warmup_round, test_round, only4K); + + runAll("tester", warmup_round, test_round); + + return 0; +} \ No newline at end of file diff --git a/08_bilateral_filter/Jason-Young123/src/main.mu b/08_bilateral_filter/Jason-Young123/src/main.mu new file mode 100644 index 0000000..a3d44ce --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/src/main.mu @@ -0,0 +1,25 @@ +#include +#include + +int main(){ + +#ifndef HAS_CV + std::cout << "\n" << BOLD << RED + << "******************************************************************\n" + << " [WARNING] OpenCV is NOT detected on this server!\n" + << " [NOTICE ] CPU Reference will be SKIPPED!\n" + << "******************************************************************" + << RESET << std::endl; +#endif + + //genTester(); + //return 0; + int warmup_round = 3; + int test_round = 15; + //bool only4K = false; + //runTester("tester", warmup_round, test_round, only4K); + + runAll("tester", warmup_round, test_round); + + return 0; +} \ No newline at end of file diff --git a/08_bilateral_filter/Jason-Young123/src/myopencv.cu b/08_bilateral_filter/Jason-Young123/src/myopencv.cu new file mode 100644 index 0000000..30c2692 --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/src/myopencv.cu @@ -0,0 +1,96 @@ +#include +#include + +#ifdef HAS_CV + +//基于cv::Mat生成bin文件 +bool genBin_cv(const std::string& outBinPath, const cv::Mat& img){ + std::ofstream file(outBinPath, std::ios::binary); + if(!file){ + std::cerr << "Error: Could not open file: " << outBinPath << std::endl; + return false; + } + int width = img.cols; + int height = img.rows; + int channels = img.channels(); + file.write(reinterpret_cast(&width), sizeof(int)); + file.write(reinterpret_cast(&height), sizeof(int)); + file.write(reinterpret_cast(&channels), sizeof(int)); + file.write(reinterpret_cast(img.data), size_t(width * height * channels)); + + file.close(); + std::cout << "Successfully genBin: " << outBinPath << std::endl; + return true; +} + + +//生成测试用例 +void genTester(){ + std::string baseResource = "resource"; + std::string baseTester = "tester"; + + struct Config { + std::string subType; + bool isRGB; + }; + + Config configs[] = { + {"gray", false}, + {"rgb", true} + }; + + for (const auto& cfg : configs) { + std::filesystem::path srcRoot = std::filesystem::path(baseResource) / cfg.subType; + + if (!std::filesystem::exists(srcRoot)) { + std::cout << "Directory not found: " << srcRoot.string() << std::endl; + continue; + } + + //遍历: gray/4K, gray/others, rgb/4K, rgb/others + for (const auto& entry : std::filesystem::recursive_directory_iterator(srcRoot)) { + if (!entry.is_regular_file()) { + continue; + } + + std::filesystem::path filePath = entry.path(); + + cv::Mat img = cv::imread(filePath.string(), cfg.isRGB ? cv::IMREAD_COLOR : cv::IMREAD_GRAYSCALE); + if (img.empty()) continue; + + if (cfg.isRGB) { + cv::cvtColor(img, img, cv::COLOR_BGR2RGB); + } + + std::filesystem::path relativePath = std::filesystem::relative(filePath, srcRoot); + std::filesystem::path outPath = std::filesystem::path(baseTester) / cfg.subType / relativePath; + outPath.replace_extension(".bin");//.bin raw文件 + + std::filesystem::create_directories(outPath.parent_path()); + + if (genBin_cv(outPath.string(), img)) { + std::cout << "[Tester] Converted: " << cfg.subType << "/" << relativePath.string() + << " -> " << outPath.string() << std::endl; + } + } + } +} + + +//基于cv::Mat生成图片 +bool genImg_cv(const std::string& outImgPath, const cv::Mat& img, bool isRGB){ + cv::Mat finalImg; + if(img.channels() == 3 && isRGB){ + cv::cvtColor(img, finalImg, cv::COLOR_RGB2BGR); + } + else{ + finalImg = img; + } + cv::imwrite(outImgPath, finalImg); + + std::cout << "Successfully genImg: " << outImgPath << std::endl; + return true; +} + + +#endif \ No newline at end of file diff --git a/08_bilateral_filter/Jason-Young123/src/myopencv.maca b/08_bilateral_filter/Jason-Young123/src/myopencv.maca new file mode 100644 index 0000000..30c2692 --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/src/myopencv.maca @@ -0,0 +1,96 @@ +#include +#include + +#ifdef HAS_CV + +//基于cv::Mat生成bin文件 +bool genBin_cv(const std::string& outBinPath, const cv::Mat& img){ + std::ofstream file(outBinPath, std::ios::binary); + if(!file){ + std::cerr << "Error: Could not open file: " << outBinPath << std::endl; + return false; + } + int width = img.cols; + int height = img.rows; + int channels = img.channels(); + file.write(reinterpret_cast(&width), sizeof(int)); + file.write(reinterpret_cast(&height), sizeof(int)); + file.write(reinterpret_cast(&channels), sizeof(int)); + file.write(reinterpret_cast(img.data), size_t(width * height * channels)); + + file.close(); + std::cout << "Successfully genBin: " << outBinPath << std::endl; + return true; +} + + +//生成测试用例 +void genTester(){ + std::string baseResource = "resource"; + std::string baseTester = "tester"; + + struct Config { + std::string subType; + bool isRGB; + }; + + Config configs[] = { + {"gray", false}, + {"rgb", true} + }; + + for (const auto& cfg : configs) { + std::filesystem::path srcRoot = std::filesystem::path(baseResource) / cfg.subType; + + if (!std::filesystem::exists(srcRoot)) { + std::cout << "Directory not found: " << srcRoot.string() << std::endl; + continue; + } + + //遍历: gray/4K, gray/others, rgb/4K, rgb/others + for (const auto& entry : std::filesystem::recursive_directory_iterator(srcRoot)) { + if (!entry.is_regular_file()) { + continue; + } + + std::filesystem::path filePath = entry.path(); + + cv::Mat img = cv::imread(filePath.string(), cfg.isRGB ? cv::IMREAD_COLOR : cv::IMREAD_GRAYSCALE); + if (img.empty()) continue; + + if (cfg.isRGB) { + cv::cvtColor(img, img, cv::COLOR_BGR2RGB); + } + + std::filesystem::path relativePath = std::filesystem::relative(filePath, srcRoot); + std::filesystem::path outPath = std::filesystem::path(baseTester) / cfg.subType / relativePath; + outPath.replace_extension(".bin");//.bin raw文件 + + std::filesystem::create_directories(outPath.parent_path()); + + if (genBin_cv(outPath.string(), img)) { + std::cout << "[Tester] Converted: " << cfg.subType << "/" << relativePath.string() + << " -> " << outPath.string() << std::endl; + } + } + } +} + + +//基于cv::Mat生成图片 +bool genImg_cv(const std::string& outImgPath, const cv::Mat& img, bool isRGB){ + cv::Mat finalImg; + if(img.channels() == 3 && isRGB){ + cv::cvtColor(img, finalImg, cv::COLOR_RGB2BGR); + } + else{ + finalImg = img; + } + cv::imwrite(outImgPath, finalImg); + + std::cout << "Successfully genImg: " << outImgPath << std::endl; + return true; +} + + +#endif \ No newline at end of file diff --git a/08_bilateral_filter/Jason-Young123/src/myopencv.mu b/08_bilateral_filter/Jason-Young123/src/myopencv.mu new file mode 100644 index 0000000..30c2692 --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/src/myopencv.mu @@ -0,0 +1,96 @@ +#include +#include + +#ifdef HAS_CV + +//基于cv::Mat生成bin文件 +bool genBin_cv(const std::string& outBinPath, const cv::Mat& img){ + std::ofstream file(outBinPath, std::ios::binary); + if(!file){ + std::cerr << "Error: Could not open file: " << outBinPath << std::endl; + return false; + } + int width = img.cols; + int height = img.rows; + int channels = img.channels(); + file.write(reinterpret_cast(&width), sizeof(int)); + file.write(reinterpret_cast(&height), sizeof(int)); + file.write(reinterpret_cast(&channels), sizeof(int)); + file.write(reinterpret_cast(img.data), size_t(width * height * channels)); + + file.close(); + std::cout << "Successfully genBin: " << outBinPath << std::endl; + return true; +} + + +//生成测试用例 +void genTester(){ + std::string baseResource = "resource"; + std::string baseTester = "tester"; + + struct Config { + std::string subType; + bool isRGB; + }; + + Config configs[] = { + {"gray", false}, + {"rgb", true} + }; + + for (const auto& cfg : configs) { + std::filesystem::path srcRoot = std::filesystem::path(baseResource) / cfg.subType; + + if (!std::filesystem::exists(srcRoot)) { + std::cout << "Directory not found: " << srcRoot.string() << std::endl; + continue; + } + + //遍历: gray/4K, gray/others, rgb/4K, rgb/others + for (const auto& entry : std::filesystem::recursive_directory_iterator(srcRoot)) { + if (!entry.is_regular_file()) { + continue; + } + + std::filesystem::path filePath = entry.path(); + + cv::Mat img = cv::imread(filePath.string(), cfg.isRGB ? cv::IMREAD_COLOR : cv::IMREAD_GRAYSCALE); + if (img.empty()) continue; + + if (cfg.isRGB) { + cv::cvtColor(img, img, cv::COLOR_BGR2RGB); + } + + std::filesystem::path relativePath = std::filesystem::relative(filePath, srcRoot); + std::filesystem::path outPath = std::filesystem::path(baseTester) / cfg.subType / relativePath; + outPath.replace_extension(".bin");//.bin raw文件 + + std::filesystem::create_directories(outPath.parent_path()); + + if (genBin_cv(outPath.string(), img)) { + std::cout << "[Tester] Converted: " << cfg.subType << "/" << relativePath.string() + << " -> " << outPath.string() << std::endl; + } + } + } +} + + +//基于cv::Mat生成图片 +bool genImg_cv(const std::string& outImgPath, const cv::Mat& img, bool isRGB){ + cv::Mat finalImg; + if(img.channels() == 3 && isRGB){ + cv::cvtColor(img, finalImg, cv::COLOR_RGB2BGR); + } + else{ + finalImg = img; + } + cv::imwrite(outImgPath, finalImg); + + std::cout << "Successfully genImg: " << outImgPath << std::endl; + return true; +} + + +#endif \ No newline at end of file diff --git a/08_bilateral_filter/Jason-Young123/tester/config.txt b/08_bilateral_filter/Jason-Young123/tester/config.txt new file mode 100644 index 0000000..6fa2360 --- /dev/null +++ b/08_bilateral_filter/Jason-Young123/tester/config.txt @@ -0,0 +1,3 @@ +radius = 5 +sigma_spatial = 3.0 +sigma_color = 30.0 diff --git a/08_bilateral_filter/Jason-Young123/tester/gray/4K/wallpaper_g1.bin b/08_bilateral_filter/Jason-Young123/tester/gray/4K/wallpaper_g1.bin new file mode 100644 index 0000000..d3de1dc Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/gray/4K/wallpaper_g1.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/gray/4K/wallpaper_g10.bin b/08_bilateral_filter/Jason-Young123/tester/gray/4K/wallpaper_g10.bin new file mode 100644 index 0000000..0120312 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/gray/4K/wallpaper_g10.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/gray/4K/wallpaper_g2.bin b/08_bilateral_filter/Jason-Young123/tester/gray/4K/wallpaper_g2.bin new file mode 100644 index 0000000..2cf5acd Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/gray/4K/wallpaper_g2.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/gray/4K/wallpaper_g3.bin b/08_bilateral_filter/Jason-Young123/tester/gray/4K/wallpaper_g3.bin new file mode 100644 index 0000000..a8964e2 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/gray/4K/wallpaper_g3.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/gray/4K/wallpaper_g4.bin b/08_bilateral_filter/Jason-Young123/tester/gray/4K/wallpaper_g4.bin new file mode 100644 index 0000000..5ac65e1 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/gray/4K/wallpaper_g4.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/gray/4K/wallpaper_g5.bin b/08_bilateral_filter/Jason-Young123/tester/gray/4K/wallpaper_g5.bin new file mode 100644 index 0000000..2c037d6 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/gray/4K/wallpaper_g5.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/gray/4K/wallpaper_g6.bin b/08_bilateral_filter/Jason-Young123/tester/gray/4K/wallpaper_g6.bin new file mode 100644 index 0000000..50adb2b Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/gray/4K/wallpaper_g6.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/gray/4K/wallpaper_g7.bin b/08_bilateral_filter/Jason-Young123/tester/gray/4K/wallpaper_g7.bin new file mode 100644 index 0000000..3fd4d58 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/gray/4K/wallpaper_g7.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/gray/4K/wallpaper_g8.bin b/08_bilateral_filter/Jason-Young123/tester/gray/4K/wallpaper_g8.bin new file mode 100644 index 0000000..a7acdfd Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/gray/4K/wallpaper_g8.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/gray/4K/wallpaper_g9.bin b/08_bilateral_filter/Jason-Young123/tester/gray/4K/wallpaper_g9.bin new file mode 100644 index 0000000..904928b Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/gray/4K/wallpaper_g9.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/gray/others/airplane.bin b/08_bilateral_filter/Jason-Young123/tester/gray/others/airplane.bin new file mode 100644 index 0000000..4ae6cea Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/gray/others/airplane.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/gray/others/airport.bin b/08_bilateral_filter/Jason-Young123/tester/gray/others/airport.bin new file mode 100644 index 0000000..4f5f265 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/gray/others/airport.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/gray/others/apple.bin b/08_bilateral_filter/Jason-Young123/tester/gray/others/apple.bin new file mode 100644 index 0000000..039064f Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/gray/others/apple.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/gray/others/barbara.bin b/08_bilateral_filter/Jason-Young123/tester/gray/others/barbara.bin new file mode 100644 index 0000000..738746c Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/gray/others/barbara.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/gray/others/bridge.bin b/08_bilateral_filter/Jason-Young123/tester/gray/others/bridge.bin new file mode 100644 index 0000000..aa0f49a Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/gray/others/bridge.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/gray/others/cameraman.bin b/08_bilateral_filter/Jason-Young123/tester/gray/others/cameraman.bin new file mode 100644 index 0000000..87b5176 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/gray/others/cameraman.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/gray/others/fishingboat.bin b/08_bilateral_filter/Jason-Young123/tester/gray/others/fishingboat.bin new file mode 100644 index 0000000..3a9ef9c Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/gray/others/fishingboat.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/gray/others/male.bin b/08_bilateral_filter/Jason-Young123/tester/gray/others/male.bin new file mode 100644 index 0000000..eef8802 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/gray/others/male.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/gray/others/palette.bin b/08_bilateral_filter/Jason-Young123/tester/gray/others/palette.bin new file mode 100644 index 0000000..05ac3d8 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/gray/others/palette.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/gray/others/ruler.bin b/08_bilateral_filter/Jason-Young123/tester/gray/others/ruler.bin new file mode 100644 index 0000000..284b338 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/gray/others/ruler.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper1.bin b/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper1.bin new file mode 100644 index 0000000..b03bce8 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper1.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper10.bin b/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper10.bin new file mode 100644 index 0000000..0bf697a Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper10.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper11.bin b/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper11.bin new file mode 100644 index 0000000..177d8f9 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper11.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper12.bin b/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper12.bin new file mode 100644 index 0000000..7d61326 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper12.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper13.bin b/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper13.bin new file mode 100644 index 0000000..ea24586 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper13.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper14.bin b/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper14.bin new file mode 100644 index 0000000..4bc18c8 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper14.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper15.bin b/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper15.bin new file mode 100644 index 0000000..822e47d Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper15.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper2.bin b/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper2.bin new file mode 100644 index 0000000..0d383df Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper2.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper3.bin b/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper3.bin new file mode 100644 index 0000000..15f1136 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper3.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper4.bin b/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper4.bin new file mode 100644 index 0000000..352568d Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper4.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper5.bin b/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper5.bin new file mode 100644 index 0000000..90c4587 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper5.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper6.bin b/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper6.bin new file mode 100644 index 0000000..c13e4b3 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper6.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper7.bin b/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper7.bin new file mode 100644 index 0000000..d62f0ee Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper7.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper8.bin b/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper8.bin new file mode 100644 index 0000000..044e35a Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper8.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper9.bin b/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper9.bin new file mode 100644 index 0000000..9987ca1 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/rgb/4K/wallpaper9.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/rgb/others/budgies.bin b/08_bilateral_filter/Jason-Young123/tester/rgb/others/budgies.bin new file mode 100644 index 0000000..9b8f462 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/rgb/others/budgies.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/rgb/others/cat.bin b/08_bilateral_filter/Jason-Young123/tester/rgb/others/cat.bin new file mode 100644 index 0000000..4ac7fb6 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/rgb/others/cat.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/rgb/others/flowers.bin b/08_bilateral_filter/Jason-Young123/tester/rgb/others/flowers.bin new file mode 100644 index 0000000..fbe57f7 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/rgb/others/flowers.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/rgb/others/house.bin b/08_bilateral_filter/Jason-Young123/tester/rgb/others/house.bin new file mode 100644 index 0000000..8529212 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/rgb/others/house.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/rgb/others/hue.bin b/08_bilateral_filter/Jason-Young123/tester/rgb/others/hue.bin new file mode 100644 index 0000000..8788d89 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/rgb/others/hue.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/rgb/others/jellybeans.bin b/08_bilateral_filter/Jason-Young123/tester/rgb/others/jellybeans.bin new file mode 100644 index 0000000..c8a919b Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/rgb/others/jellybeans.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/rgb/others/lena.bin b/08_bilateral_filter/Jason-Young123/tester/rgb/others/lena.bin new file mode 100644 index 0000000..2d2a5fd Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/rgb/others/lena.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/rgb/others/mandrill.bin b/08_bilateral_filter/Jason-Young123/tester/rgb/others/mandrill.bin new file mode 100644 index 0000000..b52e98b Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/rgb/others/mandrill.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/rgb/others/peacock.bin b/08_bilateral_filter/Jason-Young123/tester/rgb/others/peacock.bin new file mode 100644 index 0000000..aa7314e Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/rgb/others/peacock.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/rgb/others/peppers.bin b/08_bilateral_filter/Jason-Young123/tester/rgb/others/peppers.bin new file mode 100644 index 0000000..60a46e3 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/rgb/others/peppers.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/rgb/others/sailboat.bin b/08_bilateral_filter/Jason-Young123/tester/rgb/others/sailboat.bin new file mode 100644 index 0000000..83fc92c Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/rgb/others/sailboat.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/rgb/others/snow.bin b/08_bilateral_filter/Jason-Young123/tester/rgb/others/snow.bin new file mode 100644 index 0000000..c0879df Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/rgb/others/snow.bin differ diff --git a/08_bilateral_filter/Jason-Young123/tester/rgb/others/splash.bin b/08_bilateral_filter/Jason-Young123/tester/rgb/others/splash.bin new file mode 100644 index 0000000..b8c5ca2 Binary files /dev/null and b/08_bilateral_filter/Jason-Young123/tester/rgb/others/splash.bin differ