1. 简介
在智能电表、工业振动监测等计量应用中,FIR 滤波与 FFT 是最核心的底层算法。STM32U3C5 内置的 HSP硬件信号处理单元专为此类场景设计。本阶段评测旨在通过性能测试,验证 HSP 在 FIR 滤波任务中的真实加速效果,并与 Cortex-M33 CPU 上运行的标准 CMSIS-DSP 库进行对比。
2. 测试指标与环境
2.1 测试指标
HSP Kernel 耗时:HSP Accelerator 模式下,纯硬件 FIR 引擎的计算耗时。
HSP 端到端耗时 (End-to-End, E2E):模拟真实应用场景,包含数据拷贝 + 计算 + 状态重置 的总耗时。
CPU 对照组耗时:纯 CPU 运行 arm_fir_f32() 的 Kernel 与 E2E 耗时。
加速比:HSP 相较于 CPU 的性能提升倍数。
计算精度:对比 HSP 硬件输出与 CPU 浮点输出的 RMS error(均方根误差)与 Max abs error(最大绝对误差)。
2.1 测试环境
| 项目 |
配置 |
| 开发板 |
STM32U3C5 / NUCLEO-U3C5ZI-Q |
| 系统频率 |
96 MHz |
| 编译与构建 |
CMake + GCC / Release (-O2) |
| 日志与调试 |
SEGGER RTT |
| 计时方式 |
DWT CYCCNT |
| 测试策略 |
Repeat: 20次 / Warmup: 1次 |
| HSP 工作模式 |
Accelerator mode |
| CPU 对照 |
CMSIS-DSParm_fir_f32() |
DWT 是 Arm Cortex-M 内核里的 Data Watchpoint and Trace 单元,其中 CYCCNT 是一个硬件 cycle counter,用来统计 CPU clock cycles
3. 实现
3.1 CubeMX设置
使用CudeMX快速的搭建测试环境
- 首先配置Clock

- 启用HSP

- 配置HSP为
Accelerator模式

- 指定生成CMake工程

3.2 核心代码
DWT Benchmark
#include "bench_timer.h"
void BenchTimer_Init(void)
{
CoreDebug->DEMCR |= CoreDebug_DEMCR_TRCENA_Msk;
#if defined(DWT_LAR)
DWT->LAR = 0xC5ACCE55;
#endif
DWT->CYCCNT = 0;
DWT->CTRL |= DWT_CTRL_CYCCNTENA_Msk;
__DSB();
__ISB();
}
uint32_t BenchTimer_Now(void)
{
return DWT->CYCCNT;
}
uint32_t BenchTimer_Elapsed(uint32_t start, uint32_t end)
{
return end - start;
}
uint32_t BenchTimer_CyclesToUs(uint32_t cycles)
{
return (uint32_t)(((uint64_t)cycles * 1000000ULL) / SystemCoreClock);
}
uint32_t APP_HSP_Engine_BenchFIRCase(fir_bench_result_t *out, uint32_t case_index)
{
hsp_bram_resources_t *p_bram_rsrc = MX_HSP_BRAM_GetResources();
uint32_t t0, t1, t2, t3, t4;
fir_mode_sum_t hsp_sum = {0};
fir_mode_sum_t sw_sum = {0};
fir_mode_sum_t cmsis_sum = {0};
float32_t rms = 0.0f;
float32_t max_abs = 0.0f;
const uint32_t total_iterations = FIR_BENCH_WARMUP_COUNT + FIR_BENCH_REPEAT_COUNT;
const float32_t *input = NULL;
const float32_t *coef = NULL;
const float32_t *reference = NULL;
const fir_case_config_t *cfg = NULL;
arm_fir_instance_f32 cmsis_fir;
hsp_filter_state_identifier_t fir_state_id = 0U;
if ((out == NULL) || (case_index >= FIR_BENCH_MATRIX_CASES))
{
return 1U;
}
cfg = &g_fir_cases[case_index];
fir_state_id = APP_SelectFirState(p_bram_rsrc, cfg->taps);
memset(out, 0, sizeof(*out));
out->case_id = cfg->case_id;
out->n = cfg->n;
out->taps = cfg->taps;
out->repeats = FIR_BENCH_REPEAT_COUNT;
out->warmups = FIR_BENCH_WARMUP_COUNT;
if ((fir_state_id == 0U) || (APP_GetFirCaseData(cfg, &input, &coef, &reference) != 0U))
{
out->status = 1U;
return out->status;
}
APP_ModeStatsInit(&out->hsp_acc);
APP_ModeStatsInit(&out->sw_scalar);
APP_ModeStatsInit(&out->cpu_cmsis);
APP_PrepareCmsisFirCoefficients(cmsis_coef_rev, coef, cfg->taps);
cmsis_fir.numTaps = (uint16_t)cfg->taps;
cmsis_fir.pState = cmsis_fir_state;
cmsis_fir.pCoeffs = cmsis_coef_rev;
for (uint32_t iter = 0; iter < total_iterations; iter++)
{
const uint32_t collect = (iter >= FIR_BENCH_WARMUP_COUNT) ? 1U : 0U;
/* HSP path: SRAM -> BRAM copy + HSP command + HSP state reset + validation. */
t0 = BenchTimer_Now();
memcpy(p_bram_rsrc->p_buff_in, input, cfg->n * sizeof(float32_t));
memcpy(p_bram_rsrc->p_coef, coef, cfg->taps * sizeof(float32_t));
t1 = BenchTimer_Now();
HSP_ACC_Fir_f32(&hmw,
p_bram_rsrc->p_buff_in,
p_bram_rsrc->p_coef,
fir_state_id,
p_bram_rsrc->p_buff_out,
cfg->n);
t2 = BenchTimer_Now();
HSP_ACC_FirResetStateBuffer(&hmw, fir_state_id);
t3 = BenchTimer_Now();
APP_ValidateFirOutput(p_bram_rsrc->p_buff_out,
reference,
cfg->n,
fir_results_error,
&rms,
&max_abs);
t4 = BenchTimer_Now();
if (collect != 0U)
{
APP_ModeStatsUpdate(&out->hsp_acc, &hsp_sum,
BenchTimer_Elapsed(t0, t1),
BenchTimer_Elapsed(t1, t2),
BenchTimer_Elapsed(t2, t3),
BenchTimer_Elapsed(t3, t4),
BenchTimer_Elapsed(t0, t3),
BenchTimer_Elapsed(t0, t4));
out->hsp_acc.rms_x1e9 = APP_FloatAbsScaled1e9(rms);
out->hsp_acc.max_abs_x1e9 = APP_FloatAbsScaled1e9(max_abs);
if (rms > RMS_REF)
{
out->hsp_acc.status = 1U;
}
}
/* Pure software scalar path in CPU SRAM. */
t0 = BenchTimer_Now();
APP_SW_Fir_f32(input, coef, sw_fir_output, cfg->n, cfg->taps);
t1 = BenchTimer_Now();
APP_ValidateFirOutput(sw_fir_output,
reference,
cfg->n,
fir_results_error,
&rms,
&max_abs);
t2 = BenchTimer_Now();
if (collect != 0U)
{
APP_ModeStatsUpdate(&out->sw_scalar, &sw_sum,
0U,
BenchTimer_Elapsed(t0, t1),
0U,
BenchTimer_Elapsed(t1, t2),
BenchTimer_Elapsed(t0, t1),
BenchTimer_Elapsed(t0, t2));
out->sw_scalar.rms_x1e9 = APP_FloatAbsScaled1e9(rms);
out->sw_scalar.max_abs_x1e9 = APP_FloatAbsScaled1e9(max_abs);
if (rms > RMS_REF)
{
out->sw_scalar.status = 1U;
}
}
/* CPU baseline using CMSIS-DSP arm_fir_f32(). */
t0 = BenchTimer_Now();
memset(cmsis_fir_state, 0, ((cfg->n + cfg->taps + 4U) * sizeof(float32_t)));
t1 = BenchTimer_Now();
arm_fir_f32(&cmsis_fir, input, cmsis_fir_output, cfg->n);
t2 = BenchTimer_Now();
APP_ValidateFirOutput(cmsis_fir_output,
reference,
cfg->n,
fir_results_error,
&rms,
&max_abs);
t3 = BenchTimer_Now();
if (collect != 0U)
{
APP_ModeStatsUpdate(&out->cpu_cmsis, &cmsis_sum,
0U,
BenchTimer_Elapsed(t1, t2),
BenchTimer_Elapsed(t0, t1),
BenchTimer_Elapsed(t2, t3),
BenchTimer_Elapsed(t0, t2),
BenchTimer_Elapsed(t0, t3));
out->cpu_cmsis.rms_x1e9 = APP_FloatAbsScaled1e9(rms);
out->cpu_cmsis.max_abs_x1e9 = APP_FloatAbsScaled1e9(max_abs);
if (rms > RMS_REF)
{
out->cpu_cmsis.status = 1U;
}
}
}
APP_ModeStatsFinish(&out->hsp_acc, &hsp_sum, FIR_BENCH_REPEAT_COUNT);
APP_ModeStatsFinish(&out->sw_scalar, &sw_sum, FIR_BENCH_REPEAT_COUNT);
APP_ModeStatsFinish(&out->cpu_cmsis, &cmsis_sum, FIR_BENCH_REPEAT_COUNT);
out->status = out->hsp_acc.status | out->sw_scalar.status | out->cpu_cmsis.status;
return out->status;
}
pyocd安装cmsis-pack
PS: > pyocd pack -i stm32u3c5
4. 测试结果
pyocd烧录
PS: > pyocd flash build/Release/fir_acc_bench.elf --target stm32u3c5zitxq
0001748 I Loading D:\Projects\STM32_U3C5\workspace\stm32_eval\fir_acc\build\Release\fir_acc_bench.elf [load_cmd]
0001750 I Erasing... [loader]
[==================================================] 100%
0002220 I Programming... [loader]
[==================================================] 100%
RTT结果
PS: > pyocd rtt --target stm32u3c5zitxq
0001023 I Target type is stm32u3c5zitxq [board]
0001516 I DP IDR = 0x0be12477 (v2 MINDP rev0) [dap]
0001605 I AHB5-AP#0 IDR = 0x14770015 (AHB5-AP var1 rev1) [discovery]
0001607 I AHB5-AP#0 Class 0x1 ROM table #0 @ 0xe00fe000 (designer=020:ST part=42a) [rom_table]
0001609 I [0]<e00ff000:ROM class=1 designer=43b:Arm part=4c9> [rom_table]
0001609 I AHB5-AP#0 Class 0x1 ROM table #1 @ 0xe00ff000 (designer=43b:Arm part=4c9) [rom_table]
0001612 I [0]<e000e000:SCS M33 class=9 designer=43b:Arm part=d21 devtype=00 archid=2a04 devid=0:0:0> [rom_table]
0001613 I [1]<e0001000:DWT M33 class=9 designer=43b:Arm part=d21 devtype=00 archid=1a02 devid=0:0:0> [rom_table]
0001615 I [2]<e0002000:BPU M33 class=9 designer=43b:Arm part=d21 devtype=00 archid=1a03 devid=0:0:0> [rom_table]
0001616 I [3]<e0000000:ITM M33 class=9 designer=43b:Arm part=d21 devtype=43 archid=1a01 devid=0:0:0> [rom_table]
0001618 I [5]<e0041000:ETM M33 class=9 designer=43b:Arm part=d21 devtype=13 archid=4a13 devid=0:0:0> [rom_table]
0001620 I [6]<e0042000:CTI M33 class=9 designer=43b:Arm part=d21 devtype=14 archid=1a14 devid=40800:0:0> [rom_table]
0001622 I [1]<e0040000:TPIU M33 class=9 designer=43b:Arm part=d21 devtype=11 archid=0000 devid=ca1:0:0> [rom_table]
0001623 I [2]<e0044000:DBGMCU class=15 designer=020:ST part=000> [rom_table]
0001629 I CPU core #0: Cortex-M33 r0p4, v8.0-M architecture [cortex_m]
0001629 I Extensions: [DSP, FPU, FPU_V5, MPU] [cortex_m]
0001630 I FPU present: FPv5-SP-D16-M [cortex_m]
0001631 I 4 hardware watchpoints [dwt]
0001634 I 8 hardware breakpoints, 1 literal comparators [fpb]
0001877 I 3 up channels and 3 down channels found [rtt_cmd]
0001877 I Reading from up channel 0 ("Terminal") [rtt_cmd]
0001878 I Writing to down channel 0 ("Terminal") [rtt_cmd]
# fir_matrix_bench v3.1 RTT/CMake/GCC start, SystemCoreClock=96000000 Hz, repeat=20, warmup=1, cases=9/9, hsp=1, sw_scalar=0, cpu_cmsis=1, validate_each_repeat=0
test,case,mode,n,taps,repeat,warmup,copy_min,copy_avg,copy_max,kernel_min,kernel_avg,kernel_max,reset_min,reset_avg,reset_max,validate_min,validate_avg,validate_max,e2e_no_validate_min,e2e_no_validate_avg,e2e_no_validate_max,bench_total_min,bench_total_avg,bench_total_max,kernel_us_avg,e2e_no_validate_us_avg,bench_total_us_avg,rms_x1e9,max_abs_x1e9,status
fir,0,hsp_acc,250,51,20,1,8043,8049,8078,7724,7724,7725,87,87,88,5116,5116,5116,15854,15860,15889,20970,20976,21005,80,165,218,49,149,0
fir,0,cpu_cmsis,250,51,20,1,0,0,0,45548,45561,45583,8578,8583,8613,4772,4772,4772,54126,54144,54187,58898,58916,58959,474,564,613,28,76,0
fir,1,hsp_acc,128,16,20,1,2925,2926,2939,1130,1130,1131,69,69,70,2528,2528,2528,4124,4125,4140,6652,6653,6668,11,42,69,12,45,0
fir,1,cpu_cmsis,128,16,20,1,0,0,0,7791,7791,7791,4162,4166,4198,2318,2318,2318,11953,11957,11989,14271,14275,14307,81,124,148,12,45,0
fir,2,hsp_acc,128,32,20,1,3245,3249,3302,1640,1640,1643,75,75,76,2528,2528,2528,4960,4964,5017,7488,7492,7545,17,51,78,10,37,0
fir,2,cpu_cmsis,128,32,20,1,0,0,0,14567,14573,14597,4610,4611,4616,2318,2318,2318,19177,19184,19207,21495,21502,21525,151,199,223,9,30,0
4.1 耗时与加速比
| Case |
数据量 (N) |
抽头数 (taps) |
HSP Kernel |
CPU Kernel |
Kernel 加速比 |
HSP E2E |
CPU E2E |
E2E 加速比 |
| 0 |
250 |
51 |
80.46 µs |
474.59 µs |
5.90x |
165.21 µs |
564.00 µs |
3.41x |
| 1 |
128 |
16 |
11.77 µs |
81.16 µs |
6.89x |
42.97 µs |
124.55 µs |
2.90x |
| 2 |
128 |
32 |
17.08 µs |
151.80 µs |
8.89x |
51.71 µs |
199.83 µs |
3.86x |
(附:原始 Cycle 计数)
- Case 0: HSP 7,724 cycles vs CPU 45,561 cycles
- Case 1: HSP 1,130 cycles vs CPU 7,791 cycles
- Case 2: HSP 1,640 cycles vs CPU 14,573 cycles
4.2 计算精度
| Case |
测试模式 |
RMS error |
Max abs error |
| 0 (N=250) |
HSP Accelerator |
4.9e-8 |
1.49e-7 |
| 1 (N=128) |
HSP Accelerator |
1.2e-8 |
4.5e-8 |
| 2 (N=128) |
HSP Accelerator |
1.0e-8 |
3.7e-8 |
5. 结果分析
- 从加速结果来看,
在N=128时,随着滤波器抽头数(taps)从 16 增加到 32,HSP 的 Kernel 加速比从 6.89 倍升到了 8.89 倍,运算负载越重、MAC操作越密集,HSP 专属的硬件并行架构优势就越明显。
- 尽管Kernel很快,但是数据搬运成本较高,以 Case 1 (N=128/Taps=16) 为例,HSP 纯算仅花了 1,130 个周期,但前置的数据搬运却消耗了近 3,000 个周期。