with one click
with one click
矩阵乘法矩阵乘法 A[M, K] @ B[K, N] = C[M, N]中,大K维度矩阵乘法(K>>M,N)优化:针对M/N较小但K极大(如M=N=256,K=131072)的场景,Split-K切分K维度并行化、Workspace+Reduce替代全局同步,实现显著性能提升
Triton Ascend hard API restrictions and forbidden syntax. MUST-follow rules that apply to every kernel: forbidden control flow (return/break/continue/lambda/while), tensor slice/index restrictions, scalar conversion rules, BLOCK_SIZE upper bound. Violating any of these produces a compile or runtime error on Ascend.
Triton Ascend 性能优化通用策略: BLOCK_SIZE 选择 (1024-2048 for elementwise, must be <65536), grid configuration (use VEC_CORE_NUM / CUBE_CORE_NUM, 2D/3D grid for matmul / conv / reduce, 1D grid + inner loop for elementwise / pointwise), 256B alignment for memory transfers, autotune block-size patterns, fp16 / fp32 precision conversion. Bind via keywords like matmul, elementwise, reduce, block_size, grid, autotune, alignment, fp16, fp32, tile, interleaved-loop, cube-core, vec-core.
通过 adaptive_search 或 evolve 搜索式 workflow 生成优化算子。 后台 silent mode 执行,轮询监控进度。
适用于归约(reduce)类算子和含归约子步骤的复合算子(如归一化)的优化指南。典型算子包括:sum, mean, max, min, prod, argmax, argmin, cumsum, cumprod, softmax, logsoftmax, layernorm, rmsnorm, groupnorm, instancenorm, batchnorm, l1norm, l2norm, frobeniusnorm, var, std, average_pooling, sum_pooling 等。特别重要:当归约维度不是最后一维(如 dim=1 归约 shape=[B,F,D1,D2]),需要正确处理多维索引和两阶段归约。包含 PyTorch normalized_shape 多轴归一化语义说明。不适用于纯逐元素运算或矩阵乘法。如果算子是损失函数(先逐元素计算再全局归约),应选择 elementwise-reduce-fused 指南。
CPU C++ 算子核心概念、标准结构模式、KernelBench 代码规范和内嵌扩展方法
| name | cpu-optimization-x64 |
| description | x64 CPU 架构性能优化技巧、SIMD/AVX 向量化、数值稳定性和调试策略 |
| category | method |
| version | 1.0.0 |
| metadata | {"backend":"cpu","dsl":"cpp","architecture":"x86_64","optimization_techniques":"SIMD, AVX, AVX2, AVX-512, cache optimization, loop unrolling"} |
AVX (Advanced Vector Extensions) 是 x86-64 的 SIMD 指令集扩展:
推荐方式: 让编译器自动向量化,通过编译选项启用:
# 在 load_inline 中添加向量化选项
op_module = load_inline(
name="custom_op",
cpp_sources=cpp_source,
extra_cflags=[
"-O3", # 最高优化级别
"-march=native", # 针对当前 CPU 架构优化
"-ftree-vectorize", # 启用自动向量化
],
verbose=True
)
简单方式(未优化):
torch::Tensor elementwise_add(torch::Tensor a, torch::Tensor b) {
if (!a.is_contiguous()) a = a.contiguous();
if (!b.is_contiguous()) b = b.contiguous();
torch::Tensor output = torch::zeros_like(a);
auto a_ptr = a.data_ptr<float>();
auto b_ptr = b.data_ptr<float>();
auto out_ptr = output.data_ptr<float>();
int64_t numel = a.numel();
// 简单循环
for (int64_t i = 0; i < numel; ++i) {
out_ptr[i] = a_ptr[i] + b_ptr[i];
}
return output;
}
优化方式(循环展开,便于向量化):
torch::Tensor elementwise_add_optimized(torch::Tensor a, torch::Tensor b) {
if (!a.is_contiguous()) a = a.contiguous();
if (!b.is_contiguous()) b = b.contiguous();
torch::Tensor output = torch::zeros_like(a);
auto a_ptr = a.data_ptr<float>();
auto b_ptr = b.data_ptr<float>();
auto out_ptr = output.data_ptr<float>();
int64_t numel = a.numel();
// 循环展开 8 倍(匹配 AVX 寄存器宽度)
int64_t i = 0;
int64_t step = 8;
for (; i + step <= numel; i += step) {
out_ptr[i] = a_ptr[i] + b_ptr[i];
out_ptr[i + 1] = a_ptr[i + 1] + b_ptr[i + 1];
out_ptr[i + 2] = a_ptr[i + 2] + b_ptr[i + 2];
out_ptr[i + 3] = a_ptr[i + 3] + b_ptr[i + 3];
out_ptr[i + 4] = a_ptr[i + 4] + b_ptr[i + 4];
out_ptr[i + 5] = a_ptr[i + 5] + b_ptr[i + 5];
out_ptr[i + 6] = a_ptr[i + 6] + b_ptr[i + 6];
out_ptr[i + 7] = a_ptr[i + 7] + b_ptr[i + 7];
}
// 处理剩余元素
for (; i < numel; ++i) {
out_ptr[i] = a_ptr[i] + b_ptr[i];
}
return output;
}
优化效果: 循环展开后,编译器更容易识别并生成 AVX 向量化指令,性能提升 4-8 倍。
简单方式:
float sum_simple(const float* data, int64_t size) {
float sum = 0.0f;
for (int64_t i = 0; i < size; ++i) {
sum += data[i];
}
return sum;
}
优化方式(分块累加):
float sum_optimized(const float* data, int64_t size) {
// 使用 8 个累加器,减少数据依赖
float sum0 = 0.0f, sum1 = 0.0f, sum2 = 0.0f, sum3 = 0.0f;
float sum4 = 0.0f, sum5 = 0.0f, sum6 = 0.0f, sum7 = 0.0f;
int64_t i = 0;
for (; i + 8 <= size; i += 8) {
sum0 += data[i];
sum1 += data[i + 1];
sum2 += data[i + 2];
sum3 += data[i + 3];
sum4 += data[i + 4];
sum5 += data[i + 5];
sum6 += data[i + 6];
sum7 += data[i + 7];
}
// 合并结果
float sum = sum0 + sum1 + sum2 + sum3 + sum4 + sum5 + sum6 + sum7;
// 处理剩余元素
for (; i < size; ++i) {
sum += data[i];
}
return sum;
}
关键优化: 使用多个累加器避免循环携带依赖,允许指令级并行和向量化。
原则: 按行优先访问,提高空间局部性
// 二维矩阵转置优化示例
torch::Tensor transpose_optimized(torch::Tensor input) {
if (!input.is_contiguous()) input = input.contiguous();
auto sizes = input.sizes();
int64_t M = sizes[0];
int64_t N = sizes[1];
torch::Tensor output = torch::zeros({N, M}, input.options());
auto in_ptr = input.data_ptr<float>();
auto out_ptr = output.data_ptr<float>();
// 分块处理,提高缓存命中率
const int64_t BLOCK_SIZE = 64; // 适配缓存行大小
for (int64_t i = 0; i < M; i += BLOCK_SIZE) {
for (int64_t j = 0; j < N; j += BLOCK_SIZE) {
int64_t i_max = std::min(i + BLOCK_SIZE, M);
int64_t j_max = std::min(j + BLOCK_SIZE, N);
for (int64_t ii = i; ii < i_max; ++ii) {
for (int64_t jj = j; jj < j_max; ++jj) {
out_ptr[jj * M + ii] = in_ptr[ii * N + jj];
}
}
}
}
return output;
}
torch::Tensor softmax_stable(torch::Tensor x) {
if (!x.is_contiguous()) x = x.contiguous();
torch::Tensor output = torch::zeros_like(x);
auto x_ptr = x.data_ptr<float>();
auto out_ptr = output.data_ptr<float>();
int64_t numel = x.numel();
// 找到最大值(防止 exp 溢出)
float max_val = x_ptr[0];
for (int64_t i = 1; i < numel; ++i) {
max_val = std::max(max_val, x_ptr[i]);
}
// 减去最大值后计算 exp
float sum = 0.0f;
for (int64_t i = 0; i < numel; ++i) {
float exp_val = std::exp(x_ptr[i] - max_val);
out_ptr[i] = exp_val;
sum += exp_val;
}
// 归一化
for (int64_t i = 0; i < numel; ++i) {
out_ptr[i] /= sum;
}
return output;
}
float kahan_sum(const float* data, int64_t size) {
float sum = 0.0f;
float c = 0.0f; // 补偿变量
for (int64_t i = 0; i < size; ++i) {
float y = data[i] - c;
float t = sum + y;
c = (t - sum) - y;
sum = t;
}
return sum;
}
使用场景: 处理大量浮点数累加时,减少精度损失。
torch::Tensor relu_optimized(torch::Tensor x) {
// 1. 确保连续性
if (!x.is_contiguous()) x = x.contiguous();
// 2. 类型检查与转换
torch::ScalarType dtype = x.scalar_type();
bool need_convert = (dtype != torch::kFloat32 && dtype != torch::kFloat64);
torch::Tensor input = need_convert ? x.to(torch::kFloat32) : x;
// 3. 创建输出
torch::Tensor output = torch::zeros_like(input);
// 4. 优化的计算逻辑
if (input.scalar_type() == torch::kFloat32) {
auto x_ptr = input.data_ptr<float>();
auto out_ptr = output.data_ptr<float>();
int64_t numel = input.numel();
// 循环展开 8 倍
int64_t i = 0;
for (; i + 8 <= numel; i += 8) {
out_ptr[i] = std::max(0.0f, x_ptr[i]);
out_ptr[i + 1] = std::max(0.0f, x_ptr[i + 1]);
out_ptr[i + 2] = std::max(0.0f, x_ptr[i + 2]);
out_ptr[i + 3] = std::max(0.0f, x_ptr[i + 3]);
out_ptr[i + 4] = std::max(0.0f, x_ptr[i + 4]);
out_ptr[i + 5] = std::max(0.0f, x_ptr[i + 5]);
out_ptr[i + 6] = std::max(0.0f, x_ptr[i + 6]);
out_ptr[i + 7] = std::max(0.0f, x_ptr[i + 7]);
}
// 处理剩余元素
for (; i < numel; ++i) {
out_ptr[i] = std::max(0.0f, x_ptr[i]);
}
} else if (input.scalar_type() == torch::kFloat64) {
auto x_ptr = input.data_ptr<double>();
auto out_ptr = output.data_ptr<double>();
int64_t numel = input.numel();
// 同样的循环展开
int64_t i = 0;
for (; i + 4 <= numel; i += 4) { // double 展开 4 倍
out_ptr[i] = std::max(0.0, x_ptr[i]);
out_ptr[i + 1] = std::max(0.0, x_ptr[i + 1]);
out_ptr[i + 2] = std::max(0.0, x_ptr[i + 2]);
out_ptr[i + 3] = std::max(0.0, x_ptr[i + 3]);
}
for (; i < numel; ++i) {
out_ptr[i] = std::max(0.0, x_ptr[i]);
}
}
// 5. 类型还原
if (need_convert) output = output.to(dtype);
return output;
}
-O3 优化?-march=native?extra_cflags = [
"-O3", # 最高优化级别
"-march=native", # 针对当前 CPU
"-ftree-vectorize", # 自动向量化
"-ffast-math", # 快速数学(牺牲部分精度)
"-funroll-loops", # 循环展开
]
注意: -ffast-math 可能影响数值精度,谨慎使用。
| 误区 | 说明 | 建议 |
|---|---|---|
| 过度手动向量化 | 手写 AVX intrinsics 代码复杂且易错 | 优先让编译器自动向量化 |
| 循环展开太多 | 过度展开增加代码体积,降低 I-Cache 命中率 | Float32 展开 8 倍,Float64 展开 4 倍 |
| 忽略数据对齐 | 未对齐访问降低性能 | 使用 torch::zeros_like 等自动对齐 |
| 不合理的精度提升 | 内部计算无需强制使用 double | Float32 已足够,避免不必要转换 |
-O3 -march=native -ftree-vectorize