一键导入
一键导入
矩阵乘法矩阵乘法 A[M, K] @ B[K, N] = C[M, N]中,大K维度矩阵乘法(K>>M,N)优化:针对M/N较小但K极大(如M=N=256,K=131072)的场景,Split-K切分K维度并行化、Workspace+Reduce替代全局同步,实现显著性能提升
Triton Ascend hard API restrictions and forbidden syntax. MUST-follow rules that apply to every kernel: forbidden control flow (return/break/continue/lambda/while), tensor slice/index restrictions, scalar conversion rules, BLOCK_SIZE upper bound. Violating any of these produces a compile or runtime error on Ascend.
Triton Ascend 性能优化通用策略: BLOCK_SIZE 选择 (1024-2048 for elementwise, must be <65536), grid configuration (use VEC_CORE_NUM / CUBE_CORE_NUM, 2D/3D grid for matmul / conv / reduce, 1D grid + inner loop for elementwise / pointwise), 256B alignment for memory transfers, autotune block-size patterns, fp16 / fp32 precision conversion. Bind via keywords like matmul, elementwise, reduce, block_size, grid, autotune, alignment, fp16, fp32, tile, interleaved-loop, cube-core, vec-core.
通过 adaptive_search 或 evolve 搜索式 workflow 生成优化算子。 后台 silent mode 执行,轮询监控进度。
适用于归约(reduce)类算子和含归约子步骤的复合算子(如归一化)的优化指南。典型算子包括:sum, mean, max, min, prod, argmax, argmin, cumsum, cumprod, softmax, logsoftmax, layernorm, rmsnorm, groupnorm, instancenorm, batchnorm, l1norm, l2norm, frobeniusnorm, var, std, average_pooling, sum_pooling 等。特别重要:当归约维度不是最后一维(如 dim=1 归约 shape=[B,F,D1,D2]),需要正确处理多维索引和两阶段归约。包含 PyTorch normalized_shape 多轴归一化语义说明。不适用于纯逐元素运算或矩阵乘法。如果算子是损失函数(先逐元素计算再全局归约),应选择 elementwise-reduce-fused 指南。
ARM CPU 架构性能优化技巧、NEON SIMD 向量化、数值稳定性和调试策略
| name | cpu-basics |
| description | CPU C++ 算子核心概念、标准结构模式、KernelBench 代码规范和内嵌扩展方法 |
| category | fundamental |
| version | 1.0.0 |
| metadata | {"backend":"cpu","dsl":"cpp","operator_patterns":"all","architecture":"x86_64, aarch64"} |
PYBIND11_MODULE 注册的 C++ 函数,编译后在 CPU 上执行load_inline 动态编译加载所有 CPU C++ 内核都遵循相同的五步结构模式:
torch::Tensor standard_kernel(torch::Tensor x) {
// 1. 确保输入张量是连续的
if (!x.is_contiguous()) {
x = x.contiguous();
}
// 2. 检查数据类型,支持多种类型
torch::ScalarType dtype = x.scalar_type();
bool need_convert = (dtype != torch::kFloat32 && dtype != torch::kFloat64 &&
dtype != torch::kInt32 && dtype != torch::kInt64);
torch::Tensor input = need_convert ? x.to(torch::kFloat32) : x;
// 3. 创建输出张量
torch::Tensor output = torch::zeros_like(input);
// 4. 根据数据类型分发计算
if (input.scalar_type() == torch::kFloat32) {
auto x_ptr = input.data_ptr<float>();
auto out_ptr = output.data_ptr<float>();
int64_t numel = input.numel();
for (int64_t i = 0; i < numel; ++i) {
out_ptr[i] = std::max(0.0f, x_ptr[i]); // ReLU: max(0, x)
}
} else if (input.scalar_type() == torch::kFloat64) {
auto x_ptr = input.data_ptr<double>();
auto out_ptr = output.data_ptr<double>();
int64_t numel = input.numel();
for (int64_t i = 0; i < numel; ++i) {
out_ptr[i] = std::max(0.0, x_ptr[i]);
}
} else if (input.scalar_type() == torch::kInt32) {
auto x_ptr = input.data_ptr<int32_t>();
auto out_ptr = output.data_ptr<int32_t>();
int64_t numel = input.numel();
for (int64_t i = 0; i < numel; ++i) {
out_ptr[i] = std::max(0, x_ptr[i]);
}
} else if (input.scalar_type() == torch::kInt64) {
auto x_ptr = input.data_ptr<int64_t>();
auto out_ptr = output.data_ptr<int64_t>();
int64_t numel = input.numel();
for (int64_t i = 0; i < numel; ++i) {
out_ptr[i] = std::max(0L, x_ptr[i]);
}
}
// 5. 转换回原类型
if (need_convert) {
output = output.to(dtype);
}
return output;
}
重要: 生成的代码必须遵循 KernelBench 格式规范,使用 Python 模块内嵌 C++ 代码 的方式。
参考示例位置: akg_agents/python/akg_agents/op/resources/docs/cpu_docs/examples/torch_xxx_kernel.py
import torch
from torch.utils.cpp_extension import load_inline
# 内联C++扩展代码
cpp_source = """
#include <torch/extension.h>
torch::Tensor op_name_kernel(torch::Tensor x) {
// 1. 确保输入张量是连续的
if (!x.is_contiguous()) {
x = x.contiguous();
}
// 2. 检查数据类型,支持多种类型
torch::ScalarType dtype = x.scalar_type();
bool need_convert = (dtype != torch::kFloat32 && dtype != torch::kFloat64 &&
dtype != torch::kInt32 && dtype != torch::kInt64);
torch::Tensor input = need_convert ? x.to(torch::kFloat32) : x;
// 3. 创建输出张量
torch::Tensor output = torch::zeros_like(input);
// 4. 根据数据类型分发计算
if (input.scalar_type() == torch::kFloat32) {
auto x_ptr = input.data_ptr<float>();
auto out_ptr = output.data_ptr<float>();
int64_t numel = input.numel();
for (int64_t i = 0; i < numel; ++i) {
// 具体的算子计算逻辑
out_ptr[i] = compute_logic(x_ptr[i]);
}
} else if (input.scalar_type() == torch::kFloat64) {
// 同样的逻辑,但使用 double 类型
auto x_ptr = input.data_ptr<double>();
auto out_ptr = output.data_ptr<double>();
int64_t numel = input.numel();
for (int64_t i = 0; i < numel; ++i) {
out_ptr[i] = compute_logic(x_ptr[i]);
}
} else if (input.scalar_type() == torch::kInt32) {
auto x_ptr = input.data_ptr<int32_t>();
auto out_ptr = output.data_ptr<int32_t>();
int64_t numel = input.numel();
for (int64_t i = 0; i < numel; ++i) {
out_ptr[i] = compute_logic(x_ptr[i]);
}
} else if (input.scalar_type() == torch::kInt64) {
auto x_ptr = input.data_ptr<int64_t>();
auto out_ptr = output.data_ptr<int64_t>();
int64_t numel = input.numel();
for (int64_t i = 0; i < numel; ++i) {
out_ptr[i] = compute_logic(x_ptr[i]);
}
}
// 5. 转换回原类型
if (need_convert) {
output = output.to(dtype);
}
return output;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("op_name_kernel", &op_name_kernel, "CPU op_name operator");
}
"""
# 动态加载C++扩展
op_name_module = load_inline(
name="custom_op_name",
cpp_sources=cpp_source,
extra_cflags=["-O3"],
verbose=True
)
# Python接口函数
def op_name(x: torch.Tensor) -> torch.Tensor:
if x.device.type != "cpu":
x = x.cpu()
return op_name_module.op_name_kernel(x)
load_inline 动态编译并加载扩展PYBIND11_MODULE 宏注册算子适用于激活函数、逐元素运算等简单操作。
// ReLU: max(0, x)
torch::Tensor relu_kernel(torch::Tensor x) {
if (!x.is_contiguous()) x = x.contiguous();
torch::ScalarType dtype = x.scalar_type();
bool need_convert = (dtype != torch::kFloat32 && dtype != torch::kFloat64);
torch::Tensor input = need_convert ? x.to(torch::kFloat32) : x;
torch::Tensor output = torch::zeros_like(input);
if (input.scalar_type() == torch::kFloat32) {
auto x_ptr = input.data_ptr<float>();
auto out_ptr = output.data_ptr<float>();
int64_t numel = input.numel();
for (int64_t i = 0; i < numel; ++i) {
out_ptr[i] = std::max(0.0f, x_ptr[i]);
}
} else if (input.scalar_type() == torch::kFloat64) {
auto x_ptr = input.data_ptr<double>();
auto out_ptr = output.data_ptr<double>();
int64_t numel = input.numel();
for (int64_t i = 0; i < numel; ++i) {
out_ptr[i] = std::max(0.0, x_ptr[i]);
}
}
if (need_convert) output = output.to(dtype);
return output;
}
适用于求和、最大值、最小值等聚合操作。
// Sum reduction: 沿指定维度求和
torch::Tensor sum_reduction_kernel(torch::Tensor x) {
if (!x.is_contiguous()) x = x.contiguous();
torch::ScalarType dtype = x.scalar_type();
bool need_convert = (dtype != torch::kFloat32 && dtype != torch::kFloat64);
torch::Tensor input = need_convert ? x.to(torch::kFloat32) : x;
int64_t numel = input.numel();
torch::Tensor output;
if (input.scalar_type() == torch::kFloat32) {
auto x_ptr = input.data_ptr<float>();
float result = 0.0f;
for (int64_t i = 0; i < numel; ++i) {
result += x_ptr[i]; // 求和归约
}
output = torch::tensor({result}, torch::kFloat32);
} else if (input.scalar_type() == torch::kFloat64) {
auto x_ptr = input.data_ptr<double>();
double result = 0.0;
for (int64_t i = 0; i < numel; ++i) {
result += x_ptr[i];
}
output = torch::tensor({result}, torch::kFloat64);
}
if (need_convert) output = output.to(dtype);
return output;
}
确保所有操作都有适当的边界检查和错误处理。
torch::Tensor safe_operation_kernel(torch::Tensor x) {
// 1. 检查张量有效性
TORCH_CHECK(x.numel() > 0, "Input tensor cannot be empty");
TORCH_CHECK(x.dim() > 0, "Input tensor must have at least one dimension");
// 2. 确保张量连续性
if (!x.is_contiguous()) {
x = x.contiguous();
}
// 3. 类型检查和转换
torch::ScalarType dtype = x.scalar_type();
bool need_convert = (dtype != torch::kFloat32 && dtype != torch::kFloat64);
torch::Tensor input = need_convert ? x.to(torch::kFloat32) : x;
torch::Tensor output = torch::zeros_like(input);
// 4. 安全的数据处理
if (input.scalar_type() == torch::kFloat32) {
auto x_ptr = input.data_ptr<float>();
auto out_ptr = output.data_ptr<float>();
int64_t numel = input.numel();
for (int64_t i = 0; i < numel; ++i) {
out_ptr[i] = std::max(0.0f, x_ptr[i]);
}
} else if (input.scalar_type() == torch::kFloat64) {
auto x_ptr = input.data_ptr<double>();
auto out_ptr = output.data_ptr<double>();
int64_t numel = input.numel();
for (int64_t i = 0; i < numel; ++i) {
out_ptr[i] = std::max(0.0, x_ptr[i]);
}
}
if (need_convert) output = output.to(dtype);
return output;
}
// 类型检查
torch::ScalarType dtype = x.scalar_type();
bool is_float32 = (dtype == torch::kFloat32);
bool is_float64 = (dtype == torch::kFloat64);
bool is_int32 = (dtype == torch::kInt32);
bool is_int64 = (dtype == torch::kInt64);
// 类型转换
torch::Tensor input = x.to(torch::kFloat32); // 转换为 float32
torch::Tensor output = result.to(dtype); // 转换回原类型
if (!x.is_contiguous()) {
x = x.contiguous();
}
// float32 指针
auto x_ptr = input.data_ptr<float>();
auto out_ptr = output.data_ptr<float>();
// float64 指针
auto x_ptr = input.data_ptr<double>();
auto out_ptr = output.data_ptr<double>();
// int32 指针
auto x_ptr = input.data_ptr<int32_t>();
auto out_ptr = output.data_ptr<int32_t>();
// int64 指针
auto x_ptr = input.data_ptr<int64_t>();
auto out_ptr = output.data_ptr<int64_t>();
// 创建输出张量
torch::Tensor output = torch::zeros_like(input); // 同形状零张量
torch::Tensor output = torch::ones_like(input); // 同形状单位张量
torch::Tensor output = input.clone(); // 克隆张量
// 张量属性
int64_t numel = input.numel(); // 元素总数
int64_t dim = input.dim(); // 维度数
torch::IntArrayRef shape = input.sizes(); // 形状
TORCH_CHECK(x.numel() > 0, "Input tensor cannot be empty");
TORCH_CHECK(x.dim() > 0, "Input tensor must have at least one dimension");
omp_get_thread_num()等OpenMP运行时API// ❌ 错误:在受限上下文中调用OpenMP API
std::mt19937 gen(seed + omp_get_thread_num()); // 编译错误!
// ✅ 正确:在并行区域内正常调用
#pragma omp parallel
{
int tid = omp_get_thread_num(); // 正确
std::mt19937 gen(seed + tid);
}
更多完整的算子实现示例,请参考:
akg_agents/python/akg_agents/op/resources/docs/cpu_docs/basic_docs.mdakg_agents/python/akg_agents/op/resources/docs/cpu_docs/suggestion_docs.mdakg_agents/python/akg_agents/op/resources/docs/cpu_docs/api/api.mdakg_agents/python/akg_agents/op/resources/docs/cpu_docs/examples/torch_xxx_kernel.py这些文档提供了完整的实现指南和参考模板。