更多样例
【免费下载链接】asc-devkit本项目是CANN 推出的昇腾AI处理器专用的算子程序开发语言,原生支持C和C++标准规范,主要由类库和语言扩展层构成,提供多层级API,满足多维场景算子开发诉求。项目地址: https://gitcode.com/cann/asc-devkit
矢量计算基础API使用TensorTrait样例
#include "kernel_operator.h" class KernelBinaryScalarTrait { public: __aicore__ inline KernelBinaryScalarTrait() {} __aicore__ inline void Init(__gm__ uint8_t* src, __gm__ uint8_t* dstGm) { srcGlobal.SetGlobalBuffer((__gm__ int16_t*)src); dstGlobal.SetGlobalBuffer((__gm__ int16_t*)dstGm); pipe.InitBuffer(inQueueSrc, 1, 512 * sizeof(int16_t)); pipe.InitBuffer(outQueueDst, 1, 512 * sizeof(int16_t)); } __aicore__ inline void Process() { CopyIn(); Compute(); CopyOut(); } private: __aicore__ inline void CopyIn() { AscendC::LocalTensor<AscendC::TensorTrait<int16_t>> srcLocal = inQueueSrc.AllocTensor<AscendC::TensorTrait<int16_t>>(); AscendC::DataCopy(srcLocal, srcGlobal, 512); inQueueSrc.EnQue(srcLocal); } __aicore__ inline void Compute() { AscendC::LocalTensor<AscendC::TensorTrait<int16_t>> srcLocal = inQueueSrc.DeQue<AscendC::TensorTrait<int16_t>>(); AscendC::LocalTensor<AscendC::TensorTrait<int16_t>> dstLocal = outQueueDst.AllocTensor<AscendC::TensorTrait<int16_t>>(); uint64_t mask = 128; int16_t scalar = 2; // repeatTime = 4, 128 elements one repeat, 512 elements total // dstBlkStride, srcBlkStride = 1, no gap between blocks in one repeat // dstRepStride, srcRepStride =8, no gap between repeats AscendC::Adds(dstLocal, srcLocal, scalar, mask, 4, {1, 1, 8, 8}); outQueueDst.EnQue(dstLocal); inQueueSrc.FreeTensor(srcLocal); } __aicore__ inline void CopyOut() { AscendC::LocalTensor<AscendC::TensorTrait<int16_t>> dstLocal = outQueueDst.DeQue<AscendC::TensorTrait<int16_t>>(); AscendC::DataCopy(dstGlobal, dstLocal, 512); outQueueDst.FreeTensor(dstLocal); } private: AscendC::TPipe pipe; AscendC::TQue<AscendC::TPosition::VECIN, 1> inQueueSrc; AscendC::TQue<AscendC::TPosition::VECOUT, 1> outQueueDst; AscendC::GlobalTensor<AscendC::TensorTrait<int16_t>> srcGlobal, dstGlobal; }; extern "C" __global__ __aicore__ void binary_scalar_trait_kernel(__gm__ uint8_t* src, __gm__ uint8_t* dstGm) { KernelBinaryScalarTrait op; op.Init(src, dstGm); op.Process(); }矩阵计算基础API使用TensorTrait样例
#include "kernel_operator.h" template <typename dst_T, typename fmap_T, typename weight_T, typename dstCO1_T, typename bias_T> class KernelMatmul { public: __aicore__ inline KernelMatmul(uint16_t mIn, uint8_t kIn, uint8_t nIn, bool initl1In, bool initl0In) { m = mIn; k = kIn; n = nIn; aSize = m * k; bSize = k * n; cSize = m * n; initl0 = initl0In; initl1 = initl1In; } __aicore__ inline void Init(__gm__ uint8_t *a, __gm__ uint8_t *b, __gm__ uint8_t *c) { aGM.SetGlobalBuffer((__gm__ fmap_T *)a); bGM.SetGlobalBuffer((__gm__ weight_T *)b); cGM.SetGlobalBuffer((__gm__ dstCO1_T *)c); pipe.InitBuffer(inQueueA1, 1, aSize * sizeof(fmap_T)); pipe.InitBuffer(inQueueA2, 1, aSize * sizeof(fmap_T)); pipe.InitBuffer(inQueueB1, 1, bSize * sizeof(weight_T)); pipe.InitBuffer(inQueueB2, 2, bSize * sizeof(weight_T)); pipe.InitBuffer(outQueueCO1, 1, cSize * sizeof(dstCO1_T)); } __aicore__ inline void Process() { CopyIn(); SplitA(); SplitB(); Compute(); CopyOut(); } private: __aicore__ inline void CopyIn() { AscendC::LocalTensor<AscendC::TensorTrait<fmap_T>> a1Local = inQueueA1.AllocTensor<AscendC::TensorTrait<fmap_T>>(); AscendC::LocalTensor<AscendC::TensorTrait<weight_T>> b1Local = inQueueB1.AllocTensor<AscendC::TensorTrait<weight_T>>(); if(initl1 == true) { AscendC::Fill(a1Local, {static_cast<uint16_t>(m * k * sizeof(fmap_T) / 32), 1, 0, 1}); AscendC::Fill(b1Local, {static_cast<uint16_t>(k * n * sizeof(weight_T) / 32), 1, 0, 1}); } else { AscendC::DataCopy(a1Local, aGM, aSize); AscendC::DataCopy(b1Local, bGM, bSize); } inQueueA1.EnQue(a1Local); inQueueB1.EnQue(b1Local); } __aicore__ inline void SplitA() { AscendC::LocalTensor<AscendC::TensorTrait<fmap_T>> a1Local = inQueueA1.DeQue<AscendC::TensorTrait<fmap_T>>(); AscendC::LocalTensor<AscendC::TensorTrait<fmap_T>> a2Local = inQueueA2.AllocTensor<AscendC::TensorTrait<fmap_T>>(); // 1、load2d L1->L0A AscendC::LoadData2dParams loadL0AParams; loadL0AParams.repeatTimes = m * k * sizeof(fmap_T) / 512; loadL0AParams.srcStride = 1; loadL0AParams.dstGap = 0; if (initl0 == true) { Fill(a2Local, {static_cast<uint16_t>(m * k * sizeof(fmap_T) / 512), 1, 0, 1}); } else{ LoadData(a2Local, a1Local, loadL0AParams); } inQueueA2.EnQue<AscendC::TensorTrait<fmap_T>>(a2Local); inQueueA1.FreeTensor(a1Local); } __aicore__ inline void SplitB() { AscendC::LocalTensor<AscendC::TensorTrait<weight_T>> b1Local = inQueueB1.DeQue<AscendC::TensorTrait<weight_T>>(); AscendC::LocalTensor<AscendC::TensorTrait<weight_T>> b2Local = inQueueB2.AllocTensor<AscendC::TensorTrait<weight_T>>(); // 2、load2d L1->L0B AscendC::LoadData2dParams loadL0BParams; loadL0BParams.repeatTimes = k * n * sizeof(weight_T) / 512; loadL0BParams.srcStride = 1; loadL0BParams.dstGap = 0; if (initl0 == true) { AscendC::Fill(b2Local, {static_cast<uint16_t>(k * n * sizeof(weight_T) / 512), 1, 0, 1}); } else{ AscendC::LoadData(b2Local, b1Local, loadL0BParams); } inQueueB1.FreeTensor(b1Local); inQueueB2.EnQue<AscendC::TensorTrait<weight_T>>(b2Local); } __aicore__ inline void Compute() { AscendC::LocalTensor<AscendC::TensorTrait<fmap_T>> a2Local = inQueueA2.DeQue<AscendC::TensorTrait<fmap_T>>(); AscendC::LocalTensor<AscendC::TensorTrait<weight_T>> b2Local = inQueueB2.DeQue<AscendC::TensorTrait<weight_T>>(); AscendC::LocalTensor<AscendC::TensorTrait<dstCO1_T>> c1Local = outQueueCO1.AllocTensor<AscendC::TensorTrait<dstCO1_T>>(); mmadParams.isBias = false; mmadParams.m = m; mmadParams.n = n; mmadParams.k = k; AscendC::Mmad(c1Local, a2Local, b2Local, mmadParams); // m*n outQueueCO1.EnQue<AscendC::TensorTrait<dstCO1_T>>(c1Local); inQueueA2.FreeTensor(a2Local); inQueueB2.FreeTensor(b2Local); } #if __NPU_ARCH__ <= 2002 __aicore__ inline void CopyOut() { AscendC::LocalTensor<AscendC::TensorTrait<dstCO1_T>> c1Local = outQueueCO1.DeQue<AscendC::TensorTrait<dstCO1_T>>(); uint16_t M_ = Ceil(m, 16) * 16; AscendC::LocalTensor<AscendC::TensorTrait<dst_T>> ublocal; AscendC::TBuffAddr tbufublocal; tbufublocal.logicPos = (uint8_t)AscendC::TPosition::C1; ublocal.SetAddr(tbufublocal); ublocal.InitBuffer(0, M_ * n); DataCopyParams dataCopyParams; dataCopyParams.blockCount = 1; dataCopyParams.blockLen = Ceil(M_ * n * 4, 1024); DataCopyEnhancedParams enhancedParams; enhancedParams.blockMode = AscendC::BlockMode::BLOCK_MODE_MATRIX; AscendC::DataCopy(ublocal, c1Local, dataCopyParams, enhancedParams); PipeBarrier<PIPE_ALL>(); outQueueCO1.FreeTensor(c1Local); dataCopyParams.blockCount = 1; dataCopyParams.blockLen = m * n *sizeof(dstCO1_T) / ONE_BLK_SIZE; dataCopyParams.srcStride = 0; dataCopyParams.dstStride = 0; AscendC::DataCopy(cGM, ublocal, dataCopyParams); } #else __aicore__ inline void CopyOut() { AscendC::LocalTensor<AscendC::TensorTrait<dstCO1_T>> c1Local = outQueueCO1.DeQue<AscendC::TensorTrait<dstCO1_T>>(); AscendC::FixpipeParamsV220 fixpipeParams; fixpipeParams.nSize = n; fixpipeParams.mSize = m; fixpipeParams.srcStride = m; fixpipeParams.dstStride = n; fixpipeParams.ndNum = 1; fixpipeParams.srcNdStride = 0; fixpipeParams.dstNdStride = 0; AscendC::Fixpipe(cGM, c1Local, fixpipeParams); outQueueCO1.FreeTensor(c1Local); } #endif private: AscendC::TPipe pipe; AscendC::TQue<AscendC::TPosition::A1, 1> inQueueA1; AscendC::TQue<AscendC::TPosition::A2, 1> inQueueA2; AscendC::TQue<AscendC::TPosition::B1, 1> inQueueB1; AscendC::TQue<AscendC::TPosition::B2, 1> inQueueB2; // dst queue AscendC::TQue<AscendC::TPosition::CO1, 1> outQueueCO1; AscendC::GlobalTensor<AscendC::TensorTrait<fmap_T>> aGM; AscendC::GlobalTensor<AscendC::TensorTrait<weight_T>> bGM; AscendC::GlobalTensor<AscendC::TensorTrait<dst_T>> cGM; uint16_t m, k, n; bool initl0, initl1; uint16_t aSize, bSize, cSize, b2Size; AscendC::MmadParams mmadParams; }; extern "C" __global__ __aicore__ void cube_initconstvalue_simple_operator_half_16_32_16_true_false( __gm__ uint8_t *a, __gm__ uint8_t *b, __gm__ uint8_t *c) { if ASCEND_IS_AIV { return; } KernelMatmul<float, half, half, float, half> op(16, 32, 16, true, false); op.Init(a, b, c); op.Process(); }
【免费下载链接】asc-devkit本项目是CANN 推出的昇腾AI处理器专用的算子程序开发语言,原生支持C和C++标准规范,主要由类库和语言扩展层构成,提供多层级API,满足多维场景算子开发诉求。项目地址: https://gitcode.com/cann/asc-devkit
创作声明:本文部分内容由AI辅助生成(AIGC),仅供参考