news 2026/5/20 12:23:07

CANN/asc-devkit TensorTrait样例

作者头像

张小明

前端开发工程师

1.2k 24
文章封面图
CANN/asc-devkit TensorTrait样例

更多样例

【免费下载链接】asc-devkit本项目是CANN 推出的昇腾AI处理器专用的算子程序开发语言,原生支持C和C++标准规范,主要由类库和语言扩展层构成,提供多层级API,满足多维场景算子开发诉求。项目地址: https://gitcode.com/cann/asc-devkit

  • 矢量计算基础API使用TensorTrait样例

    #include "kernel_operator.h" class KernelBinaryScalarTrait { public: __aicore__ inline KernelBinaryScalarTrait() {} __aicore__ inline void Init(__gm__ uint8_t* src, __gm__ uint8_t* dstGm) { srcGlobal.SetGlobalBuffer((__gm__ int16_t*)src); dstGlobal.SetGlobalBuffer((__gm__ int16_t*)dstGm); pipe.InitBuffer(inQueueSrc, 1, 512 * sizeof(int16_t)); pipe.InitBuffer(outQueueDst, 1, 512 * sizeof(int16_t)); } __aicore__ inline void Process() { CopyIn(); Compute(); CopyOut(); } private: __aicore__ inline void CopyIn() { AscendC::LocalTensor<AscendC::TensorTrait<int16_t>> srcLocal = inQueueSrc.AllocTensor<AscendC::TensorTrait<int16_t>>(); AscendC::DataCopy(srcLocal, srcGlobal, 512); inQueueSrc.EnQue(srcLocal); } __aicore__ inline void Compute() { AscendC::LocalTensor<AscendC::TensorTrait<int16_t>> srcLocal = inQueueSrc.DeQue<AscendC::TensorTrait<int16_t>>(); AscendC::LocalTensor<AscendC::TensorTrait<int16_t>> dstLocal = outQueueDst.AllocTensor<AscendC::TensorTrait<int16_t>>(); uint64_t mask = 128; int16_t scalar = 2; // repeatTime = 4, 128 elements one repeat, 512 elements total // dstBlkStride, srcBlkStride = 1, no gap between blocks in one repeat // dstRepStride, srcRepStride =8, no gap between repeats AscendC::Adds(dstLocal, srcLocal, scalar, mask, 4, {1, 1, 8, 8}); outQueueDst.EnQue(dstLocal); inQueueSrc.FreeTensor(srcLocal); } __aicore__ inline void CopyOut() { AscendC::LocalTensor<AscendC::TensorTrait<int16_t>> dstLocal = outQueueDst.DeQue<AscendC::TensorTrait<int16_t>>(); AscendC::DataCopy(dstGlobal, dstLocal, 512); outQueueDst.FreeTensor(dstLocal); } private: AscendC::TPipe pipe; AscendC::TQue<AscendC::TPosition::VECIN, 1> inQueueSrc; AscendC::TQue<AscendC::TPosition::VECOUT, 1> outQueueDst; AscendC::GlobalTensor<AscendC::TensorTrait<int16_t>> srcGlobal, dstGlobal; }; extern "C" __global__ __aicore__ void binary_scalar_trait_kernel(__gm__ uint8_t* src, __gm__ uint8_t* dstGm) { KernelBinaryScalarTrait op; op.Init(src, dstGm); op.Process(); }
  • 矩阵计算基础API使用TensorTrait样例

    #include "kernel_operator.h" template <typename dst_T, typename fmap_T, typename weight_T, typename dstCO1_T, typename bias_T> class KernelMatmul { public: __aicore__ inline KernelMatmul(uint16_t mIn, uint8_t kIn, uint8_t nIn, bool initl1In, bool initl0In) { m = mIn; k = kIn; n = nIn; aSize = m * k; bSize = k * n; cSize = m * n; initl0 = initl0In; initl1 = initl1In; } __aicore__ inline void Init(__gm__ uint8_t *a, __gm__ uint8_t *b, __gm__ uint8_t *c) { aGM.SetGlobalBuffer((__gm__ fmap_T *)a); bGM.SetGlobalBuffer((__gm__ weight_T *)b); cGM.SetGlobalBuffer((__gm__ dstCO1_T *)c); pipe.InitBuffer(inQueueA1, 1, aSize * sizeof(fmap_T)); pipe.InitBuffer(inQueueA2, 1, aSize * sizeof(fmap_T)); pipe.InitBuffer(inQueueB1, 1, bSize * sizeof(weight_T)); pipe.InitBuffer(inQueueB2, 2, bSize * sizeof(weight_T)); pipe.InitBuffer(outQueueCO1, 1, cSize * sizeof(dstCO1_T)); } __aicore__ inline void Process() { CopyIn(); SplitA(); SplitB(); Compute(); CopyOut(); } private: __aicore__ inline void CopyIn() { AscendC::LocalTensor<AscendC::TensorTrait<fmap_T>> a1Local = inQueueA1.AllocTensor<AscendC::TensorTrait<fmap_T>>(); AscendC::LocalTensor<AscendC::TensorTrait<weight_T>> b1Local = inQueueB1.AllocTensor<AscendC::TensorTrait<weight_T>>(); if(initl1 == true) { AscendC::Fill(a1Local, {static_cast<uint16_t>(m * k * sizeof(fmap_T) / 32), 1, 0, 1}); AscendC::Fill(b1Local, {static_cast<uint16_t>(k * n * sizeof(weight_T) / 32), 1, 0, 1}); } else { AscendC::DataCopy(a1Local, aGM, aSize); AscendC::DataCopy(b1Local, bGM, bSize); } inQueueA1.EnQue(a1Local); inQueueB1.EnQue(b1Local); } __aicore__ inline void SplitA() { AscendC::LocalTensor<AscendC::TensorTrait<fmap_T>> a1Local = inQueueA1.DeQue<AscendC::TensorTrait<fmap_T>>(); AscendC::LocalTensor<AscendC::TensorTrait<fmap_T>> a2Local = inQueueA2.AllocTensor<AscendC::TensorTrait<fmap_T>>(); // 1、load2d L1->L0A AscendC::LoadData2dParams loadL0AParams; loadL0AParams.repeatTimes = m * k * sizeof(fmap_T) / 512; loadL0AParams.srcStride = 1; loadL0AParams.dstGap = 0; if (initl0 == true) { Fill(a2Local, {static_cast<uint16_t>(m * k * sizeof(fmap_T) / 512), 1, 0, 1}); } else{ LoadData(a2Local, a1Local, loadL0AParams); } inQueueA2.EnQue<AscendC::TensorTrait<fmap_T>>(a2Local); inQueueA1.FreeTensor(a1Local); } __aicore__ inline void SplitB() { AscendC::LocalTensor<AscendC::TensorTrait<weight_T>> b1Local = inQueueB1.DeQue<AscendC::TensorTrait<weight_T>>(); AscendC::LocalTensor<AscendC::TensorTrait<weight_T>> b2Local = inQueueB2.AllocTensor<AscendC::TensorTrait<weight_T>>(); // 2、load2d L1->L0B AscendC::LoadData2dParams loadL0BParams; loadL0BParams.repeatTimes = k * n * sizeof(weight_T) / 512; loadL0BParams.srcStride = 1; loadL0BParams.dstGap = 0; if (initl0 == true) { AscendC::Fill(b2Local, {static_cast<uint16_t>(k * n * sizeof(weight_T) / 512), 1, 0, 1}); } else{ AscendC::LoadData(b2Local, b1Local, loadL0BParams); } inQueueB1.FreeTensor(b1Local); inQueueB2.EnQue<AscendC::TensorTrait<weight_T>>(b2Local); } __aicore__ inline void Compute() { AscendC::LocalTensor<AscendC::TensorTrait<fmap_T>> a2Local = inQueueA2.DeQue<AscendC::TensorTrait<fmap_T>>(); AscendC::LocalTensor<AscendC::TensorTrait<weight_T>> b2Local = inQueueB2.DeQue<AscendC::TensorTrait<weight_T>>(); AscendC::LocalTensor<AscendC::TensorTrait<dstCO1_T>> c1Local = outQueueCO1.AllocTensor<AscendC::TensorTrait<dstCO1_T>>(); mmadParams.isBias = false; mmadParams.m = m; mmadParams.n = n; mmadParams.k = k; AscendC::Mmad(c1Local, a2Local, b2Local, mmadParams); // m*n outQueueCO1.EnQue<AscendC::TensorTrait<dstCO1_T>>(c1Local); inQueueA2.FreeTensor(a2Local); inQueueB2.FreeTensor(b2Local); } #if __NPU_ARCH__ <= 2002 __aicore__ inline void CopyOut() { AscendC::LocalTensor<AscendC::TensorTrait<dstCO1_T>> c1Local = outQueueCO1.DeQue<AscendC::TensorTrait<dstCO1_T>>(); uint16_t M_ = Ceil(m, 16) * 16; AscendC::LocalTensor<AscendC::TensorTrait<dst_T>> ublocal; AscendC::TBuffAddr tbufublocal; tbufublocal.logicPos = (uint8_t)AscendC::TPosition::C1; ublocal.SetAddr(tbufublocal); ublocal.InitBuffer(0, M_ * n); DataCopyParams dataCopyParams; dataCopyParams.blockCount = 1; dataCopyParams.blockLen = Ceil(M_ * n * 4, 1024); DataCopyEnhancedParams enhancedParams; enhancedParams.blockMode = AscendC::BlockMode::BLOCK_MODE_MATRIX; AscendC::DataCopy(ublocal, c1Local, dataCopyParams, enhancedParams); PipeBarrier<PIPE_ALL>(); outQueueCO1.FreeTensor(c1Local); dataCopyParams.blockCount = 1; dataCopyParams.blockLen = m * n *sizeof(dstCO1_T) / ONE_BLK_SIZE; dataCopyParams.srcStride = 0; dataCopyParams.dstStride = 0; AscendC::DataCopy(cGM, ublocal, dataCopyParams); } #else __aicore__ inline void CopyOut() { AscendC::LocalTensor<AscendC::TensorTrait<dstCO1_T>> c1Local = outQueueCO1.DeQue<AscendC::TensorTrait<dstCO1_T>>(); AscendC::FixpipeParamsV220 fixpipeParams; fixpipeParams.nSize = n; fixpipeParams.mSize = m; fixpipeParams.srcStride = m; fixpipeParams.dstStride = n; fixpipeParams.ndNum = 1; fixpipeParams.srcNdStride = 0; fixpipeParams.dstNdStride = 0; AscendC::Fixpipe(cGM, c1Local, fixpipeParams); outQueueCO1.FreeTensor(c1Local); } #endif private: AscendC::TPipe pipe; AscendC::TQue<AscendC::TPosition::A1, 1> inQueueA1; AscendC::TQue<AscendC::TPosition::A2, 1> inQueueA2; AscendC::TQue<AscendC::TPosition::B1, 1> inQueueB1; AscendC::TQue<AscendC::TPosition::B2, 1> inQueueB2; // dst queue AscendC::TQue<AscendC::TPosition::CO1, 1> outQueueCO1; AscendC::GlobalTensor<AscendC::TensorTrait<fmap_T>> aGM; AscendC::GlobalTensor<AscendC::TensorTrait<weight_T>> bGM; AscendC::GlobalTensor<AscendC::TensorTrait<dst_T>> cGM; uint16_t m, k, n; bool initl0, initl1; uint16_t aSize, bSize, cSize, b2Size; AscendC::MmadParams mmadParams; }; extern "C" __global__ __aicore__ void cube_initconstvalue_simple_operator_half_16_32_16_true_false( __gm__ uint8_t *a, __gm__ uint8_t *b, __gm__ uint8_t *c) { if ASCEND_IS_AIV { return; } KernelMatmul<float, half, half, float, half> op(16, 32, 16, true, false); op.Init(a, b, c); op.Process(); }

【免费下载链接】asc-devkit本项目是CANN 推出的昇腾AI处理器专用的算子程序开发语言,原生支持C和C++标准规范,主要由类库和语言扩展层构成,提供多层级API,满足多维场景算子开发诉求。项目地址: https://gitcode.com/cann/asc-devkit

创作声明:本文部分内容由AI辅助生成(AIGC),仅供参考

版权声明: 本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若内容造成侵权/违法违规/事实不符,请联系邮箱:809451989@qq.com进行投诉反馈,一经查实,立即删除!
网站建设 2026/5/20 12:22:01

Ascend C SIMD转置API文档

Transpose 【免费下载链接】asc-devkit 本项目是CANN 推出的昇腾AI处理器专用的算子程序开发语言&#xff0c;原生支持C和C标准规范&#xff0c;主要由类库和语言扩展层构成&#xff0c;提供多层级API&#xff0c;满足多维场景算子开发诉求。 项目地址: https://gitcode.com/…

作者头像 李华
网站建设 2026/5/20 12:18:12

终极解决方案:IPXWrapper让经典游戏在现代Windows系统重获联机能力

终极解决方案&#xff1a;IPXWrapper让经典游戏在现代Windows系统重获联机能力 【免费下载链接】ipxwrapper 项目地址: https://gitcode.com/gh_mirrors/ip/ipxwrapper 还在为《星际争霸》《帝国时代》《暗黑破坏神》等经典游戏无法在现代Windows系统上联机而烦恼吗&am…

作者头像 李华
网站建设 2026/5/20 12:17:02

如何快速掌握云音乐歌词批量下载工具:面向初学者的完整教程

如何快速掌握云音乐歌词批量下载工具&#xff1a;面向初学者的完整教程 【免费下载链接】163MusicLyrics 云音乐歌词获取处理工具【网易云、QQ音乐】 项目地址: https://gitcode.com/GitHub_Trending/16/163MusicLyrics 还在为找不到心爱歌曲的歌词而烦恼吗&#xff1f;…

作者头像 李华
网站建设 2026/5/20 12:16:02

终极指南:如何用YOLOv8 AI自瞄系统快速提升游戏瞄准精度

终极指南&#xff1a;如何用YOLOv8 AI自瞄系统快速提升游戏瞄准精度 【免费下载链接】RookieAI_yolov8 基于yolov8实现的AI自瞄项目 AI self-aiming project based on yolov8 项目地址: https://gitcode.com/gh_mirrors/ro/RookieAI_yolov8 RookieAI_yolov8是一款基于YO…

作者头像 李华
网站建设 2026/5/20 12:12:04

第9章:安全与权限管理

第9章:安全与权限管理 9.1 X-Pack安全功能概述 为什么需要安全 Elasticsearch默认不启用安全,任何人都可以访问集群、读取数据、删除索引。在生产环境中,必须启用安全: 安全风险: 未授权访问: 任何人可访问集群 数据泄露: 敏感数据可被读取 数据破坏: 索引可被删除 恶意攻击…

作者头像 李华