roctracer 的应用示例

1,不用 roctracer 的普通场景

mt.cpp

/* Copyright (c) 2018-2022 Advanced Micro Devices, Inc.Permission is hereby granted, free of charge, to any person obtaining a copyof this software and associated documentation files (the "Software"), to dealin the Software without restriction, including without limitation the rightsto use, copy, modify, merge, publish, distribute, sublicense, and/or sellcopies of the Software, and to permit persons to whom the Software isfurnished to do so, subject to the following conditions:The above copyright notice and this permission notice shall be included inall copies or substantial portions of the Software.THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS ORIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THEAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHERLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS INTHE SOFTWARE. */#include <iostream>// hip header file
#include <hip/hip_runtime.h>#define HIP_CALL(call)                                                                             \do {                                                                                             \hipError_t err = call;                                                                         \if (err != hipSuccess) {                                                                       \fprintf(stderr, "%s\n", hipGetErrorString(err));                                             \abort();                                                                                     \}                                                                                              \} while (0)#define WIDTH 1024#define NUM (WIDTH * WIDTH)#define THREADS_PER_BLOCK_X 4
#define THREADS_PER_BLOCK_Y 4
#define THREADS_PER_BLOCK_Z 1// Device (Kernel) function, it must be void
__global__ void matrixTranspose(float* out, float* in, const int width) {int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;out[y * width + x] = in[x * width + y];
}// CPU implementation of matrix transpose
void matrixTransposeCPUReference(float* output, float* input, const unsigned int width) {for (unsigned int j = 0; j < width; j++) {for (unsigned int i = 0; i < width; i++) {output[i * width + j] = input[j * width + i];}}
}int main() {float* Matrix;float* TransposeMatrix;float* cpuTransposeMatrix;float* gpuMatrix;float* gpuTransposeMatrix;hipDeviceProp_t devProp;HIP_CALL(hipGetDeviceProperties(&devProp, 0));std::cerr << "Device name " << devProp.name << std::endl;int i;int errors;Matrix = (float*)malloc(NUM * sizeof(float));TransposeMatrix = (float*)malloc(NUM * sizeof(float));cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));// initialize the input datafor (i = 0; i < NUM; i++) {Matrix[i] = (float)i * 10.0f;}// allocate the memory on the device sideHIP_CALL(hipMalloc((void**)&gpuMatrix, NUM * sizeof(float)));HIP_CALL(hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float)));uint32_t iterations = 100;while (iterations-- > 0) {std::cerr << "## Iteration (" << iterations << ") #################" << std::endl;// Memory transfer from host to deviceHIP_CALL(hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice));// Lauching kernel from hosthipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix, gpuMatrix, WIDTH);HIP_CALL(hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost));// CPU MatrixTranspose computationmatrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);// verify the resultserrors = 0;double eps = 1.0E-6;for (i = 0; i < NUM; i++) {if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {errors++;}}if (errors != 0) {fprintf(stderr, "FAILED: %d errors\n", errors);} else {fprintf(stderr, "PASSED!\n");}}// free the resources on device sideHIP_CALL(hipFree(gpuMatrix));HIP_CALL(hipFree(gpuTransposeMatrix));// free the resources on host sidefree(Matrix);free(TransposeMatrix);free(cpuTransposeMatrix);return errors;
}

编译:

 $ hipcc mt.cpp -o mt

$ ./mt xxx

不会产生文件;

2,加入roctracer的源文件

MatrixTranspose.cpp:

/* Copyright (c) 2018-2022 Advanced Micro Devices, Inc.Permission is hereby granted, free of charge, to any person obtaining a copyof this software and associated documentation files (the "Software"), to dealin the Software without restriction, including without limitation the rightsto use, copy, modify, merge, publish, distribute, sublicense, and/or sellcopies of the Software, and to permit persons to whom the Software isfurnished to do so, subject to the following conditions:The above copyright notice and this permission notice shall be included inall copies or substantial portions of the Software.THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS ORIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THEAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHERLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS INTHE SOFTWARE. */#include <iostream>// hip header file
#include <hip/hip_runtime.h>
#include "roctracer_ext.h"
// roctx header file
#include <roctx.h>#define HIP_CALL(call)                                                                             \do {                                                                                             \hipError_t err = call;                                                                         \if (err != hipSuccess) {                                                                       \fprintf(stderr, "%s\n", hipGetErrorString(err));                                             \abort();                                                                                     \}                                                                                              \} while (0)#define WIDTH 1024#define NUM (WIDTH * WIDTH)#define THREADS_PER_BLOCK_X 4
#define THREADS_PER_BLOCK_Y 4
#define THREADS_PER_BLOCK_Z 1// Device (Kernel) function, it must be void
__global__ void matrixTranspose(float* out, float* in, const int width) {int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;out[y * width + x] = in[x * width + y];
}// CPU implementation of matrix transpose
void matrixTransposeCPUReference(float* output, float* input, const unsigned int width) {for (unsigned int j = 0; j < width; j++) {for (unsigned int i = 0; i < width; i++) {output[i * width + j] = input[j * width + i];}}
}int main() {float* Matrix;float* TransposeMatrix;float* cpuTransposeMatrix;float* gpuMatrix;float* gpuTransposeMatrix;hipDeviceProp_t devProp;HIP_CALL(hipGetDeviceProperties(&devProp, 0));std::cerr << "Device name " << devProp.name << std::endl;int i;int errors;Matrix = (float*)malloc(NUM * sizeof(float));TransposeMatrix = (float*)malloc(NUM * sizeof(float));cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float));// initialize the input datafor (i = 0; i < NUM; i++) {Matrix[i] = (float)i * 10.0f;}// allocate the memory on the device sideHIP_CALL(hipMalloc((void**)&gpuMatrix, NUM * sizeof(float)));HIP_CALL(hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float)));uint32_t iterations = 100;while (iterations-- > 0) {std::cerr << "## Iteration (" << iterations << ") #################" << std::endl;// Memory transfer from host to deviceHIP_CALL(hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(float), hipMemcpyHostToDevice));roctxMark("before hipLaunchKernel");int rangeId = roctxRangeStart("hipLaunchKernel range");roctxRangePush("hipLaunchKernel");// Lauching kernel from hosthipLaunchKernelGGL(matrixTranspose, dim3(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y),dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, gpuTransposeMatrix, gpuMatrix, WIDTH);roctxMark("after hipLaunchKernel");// Memory transfer from device to hostroctxRangePush("hipMemcpy");HIP_CALL(hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost));roctxRangePop();  // for "hipMemcpy"roctxRangePop();  // for "hipLaunchKernel"roctxRangeStop(rangeId);// CPU MatrixTranspose computationmatrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);// verify the resultserrors = 0;double eps = 1.0E-6;for (i = 0; i < NUM; i++) {if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps) {errors++;}}if (errors != 0) {fprintf(stderr, "FAILED: %d errors\n", errors);} else {fprintf(stderr, "PASSED!\n");}}// free the resources on device sideHIP_CALL(hipFree(gpuMatrix));HIP_CALL(hipFree(gpuTransposeMatrix));// free the resources on host sidefree(Matrix);free(TransposeMatrix);free(cpuTransposeMatrix);return errors;
}

编译:

只使用hipcc无法直接编译这个源文件

需要指定include 目录和链接库:

$ hipcc ./MatrixTranspose.cpp  -I /opt/rocm/include/roctracer/ -lroctx64

运行:

./a.out

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.xdnf.cn/news/1535112.html

如若内容造成侵权/违法违规/事实不符,请联系一条长河网进行投诉反馈,一经查实,立即删除!

相关文章

✨机器学习笔记(四)—— 逻辑回归、决策边界、过拟合、正则化

Course1-Week3: https://github.com/kaieye/2022-Machine-Learning-Specialization/tree/main/Supervised%20Machine%20Learning%20Regression%20and%20Classification/week3机器学习笔记&#xff08;四&#xff09; 1️⃣逻辑回归&#xff08;logistic regression&#xff09;…

element-plus表单使用show-overflow-tooltip,避免占满屏幕,需要设置宽度

在表单中&#xff0c;<el-table-clumn>中添加show-overflow-tooltip&#xff0c;可以实现表格内容过多的问题。 属性官方解释&#xff1a;是否隐藏额外内容并在单元格悬停时使用 Tooltip 显示它们。 出现的问题&#xff1a; 使用了该属性之后&#xff0c;弹出的详细内…

反射动态代理

1. 反射 1.1 反射的概述&#xff1a; **专业的解释&#xff08;了解一下&#xff09;&#xff1a;**是在运行状态中&#xff0c;对于任意一个类&#xff0c;都能够知道这个类的所有属性和方法&#xff1b;对于任意一个对象&#xff0c;都能够调用它的任意属性和方法&#xff…

【Linux实践】实验二:LINUX操作基础

【Linux实践】实验二&#xff1a;LINUX操作基础 实验目的实验内容实验步骤及结果1. 打开终端2. 关闭计算机命令3. 查看帮助文档4. 修改计算机主机名5. 显示月历和时间6. 统计行数、字符数、单词数 这章开始要涉及到命令了&#xff0c;其他关于命令的内容可以看我 2021年写的笔记…

非金属失效与典型案例分析培训

随着生产和科学技术的发展&#xff0c;人们不断对高分子材料提出各种各样的新要求。因为技术的全新要求和产品的高要求化&#xff0c;而客户对产品的高要求及工艺理解不一&#xff0c;于是高分子材料断裂、开裂、腐蚀、变色等之类失效频繁出现&#xff0c;常引起供应商与用户间…

无人机几种常见的避障系统!!!

1. 视觉避障系统 工作原理&#xff1a; 视觉避障系统通过安装在无人机上的摄像头捕捉周围环境的图像&#xff0c;利用计算机视觉技术对图像进行处理和分析&#xff0c;提取出障碍物的信息。 通过对障碍物的识别和分类&#xff0c;无人机可以判断出障碍物的性质和危险程度&am…

人工智能+数字孪生技术在智慧型项目中的应用研究(Word原件)

1 基于BIM的智慧社区运维管理信息系统构建 1.1 数据存储 1.2 数据交换 1.3 BIM模型的数据整合及轻量化 1.运维BIM模型 2.BIM模型的数据整合 3.BIM模型的轻量化处理 2 GIS与BIM融合数字孪生技术应用 2.1 BIM模型在实景三维GIS平台上分析 2.2 BIM与GIS数据交互 …

汽车租赁系统1.0版本

汽车租赁系统1.0版本比较简陋&#xff0c;以后还会有2.0、3.0……就像《我爱发明》里面的一代机器二代机器&#xff0c;三代机器一样&#xff0c;是一个迭代更新的过程&#xff08;最近比较忙&#xff0c;可能会很久&#xff09;&#xff0c;这个1.0版本很简陋&#xff0c;也请…

电阻、电容、电感的封装大小分别与什么参数有关?

电阻封装大小与电阻值、额定功率有关&#xff1b; 电容封装大小与电容值、额定电压有关&#xff1b; 电感封装大小与电感量、额定电流有关。

7. qml按键最优解

目录 qml自带按键状态按键长按按键延时按键防抖按键 qml自带按键 官网列出了他扩展的按键派生与AbstractButton Button CheckBox DelayButton ltemDelegate MenuBarltem Menultem RadioButton switch TabButton 一般开发的过程中根据业务的不同进行选择 AbstractButton 状态按…

Linux云计算 |【第三阶段】PROJECT1-DAY3

主要内容&#xff1a; Keepalived高可用、部署Ceph分布式存储 一、网站架构进阶项目案例 案例1&#xff1a;Keepalived高可用 延续 PROJECT1-DAY2 案例&#xff0c;部署两台代理服务器&#xff0c;实现如下效果&#xff1a; 1&#xff09;利用keepalived实现两台代理服务器的…

测试通用面试题大全

24年软件测试的发展如何&#xff1f; 1、IT行业还会继续升温&#xff0c;高质量人才需求相对还是短缺。 2、要求变高之后&#xff0c;很难再下降了&#xff0c;学历和经验。 3、功能测试之外的东西&#xff0c;接口、性能和自动化要掌握一点。 4、长远来看&#xff0c;软件…

Idea springboot项目热部署

使用 spring-boot-devtools spring-boot-devtools 是 Spring Boot 提供的开发工具模块&#xff0c;它可以自动检测到代码的变化并重启应用&#xff0c;实现热部署。 配置步骤&#xff1a; 添加依赖&#xff1a; 在项目的 pom.xml 中加入 spring-boot-devtools 依赖&#xff1…

做海外问卷渠道查,你必备的几个答题工具(缺一不可,建议收藏)

大家好&#xff0c;我是金言问卷。 近几年&#xff0c;随着我国经济的下行&#xff0c;越来越多的人开始寻求互联网上的创收机会。在这个背景下&#xff0c;海外问卷渠道查也成为一个备受关注的小众项目。 本文将为你揭秘&#xff0c;入局海外问卷渠道查必备的几个工具&#…

电学基础概念详解及三相电公式汇总

​​​​​​​ 本文全面介绍了电路的基本组成、电学核心概念以及三相电的常用公式。首先&#xff0c;通过水力学中的现象类比&#xff0c;生动解释了电路中电池、开关、电阻和灯泡等元素的功能&#xff0c;帮助读者更好地理解电压、电流和电阻之间的关系。随后&#xff0c;详…

Sparse4D v1

Sparse4D: Multi-view 3D Object Detection with Sparse Spatial-Temporal Fusion Abstract 基于鸟瞰图 (BEV) 的方法最近在多视图 3D 检测任务方面取得了重大进展。与基于 BEV 的方法相比&#xff0c;基于稀疏的方法在性能上落后&#xff0c;但仍然有很多不可忽略的优点。为了…

红日靶场通关

初始准备 首先是网络配置&#xff0c;看教程来的&#xff0c;我配置完的效果如下 windows7&#xff1a;(内&#xff1a;192.168.52.143 / 外&#xff1a;192.168.154.136) windows2003&#xff1a;(内&#xff1a;192.168.52.141)windows2008:&#xff08;内&#xff1a;192.…

计算机毕业设计 智慧物业服务系统的设计与实现 Java+SpringBoot+Vue 前后端分离 文档报告 代码讲解 安装调试

&#x1f34a;作者&#xff1a;计算机编程-吉哥 &#x1f34a;简介&#xff1a;专业从事JavaWeb程序开发&#xff0c;微信小程序开发&#xff0c;定制化项目、 源码、代码讲解、文档撰写、ppt制作。做自己喜欢的事&#xff0c;生活就是快乐的。 &#x1f34a;心愿&#xff1a;点…

几种mfc140u.dll常见错误情况,以及mfc140u.dll文件修复的方法

如果你遇到与mfc140u.dll 文件相关的错误&#xff0c;这通常指的是该mfc140u.dll文件可能丢失、损坏或与您的应用程序不兼容。详细分析关于mfc140u.dll文件错误会对系统有什么影响&#xff0c;mfc140u.dll文件处于什么样的位置&#xff1f;以下是几种常见的错误情况及其修复方法…

[今日Arxiv] LLM推理中的错误自迭代改进,DeepMind

最近思维链微调范式很火&#xff0c;翻出来DeepMind的文章分享出来。 大致讲了推理过程中的自改进迭代思路&#xff0c;注重推理过程。 来自 1UC Berkeley, 2Google DeepMind, ♦Work done during an internship at Google DeepMind 的文章 文章标题&#xff1a; Scaling L…