漳州正规网站建设公司,凡科可以做淘宝客网站吗,什么是网站建设整体策划方案,平台网站建设协议背景 想要了解CUDA并行计算原理#xff0c;同时针对深度学习中出现一些“不支持算子”可能需要手写的需要#xff0c;配置一个简单的CUDA编译环境#xff0c;探索CUDA编程的范式【注#xff1a;CUDA环境配置略】。结果展示 示例代码
#include cuda_runtime.h
…背景 想要了解CUDA并行计算原理同时针对深度学习中出现一些“不支持算子”可能需要手写的需要配置一个简单的CUDA编译环境探索CUDA编程的范式【注CUDA环境配置略】。结果展示 示例代码
#include cuda_runtime.h
#include device_launch_parameters.h
#include iostream__global__ void VecAdd(int* A, int* B, int* C)
{int i threadIdx.x;C[i] A[i] B[i];
}void test_cuda(){// define dataconst int size 3;int a[size] { 1,2,3 };int b[size] { 10,20,30 };int c[size] { 0 };// define deviceint* dev_a 0;int* dev_b 0;int* dev_c 0;// set cuda statecudaError_t cudaStatus;// select cuda devicecudaStatus cudaSetDevice(0);if (cudaStatus ! cudaSuccess) {fprintf(stderr, GPU device error);return;}// allocate memory on cudacudaStatus cudaMalloc((void**)dev_c, size * sizeof(int));if (cudaStatus ! cudaSuccess)fprintf(stderr, device_c allocate error);cudaStatus cudaMalloc((void**)dev_a, size * sizeof(int));if (cudaStatus ! cudaSuccess)fprintf(stderr, device_a allocate error);cudaStatus cudaMalloc((void**)dev_b, size * sizeof(int));if (cudaStatus ! cudaSuccess)fprintf(stderr, device_b allocate error);// copy data from cpu to gpucudaStatus cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);if (cudaStatus ! cudaSuccess) {fprintf(stderr, device_a copy error);}// copy data from cpu to gpucudaStatus cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);if (cudaStatus ! cudaSuccess) {fprintf(stderr, device_b copy error);}// core gpu operationVecAdd 1, size (dev_a, dev_b, dev_c);// get error statecudaStatus cudaGetLastError();if (cudaStatus ! cudaSuccess) {fprintf(stderr, VecAdd call error: %s\n, cudaGetErrorString(cudaStatus));}cudaStatus cudaDeviceSynchronize();if (cudaStatus ! cudaSuccess) {fprintf(stderr, cudaDeviceSynchronize not sucess %d!\n, cudaStatus);}// copy data from cuda to cpucudaStatus cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);if (cudaStatus ! cudaSuccess) {fprintf(stderr, copy result to host error);}printf({1,2,3} {10,20,30} {%d,%d,%d}\n, c[0], c[1], c[2]);cudaFree(dev_a);cudaFree(dev_b);cudaFree(dev_c);
}int main() {test_cuda();return 0;
}小结
NVCC编译cuda命令与g编译C较为相似从而借鉴引入对应的include实现Windows下cmake编译CUDA代码示例代码展示了从CPU读取数据在GPU端进行计算最终传输给CPU的过程与深度学习数据加载过程类似是较为通用的过程理解C到CUDA的过渡、预加载过程进一步从底层了解CUDA。