CUDA流与多GPU编程技术
在高性能CUDA应用中,仅依靠单GPU的计算能力往往不够。本文将介绍CUDA流(Streams)和多GPU编程技术,帮助你充分利用系统资源。
CUDA流 (CUDA Streams)
CUDA流是GPU上一系列按顺序执行的操作序列。通过使用多个流,可以实现操作间的重叠,提高整体吞吐量。
基本流操作
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
| #include <cuda_runtime.h>
void basic_stream_example() { const int N = 1024 * 1024; size_t size = N * sizeof(float); float *h_a = (float*)malloc(size); float *h_b = (float*)malloc(size); float *h_c = (float*)malloc(size); float *d_a, *d_b, *d_c; cudaMalloc(&d_a, size); cudaMalloc(&d_b, size); cudaMalloc(&d_c, size); cudaStream_t stream1, stream2; cudaStreamCreate(&stream1); cudaStreamCreate(&stream2); int chunk_size = N / 2; cudaMemcpyAsync(d_a, h_a, chunk_size * sizeof(float), cudaMemcpyHostToDevice, stream1); cudaMemcpyAsync(d_b, h_b, chunk_size * sizeof(float), cudaMemcpyHostToDevice, stream2); my_kernel<<<chunk_size / 256, 256, 0, stream1>>>( d_a, d_c, 0, chunk_size / 2); my_kernel<<<chunk_size / 256, 256, 0, stream2>>>( d_b, d_c + chunk_size / 2, chunk_size / 2, chunk_size / 2); cudaMemcpyAsync(h_c, d_c, N * sizeof(float), cudaMemcpyDeviceToHost, stream1); cudaStreamSynchronize(stream1); cudaStreamSynchronize(stream2); cudaStreamDestroy(stream1); cudaStreamDestroy(stream2); cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); free(h_a); free(h_b); free(h_c); }
|
事件 (Events) 与流同步
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
| void stream_with_events() { cudaStream_t stream; cudaStreamCreate(&stream); cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, stream); kernel<<<blocks, threads, 0, stream>>>(data); cudaEventRecord(stop, stream); cudaEventSynchronize(stop); float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start, stop); printf("Kernel execution time: %f ms\n", milliseconds); cudaEventDestroy(start); cudaEventDestroy(stop); cudaStreamDestroy(stream); }
|
多GPU编程
现代系统通常配备多个GPU,CUDA提供了多GPU编程接口来利用这些资源。
查询和设置GPU设备
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
| void multi_gpu_setup() { int device_count; cudaGetDeviceCount(&device_count); printf("Number of CUDA devices: %d\n", device_count); for (int i = 0; i < device_count; i++) { cudaDeviceProp prop; cudaGetDeviceProperties(&prop, i); printf("Device %d: %s\n", i, prop.name); printf(" Compute capability: %d.%d\n", prop.major, prop.minor); printf(" Global memory: %.2f GB\n", (float)prop.totalGlobalMem / (1024 * 1024 * 1024)); } }
|
多GPU数据并行处理
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
| struct MultiGPUContext { int num_devices; cudaStream_t* streams; float** d_data; float** d_result; };
void multi_gpu_processing(float* h_input, float* h_output, int total_size) { int num_devices; cudaGetDeviceCount(&num_devices); if (num_devices > 4) num_devices = 4; MultiGPUContext ctx; ctx.num_devices = num_devices; ctx.streams = new cudaStream_t[num_devices]; ctx.d_data = new float*[num_devices]; ctx.d_result = new float*[num_devices]; int chunk_size = total_size / num_devices; for (int i = 0; i < num_devices; i++) { cudaSetDevice(i); cudaStreamCreate(&ctx.streams[i]); cudaMalloc(&ctx.d_data[i], chunk_size * sizeof(float)); cudaMalloc(&ctx.d_result[i], chunk_size * sizeof(float)); } for (int i = 0; i < num_devices; i++) { cudaSetDevice(i); cudaMemcpyAsync(ctx.d_data[i], h_input + i * chunk_size, chunk_size * sizeof(float), cudaMemcpyHostToDevice, ctx.streams[i]); processing_kernel<<<chunk_size / 256, 256, 0, ctx.streams[i]>>>( ctx.d_data[i], ctx.d_result[i], chunk_size); cudaMemcpyAsync(h_output + i * chunk_size, ctx.d_result[i], chunk_size * sizeof(float), cudaMemcpyDeviceToHost, ctx.streams[i]); } for (int i = 0; i < num_devices; i++) { cudaSetDevice(i); cudaStreamSynchronize(ctx.streams[i]); } for (int i = 0; i < num_devices; i++) { cudaSetDevice(i); cudaStreamDestroy(ctx.streams[i]); cudaFree(ctx.d_data[i]); cudaFree(ctx.d_result[i]); } delete[] ctx.streams; delete[] ctx.d_data; delete[] ctx.d_result; }
|
GPU间通信
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
| void gpu_to_gpu_communication() { int device_count; cudaGetDeviceCount(&device_count); if (device_count < 2) { printf("Need at least 2 GPUs for peer-to-peer communication\n"); return; } int can_access_peer; cudaDeviceCanAccessPeer(&can_access_peer, 0, 1); if (can_access_peer) { cudaSetDevice(0); cudaDeviceEnablePeerAccess(1, 0); cudaSetDevice(1); cudaDeviceEnablePeerAccess(0, 0); float *d_src, *d_dst; cudaSetDevice(0); cudaMalloc(&d_src, 1024 * sizeof(float)); cudaSetDevice(1); cudaMalloc(&d_dst, 1024 * sizeof(float)); cudaSetDevice(0); cudaMemcpyPeer(d_dst, 1, d_src, 0, 1024 * sizeof(float)); } }
|
统一内存 (Unified Memory) 与多GPU
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
| void unified_memory_multigpu() { int num_devices; cudaGetDeviceCount(&num_devices); float *managed_data; size_t size = 1024 * 1024 * sizeof(float); cudaMallocManaged(&managed_data, size); for (int i = 0; i < num_devices; i++) { cudaMemAdvise(managed_data, size, cudaMemAdviseSetPreferredLocation, i); cudaSetDevice(i); kernel_on_device_i<<<blocks, threads>>>(managed_data + i * chunk_size); } cudaMemPrefetchAsync(managed_data, size, cudaCpuDeviceId); cudaFree(managed_data); }
|
性能最佳实践
1. 重叠数据传输与计算
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
| void overlap_compute_and_transfer() { const int num_streams = 4; const int chunk_size = N / num_streams; cudaStream_t streams[num_streams]; for (int i = 0; i < num_streams; i++) { cudaStreamCreate(&streams[i]); } for (int i = 0; i < num_streams; i++) { cudaMemcpyAsync(d_data + i * chunk_size, h_input + i * chunk_size, chunk_size * sizeof(float), cudaMemcpyHostToDevice, streams[i]); kernel<<<blocks, threads, 0, streams[i]>>>( d_data + i * chunk_size, chunk_size); cudaMemcpyAsync(h_output + i * chunk_size, d_data + i * chunk_size, chunk_size * sizeof(float), cudaMemcpyDeviceToHost, streams[i]); } for (int i = 0; i < num_streams; i++) { cudaStreamSynchronize(streams[i]); cudaStreamDestroy(streams[i]); } }
|
2. 使用CUDA流回调
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
| void CUDA_CB my_callback(cudaStream_t stream, cudaError_t status, void *data) { printf("Stream operation completed\n"); }
void stream_callback_example() { cudaStream_t stream; cudaStreamCreate(&stream); cudaLaunchHostFunc(stream, my_callback, nullptr); kernel<<<blocks, threads, 0, stream>>>(data); cudaStreamSynchronize(stream); cudaStreamDestroy(stream); }
|
总结
CUDA流和多GPU编程是构建高性能应用的关键技术。通过合理使用流,可以实现计算与数据传输的重叠;通过多GPU编程,可以充分利用系统中的多个GPU资源。在实际应用中,需要根据具体的工作负载选择合适的并行策略,并使用Nsight等工具进行性能分析和优化。