CUDA流与多GPU编程技术

在高性能CUDA应用中,仅依靠单GPU的计算能力往往不够。本文将介绍CUDA流(Streams)和多GPU编程技术,帮助你充分利用系统资源。

CUDA流 (CUDA Streams)

CUDA流是GPU上一系列按顺序执行的操作序列。通过使用多个流,可以实现操作间的重叠,提高整体吞吐量。

基本流操作

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#include <cuda_runtime.h>

void basic_stream_example()
{
const int N = 1024 * 1024;
size_t size = N * sizeof(float);

// 分配主机和设备内存
float *h_a = (float*)malloc(size);
float *h_b = (float*)malloc(size);
float *h_c = (float*)malloc(size);

float *d_a, *d_b, *d_c;
cudaMalloc(&d_a, size);
cudaMalloc(&d_b, size);
cudaMalloc(&d_c, size);

// 创建CUDA流
cudaStream_t stream1, stream2;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);

// 将数据分块
int chunk_size = N / 2;

// 在不同流中并发执行操作
cudaMemcpyAsync(d_a, h_a, chunk_size * sizeof(float),
cudaMemcpyHostToDevice, stream1);
cudaMemcpyAsync(d_b, h_b, chunk_size * sizeof(float),
cudaMemcpyHostToDevice, stream2);

// 在不同流中执行核函数
my_kernel<<<chunk_size / 256, 256, 0, stream1>>>(
d_a, d_c, 0, chunk_size / 2);
my_kernel<<<chunk_size / 256, 256, 0, stream2>>>(
d_b, d_c + chunk_size / 2, chunk_size / 2, chunk_size / 2);

// 异步拷回结果
cudaMemcpyAsync(h_c, d_c, N * sizeof(float),
cudaMemcpyDeviceToHost, stream1);

// 等待所有操作完成
cudaStreamSynchronize(stream1);
cudaStreamSynchronize(stream2);

// 清理资源
cudaStreamDestroy(stream1);
cudaStreamDestroy(stream2);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
free(h_a); free(h_b); free(h_c);
}

事件 (Events) 与流同步

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
void stream_with_events()
{
cudaStream_t stream;
cudaStreamCreate(&stream);

// 创建事件
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);

// 记录事件
cudaEventRecord(start, stream);

// 执行操作
kernel<<<blocks, threads, 0, stream>>>(data);

cudaEventRecord(stop, stream);

// 等待事件完成
cudaEventSynchronize(stop);

// 计算执行时间
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);

printf("Kernel execution time: %f ms\n", milliseconds);

// 清理
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaStreamDestroy(stream);
}

多GPU编程

现代系统通常配备多个GPU,CUDA提供了多GPU编程接口来利用这些资源。

查询和设置GPU设备

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
void multi_gpu_setup()
{
int device_count;
cudaGetDeviceCount(&device_count);

printf("Number of CUDA devices: %d\n", device_count);

for (int i = 0; i < device_count; i++) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
printf("Device %d: %s\n", i, prop.name);
printf(" Compute capability: %d.%d\n", prop.major, prop.minor);
printf(" Global memory: %.2f GB\n",
(float)prop.totalGlobalMem / (1024 * 1024 * 1024));
}
}

多GPU数据并行处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
struct MultiGPUContext {
int num_devices;
cudaStream_t* streams;
float** d_data;
float** d_result;
};

void multi_gpu_processing(float* h_input, float* h_output, int total_size)
{
int num_devices;
cudaGetDeviceCount(&num_devices);

// 限制使用的GPU数量
if (num_devices > 4) num_devices = 4;

// 分配上下文
MultiGPUContext ctx;
ctx.num_devices = num_devices;
ctx.streams = new cudaStream_t[num_devices];
ctx.d_data = new float*[num_devices];
ctx.d_result = new float*[num_devices];

int chunk_size = total_size / num_devices;

// 为每个GPU分配资源
for (int i = 0; i < num_devices; i++) {
cudaSetDevice(i);
cudaStreamCreate(&ctx.streams[i]);
cudaMalloc(&ctx.d_data[i], chunk_size * sizeof(float));
cudaMalloc(&ctx.d_result[i], chunk_size * sizeof(float));
}

// 并行数据传输和处理
for (int i = 0; i < num_devices; i++) {
cudaSetDevice(i);
cudaMemcpyAsync(ctx.d_data[i],
h_input + i * chunk_size,
chunk_size * sizeof(float),
cudaMemcpyHostToDevice,
ctx.streams[i]);

// 执行计算
processing_kernel<<<chunk_size / 256, 256, 0, ctx.streams[i]>>>(
ctx.d_data[i], ctx.d_result[i], chunk_size);

// 传输结果回主机
cudaMemcpyAsync(h_output + i * chunk_size,
ctx.d_result[i],
chunk_size * sizeof(float),
cudaMemcpyDeviceToHost,
ctx.streams[i]);
}

// 同步所有GPU
for (int i = 0; i < num_devices; i++) {
cudaSetDevice(i);
cudaStreamSynchronize(ctx.streams[i]);
}

// 清理资源
for (int i = 0; i < num_devices; i++) {
cudaSetDevice(i);
cudaStreamDestroy(ctx.streams[i]);
cudaFree(ctx.d_data[i]);
cudaFree(ctx.d_result[i]);
}
delete[] ctx.streams;
delete[] ctx.d_data;
delete[] ctx.d_result;
}

GPU间通信

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
void gpu_to_gpu_communication()
{
int device_count;
cudaGetDeviceCount(&device_count);

if (device_count < 2) {
printf("Need at least 2 GPUs for peer-to-peer communication\n");
return;
}

// 检查GPU间是否支持P2P通信
int can_access_peer;
cudaDeviceCanAccessPeer(&can_access_peer, 0, 1);

if (can_access_peer) {
// 启用对等访问
cudaSetDevice(0);
cudaDeviceEnablePeerAccess(1, 0);

cudaSetDevice(1);
cudaDeviceEnablePeerAccess(0, 0);

// 现在可以直接在GPU间传输数据
float *d_src, *d_dst;

cudaSetDevice(0);
cudaMalloc(&d_src, 1024 * sizeof(float));

cudaSetDevice(1);
cudaMalloc(&d_dst, 1024 * sizeof(float));

// GPU到GPU直接传输
cudaSetDevice(0);
cudaMemcpyPeer(d_dst, 1, d_src, 0, 1024 * sizeof(float));
}
}

统一内存 (Unified Memory) 与多GPU

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
void unified_memory_multigpu()
{
int num_devices;
cudaGetDeviceCount(&num_devices);

// 分配统一内存
float *managed_data;
size_t size = 1024 * 1024 * sizeof(float);
cudaMallocManaged(&managed_data, size);

// 设置内存偏好
for (int i = 0; i < num_devices; i++) {
cudaMemAdvise(managed_data, size, cudaMemAdviseSetPreferredLocation, i);
// 在设备i上执行操作
cudaSetDevice(i);
kernel_on_device_i<<<blocks, threads>>>(managed_data + i * chunk_size);
}

// 触发内存迁移
cudaMemPrefetchAsync(managed_data, size, cudaCpuDeviceId);

cudaFree(managed_data);
}

性能最佳实践

1. 重叠数据传输与计算

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
void overlap_compute_and_transfer()
{
const int num_streams = 4;
const int chunk_size = N / num_streams;

cudaStream_t streams[num_streams];
for (int i = 0; i < num_streams; i++) {
cudaStreamCreate(&streams[i]);
}

for (int i = 0; i < num_streams; i++) {
// 异步传输数据
cudaMemcpyAsync(d_data + i * chunk_size,
h_input + i * chunk_size,
chunk_size * sizeof(float),
cudaMemcpyHostToDevice,
streams[i]);

// 执行计算
kernel<<<blocks, threads, 0, streams[i]>>>(
d_data + i * chunk_size, chunk_size);

// 异步传输结果
cudaMemcpyAsync(h_output + i * chunk_size,
d_data + i * chunk_size,
chunk_size * sizeof(float),
cudaMemcpyDeviceToHost,
streams[i]);
}

// 同步所有流
for (int i = 0; i < num_streams; i++) {
cudaStreamSynchronize(streams[i]);
cudaStreamDestroy(streams[i]);
}
}

2. 使用CUDA流回调

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
void CUDA_CB my_callback(cudaStream_t stream, cudaError_t status, void *data)
{
printf("Stream operation completed\n");
// 执行后续操作
}

void stream_callback_example()
{
cudaStream_t stream;
cudaStreamCreate(&stream);

// 添加回调
cudaLaunchHostFunc(stream, my_callback, nullptr);

kernel<<<blocks, threads, 0, stream>>>(data);

cudaStreamSynchronize(stream);
cudaStreamDestroy(stream);
}

总结

CUDA流和多GPU编程是构建高性能应用的关键技术。通过合理使用流,可以实现计算与数据传输的重叠;通过多GPU编程,可以充分利用系统中的多个GPU资源。在实际应用中,需要根据具体的工作负载选择合适的并行策略,并使用Nsight等工具进行性能分析和优化。