__global__ voidmatrix_multiply_optimized(float* A, float* B, float* C, int N) { __shared__ float As[16][16]; __shared__ float Bs[16][16]; int bx = blockIdx.x, by = blockIdx.y; int tx = threadIdx.x, ty = threadIdx.y; // 计算输出子矩阵的索引 int row = by * 16 + ty; int col = bx * 16 + tx; float sum = 0.0f; // 分块计算,利用共享内存 for (int m = 0; m < (N + 15)/16; ++m) { // 加载数据到共享内存 if (row < N && m * 16 + tx < N) As[ty][tx] = A[row * N + m * 16 + tx]; else As[ty][tx] = 0.0f; if (col < N && m * 16 + ty < N) Bs[ty][tx] = B[(m * 16 + ty) * N + col]; else Bs[ty][tx] = 0.0f; __syncthreads(); // 计算部分乘积 for (int k = 0; k < 16; ++k) sum += As[ty][k] * Bs[k][tx]; __syncthreads(); } if (row < N && col < N) C[row * N + col] = sum; }
2. 使用统一内存 (Unified Memory)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
voidunified_memory_example(int N) { float *d_data; cudaMallocManaged(&d_data, N * sizeof(float)); // 主机和设备端都可以访问 for (int i = 0; i < N; i++) { d_data[i] = i; // 在主机端初始化 } kernel<<<blocks, threads>>>(d_data, N); cudaDeviceSynchronize(); cudaFree(d_data); }