1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
|
namespace SingleGPU {
// 线程块大小,对于目前的GPU,128、256、512都是不错的选择,但不能选择64,
// 否则一个sm上需要驻留的线程块数可能超出限制。仅用于非启发式配置
constexpr int BLOCK_SIZE = 512;
// 非启发式配置,线程块大小都选择256,线程块数为刚好能使整个gpu满载的配置,
// 即每个sm可驻留的最大线程块数(由最大数量计算)乘上sm个数
inline void LaunchConfig(int *grid_size, int *block_size) {
*block_size = BLOCK_SIZE;
int device_id = 0;
cudaGetDevice(&device_id);
int max_threads_SM = 0;
int num_SM = 0;
CHECK(cudaDeviceGetAttribute(
&max_threads_SM, cudaDevAttrMaxThreadsPerMultiProcessor, device_id
));
CHECK(cudaDeviceGetAttribute(
&num_SM, cudaDevAttrMultiProcessorCount, device_id
));
*grid_size = max_threads_SM / (*block_size) * num_SM;;
}
// 使用带任务大小的线程网格配置,来启动核函数,当任务规模比较小时,不要启动太多的
// 线程块,只需要启动N / block_size向上取整个线程块。如果任务规模比较大,则仍然
// 取LaunchConfig返回的线程网格
template<typename KernelFunction, typename ...Args>
void LaunchKernelWithNumThreads(
KernelFunction kernel,
size_t num_threads,
size_t shared_memory,
cudaStream_t stream,
Args ...args
) {
int grid_size = 0;
int block_size = 0;
LaunchConfig(&grid_size, &block_size);
int N_grid_size = static_cast<int>(
(num_threads + block_size - 1) / block_size
);
if (N_grid_size < grid_size)
grid_size = N_grid_size;
kernel<<<grid_size, block_size, shared_memory, stream>>>(args...);
}
__global__ void AddKernel1(
ValueType *result,
const ValueType *v1,
const ValueType *v2,
const IndexType N,
const ValueType alpha
) {
unsigned int i = threadIdx.x + blockIdx.x * blockDim.x;
unsigned int stride = gridDim.x * blockDim.x;
for (; i < N; i += stride)
result[i] = v1[i] + alpha * v2[i];
}
// result = v1 + alpha * v2
void Add(
Vector *result,
const Vector &v1,
const Vector &v2,
const ValueType alpha = 1.0
) {
if (v1.size != v2.size)
throw std::runtime_error{"SingleGPU::Add: 参与运算的两个向量长度不一致!"};
result->resize(v1.size);
LaunchKernelWithNumThreads(
AddKernel1, v1.size, 0, 0,
result->data(), v1.data(), v2.data(), v1.size, alpha
);
}
template<int BlockSize = BLOCK_SIZE>
__global__ void DotKernel(
ValueType *__restrict__ partial,
const ValueType *__restrict__ vec1,
const ValueType *__restrict__ vec2,
const IndexType N
) {
__shared__ volatile ValueType cache[BlockSize];
int tid = threadIdx.x;
ValueType t = 0;
for (IndexType i = threadIdx.x + blockIdx.x * blockDim.x;
i < N; i += gridDim.x * blockDim.x)
t += vec1[i] * vec2[i];
cache[tid] = t;
__syncthreads();
// 线程块内归约
if (BlockSize >= 1024 && tid < 512)
cache[tid] += cache[tid + 512];
__syncthreads();
if (BlockSize >= 512 && tid < 256)
cache[tid] += cache[tid + 256];
__syncthreads();
if (BlockSize >= 256 && tid < 128)
cache[tid] += cache[tid + 128];
__syncthreads();
if (BlockSize >= 128 && tid < 64)
cache[tid] += cache[tid + 64];
__syncthreads();
// 线程束内归约
if (tid < 32) {
cache[tid] += cache[tid + 32];
cache[tid] += cache[tid + 16];
cache[tid] += cache[tid + 8];
cache[tid] += cache[tid + 4];
cache[tid] += cache[tid + 2];
cache[tid] += cache[tid + 1];
}
if (tid == 0)
partial[blockIdx.x] = cache[0];
}
ValueType Dot(const Vector &v1, const Vector &v2) {
if (v1.size != v2.size)
throw std::runtime_error{"SingleGPU::Dot: 参与运算的向量维度不一致!"};
int grid_size = 0;
int block_size = 0;
LaunchConfig(&grid_size, &block_size);
int N_grid_size = (v1.size + block_size - 1) / block_size;
if (N_grid_size < grid_size)
grid_size = N_grid_size;
Vector dev_partial(grid_size);
DotKernel<><<<grid_size, block_size>>>(
dev_partial.data(), v1.data(), v2.data(), v1.size
);
Host::Vector partial;
dev_partial.CopyToHost(partial);
ValueType sum = 0;
for (int i = 0; i < grid_size; i++)
sum += partial[i];
return sum;
}
// L2范数
ValueType Norm2(const Vector &v) {
return sqrt(Dot(v, v));
}
/*
计算csr格式的稀疏矩阵与稠密向量之积的核函数: result = Av
半个warp计算一个分量
*/
template<int BlockSize = BLOCK_SIZE>
__global__ void SpMvKernel(
ValueType *__restrict__ result,
const ValueType *__restrict__ values,
const IndexType *__restrict__ row_offsets,
const IndexType *__restrict__ column_indices,
const IndexType num_rows,
const ValueType *__restrict__ vec
) {
__shared__ volatile ValueType cache[BlockSize];
const int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
const int lane = thread_id & 15; // 半线程束内偏移量
const int num_rows_per_block = gridDim.x * blockDim.x / 16;
for (IndexType row = thread_id / 16; row < num_rows; row += num_rows_per_block) {
ValueType t = 0;
for (IndexType j = row_offsets[row] + lane; j < row_offsets[row + 1]; j += 16)
t += values[j] * vec[column_indices[j]];
cache[threadIdx.x] = t;
// parallel reduction in shared memory
if (lane < 8) {
cache[threadIdx.x] = t = t + cache[threadIdx.x + 8];
cache[threadIdx.x] = t = t + cache[threadIdx.x + 4];
cache[threadIdx.x] = t = t + cache[threadIdx.x + 2];
t = t + cache[threadIdx.x + 1];
}
// first thread writes the result
if (lane == 0)
result[row] = t;
}
}
void SpMv(Vector *result, const CsrMatrix &A, const Vector &v) {
if (A.num_cols != v.size)
throw std::runtime_error{"SingleGPU::SpMv: 参与运算的矩阵列数与向量维度不一致!"};
result->resize(A.num_rows);
LaunchKernelWithNumthreads(
SpMvKernel<>, A.num_rows * 16ull, 0, 0,
result->data(),
A.values, A.row_offsets, A.column_indices, A.num_rows,
v.data()
);
}
}
|