1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
|
// 计算L和U的各行非零元素个数
__global__ void GetSplitMatrixSizeKernel(
IndexType *__restrict__ lower_row_size,
IndexType *__restrict__ upper_row_size,
const IndexType *__restrict__ row_offsets,
const IndexType *__restrict__ column_indices,
const IndexType num_rows
) {
IndexType row = threadIdx.x + blockIdx.x * blockDim.x;
IndexType stride = blockDim.x * gridDim.x;
for (; row < num_rows; row += stride) {
IndexType start = row_offsets[row];
IndexType end = row_offsets[row + 1];
// 跳过严格下三角,此后j指向对角元素
IndexType j = start;
while (column_indices[j] < row)
++j;
// 下三角
lower_row_size[row] = j - start + 1;
// 上三角
upper_row_size[row] = end - j;
}
}
// 填充L和U
__global__ void SplitMatrixKernel(
ValueType *__restrict__ lower_values,
IndexType *__restrict__ lower_column_indices,
ValueType *__restrict__ upper_values,
IndexType *__restrict__ upper_column_indices,
ValueType *__restrict__ diag,
const IndexType *__restrict__ lower_row_offsets,
const IndexType *__restrict__ upper_row_offsets,
const ValueType *__restrict__ values,
const IndexType *__restrict__ row_offsets,
const IndexType *__restrict__ column_indices,
const IndexType num_rows
) {
IndexType row = threadIdx.x + blockIdx.x * blockDim.x;
IndexType stride = blockDim.x * gridDim.x;
for (; row < num_rows; row += stride) {
// 下三角,对角元素的位置填入1
IndexType n = lower_row_offsets[row];
IndexType j = row_offsets[row];
while (column_indices[j] < row) {
lower_values[n] = values[j];
lower_column_indices[n] = column_indices[j];
++n;
++j;
}
lower_values[n] = 1;
lower_column_indices[n] = row;
// 对角
if (column_indices[j] == row)
diag[row] = values[j++];
// 上三角,对角元素的位置填入1
n = upper_row_offsets[row];
upper_values[n] = 1;
upper_column_indices[n] = row;
++n;
while (j < row_offsets[row + 1]) {
upper_values[n] = values[j];
upper_column_indices[n] = column_indices[j];
++n;
++j;
}
}
}
// 接口继承的基类
class Preconditioner {
public:
virtual void precondition(Vector *out, const Vector &in) const = 0;
virtual ~Preconditioner() {}
};
// 实现precondition
class DILUPreconditioner : public Preconditioner {
public:
DILUPreconditioner(const CsrMatrix &A);
void precondition(Vector *out, const Vector &in) const override;
~DILUPreconditioner() override = default;
private:
Vector invE;
CsrMatrix lower; // I + invE * L
CsrMatrix upper; // I + invE * U
LowerCsrMatrixColor1 lower_colors;
UpperCsrMatrixColor1 upper_colors;
};
DILUPreconditioner::DILUPreconditioner(const CsrMatrix &A) {
// 取出下三角和上三角
invE.resize(A.num_rows);
lower.resize(A.num_rows, A.num_cols);
upper.resize(A.num_rows, A.num_cols);
LaunchKernelWithNumThreads(
GetSplitMatrixSizeKernel, A.num_rows, 0, 0,
lower.row_offsets,
upper.row_offsets,
A.row_offsets,
A.column_indices,
A.num_rows
);
thrust::exclusive_scan(
thrust::device_ptr<IndexType>(lower.row_offsets),
thrust::device_ptr<IndexType>(lower.row_offsets + A.num_rows + 1),
thrust::device_ptr<IndexType>(lower.row_offsets)
);
thrust::exclusive_scan(
thrust::device_ptr<IndexType>(upper.row_offsets),
thrust::device_ptr<IndexType>(upper.row_offsets + A.num_rows + 1),
thrust::device_ptr<IndexType>(upper.row_offsets)
);
// 分配空间
CHECK(cudaMemcpy(&lower.num_entries, lower.row_offsets + lower.num_rows,
sizeof(IndexType), cudaMemcpyDeviceToHost));
CHECK(cudaMemcpy(&upper.num_entries, upper.row_offsets + upper.num_rows,
sizeof(IndexType), cudaMemcpyDeviceToHost));
CHECK(cudaMalloc(&lower.values, lower.num_entries * sizeof(ValueType)));
CHECK(cudaMalloc(&lower.column_indices, lower.num_entries * sizeof(IndexType)));
CHECK(cudaMalloc(&upper.values, upper.num_entries * sizeof(ValueType)));
CHECK(cudaMalloc(&upper.column_indices, upper.num_entries * sizeof(IndexType)));
// 取出
LaunchKernelWithNumThreads(
SplitMatrixKernel, A.num_rows, 0, 0,
lower.values, lower.column_indices,
upper.values, upper.column_indices,
invE.data(),
lower.row_offsets, upper.row_offsets,
A.values, A.row_offsets, A.column_indices, A.num_rows
);
// 着色
lower_colors = LowerCsrMatrixColor1(lower);
upper_colors = UpperCsrMatrixColor1(upper);
// ...
}
|