-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathconvolution1d_pack1to4.comp
139 lines (119 loc) · 4.87 KB
/
convolution1d_pack1to4.comp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#version 450
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif
//#extension GL_EXT_debug_printf : enable
#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"
layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int dilation_w = 1;
layout (constant_id = 2) const int stride_w = 1;
layout (constant_id = 3) const int bias_term = 0;
layout (constant_id = 4) const int activation_type = 0;
layout (constant_id = 5) const float activation_param_0 = 0;
layout (constant_id = 6) const float activation_param_1 = 0;
#define shape_constant_id_offset 7
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
layout (binding = 2) uniform unfp sampler3D weight_blob;
layout (binding = 3) uniform unfp sampler3D bias_blob;
#else
layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
#endif
layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;
int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;
void main()
{
int gx = int(gl_GlobalInvocationID.x) * 2;
int gy = int(gl_GlobalInvocationID.y) * 2;
int gz = int(gl_GlobalInvocationID.z) * 2;
if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
return;
const ivec2 gx2 = gx + ivec2(0, 1);
const ivec2 gy2 = gy + ivec2(0, 1);
const ivec2 gz2 = gz + ivec2(0, 1);
afpvec4 sum0 = afpvec4(0.0f);
afpvec4 sum1 = afpvec4(0.0f);
afpvec4 sum2 = afpvec4(0.0f);
afpvec4 sum3 = afpvec4(0.0f);
if (bias_term == 1)
{
#if NCNN_image_shader
sum0 = image3d_ld4(bias_blob, ivec3(gz2.x, 0, 0));
sum4 = image3d_ld4(bias_blob, ivec3(gz2.y, 0, 0));
#else
sum0 = buffer_ld4(bias_data, gy2.x);
sum2 = buffer_ld4(bias_data, gy2.y);
#endif
sum1 = sum0;
sum3 = sum2;
}
#if NCNN_image_shader
//
#else
ivec2 w_offsetv = kernel_w * psc(h) * gy2;
//debugPrintfEXT(" kernel_w * psc(h) * gy2 = %v4i \n", w_offsetv);
//debugPrintfEXT("psc(cstep) psc(outcstep) %i, %i \n", psc(cstep), psc(outcstep));
for (int iny = 0; iny < psc(h); iny++)
{
ivec2 v_offsetv = iny * psc(w) + gx2 * stride_w;
for (int x = 0; x < kernel_w; x++)
{
afp v0 = buffer_ld1(bottom_blob_data, v_offsetv.x + x * dilation_w);
afp v1 = buffer_ld1(bottom_blob_data, v_offsetv.y + x * dilation_w);
afpvec4 k0 = buffer_ld4(weight_data, w_offsetv.x + x);
afpvec4 k1 = buffer_ld4(weight_data, w_offsetv.y + x);
//debugPrintfEXT(" k0, k1, %v4f | %v4f \n", k0, k1);
sum0 += v0 * k0;
sum1 += v1 * k0;
sum2 += v0 * k1;
sum3 += v1 * k1;
}
//debugPrintfEXT("============\n");
w_offsetv += kernel_w;
}
#endif
sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
sum2 = activation_afpvec4(sum2, activation_type, activation_param_0, activation_param_1);
sum3 = activation_afpvec4(sum3, activation_type, activation_param_0, activation_param_1);
#if NCNN_image_shader
image3d_st4(top_blob, ivec3(gx2.x, gy2.x, gz2.x), sum0);
image3d_st4(top_blob, ivec3(gx2.y, gy2.x, gz2.x), sum1);
image3d_st4(top_blob, ivec3(gx2.x, gy2.y, gz2.x), sum2);
image3d_st4(top_blob, ivec3(gx2.y, gy2.y, gz2.x), sum3);
#else
if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st4(top_blob_data, gy2.x * psc(outw) + gx2.x, sum0);
if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st4(top_blob_data, gy2.x * psc(outw) + gx2.y, sum1);
if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st4(top_blob_data, gy2.y * psc(outw) + gx2.x, sum2);
if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st4(top_blob_data, gy2.y * psc(outw) + gx2.y, sum3);
#endif
}