-
Notifications
You must be signed in to change notification settings - Fork 4.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Convolution1D and Deconvolution1D layers #4811
Comments
currently, no vulkan conv1d / deconv1d |
Thank You @nihui . |
Additionally, this holds true for the vocoders of both VITS and DiffSinger, in summary, all TTS synthesis relies on this. |
I had to create
Convolution1D_vulkan.h
Main.cpp
All compiled well
But i dont know how implement this in my custom layer without layer_shader_type.h and layer_shader_type_enum.h. |
I found how make this
|
I have only one question GLSL data type C data type Description |
I found declarations here gpu.cpp |
under construction ... |
Hi @nihui , thank You for link and helping. |
Hi, you can join ncnn qq group if you use qq (see ncnn readme) thru which I can provide more help in time |
Work in progress convolution1d.comp for kernel_w > 1 and elempack 1 #version 450
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif
#extension GL_EXT_debug_printf : enable
#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"
layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int dilation_w = 1;
layout (constant_id = 2) const int stride_w = 1;
layout (constant_id = 3) const int bias_term = 0;
layout (constant_id = 4) const int activation_type = 0;
layout (constant_id = 5) const float activation_param_0 = 0;
layout (constant_id = 6) const float activation_param_1 = 0;
#define shape_constant_id_offset 7
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler2D bottom_blob;
layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob;
layout (binding = 2) uniform unfp sampler3D weight_blob;
layout (binding = 3) uniform unfp sampler3D bias_blob;
#else
layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; };
layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
#endif
layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;
int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;
void print_bottom_blob()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);
if (gx >= 1 || gy >= 1)
return;
debugPrintfEXT("Hello %i, %i\n", gx, gy);
for (int i = 0; i < psc(w); ++i) {
for (int j = 0; j < psc(h); ++j) {
debugPrintfEXT("Elem %d %d: %f ", i, j, bottom_blob_data[i*psc(h)+j]);
}
debugPrintfEXT("\n");
}
}
void main()
{
int gx = int(gl_GlobalInvocationID.x) * 2;
int gy = int(gl_GlobalInvocationID.y) * 2;
int gz = int(gl_GlobalInvocationID.z) * 2;
//print_bottom_blob();
if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
return;
const ivec2 gx2 = gx + ivec2(0, 1);
const ivec2 gy2 = gy + ivec2(0, 1);
afp sum0 = afp(0.0f);
afp sum1 = afp(0.0f);
afp sum2 = afp(0.0f);
afp sum3 = afp(0.0f);
if (bias_term == 1)
{
#if NCNN_image_shader
//sum = image2d_ld1(bias_blob, ivec2(gx, 0));
#else
sum0 = buffer_ld1(bias_data, gy2.x);
sum2 = buffer_ld1(bias_data, gy2.y);
sum1 = sum0;
sum3 = sum2;
#endif
}
#if NCNN_image_shader
//
#else
ivec2 w_offsetv = kernel_w * psc(h) * gy2; // weight offset
for (int iny = 0; iny < psc(h); iny++)
{
ivec2 v_offsetv = iny * psc(w) + gx2 * stride_w; // value offset
for (int x = 0; x < kernel_w; x++)
{
afp v0 = buffer_ld1(bottom_blob_data, v_offsetv.x + x * dilation_w); // Load the value +0
afp v1 = buffer_ld1(bottom_blob_data, v_offsetv.y + x * dilation_w); // Load the value +1
afp k0 = buffer_ld1(weight_data, w_offsetv.x + x); // Load the weight value +0
afp k1 = buffer_ld1(weight_data, w_offsetv.y + x); // Load the weight value +1
sum0 += v0 * k0;
sum1 += v1 * k0;
sum2 += v0 * k1;
sum3 += v1 * k1;
}
w_offsetv += kernel_w; // Move to the next set of weights
}
#endif
sum0 = activation_afp(sum0, activation_type, activation_param_0, activation_param_1);
sum1 = activation_afp(sum1, activation_type, activation_param_0, activation_param_1);
sum2 = activation_afp(sum2, activation_type, activation_param_0, activation_param_1);
sum3 = activation_afp(sum3, activation_type, activation_param_0, activation_param_1);
#if NCNN_image_shader
//image2d_st1(top_blob, ivec3(gx2.x, gy2.x, gz2.x), sum0);
//image2d_st1(top_blob, ivec3(gx2.y, gy2.x, gz2.x), sum1);
//image2d_st1(top_blob, ivec3(gx2.x, gy2.y, gz2.x), sum2);
//image2d_st1(top_blob, ivec3(gx2.y, gy2.y, gz2.x), sum3);
#else
if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st1(top_blob_data, gy2.x * psc(outw) + gx2.x, sum0);
if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st1(top_blob_data, gy2.x * psc(outw) + gx2.y, sum1);
if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st1(top_blob_data, gy2.y * psc(outw) + gx2.x, sum2);
if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st1(top_blob_data, gy2.y * psc(outw) + gx2.y, sum3);
#endif
}
|
My convolution1d.comp for kernel_w > 1 and elempack 1 ( unpacked float 32) work correct and produce correct results.
Output
|
convolution1d_pack4.comp (float 32 pack4 blobs and input unpacked weights) #version 450
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif
//#extension GL_EXT_debug_printf : enable
#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"
layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int dilation_w = 1;
layout (constant_id = 2) const int stride_w = 1;
layout (constant_id = 3) const int bias_term = 0;
layout (constant_id = 4) const int activation_type = 0;
layout (constant_id = 5) const float activation_param_0 = 0;
layout (constant_id = 6) const float activation_param_1 = 0;
#define shape_constant_id_offset 7
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
layout (binding = 2) uniform unfp sampler3D weight_blob;
layout (binding = 3) uniform unfp sampler3D bias_blob;
#else
//layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
//layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
//layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; };
//layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
#else
//layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; };
//layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; };
#endif
layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
#endif
layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;
int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;
/*
void print_bottblob()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);
if (gx >= 1 || gy >= 1 || gz >= 1)
return;
//debugPrintfEXT("Hello %i, %i\n", gx, gy);
for (int i = 0; i < psc(h)/4; ++i) {
for (int j = 0; j < psc(w); ++j) {
//for (int j = 0; j < psc(h); ++j) {
//afp v = buffer_ld1(bottom_blob_data, 3);
//debugPrintfEXT("Elem %d %d: %f ", i, j, v);
//debugPrintfEXT("Bot_Blob %d %d: %f ", i, j, bottom_blob_data[i*psc(h)+j]);
afpvec4 test = buffer_ld4(bottom_blob_data, i*psc(w)+j);
debugPrintfEXT(" Top_Blob %d %d: %v4f ", i, j, test);
//afpvec4 value;
//value = buffer_ld4(bottom_blob_data, i*psc(h)+j );
//debugPrintfEXT("Bot_Blob %d %d: %f ", i, j, value);
}
debugPrintfEXT("\n");
}
}
void print_weight()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);
if (gx >= 1 || gy >= 1 || gz >= 1)
return;
debugPrintfEXT("Hello %i, %i\n", gx, gy);
for (int i = 0; i < psc(outh)*4; ++i) {
for (int j = 0; j < psc(outw)*kernel_w; ++j) {
//afp v = buffer_ld1(bottom_blob_data, 3);
//debugPrintfEXT("Elem %d %d: %f ", i, j, v);
debugPrintfEXT("Weight %d %d: %f ", i, j, weight_data[i*psc(outw)*kernel_w+j]);
//afpvec4 test = buffer_ld4(weight_data, i*psc(outw)+j);
//debugPrintfEXT(" Weight %d %d: %v4f ", i, j, test);
}
debugPrintfEXT("\n");
}
}
*/
void main()
{
int gx = int(gl_GlobalInvocationID.x) * 2;
int gy = int(gl_GlobalInvocationID.y) * 2;
int gz = int(gl_GlobalInvocationID.z) * 2;
//print_bottblob();
//print_weight();
if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
return;
const ivec2 gx2 = gx + ivec2(0, 1);
const ivec2 gy2 = gy + ivec2(0, 1);
const ivec2 gy4 = gy*4 + ivec2(0, 4);
const ivec2 gz2 = gz + ivec2(0, 1);
afpvec4 sum0 = afpvec4(0.0f);
afpvec4 sum1 = afpvec4(0.0f);
afpvec4 sum2 = afpvec4(0.0f);
afpvec4 sum3 = afpvec4(0.0f);
afpvec4 sum4 = afpvec4(0.0f);
afpvec4 sum5 = afpvec4(0.0f);
afpvec4 sum6 = afpvec4(0.0f);
afpvec4 sum7 = afpvec4(0.0f);
afpvec4 sum8 = afpvec4(0.0f);
afpvec4 sum9 = afpvec4(0.0f);
afpvec4 sum10 = afpvec4(0.0f);
afpvec4 sum11 = afpvec4(0.0f);
afpvec4 sum12 = afpvec4(0.0f);
afpvec4 sum13 = afpvec4(0.0f);
afpvec4 sum14 = afpvec4(0.0f);
afpvec4 sum15 = afpvec4(0.0f);
afpvec4 sum16 = afpvec4(0.0f);
afpvec4 sum17 = afpvec4(0.0f);
afpvec4 sum18 = afpvec4(0.0f);
afpvec4 sum19 = afpvec4(0.0f);
if (bias_term == 1)
{
#if NCNN_image_shader
sum = image2d_ld1(bias_blob, ivec2(gx, 0));
#else
sum4 = buffer_ld4(bias_data, gy2.x);
sum5 = sum4;
sum14 = buffer_ld4(bias_data, gy2.y);
sum15 = sum14;
#endif
}
#if NCNN_image_shader
//
#else
ivec4 gy4_0 = gy4.x + ivec4(0, 1, 2, 3);
ivec4 gy4_1 = gy4.y + ivec4(0, 1, 2, 3);
ivec4 w_offsetv4_0;
ivec4 w_offsetv4_1;
w_offsetv4_0 = kernel_w * psc(h) * 4 * gy4_0;
w_offsetv4_1 = kernel_w * psc(h) * 4 * gy4_1;
for (int iny = 0; iny < psc(h); iny++)
{
ivec2 v_offsetv = iny * psc(w) + gx2 * stride_w;
for (int x = 0; x < kernel_w; x++)
{
afpvec4 v0 = buffer_ld4(bottom_blob_data, v_offsetv.x + x * dilation_w);
afpvec4 v1 = buffer_ld4(bottom_blob_data, v_offsetv.y + x * dilation_w);
afp k0 = buffer_ld1(weight_data, (w_offsetv4_0.x + x) + kernel_w * 0); // Load the weight value
afp k1 = buffer_ld1(weight_data, (w_offsetv4_0.x + x) + kernel_w * 1); // Load the weight value
afp k2 = buffer_ld1(weight_data, (w_offsetv4_0.x + x) + kernel_w * 2); // Load the weight value
afp k3 = buffer_ld1(weight_data, (w_offsetv4_0.x + x) + kernel_w * 3); // Load the weight value
afp k4 = buffer_ld1(weight_data, (w_offsetv4_0.y + x) + kernel_w * 0); // Load the weight value
afp k5 = buffer_ld1(weight_data, (w_offsetv4_0.y + x) + kernel_w * 1); // Load the weight value
afp k6 = buffer_ld1(weight_data, (w_offsetv4_0.y + x) + kernel_w * 2); // Load the weight value
afp k7 = buffer_ld1(weight_data, (w_offsetv4_0.y + x) + kernel_w * 3); // Load the weight value
afp k8 = buffer_ld1(weight_data, (w_offsetv4_0.z + x) + kernel_w * 0); // Load the weight value
afp k9 = buffer_ld1(weight_data, (w_offsetv4_0.z + x) + kernel_w * 1); // Load the weight value
afp k10 = buffer_ld1(weight_data, (w_offsetv4_0.z + x) + kernel_w * 2); // Load the weight value
afp k11 = buffer_ld1(weight_data, (w_offsetv4_0.z + x) + kernel_w * 3); // Load the weight value
afp k12 = buffer_ld1(weight_data, (w_offsetv4_0.w + x) + kernel_w * 0); // Load the weight value
afp k13 = buffer_ld1(weight_data, (w_offsetv4_0.w + x) + kernel_w * 1); // Load the weight value
afp k14 = buffer_ld1(weight_data, (w_offsetv4_0.w + x) + kernel_w * 2); // Load the weight value
afp k15 = buffer_ld1(weight_data, (w_offsetv4_0.w + x) + kernel_w * 3); // Load the weight value
afp k16 = buffer_ld1(weight_data, (w_offsetv4_1.x + x) + kernel_w * 0); // Load the weight value
afp k17 = buffer_ld1(weight_data, (w_offsetv4_1.x + x) + kernel_w * 1); // Load the weight value
afp k18 = buffer_ld1(weight_data, (w_offsetv4_1.x + x) + kernel_w * 2); // Load the weight value
afp k19 = buffer_ld1(weight_data, (w_offsetv4_1.x + x) + kernel_w * 3); // Load the weight value
afp k20 = buffer_ld1(weight_data, (w_offsetv4_1.y + x) + kernel_w * 0); // Load the weight value
afp k21 = buffer_ld1(weight_data, (w_offsetv4_1.y + x) + kernel_w * 1); // Load the weight value
afp k22 = buffer_ld1(weight_data, (w_offsetv4_1.y + x) + kernel_w * 2); // Load the weight value
afp k23 = buffer_ld1(weight_data, (w_offsetv4_1.y + x) + kernel_w * 3); // Load the weight value
afp k24 = buffer_ld1(weight_data, (w_offsetv4_1.z + x) + kernel_w * 0); // Load the weight value
afp k25 = buffer_ld1(weight_data, (w_offsetv4_1.z + x) + kernel_w * 1); // Load the weight value
afp k26 = buffer_ld1(weight_data, (w_offsetv4_1.z + x) + kernel_w * 2); // Load the weight value
afp k27 = buffer_ld1(weight_data, (w_offsetv4_1.z + x) + kernel_w * 3); // Load the weight value
afp k28 = buffer_ld1(weight_data, (w_offsetv4_1.w + x) + kernel_w * 0); // Load the weight value
afp k29 = buffer_ld1(weight_data, (w_offsetv4_1.w + x) + kernel_w * 1); // Load the weight value
afp k30 = buffer_ld1(weight_data, (w_offsetv4_1.w + x) + kernel_w * 2); // Load the weight value
afp k31 = buffer_ld1(weight_data, (w_offsetv4_1.w + x) + kernel_w * 3); // Load the weight value
#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
afpmat4 k0 = afpmat4(
buffer_ld4(weight_data, (w_offsetv.x + x) * 4 + 0),
buffer_ld4(weight_data, (w_offsetv.x + x) * 4 + 1),
buffer_ld4(weight_data, (w_offsetv.x + x) * 4 + 2),
buffer_ld4(weight_data, (w_offsetv.x + x) * 4 + 3)
);
afpmat4 k1 = afpmat4(
buffer_ld4(weight_data, (w_offsetv.y + x) * 4 + 0),
buffer_ld4(weight_data, (w_offsetv.y + x) * 4 + 1),
buffer_ld4(weight_data, (w_offsetv.y + x) * 4 + 2),
buffer_ld4(weight_data, (w_offsetv.y + x) * 4 + 3)
);
#else
#endif
//debugPrintfEXT(" k0, k1, k2, k3 %f, %f, %f, %f \n", k0, k1, k2, k3);
//debugPrintfEXT(" k4, k5, k6, k7 %f, %f, %f, %f \n", k4, k5, k6, k7);
sum0 += v0 * afpvec4(k0, k1, k2, k3); //* k0;
sum1 += v1 * afpvec4(k0, k1, k2, k3); //* k0;
sum2 += v0 * afpvec4(k4, k5, k6, k7); //* k1;
sum3 += v1 * afpvec4(k4, k5, k6, k7); //* k1;
sum6 += v0 * afpvec4(k8, k9, k10, k11); //* k0;
sum7 += v1 * afpvec4(k8, k9, k10, k11); //* k0;
sum8 += v0 * afpvec4(k12, k13, k14, k15); //* k1;
sum9 += v1 * afpvec4(k12, k13, k14, k15); //* k1;
sum10 += v0 * afpvec4(k16, k17, k18, k19); //* k0;
sum11 += v1 * afpvec4(k16, k17, k18, k19); //* k0;
sum12 += v0 * afpvec4(k20, k21, k22, k23); //* k1;
sum13 += v1 * afpvec4(k20, k21, k22, k23); //* k1;
sum16 += v0 * afpvec4(k24, k25, k26, k27); //* k0;
sum17 += v1 * afpvec4(k24, k25, k26, k27); //* k0;
sum18 += v0 * afpvec4(k28, k29, k30, k31); //* k1;
sum19 += v1 * afpvec4(k28, k29, k30, k31); //* k1;
}
w_offsetv4_0 += kernel_w*4;
w_offsetv4_1 += kernel_w*4;
}
sum4.x += sum0.x + sum0.y + sum0.z + sum0.w;
sum4.y += sum2.x + sum2.y + sum2.z + sum2.w;
sum4.z += sum6.x + sum6.y + sum6.z + sum6.w;
sum4.w += sum8.x + sum8.y + sum8.z + sum8.w;
sum5.x += sum1.x + sum1.y + sum1.z + sum1.w;
sum5.y += sum3.x + sum3.y + sum3.z + sum3.w;
sum5.z += sum7.x + sum7.y + sum7.z + sum7.w;
sum5.w += sum9.x + sum9.y + sum9.z + sum9.w;
sum14.x += sum10.x + sum10.y + sum10.z + sum10.w;
sum14.y += sum12.x + sum12.y + sum12.z + sum12.w;
sum14.z += sum16.x + sum16.y + sum16.z + sum16.w;
sum14.w += sum18.x + sum18.y + sum18.z + sum18.w;
sum15.x += sum11.x + sum11.y + sum11.z + sum11.w;
sum15.y += sum13.x + sum13.y + sum13.z + sum13.w;
sum15.z += sum17.x + sum17.y + sum17.z + sum17.w;
sum15.w += sum19.x + sum19.y + sum19.z + sum19.w;
#endif
sum4 = activation_afpvec4(sum4, activation_type, activation_param_0, activation_param_1);
sum5 = activation_afpvec4(sum5, activation_type, activation_param_0, activation_param_1);
sum14 = activation_afpvec4(sum14, activation_type, activation_param_0, activation_param_1);
sum15 = activation_afpvec4(sum15, activation_type, activation_param_0, activation_param_1);
#if NCNN_image_shader
image2d_st1(top_blob, ivec3(gx2.x, gy2.x, gz2.x), sum0);
image2d_st1(top_blob, ivec3(gx2.y, gy2.x, gz2.x), sum1);
image2d_st1(top_blob, ivec3(gx2.x, gy2.y, gz2.x), sum2);
image2d_st1(top_blob, ivec3(gx2.y, gy2.y, gz2.x), sum3);
#else
if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st4(top_blob_data, gy2.x * psc(outw) + gx2.x, sum4);
if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st4(top_blob_data, gy2.x * psc(outw) + gx2.y, sum5);
if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st4(top_blob_data, gy2.y * psc(outw) + gx2.x, sum14);
if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st4(top_blob_data, gy2.y * psc(outw) + gx2.y, sum15);
#endif
} |
I have finished creating working convolution1d_vulkan for fp32
convolution1d.comp Inference duration for this mel spectrogram: 5 seconds output.mp4 |
vulkan conv1d #5060 |
hi @nihui |
try disabling fp16 The following test print the same result on cpu and gpu
|
Hi @nihui , thank you for your work. Now ncnn is open to new directions such as sound synthesis, voice conversion, music synthesis and TTS.
I also found what the problem was. For example, convolution1d is waiting for input dimension 2, and I passed ncnn:Mat with dimension 3. convolution1d with vulkan=false treats ncnn::Mat with dimension 3 correctly as dimension 2, but convolution1d with vulkan=true produces the wrong result.
and I was getting an erroneous result with vulkan=true because dims=3
Now I have changed the code
Output:
and I get the correct result with convolution1d vulkan=true |
Simple question.
My model has many Convolution1D and Deconvolution1D layers. the execution time on CPU and VULKAN is about the same. I just wanted to know if ncnn supports VULKAN acceleration for Convolution1D and Deconvolution1D layers?
The text was updated successfully, but these errors were encountered: