Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[TTS]Add slim for TTS #2729

Merged
merged 5 commits into from
Dec 9, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/csmsc/tts2/local/PTQ_static.sh
5 changes: 5 additions & 0 deletions examples/csmsc/tts2/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,8 @@ fi
if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1
fi

# PTQ_static
if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} speedyspeech_csmsc || exit -1
fi
8 changes: 8 additions & 0 deletions examples/csmsc/tts3/local/PTQ_dynamic.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
train_output_path=$1
model_name=$2
weight_bits=$3

python3 ${BIN_DIR}/../PTQ_dynamic.py \
--inference_dir ${train_output_path}/inference \
--model_name ${model_name} \
--weight_bits ${weight_bits}
8 changes: 8 additions & 0 deletions examples/csmsc/tts3/local/PTQ_static.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
train_output_path=$1
model_name=$2

python3 ${BIN_DIR}/../PTQ_static.py \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--inference_dir ${train_output_path}/inference \
--model_name ${model_name} \
--onnx_forma=True
13 changes: 13 additions & 0 deletions examples/csmsc/tts3/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -76,3 +76,16 @@ fi
if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1
fi

# PTQ_dynamic
if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
./local/PTQ_dynamic.sh ${train_output_path} fastspeech2_csmsc 8
# ./local/PTQ_dynamic.sh ${train_output_path} pwgan_csmsc 8
# ./local/PTQ_dynamic.sh ${train_output_path} mb_melgan_csmsc 8
# ./local/PTQ_dynamic.sh ${train_output_path} hifigan_csmsc 8
fi

# PTQ_static
if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} fastspeech2_csmsc || exit -1
fi
8 changes: 8 additions & 0 deletions examples/csmsc/voc1/local/PTQ_static.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
train_output_path=$1
model_name=$2

python3 ${BIN_DIR}/../../PTQ_static.py \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--inference_dir ${train_output_path}/inference \
--model_name ${model_name} \
--onnx_format=True
5 changes: 5 additions & 0 deletions examples/csmsc/voc1/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi

# PTQ_static
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} pwgan_csmsc || exit -1
fi
1 change: 1 addition & 0 deletions examples/csmsc/voc3/local/PTQ_static.sh
5 changes: 5 additions & 0 deletions examples/csmsc/voc3/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi

# PTQ_static
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} mb_melgan_csmsc || exit -1
fi
1 change: 1 addition & 0 deletions examples/csmsc/voc5/local/PTQ_static.sh
5 changes: 5 additions & 0 deletions examples/csmsc/voc5/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi

# PTQ_static
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} hifigan_csmsc || exit -1
fi
67 changes: 67 additions & 0 deletions paddlespeech/t2s/datasets/am_batch_fn.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,3 +538,70 @@ def vits_multi_spk_batch_fn(examples):
spk_id = paddle.to_tensor(spk_id)
batch["spk_id"] = spk_id
return batch


# for PaddleSlim
def fastspeech2_single_spk_batch_fn_static(examples):
text = [np.array(item["text"], dtype=np.int64) for item in examples]
text = np.array(text)
# do not need batch axis in infer
text = text[0]
batch = {
"text": text,
}
return batch


def fastspeech2_multi_spk_batch_fn_static(examples):
text = [np.array(item["text"], dtype=np.int64) for item in examples]
text = np.array(text)
text = text[0]
batch = {
"text": text,
}
if "spk_id" in examples[0]:
spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
spk_id = np.array(spk_id)
spk_id = spk_id[0]
batch["spk_id"] = spk_id
if "spk_emb" in examples[0]:
spk_emb = [
np.array(item["spk_emb"], dtype=np.float32) for item in examples
]
spk_emb = np.array(spk_emb)
spk_emb = spk_id[spk_emb]
batch["spk_emb"] = spk_emb
return batch


def speedyspeech_single_spk_batch_fn_static(examples):
phones = [np.array(item["phones"], dtype=np.int64) for item in examples]
tones = [np.array(item["tones"], dtype=np.int64) for item in examples]
phones = np.array(phones)
tones = np.array(tones)
phones = phones[0]
tones = tones[0]
batch = {
"phones": phones,
"tones": tones,
}
return batch


def speedyspeech_multi_spk_batch_fn_static(examples):
phones = [np.array(item["phones"], dtype=np.int64) for item in examples]
tones = [np.array(item["tones"], dtype=np.int64) for item in examples]
phones = np.array(phones)
tones = np.array(tones)
phones = phones[0]
tones = tones[0]
batch = {
"phones": phones,
"tones": tones,
}
if "spk_id" in examples[0]:
spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
spk_id = np.array(spk_id)
spk_id = spk_id[0]
batch["spk_id"] = spk_id
return batch
55 changes: 47 additions & 8 deletions paddlespeech/t2s/datasets/vocoder_batch_fn.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,12 @@ def __call__(self, batch):
Args:
batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).

Returns:
Returns:
Tensor:
Target signal batch (B, 1, T).
Tensor:
Auxiliary feature batch (B, C, T'), where
T = (T' - 2 * aux_context_window) * hop_size.
Tensor:
Target signal batch (B, 1, T).

"""
# check length
batch = [
Expand Down Expand Up @@ -106,11 +105,7 @@ def _adjust_length(self, x, c):
if len(x) < c.shape[0] * self.hop_size:
x = np.pad(x, (0, c.shape[0] * self.hop_size - len(x)), mode="edge")
elif len(x) > c.shape[0] * self.hop_size:
# print(
# f"wave length: ({len(x)}), mel length: ({c.shape[0]}), hop size: ({self.hop_size })"
# )
x = x[:c.shape[0] * self.hop_size]

# check the legnth is valid
assert len(x) == c.shape[
0] * self.hop_size, f"wave length: ({len(x)}), mel length: ({c.shape[0]})"
Expand Down Expand Up @@ -218,3 +213,47 @@ def __call__(self, batch):
y = label_2_float(paddle.cast(y, dtype='float32'), self.bits)

return x, y, mels


# for paddleslim


class Clip_static(Clip):
"""Collate functor for training vocoders.
"""

def __call__(self, batch):
"""Convert into batch tensors.

Args:
batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).

Returns:
Dict[str, np.array]:
Auxiliary feature batch (B, C, T'), where
T = (T' - 2 * aux_context_window) * hop_size.
"""
# check length
batch = [
self._adjust_length(b['wave'], b['feats']) for b in batch
if b['feats'].shape[0] > self.mel_threshold
]
xs, cs = [b[0] for b in batch], [b[1] for b in batch]

# make batch with random cut
c_lengths = [c.shape[0] for c in cs]
start_frames = np.array([
np.random.randint(self.start_offset, cl + self.end_offset)
for cl in c_lengths
])

c_starts = start_frames - self.aux_context_window
c_ends = start_frames + self.batch_max_frames + self.aux_context_window
c_batch = np.stack(
[c[start:end] for c, start, end in zip(cs, c_starts, c_ends)])
# infer axis (T',C) is different with train axis (B, C, T')
# c_batch = c_batch.transpose([0, 2, 1]) # (B, C, T')
# do not need batch axis in infer
c_batch = c_batch[0]
batch = {"logmel": c_batch}
return batch
80 changes: 80 additions & 0 deletions paddlespeech/t2s/exps/PTQ_dynamic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse

import paddle
from paddleslim.quant import quant_post_dynamic


def parse_args():
parser = argparse.ArgumentParser(
description="Paddle Slim Dynamic with acoustic model & vocoder.")
# acoustic model
parser.add_argument(
'--model_name',
type=str,
default='fastspeech2_csmsc',
choices=[
'speedyspeech_csmsc',
'fastspeech2_csmsc',
'fastspeech2_aishell3',
'fastspeech2_ljspeech',
'fastspeech2_vctk',
'tacotron2_csmsc',
'fastspeech2_mix',
'pwgan_csmsc',
'pwgan_aishell3',
'pwgan_ljspeech',
'pwgan_vctk',
'mb_melgan_csmsc',
'hifigan_csmsc',
'hifigan_aishell3',
'hifigan_ljspeech',
'hifigan_vctk',
'wavernn_csmsc',
],
help='Choose model type of tts task.')

parser.add_argument(
"--inference_dir", type=str, help="dir to save inference models")
parser.add_argument(
"--weight_bits",
type=int,
default=8,
choices=[8, 16],
help="The bits for the quantized weight, and it should be 8 or 16. Default is 8.",
)

args, _ = parser.parse_known_args()
return args


# only inference for models trained with csmsc now
def main():
args = parse_args()
paddle.enable_static()
quant_post_dynamic(
model_dir=args.inference_dir,
save_model_dir=args.inference_dir,
model_filename=args.model_name + ".pdmodel",
params_filename=args.model_name + ".pdiparams",
save_model_filename=args.model_name + "_" + str(args.weight_bits) +
"bits.pdmodel",
save_params_filename=args.model_name + "_" + str(args.weight_bits) +
"bits.pdiparams",
weight_bits=args.weight_bits, )


if __name__ == "__main__":
main()
Loading