From 3a5c61686eb12a99bf02a00413ca3193a01ef441 Mon Sep 17 00:00:00 2001 From: Sting <loic.allegre@ensiie.fr> Date: Mon, 24 Jul 2023 19:22:12 +0200 Subject: [PATCH] Option to select language for phoneme transcription --- autokara.py | 3 ++- autosyl/LyricsAlignment/utils.py | 8 ++++++-- autosyl/LyricsAlignment/wrapper.py | 8 ++++---- autosyl/segment.py | 5 +++-- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/autokara.py b/autokara.py index a84a051..8fe6f2e 100644 --- a/autokara.py +++ b/autokara.py @@ -15,6 +15,7 @@ parser.add_argument("ass_file", type=str, help="The ASS file with lyrics to time parser.add_argument("--vocals", action="store_true", help="Treat the input as vocals file, i.e. do not perform vocals extraction") parser.add_argument("-o", "--output", help="Write output to specified file. If absent, overwrite source file") parser.add_argument("-v","--verbose", action="store_true", help="Increased verbosity") +parser.add_argument("-l","--lang", help="Select language to use (default is Japanese Romaji)") args = parser.parse_args() @@ -53,7 +54,7 @@ reference_syls, line_meta = getSyls(ass_file) if verbose: print("Starting syl detection...") -syls = segment(vocals_file, reference_syls=reference_syls, verbose=verbose) +syls = segment(vocals_file, reference_syls=reference_syls, verbose=verbose, language=args.lang) print(syls) print(line_meta) diff --git a/autosyl/LyricsAlignment/utils.py b/autosyl/LyricsAlignment/utils.py index 9b73869..ec8aab8 100644 --- a/autosyl/LyricsAlignment/utils.py +++ b/autosyl/LyricsAlignment/utils.py @@ -32,7 +32,7 @@ class G2p_Wrapper(): else: return self.transducer(word).output_string.split() -g2p = G2p_Wrapper(language="jp") +#g2p = G2p_Wrapper(language="jp") @@ -110,7 +110,11 @@ def load_lyrics(lyrics_file): def write_wav(path, audio, sr): soundfile.write(path, audio.T, sr, "PCM_16") -def gen_phone_gt(words, raw_lines): +def gen_phone_gt(words, raw_lines, language="jp"): + + print(f"Translating lyrics to phonemes, language chosen : {language:s}") + g2p = G2p_Wrapper(language=language) + # helper function def getsubidx(x, y): # find y in x diff --git a/autosyl/LyricsAlignment/wrapper.py b/autosyl/LyricsAlignment/wrapper.py index 120fcfe..825ec7c 100644 --- a/autosyl/LyricsAlignment/wrapper.py +++ b/autosyl/LyricsAlignment/wrapper.py @@ -10,10 +10,10 @@ from autosyl.LyricsAlignment.model import train_audio_transforms, AcousticModel, np.random.seed(7) -def preprocess_from_file(audio_file, lyrics_file, word_file=None): +def preprocess_from_file(audio_file, lyrics_file, word_file=None, language="jp"): y, sr = preprocess_audio(audio_file) - words, lyrics_p, idx_word_p, idx_line_p = preprocess_lyrics(lyrics_file, word_file) + words, lyrics_p, idx_word_p, idx_line_p = preprocess_lyrics(lyrics_file, word_file, language=language) return y, words, lyrics_p, idx_word_p, idx_line_p @@ -143,7 +143,7 @@ def preprocess_audio(audio_file, sr=22050): return y, curr_sr -def preprocess_lyrics(lyrics_lines, word_file=None): +def preprocess_lyrics(lyrics_lines, word_file=None, language="jp"): from string import ascii_lowercase d = {ascii_lowercase[i]: i for i in range(26)} d["'"] = 26 @@ -166,7 +166,7 @@ def preprocess_lyrics(lyrics_lines, word_file=None): else: words_lines = full_lyrics.split() - lyrics_p, words_p, idx_word_p, idx_line_p = utils.gen_phone_gt(words_lines, raw_lines) + lyrics_p, words_p, idx_word_p, idx_line_p = utils.gen_phone_gt(words_lines, raw_lines, language=language) return words_lines, lyrics_p, idx_word_p, idx_line_p diff --git a/autosyl/segment.py b/autosyl/segment.py index e9c9c22..adacae0 100644 --- a/autosyl/segment.py +++ b/autosyl/segment.py @@ -11,7 +11,7 @@ from autosyl.LyricsAlignment.wrapper import align, preprocess_from_file -def segment(songfile, reference_syls=None, syls_per_line=10, last_syl_dur=500, verbose=False): +def segment(songfile, reference_syls=None, syls_per_line=10, last_syl_dur=500, verbose=False, language="jp"): delay = -4 backtrack = False @@ -23,6 +23,7 @@ def segment(songfile, reference_syls=None, syls_per_line=10, last_syl_dur=500, v method = "MTL_BDR" # "Baseline", "MTL", "Baseline_BDR", "MTL_BDR" cuda=False # set True if you have access to a GPU checkpoint_folder = "./autosyl/LyricsAlignment/checkpoints" + language = language lyrics_lines = [" ".join([syl[1] for syl in line]) for line in reference_syls] @@ -36,7 +37,7 @@ def segment(songfile, reference_syls=None, syls_per_line=10, last_syl_dur=500, v # lyrics_p: phoneme sequence of the target lyrics # idx_word_p: indices of word start in lyrics_p # idx_line_p: indices of line start in lyrics_p - audio, words, lyrics_p, idx_word_p, idx_line_p = preprocess_from_file(audio_file, lyrics_lines, word_file) + audio, words, lyrics_p, idx_word_p, idx_line_p = preprocess_from_file(audio_file, lyrics_lines, word_file, language) if verbose: print(lyrics_p) -- GitLab