From 3a5c61686eb12a99bf02a00413ca3193a01ef441 Mon Sep 17 00:00:00 2001
From: Sting <loic.allegre@ensiie.fr>
Date: Mon, 24 Jul 2023 19:22:12 +0200
Subject: [PATCH] Option to select language for phoneme transcription

---
 autokara.py                        | 3 ++-
 autosyl/LyricsAlignment/utils.py   | 8 ++++++--
 autosyl/LyricsAlignment/wrapper.py | 8 ++++----
 autosyl/segment.py                 | 5 +++--
 4 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/autokara.py b/autokara.py
index a84a051..8fe6f2e 100644
--- a/autokara.py
+++ b/autokara.py
@@ -15,6 +15,7 @@ parser.add_argument("ass_file", type=str, help="The ASS file with lyrics to time
 parser.add_argument("--vocals", action="store_true", help="Treat the input as vocals file, i.e. do not perform vocals extraction")
 parser.add_argument("-o", "--output", help="Write output to specified file. If absent, overwrite source file")
 parser.add_argument("-v","--verbose", action="store_true", help="Increased verbosity")
+parser.add_argument("-l","--lang", help="Select language to use (default is Japanese Romaji)")
 
 args = parser.parse_args()
 
@@ -53,7 +54,7 @@ reference_syls, line_meta = getSyls(ass_file)
 
 if verbose:
     print("Starting syl detection...")
-syls = segment(vocals_file, reference_syls=reference_syls, verbose=verbose)
+syls = segment(vocals_file, reference_syls=reference_syls, verbose=verbose, language=args.lang)
 print(syls)
 print(line_meta)
 
diff --git a/autosyl/LyricsAlignment/utils.py b/autosyl/LyricsAlignment/utils.py
index 9b73869..ec8aab8 100644
--- a/autosyl/LyricsAlignment/utils.py
+++ b/autosyl/LyricsAlignment/utils.py
@@ -32,7 +32,7 @@ class G2p_Wrapper():
         else:
             return self.transducer(word).output_string.split()
 
-g2p = G2p_Wrapper(language="jp")
+#g2p = G2p_Wrapper(language="jp")
 
 
 
@@ -110,7 +110,11 @@ def load_lyrics(lyrics_file):
 def write_wav(path, audio, sr):
     soundfile.write(path, audio.T, sr, "PCM_16")
 
-def gen_phone_gt(words, raw_lines):
+def gen_phone_gt(words, raw_lines, language="jp"):
+
+    print(f"Translating lyrics to phonemes, language chosen : {language:s}")
+    g2p = G2p_Wrapper(language=language)
+
 
     # helper function
     def getsubidx(x, y):  # find y in x
diff --git a/autosyl/LyricsAlignment/wrapper.py b/autosyl/LyricsAlignment/wrapper.py
index 120fcfe..825ec7c 100644
--- a/autosyl/LyricsAlignment/wrapper.py
+++ b/autosyl/LyricsAlignment/wrapper.py
@@ -10,10 +10,10 @@ from autosyl.LyricsAlignment.model import train_audio_transforms, AcousticModel,
 
 np.random.seed(7)
 
-def preprocess_from_file(audio_file, lyrics_file, word_file=None):
+def preprocess_from_file(audio_file, lyrics_file, word_file=None, language="jp"):
     y, sr = preprocess_audio(audio_file)
 
-    words, lyrics_p, idx_word_p, idx_line_p = preprocess_lyrics(lyrics_file, word_file)
+    words, lyrics_p, idx_word_p, idx_line_p = preprocess_lyrics(lyrics_file, word_file, language=language)
 
     return y, words, lyrics_p, idx_word_p, idx_line_p
 
@@ -143,7 +143,7 @@ def preprocess_audio(audio_file, sr=22050):
 
     return y, curr_sr
 
-def preprocess_lyrics(lyrics_lines, word_file=None):
+def preprocess_lyrics(lyrics_lines, word_file=None, language="jp"):
     from string import ascii_lowercase
     d = {ascii_lowercase[i]: i for i in range(26)}
     d["'"] = 26
@@ -166,7 +166,7 @@ def preprocess_lyrics(lyrics_lines, word_file=None):
     else:
         words_lines = full_lyrics.split()
 
-    lyrics_p, words_p, idx_word_p, idx_line_p = utils.gen_phone_gt(words_lines, raw_lines)
+    lyrics_p, words_p, idx_word_p, idx_line_p = utils.gen_phone_gt(words_lines, raw_lines, language=language)
 
     return words_lines, lyrics_p, idx_word_p, idx_line_p
 
diff --git a/autosyl/segment.py b/autosyl/segment.py
index e9c9c22..adacae0 100644
--- a/autosyl/segment.py
+++ b/autosyl/segment.py
@@ -11,7 +11,7 @@ from autosyl.LyricsAlignment.wrapper import align, preprocess_from_file
 
 
 
-def segment(songfile, reference_syls=None, syls_per_line=10, last_syl_dur=500, verbose=False):
+def segment(songfile, reference_syls=None, syls_per_line=10, last_syl_dur=500, verbose=False, language="jp"):
 
     delay = -4
     backtrack = False
@@ -23,6 +23,7 @@ def segment(songfile, reference_syls=None, syls_per_line=10, last_syl_dur=500, v
     method = "MTL_BDR"                             # "Baseline", "MTL", "Baseline_BDR", "MTL_BDR"
     cuda=False                                 # set True if you have access to a GPU
     checkpoint_folder = "./autosyl/LyricsAlignment/checkpoints"
+    language = language
 
 
     lyrics_lines = [" ".join([syl[1] for syl in line]) for line in reference_syls]
@@ -36,7 +37,7 @@ def segment(songfile, reference_syls=None, syls_per_line=10, last_syl_dur=500, v
     # lyrics_p:     phoneme sequence of the target lyrics
     # idx_word_p:   indices of word start in lyrics_p
     # idx_line_p:   indices of line start in lyrics_p
-    audio, words, lyrics_p, idx_word_p, idx_line_p = preprocess_from_file(audio_file, lyrics_lines, word_file)
+    audio, words, lyrics_p, idx_word_p, idx_line_p = preprocess_from_file(audio_file, lyrics_lines, word_file, language)
     if verbose:
         print(lyrics_p)
 
-- 
GitLab