From eaaa83d011b6a881854d13a3e2bb9bcd68d481c9 Mon Sep 17 00:00:00 2001 From: Sting <loic.allegre@ensiie.fr> Date: Mon, 24 Jul 2023 12:33:15 +0200 Subject: [PATCH] Add verbose option --- autokara.py | 9 ++++++++- autosyl/segment.py | 12 +++++++----- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/autokara.py b/autokara.py index c77abb4..a84a051 100644 --- a/autokara.py +++ b/autokara.py @@ -14,10 +14,12 @@ parser.add_argument("source_file", type=str, help="The video/audio file to time" parser.add_argument("ass_file", type=str, help="The ASS file with lyrics to time") parser.add_argument("--vocals", action="store_true", help="Treat the input as vocals file, i.e. do not perform vocals extraction") parser.add_argument("-o", "--output", help="Write output to specified file. If absent, overwrite source file") +parser.add_argument("-v","--verbose", action="store_true", help="Increased verbosity") args = parser.parse_args() ass_file = args.ass_file +verbose = args.verbose if not args.vocals : print("Extracting audio from video file...") @@ -45,8 +47,13 @@ else: print("Identifying syl starts...") +if verbose: + print("Retrieving syls from lyrics...") reference_syls, line_meta = getSyls(ass_file) -syls = segment(vocals_file, reference_syls=reference_syls) + +if verbose: + print("Starting syl detection...") +syls = segment(vocals_file, reference_syls=reference_syls, verbose=verbose) print(syls) print(line_meta) diff --git a/autosyl/segment.py b/autosyl/segment.py index cc8e3fe..cc1ae1e 100644 --- a/autosyl/segment.py +++ b/autosyl/segment.py @@ -11,7 +11,7 @@ from autosyl.LyricsAlignment.wrapper import align, preprocess_from_file -def segment(songfile, reference_syls=None, syls_per_line=10, last_syl_dur=500): +def segment(songfile, reference_syls=None, syls_per_line=10, last_syl_dur=500, verbose=False): delay = -4 backtrack = False @@ -21,16 +21,16 @@ def segment(songfile, reference_syls=None, syls_per_line=10, last_syl_dur=500): audio_file = songfile # pre-computed source-separated vocals; These models do not work with mixture input. word_file = None # example: jamendolyrics/lyrics/*.words.txt"; Set to None if you don't have it method = "MTL_BDR" # "Baseline", "MTL", "Baseline_BDR", "MTL_BDR" - cuda=True # set True if you have access to a GPU + cuda=False # set True if you have access to a GPU checkpoint_folder = "./autosyl/LyricsAlignment/checkpoints" - pred_file = "./MTL.csv" # saved alignment results, "(float) start_time, (float) end_time, (string) word" - lyrics_lines = [" ".join([syl[1] for syl in line]) for line in reference_syls] #print(lyrics_lines) + if verbose: + print("Preprocessing audio and lyrics...") # load audio and lyrics # words: a list of words # lyrics_p: phoneme sequence of the target lyrics @@ -38,10 +38,12 @@ def segment(songfile, reference_syls=None, syls_per_line=10, last_syl_dur=500): # idx_line_p: indices of line start in lyrics_p audio, words, lyrics_p, idx_word_p, idx_line_p = preprocess_from_file(audio_file, lyrics_lines, word_file) + if verbose: + print("Retrieving syls from lyrics...") # compute alignment # word_align: a list of frame indices aligned to each word # words: a list of words - word_align, words = align(audio, words, lyrics_p, idx_word_p, idx_line_p, method=method, cuda=False, checkpoint_folder=checkpoint_folder) + word_align, words = align(audio, words, lyrics_p, idx_word_p, idx_line_p, method=method, cuda=cuda, checkpoint_folder=checkpoint_folder) words_onsets = np.array([word_align[i][0] for i in range(len(word_align))]) -- GitLab