Add verbose option

eaaa83d0 · Sting · 698ac7e7 · eaaa83d0 · eaaa83d0
--- a/autokara.py
+++ b/autokara.py
@@ -14,10 +14,12 @@ parser.add_argument("source_file", type=str, help="The video/audio file to time"
 parser.add_argument("ass_file", type=str, help="The ASS file with lyrics to time")
 parser.add_argument("--vocals", action="store_true", help="Treat the input as vocals file, i.e. do not perform vocals extraction")
 parser.add_argument("-o", "--output", help="Write output to specified file. If absent, overwrite source file")
+parser.add_argument("-v","--verbose", action="store_true", help="Increased verbosity")
 args = parser.parse_args()
 ass_file = args.ass_file
+verbose = args.verbose
 if not args.vocals :
    print("Extracting audio from video file...")
@@ -45,8 +47,13 @@ else:
 print("Identifying syl starts...")
+if verbose:
+    print("Retrieving syls from lyrics...")
 reference_syls, line_meta = getSyls(ass_file)
-syls = segment(vocals_file, reference_syls=reference_syls)
+if verbose:
+    print("Starting syl detection...")
+syls = segment(vocals_file, reference_syls=reference_syls, verbose=verbose)
 print(syls)
 print(line_meta)

--- a/autosyl/segment.py
+++ b/autosyl/segment.py
@@ -11,7 +11,7 @@ from autosyl.LyricsAlignment.wrapper import align, preprocess_from_file
-def segment(songfile, reference_syls=None, syls_per_line=10, last_syl_dur=500):
+def segment(songfile, reference_syls=None, syls_per_line=10, last_syl_dur=500, verbose=False):
    delay = -4
    backtrack = False
@@ -21,16 +21,16 @@ def segment(songfile, reference_syls=None, syls_per_line=10, last_syl_dur=500):
    audio_file = songfile                      # pre-computed source-separated vocals; These models do not work with mixture input.
    word_file = None                           # example: jamendolyrics/lyrics/*.words.txt"; Set to None if you don't have it
    method = "MTL_BDR"                             # "Baseline", "MTL", "Baseline_BDR", "MTL_BDR"
-    cuda=True                                 # set True if you have access to a GPU
+    cuda=False                                 # set True if you have access to a GPU
    checkpoint_folder = "./autosyl/LyricsAlignment/checkpoints"
-    pred_file = "./MTL.csv"                    # saved alignment results, "(float) start_time, (float) end_time, (string) word"
    lyrics_lines = [" ".join([syl[1] for syl in line]) for line in reference_syls]
    #print(lyrics_lines)
+    if verbose:
+        print("Preprocessing audio and lyrics...")
    # load audio and lyrics
    # words:        a list of words
    # lyrics_p:     phoneme sequence of the target lyrics
@@ -38,10 +38,12 @@ def segment(songfile, reference_syls=None, syls_per_line=10, last_syl_dur=500):
    # idx_line_p:   indices of line start in lyrics_p
    audio, words, lyrics_p, idx_word_p, idx_line_p = preprocess_from_file(audio_file, lyrics_lines, word_file)
+    if verbose:
+        print("Retrieving syls from lyrics...")
    # compute alignment
    # word_align:   a list of frame indices aligned to each word
    # words:        a list of words
-    word_align, words = align(audio, words, lyrics_p, idx_word_p, idx_line_p, method=method, cuda=False, checkpoint_folder=checkpoint_folder)
+    word_align, words = align(audio, words, lyrics_p, idx_word_p, idx_line_p, method=method, cuda=cuda, checkpoint_folder=checkpoint_folder)
    words_onsets = np.array([word_align[i][0] for i in range(len(word_align))])