diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..9d6f1d0
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,6 @@
+# Git Attributes (https://git-scm.com/docs/gitattributes)
+# Default git attributes
+* text=auto
+
+# Overrides
+*.png -text
diff --git a/describealign.py b/describealign.py
index 9fdc621..717baea 100644
--- a/describealign.py
+++ b/describealign.py
@@ -1,1250 +1,1250 @@
-# combines videos with matching audio files (e.g. audio descriptions)
-# input: video or folder of videos and an audio file or folder of audio files
-# output: videos in a folder "videos_with_ad", with aligned segments of the audio replaced
-# this script aligns the new audio to the video using the video's old audio
-# first, the video's sound and the audio file are both converted to spectrograms
-# second, the two spectrograms are roughly aligned by finding their longest common subsequence
-# third, the rough alignment is denoised through L1-Minimization
-# fourth, the spectrogram alignments determine where the new audio replaces the old
-
-'''
-Copyright (C) 2023 Julian Brown
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program. If not, see .
-'''
-
-# Nuitka build options:
-# nuitka-project-if: {OS} != "Windows":
-# nuitka-project: --enable-plugins=pyside2
-#
-# Compilation mode, standalone everywhere, except on macOS there app bundle
-# nuitka-project-if: {OS} == "Darwin":
-# nuitka-project: --standalone
-# nuitka-project: --macos-create-app-bundle
-# Mac needs onefile too apparently, because pyside2 plugin requires it.
-# All other platforms need it to, so set it universally.
-# nuitka-project: --onefile
-#
-# Debugging options, controlled via environment variable at compile time.
-# nuitka-project-if: os.getenv("DEBUG_COMPILATION", "no") == "yes":
-# nuitka-project: --enable-console
-# nuitka-project-else:
-# nuitka-project: --disable-console
-
-# Set app icon
-# nuitka-project-if: {OS} == "Windows":
-# nuitka-project: --windows-icon-from-ico=describealign.png
-# nuitka-project-else:
-# nuitka-project-if: {OS} == "Darwin":
-# nuitka-project: --macos-app-icon=describealign.png
-# nuitka-project-else:
-# nuitka-project: --linux-icon=describealign.png
-# End Nuitka build options
-
-VIDEO_EXTENSIONS = set(['mp4', 'mkv', 'avi', 'mov', 'webm', 'm4v', 'flv', 'vob'])
-AUDIO_EXTENSIONS = set(['mp3', 'm4a', 'opus', 'wav', 'aac', 'flac', 'ac3', 'mka'])
-PLOT_ALIGNMENT_TO_FILE = True
-
-TIMESTEP_SIZE_SECONDS = .16
-TIMESTEP_OVERLAP_RATIO = .5
-AUDIO_SAMPLE_RATE = 44100
-MEL_COEFFS_PER_TIMESTEP = 25
-DITHER_PERIOD_STEPS = 60
-MIN_CORR_FOR_TOKEN_MATCH = .6
-GAP_START_COST = 1.0
-GAP_EXTEND_COST = -.01
-GAP_EXTEND_DIAG_BONUS = -.01
-SKIP_MATCH_COST = .1
-MAX_RATE_RATIO_DIFF_ALIGN = .1
-PREF_CUT_AT_GAPS_FACTOR = 5
-MIN_DURATION_TO_REPLACE_SECONDS = 2
-MIN_START_END_SYNC_TIME_SECONDS = 2
-MAX_START_END_SYNC_ERR_SECONDS = .2
-MAX_RATE_RATIO_DIFF_BOOST = .003
-MIN_DESC_DURATION = .5
-MAX_GAP_IN_DESC_SEC = 1.5
-JUST_NOTICEABLE_DIFF_IN_FREQ_RATIO = .005
-CATCHUP_RATE = 5
-
-if PLOT_ALIGNMENT_TO_FILE:
- import matplotlib.pyplot as plt
-import argparse
-import os
-import glob
-import itertools
-import datetime
-import numpy as np
-import ffmpeg
-import static_ffmpeg
-import python_speech_features as psf
-import scipy.signal
-import scipy.optimize
-import scipy.interpolate
-import scipy.ndimage as nd
-import scipy.sparse
-import pytsmod
-import configparser
-import traceback
-import multiprocessing
-import platform
-
-IS_RUNNING_WINDOWS = platform.system() == 'Windows'
-if IS_RUNNING_WINDOWS:
- import PySimpleGUIWx as sg
- default_output_dir = 'videos_with_ad'
- default_alignment_dir = 'alignment_plots'
-else:
- import PySimpleGUIQt as sg
- default_output_dir = os.path.expanduser('~') + '/videos_with_ad'
- default_alignment_dir = os.path.expanduser('~') + '/alignment_plots'
-
-def display(text, func=None):
- if func:
- func(text)
- print(text)
-
-def throw_runtime_error(text, func=None):
- if func:
- func(text)
- raise RuntimeError(text)
-
-def ensure_folders_exist(dirs, display_func=None):
- for dir in dirs:
- if not os.path.isdir(dir):
- display("Directory not found, creating it: " + dir, display_func)
- os.makedirs(dir)
-
-def get_sorted_filenames(path, extensions, alt_extensions=set([])):
- # path could be three different things: a file, a directory, a list of files
- if type(path) is list:
- files = [os.path.abspath(file) for file in path]
- for file in files:
- if not os.path.isfile(file):
- raise RuntimeError(f"No file found at input path:\n {file}")
- else:
- path = os.path.abspath(path)
- if os.path.isdir(path):
- files = glob.glob(glob.escape(path) + "/*")
- if len(files) == 0:
- raise RuntimeError(f"Empty input directory:\n {path}")
- else:
- if not os.path.isfile(path):
- raise RuntimeError(f"No file or directory found at input path:\n {path}")
- files = [path]
- files = [file for file in files if os.path.splitext(file)[1][1:] in extensions | alt_extensions]
- if len(files) == 0:
- error_msg = [f"No files with valid extensions found at input path:\n {path}",
- "Did you accidentally put the audio filepath before the video filepath?",
- "The video path should be the first positional input, audio second.",
- "Or maybe you need to add a new extension to this script's regex?",
- f"valid extensions for this input are:\n {extensions}"]
- raise RuntimeError("\n".join(error_msg))
- files = sorted(files)
- file_types = [0 if os.path.splitext(file)[1][1:] in extensions else 1 for file in files]
- return files, file_types
-
-# read audio from file with ffmpeg and convert to numpy array
-def parse_audio_from_file(media_file):
- media_stream, _ = (ffmpeg
- .input(media_file)
- .output('-', format='s16le', acodec='pcm_s16le', ac=2, ar=AUDIO_SAMPLE_RATE, loglevel='fatal')
- .run(capture_stdout=True, cmd=get_ffmpeg())
- )
- media_arr = np.frombuffer(media_stream, np.int16).astype(np.float32).reshape((-1,2)).T
- return media_arr
-
-# tokenize audio by transforming with a mel-frequency cepstrum (MFC)
-def tokenize_audio(media_arr, rate=1):
- step_size_samples = psf.sigproc.round_half_up(TIMESTEP_SIZE_SECONDS * rate * AUDIO_SAMPLE_RATE)
- window_size_seconds = TIMESTEP_SIZE_SECONDS / TIMESTEP_OVERLAP_RATIO
- window_size_samples = psf.sigproc.round_half_up(window_size_seconds * AUDIO_SAMPLE_RATE)
- fft_size_samples = 2**int(np.ceil(np.log2(window_size_samples)))
- get_mfcc = lambda arr: psf.mfcc(np.mean(arr, axis=0),
- samplerate=AUDIO_SAMPLE_RATE,
- winlen=window_size_seconds,
- winstep=TIMESTEP_SIZE_SECONDS * rate,
- numcep=MEL_COEFFS_PER_TIMESTEP,
- nfilt=MEL_COEFFS_PER_TIMESTEP * 2,
- nfft=fft_size_samples,
- winfunc=scipy.signal.windows.hann)
- num_timesteps = max(1, ((media_arr.shape[1] - window_size_samples - 1) // step_size_samples) + 2)
- media_spec = np.zeros((num_timesteps, MEL_COEFFS_PER_TIMESTEP))
- chunk_size = 1000
- for chunk_index in np.arange(0, num_timesteps, chunk_size):
- chunk_bounds_samples = ((chunk_index ) * step_size_samples,
- (chunk_index + chunk_size - 1) * step_size_samples + window_size_samples)
- media_spec[chunk_index:chunk_index+chunk_size] = get_mfcc(media_arr[:,slice(*chunk_bounds_samples)])
- '''
- # alternate python library's MFC implementation
- import librosa
- media_spec = librosa.feature.mfcc(y=np.mean(media_arr, axis=0),
- sr=AUDIO_SAMPLE_RATE,
- n_mfcc=MEL_COEFFS_PER_TIMESTEP,
- lifter=22,
- n_fft=fft_size_samples,
- hop_length=step_size_samples,
- win_length=window_size_samples,
- window=scipy.signal.windows.hann).T
- num_timesteps = media_spec.shape[0]
- '''
- timings_samples = window_size_samples/2. + step_size_samples * np.arange(num_timesteps)
- timings_seconds = timings_samples / AUDIO_SAMPLE_RATE
- return media_spec, timings_seconds
-
-# same as tokenize_audio, but dithering the MFC window timings
-# this allows for finer alignment by ameliorating discretization error
-def tokenize_audio_dither(media_arr, slow_timings):
- # choose a relative step size slightly less than 1 to ameliorate quantization error
- # maximize alignment accuracy by using least approximable number with desired period
- # this is the continued fraction [0;1,N-2,1,1,1,...], where the trailing ones give phi
- fast_rate = 1. / (1 + 1. / (DITHER_PERIOD_STEPS - 2 + (np.sqrt(5) + 1) / 2.))
- fast_spec, fast_timings = tokenize_audio(media_arr, fast_rate)
-
- # prevent drift in difficult to align segments (e.g. describer speaking or quiet/droning segments)
- # by approximately equalizing the number of tokens per unit time between dithered and undithered
- # the dithered audio will have ~(1 + 1 / DITHER_PERIOD_STEPS) times as many tokens, so
- # this can be accomplished by simply deleting a token every DITHER_PERIOD_STEPS tokens
- fast_spec = np.delete(fast_spec, slice(DITHER_PERIOD_STEPS // 2, None, DITHER_PERIOD_STEPS), axis=0)
- fast_timings = np.delete(fast_timings, slice(DITHER_PERIOD_STEPS // 2, None, DITHER_PERIOD_STEPS))
- return fast_spec, fast_timings
-
-# normalize along both time and frequency axes to allow comparing tokens by correlation
-def normalize_spec(media_spec_raw, axes=(0,1)):
- media_spec = media_spec_raw.copy()
- for axis in axes:
- norm_func = np.std if axis == 0 else np.linalg.norm
- media_spec = media_spec - np.mean(media_spec, axis=axis, keepdims=True)
- media_spec = media_spec/(norm_func(media_spec,axis=axis,keepdims=True)+1e-10)
- return media_spec
-
-# vectorized implementation of the Wagner–Fischer (Longest Common Subsequence) algorithm
-# modified to include affine gap penalties and skip+match options (i.e. knight's moves)
-# gaps are necessary when parts are cut out of the audio description (e.g. cut credits)
-# or when the audio description includes a commercial break or an extra scene
-# the skip+match option allows for micro-adjustments without eating the full gap penalty
-# skip+match is primarily useful in maintaining alignment when the rates differ slightly
-def rough_align(video_spec, audio_desc_spec, video_timings, audio_desc_timings):
- pred_map = {0:lambda node: (0, node[1]-1, node[2]-1),
- 1:lambda node: (0, node[1]-2, node[2]-1),
- 2:lambda node: (0, node[1]-1, node[2]-2),
- 3:lambda node: (1, node[1]-1, node[2]-1),
- 4:lambda node: (0, node[1] , node[2] ),
- 5:lambda node: (1, node[1]-1, node[2] ),
- 6:lambda node: (1, node[1]-1, node[2]-1),
- 7:lambda node: (1, node[1] , node[2]-1)}
- pred_matrix = np.zeros((2, audio_desc_spec.shape[0], video_spec.shape[0]), dtype=np.uint8)
- pred_matrix[0,1:,:2] = 0
- pred_matrix[1,1:,:2] = 4
- pred_matrix[:,0,:2] = [0,5]
- path_corrs_match = np.zeros((3, video_spec.shape[0]))
- path_corrs_gap = np.zeros((3, video_spec.shape[0]))
- corrs = np.zeros((3, video_spec.shape[0]))
- corrs[:,:] = np.roll(np.dot(video_spec, audio_desc_spec[0]), 1)[None,:]
- for i in range(audio_desc_spec.shape[0]):
- i_mod = i % 3
- match_pred_corrs = np.hstack([path_corrs_match[i_mod-1][1:-1][:,None],
- path_corrs_match[i_mod-2][1:-1][:,None] - SKIP_MATCH_COST,
- path_corrs_match[i_mod-1][0:-2][:,None] - SKIP_MATCH_COST,
- path_corrs_gap[ i_mod-1][1:-1][:,None]])
- pred_matrix[0][i][2:] = np.argmax(match_pred_corrs, axis=1)
- path_corrs_match[i_mod][2:] = np.take_along_axis(match_pred_corrs, pred_matrix[0][i][2:,None], axis=1).T
- corrs = np.roll(corrs, -1, axis=1)
- corrs[(i_mod+1)%3,:] = np.roll(np.dot(video_spec, audio_desc_spec[min(audio_desc_spec.shape[0]-1,i+1)]), 1)
- fisher_infos = (2 * corrs[i_mod] - corrs[i_mod-1] - corrs[(i_mod+1)%3]) / min(.2, TIMESTEP_SIZE_SECONDS)
- fisher_infos[fisher_infos < 0] = 0
- fisher_infos[fisher_infos > 10] = 10
- row_corrs = np.maximum(0, corrs[i_mod][2:] - MIN_CORR_FOR_TOKEN_MATCH)
- path_corrs_match[i_mod][2:] += row_corrs * (fisher_infos[2:] / 5)
- gap_pred_corrs = np.hstack([path_corrs_match[i_mod][2: ][:,None] - GAP_START_COST,
- path_corrs_gap[i_mod-1][2: ][:,None],
- path_corrs_gap[i_mod-1][1:-1][:,None] - GAP_EXTEND_DIAG_BONUS - \
- GAP_EXTEND_COST])
- pred_matrix[1][i][2:] = np.argmax(gap_pred_corrs, axis=1)
- path_corrs_gap_no_col_skip = np.take_along_axis(gap_pred_corrs, pred_matrix[1][i][2:,None], axis=1).flat
- pred_matrix[1][i][2:] += 4
- path_corrs_gap[i_mod][2:] = np.maximum.accumulate(path_corrs_gap_no_col_skip + \
- GAP_EXTEND_COST * np.arange(video_spec.shape[0]-2)) - \
- GAP_EXTEND_COST * np.arange(video_spec.shape[0]-2)
- pred_matrix[1][i][2:][path_corrs_gap[i_mod][2:] > path_corrs_gap_no_col_skip] = 7
- path_corrs_gap[i_mod][2:] -= GAP_EXTEND_COST
-
- # reconstruct optimal path by following predecessors backwards through the table
- end_node_layer = np.argmax([path_corrs_match[i_mod,-1],
- path_corrs_gap[ i_mod,-1]])
- cur_node = (end_node_layer, audio_desc_spec.shape[0]-1, video_spec.shape[0]-1)
- get_predecessor = lambda node: pred_map[pred_matrix[node]](node)
- path = []
- visited = set()
- while min(cur_node[1:]) >= 0:
- cur_node, last_node = get_predecessor(cur_node), cur_node
- # failsafe to prevent an infinite loop that should never happen anyways
- if cur_node in visited:
- break
- visited.add(cur_node)
- if last_node[0] == 0:
- path.append(last_node[1:])
- path = path[::-1]
-
- # determine how much information this node gives about the alignment
- # a larger double derivative means more precise timing information
- # sudden noises give more timing information than droning sounds
- def get_fisher_info(node):
- i,j = node
- if node[0] >= audio_desc_spec.shape[0]-1 or \
- node[1] >= video_spec.shape[0]-1 or \
- min(node) <= 0:
- return 0
- info = 2*np.dot(audio_desc_spec[i ],video_spec[j ]) - \
- np.dot(audio_desc_spec[i-1],video_spec[j+1]) - \
- np.dot(audio_desc_spec[i+1],video_spec[j-1])
- info /= min(.2, TIMESTEP_SIZE_SECONDS)
- return info
-
- # the quality of a node combines the correlation of its tokens
- # with how precisely the match is localized in time
- def get_match_quality(node):
- # correlations are between -1 and 1, as all tokens have unit norm
- token_correlation = np.dot(audio_desc_spec[node[0]],video_spec[node[1]])
- fisher_info = min(max(0, get_fisher_info(node)), 10)
- return max(0, token_correlation - MIN_CORR_FOR_TOKEN_MATCH) * (fisher_info / 5)
-
- # filter out low match quality nodes from LCS path
- quals = [get_match_quality(node) for node in path]
- if len(quals) == 0 or max(quals) <= 0:
- raise RuntimeError("Rough alignment failed, are the input files mismatched?")
- path, quals = zip(*[(path, qual) for (path, qual) in zip(path, quals) if qual > 0])
-
- # convert units of path nodes from timesteps to seconds
- path = [(audio_desc_timings[i], video_timings[j]) for (i,j) in path]
-
- return path, quals
-
-# chunk path segments of similar slope into clips
-# a clip has the form: (start_index, end_index)
-def chunk_path(smooth_path, tol):
- x,y = zip(*smooth_path)
- slopes = np.diff(y) / np.diff(x)
- median_slope = np.median(slopes)
- slope_changes = np.diff(slopes)
- breaks = np.where(np.abs(slope_changes) > tol)[0] + 1
- breaks = [0] + list(breaks) + [len(x)-1]
- clips = list(zip(breaks[:-1], breaks[1:]))
- return clips, median_slope, slopes
-
-# find piece-wise linear alignment that minimizes the weighted combination of
-# total absolute error at each node and total absolute slope change of the fit
-# distance between nodes and the fit (i.e. errors) are weighted by node quality
-# absolute slope changes are differences between the slopes of adjacent fit lines
-# slope changes are weighted much more than node errors to smooth out noise
-# the main source of noise is rough alignment drift while the describer is speaking
-def smooth_align(path, quals, smoothness):
- # rotate basis to make vertical and horizontal slopes "cost" the same
- # the new horizontal axis is x+y and the new vertical is -x+y
- # Wagner–Fischer gives monotonically increasing nodes, so 0 <= slope < inf
- # after this transformation, we instead have -1 <= slope < 1
- # perfectly matching audio has pre-transformation slope = 1
- # after this transformation, it instead has slope = 0
- rotated_path = [(x+y,-x+y) for x,y in path]
-
- # stretch the x axis to make all slopes "cost" nearly the same
- # without this, small changes to the slope at slope = +/-1
- # cost sqrt(2) times as much as small changes at slope = 0
- # by stretching, we limit the range of slopes to within +/- 1/x_stretch_factor
- # the small angle approximation means these slopes all cost roughly the same
- x_stretch_factor = 10.
- rotated_stretched_path = [(x_stretch_factor*x,y) for x,y in rotated_path]
-
- # L1-Minimization to solve the alignment problem using a linear program
- # the absolute value functions needed for "absolute error" can be represented
- # in a linear program by splitting variables into positive and negative pieces
- # and constraining each to be positive (done by default in scipy's linprog)
- # x is fit_err_pos, fit_err_neg, slope_change_pos, slope_change_neg
- # fit_err[i] = path[i][1] - y_fit[i]
- # slope_change[i] = (y_fit[i+2] - y_fit[i+1])/(path[i+2][0] - path[i+1][0]) - \
- # (y_fit[i+1] - y_fit[i ])/(path[i+1][0] - path[i ][0])
- # this can be rewritten in terms of fit_err by re-arranging the 1st equation:
- # y_fit[i] = path[i][1] - fit_err[i]
- # this gives:
- # slope_change[i] = path_half[i] - fit_err_half[i]
- # where each half is just the original equation but y_fit is swapped out
- # the slope_change variables can then be set using equality constraints
- num_fit_points = len(rotated_stretched_path)
- x,y = [np.array(arr) for arr in zip(*rotated_stretched_path)]
- x_diffs = np.diff(x, prepend=[-10**10], append=[10**10])
- y_diffs = np.diff(y, prepend=[ 0 ], append=[ 0 ])
- slope_change_magnitudes = np.abs(np.diff(y_diffs/x_diffs)) * x_stretch_factor
- slope_change_locations = (slope_change_magnitudes > MAX_RATE_RATIO_DIFF_ALIGN)
- slope_change_locations[1:-1] *= (np.abs(y[2:] - y[:-2]) > 5)
- slope_change_costs = np.full(num_fit_points, smoothness / float(TIMESTEP_SIZE_SECONDS))
- slope_change_costs[slope_change_locations] /= PREF_CUT_AT_GAPS_FACTOR
- c = np.hstack([quals,
- quals,
- slope_change_costs * x_stretch_factor,
- slope_change_costs * x_stretch_factor])
- fit_err_coeffs = scipy.sparse.diags([ 1. / x_diffs[:-1],
- -1. / x_diffs[:-1] - 1. / x_diffs[1:],
- 1. / x_diffs[1:]],
- offsets=[0,1,2],
- shape=(num_fit_points, num_fit_points + 2)).tocsc()[:,1:-1]
- A_eq = scipy.sparse.hstack([ fit_err_coeffs,
- -fit_err_coeffs,
- scipy.sparse.eye(num_fit_points),
- -scipy.sparse.eye(num_fit_points)])
- b_eq = y_diffs[1: ] / x_diffs[1: ] - \
- y_diffs[ :-1] / x_diffs[ :-1]
- fit = scipy.optimize.linprog(c, A_eq=A_eq, b_eq=b_eq)
- if not fit.success:
- print(fit)
- raise RuntimeError("Smooth Alignment L1-Min Optimization Failed!")
-
- # combine fit_err_pos and fit_err_neg
- fit_err = fit.x[:num_fit_points] - fit.x[num_fit_points:2*num_fit_points]
-
- # subtract fit errors from nodes to retrieve the smooth fit's coordinates
- # also, unstretch x axis and rotate basis back, reversing the affine pre-processing
- smooth_path = [(((x / x_stretch_factor) - y) / 2.,
- ((x / x_stretch_factor) + y) / 2.) for x,y in zip(x, y - fit_err)]
-
- # clip off start/end of replacement audio if it doesn't match or isn't aligned
- # without this, describer intro/outro skips can cause mismatches at the start/end
- # the problem would be localized and just means audio might not match video at the start/end
- # instead we just keep the original video's audio in those segments if mismatches are detected
- # if instead the first few or last few nodes are well-aligned, that edge is marked as synced
- # during audio replacement, synced edges will be extended backwards/forwards as far as possible
- # this is useful when the describer begins talking immediately (or before any alignable audio)
- # or when the describer continues speaking until the end (or no more alignable audio remains)
- # otherwise, the mismatch would result in the describer's voice not replacing audio in that part
- max_sync_err = MAX_START_END_SYNC_ERR_SECONDS
- smoothing_std = MIN_START_END_SYNC_TIME_SECONDS / (2. * TIMESTEP_SIZE_SECONDS)
- smoothed_fit_err = nd.gaussian_filter(np.abs(fit_err), sigma=smoothing_std)
- smooth_err_path = zip(smoothed_fit_err, smooth_path)
- old_length = num_fit_points
- smooth_err_path = list(itertools.dropwhile(lambda x: x[0] > max_sync_err, smooth_err_path))[::-1]
- is_synced_at_start = len(smooth_err_path) == old_length
- old_length = len(smooth_err_path)
- smooth_err_path = list(itertools.dropwhile(lambda x: x[0] > max_sync_err, smooth_err_path))[::-1]
- is_synced_at_end = len(smooth_err_path) == old_length
- _, smooth_path = zip(*smooth_err_path)
- smooth_path = list(smooth_path)
- if is_synced_at_start:
- slope = (smooth_path[1][1] - smooth_path[0][1]) / (smooth_path[1][0] - smooth_path[0][0])
- smooth_path.insert(0, (-10e10, -10e10 * slope))
- if is_synced_at_end:
- slope = (smooth_path[-1][1] - smooth_path[-2][1]) / (smooth_path[-1][0] - smooth_path[-2][0])
- smooth_path.append((10e10, 10e10 * slope))
-
- clips, median_slope, slopes = chunk_path(smooth_path, tol=1e-7)
-
- # assemble clips with slopes within the rate tolerance into runs
- runs, run = [], []
- bad_clips = []
- for clip in clips:
- if np.abs(median_slope-slopes[clip[0]]) > MAX_RATE_RATIO_DIFF_ALIGN:
- if len(run) > 0:
- runs.append(run)
- run = []
- bad_clips.append(clip)
- continue
- run.append(clip)
- if len(run) > 0:
- runs.append(run)
-
- return smooth_path, runs, bad_clips, clips
-
-# if the start or end were marked as synced during smooth alignment then
-# extend that alignment to the edge (i.e. to the start/end of the audio)
-def cap_synced_end_points(smooth_path, video_arr, audio_desc_arr):
- if smooth_path[0][0] < -10e9:
- slope = smooth_path[0][1] / smooth_path[0][0]
- new_start_point = (0, smooth_path[1][1] - smooth_path[1][0] * slope)
- if new_start_point[1] < 0:
- new_start_point = (smooth_path[1][0] - smooth_path[1][1] / slope, 0)
- smooth_path[0] = new_start_point
- if smooth_path[-1][0] > 10e9:
- video_runtime = (video_arr.shape[1] - 2.) / AUDIO_SAMPLE_RATE
- audio_runtime = (audio_desc_arr.shape[1] - 2.) / AUDIO_SAMPLE_RATE
- slope = smooth_path[-1][1] / smooth_path[-1][0]
- new_end_point = (audio_runtime, smooth_path[-2][1] + (audio_runtime - smooth_path[-2][0]) * slope)
- if new_end_point[1] > video_runtime:
- new_end_point = (smooth_path[-2][0] + (video_runtime - smooth_path[-2][1]) / slope, video_runtime)
- smooth_path[-1] = new_end_point
-
-# visualize both the rough and smooth alignments
-def plot_alignment(plot_filename_no_ext, path, smooth_path, quals, runs, bad_clips, ad_timings):
- scatter_color = [.2,.4,.8]
- lcs_rgba = np.zeros((len(quals),4))
- lcs_rgba[:,:3] = np.array(scatter_color)[None,:]
- lcs_rgba[:,3] = np.minimum(1, np.array(quals) * 500. / len(quals))
- audio_times, video_times = np.array(path).T.reshape((2,-1))
- audio_offsets = audio_times - video_times
- def expand_limits(start, end, ratio=.01):
- average = (end + start) / 2.
- half_diff = (end - start) / 2.
- half_diff *= (1 + ratio)
- return (average - half_diff, average + half_diff)
- plt.xlim(expand_limits(*(0, np.max(video_times) / 60.)))
- plt.ylim(expand_limits(*(np.min(audio_offsets) - TIMESTEP_SIZE_SECONDS / 2.,
- np.max(audio_offsets) + TIMESTEP_SIZE_SECONDS / 2.)))
- plt.scatter(video_times / 60., audio_offsets, s=3, c=lcs_rgba, label='LCS Matches')
- audio_times, video_times = np.array(smooth_path).T.reshape((2,-1))
- audio_offsets = audio_times - video_times
- if ad_timings is None:
- plt.plot(video_times / 60., audio_offsets, 'r-', lw=.5, label='Replaced Audio')
- bad_path = []
- for clip in bad_clips:
- bad_path.extend(smooth_path[clip[0]:clip[1]+1])
- bad_path.append((smooth_path[clip[1]][0] + 1e-10, np.nan))
- audio_times, video_times = np.array(bad_path).T.reshape((2,-1))
- audio_offsets = audio_times - video_times
- if len(audio_offsets) > 0:
- plt.plot(video_times / 60., audio_offsets, 'c-', lw=1, label='Original Audio')
- else:
- interp = scipy.interpolate.interp1d(video_times, audio_offsets,
- fill_value = np.inf,
- bounds_error = False, assume_sorted = True)
- plt.plot(video_times / 60., audio_offsets, 'c-', lw=.5, label='Original Audio')
- video_times = ad_timings
- audio_offsets = interp(ad_timings)
- if len(audio_offsets) > 0:
- plt.plot(video_times / 60., audio_offsets, 'r-', lw=1, label='Replaced Audio')
- plt.xlabel('Video Time (minutes)')
- plt.ylabel('Audio Description Offset (seconds)')
- plt.title('Alignment')
- plt.legend().legendHandles[0].set_color(scatter_color)
- plt.tight_layout()
- plt.savefig(plot_filename_no_ext + '.png', dpi=400)
- plt.clf()
-
- with open(plot_filename_no_ext + '.txt', 'w') as file:
- rough_clips, median_slope, _ = chunk_path(smooth_path, tol=2e-2)
- video_offset = np.diff(smooth_path[rough_clips[0][0]])[0]
- print("Main changes needed to video to align it to audio input:", file=file)
- print(f"Start Offset: {-video_offset:.2f} seconds", file=file)
- print(f"Median Rate Change: {(median_slope-1.)*100:.2f}%", file=file)
- for clip_start, clip_end in rough_clips:
- audio_desc_start, video_start = smooth_path[clip_start]
- audio_desc_end, video_end = smooth_path[clip_end]
- slope = (video_end - video_start) / (audio_desc_end - audio_desc_start)
- def str_from_time(seconds):
- minutes, seconds = divmod(seconds, 60)
- hours, minutes = divmod(minutes, 60)
- return f"{hours:2.0f}:{minutes:02.0f}:{seconds:05.2f}"
- print(f"Rate change of {(slope-1.)*100:6.1f}% from {str_from_time(video_start)} to " + \
- f"{str_from_time(video_end)} aligning with audio from " + \
- f"{str_from_time(audio_desc_start)} to {str_from_time(audio_desc_end)}", file=file)
-
-# use the smooth alignment to replace runs of video sound with corresponding described audio
-def replace_aligned_segments(video_arr, audio_desc_arr, smooth_path, runs, no_pitch_correction=False):
- # perform quadratic interpolation of the audio description's waveform
- # this allows it to be stretched to match the corresponding video segment
- def audio_desc_arr_interp(samples):
- chunk_size = 10**7
- interpolated_chunks = []
- for chunk in (samples[i:i+chunk_size] for i in range(0, len(samples), chunk_size)):
- interp_bounds = (max(int(chunk[0]-2), 0),
- min(int(chunk[-1]+2), audio_desc_arr.shape[1]))
- interp = scipy.interpolate.interp1d(np.arange(*interp_bounds),
- audio_desc_arr[:,slice(*interp_bounds)],
- copy=False, bounds_error=False, fill_value=0,
- kind='quadratic', assume_sorted=True)
- interpolated_chunks.append(interp(chunk).astype(np.float32))
- return np.hstack(interpolated_chunks)
-
- # construct a stretched audio description waveform using the quadratic interpolator
- def get_interped_segment(run, interp):
- segment = []
- for clip in run:
- num_samples = int(y[clip[1]] * AUDIO_SAMPLE_RATE) - \
- int(y[clip[0]] * AUDIO_SAMPLE_RATE)
- clip_bounds = np.array((x[clip[0]], x[clip[1]])) * AUDIO_SAMPLE_RATE
- sample_points = np.linspace(*clip_bounds, num=num_samples, endpoint=False)
- segment.append(interp(sample_points))
- segment = np.hstack(segment)
- return segment
-
- x,y = zip(*smooth_path)
- for run in runs:
- run_length_seconds = y[run[-1][1]] - y[run[0][0]]
- if run_length_seconds < MIN_DURATION_TO_REPLACE_SECONDS:
- continue
- anchor_point_path_indices = [clip[0] for clip in run]
- anchor_point_path_indices.append(run[-1][1])
- anchor_points = (np.array((np.array(x)[anchor_point_path_indices],
- np.array(y)[anchor_point_path_indices])) * AUDIO_SAMPLE_RATE).astype(int)
- slopes = np.diff(anchor_points[1]) / np.diff(anchor_points[0])
- for clip_index, (clip, slope) in enumerate(zip(run, slopes)):
- # only apply pitch correction if the difference would be noticeable
- if no_pitch_correction or np.abs(1 - slope) <= JUST_NOTICEABLE_DIFF_IN_FREQ_RATIO:
- stretched_audio = get_interped_segment([clip], audio_desc_arr_interp)
- else:
- anchor_point_pair = anchor_points[:,clip_index:clip_index+2].copy()
- # account for quirks of pytsmod's wsola anchor point implementation
- anchor_point_pair[1][-1] -= 1
- anchor_y_offset = anchor_point_pair[1][0]
- anchor_point_pair[1,:] -= anchor_y_offset
- stretched_audio = pytsmod.wsola(audio_desc_arr, anchor_point_pair)
- video_arr[:,slice(*anchor_points[1,clip_index:clip_index+2])] = stretched_audio
-
-# identify which segments of the replaced audio actually have the describer speaking
-# uses a Naive Bayes classifier smoothed with L1-Minimization to identify the describer
-def detect_describer(video_arr, video_spec, video_spec_raw, video_timings,
- smooth_path, detect_sensitivity, boost_sensitivity):
- # retokenize the audio description, which has been stretched to match the video
- audio_desc_spec_raw, audio_timings = tokenize_audio(video_arr)
- audio_desc_spec = normalize_spec(audio_desc_spec_raw)
-
- # avoid boosting or training on mismatched segments, like those close to skips
- # assumes matching segments all have the same, constant play rate
- # could be modified to handle a multi-modal distribution of rates
- aligned_audio_times, aligned_video_times = zip(*smooth_path)
- interp = scipy.interpolate.interp1d(aligned_video_times, aligned_audio_times,
- fill_value = 'extrapolate',
- bounds_error = False, assume_sorted = True)
- slopes = (interp(video_timings + 1e-5) - \
- interp(video_timings - 1e-5)) / 2e-5
- median_slope = np.median(slopes)
- aligned_mask = np.abs(slopes - median_slope) < MAX_RATE_RATIO_DIFF_ALIGN
- well_aligned_mask = np.abs(slopes - median_slope) < MAX_RATE_RATIO_DIFF_BOOST
-
- # first pass identification by assuming poorly matched tokens are describer speech
- # also assumes the describer doesn't speak very quietly
- corrs = np.sum(audio_desc_spec * video_spec, axis=-1)
- smooth_volume = nd.gaussian_filter(audio_desc_spec[:,0], sigma=1)
- audio_desc_loud = smooth_volume > np.percentile(smooth_volume, 30)
- speech_mask = (corrs < .2) * audio_desc_loud
-
- # normalize spectrogram coefficients along time axis to prep for conversion to PDFs
- audio_desc_spec = normalize_spec(audio_desc_spec_raw, axes=(0,))
- audio_desc_spec = np.clip(audio_desc_spec / 6., -1, 1)
- video_spec = normalize_spec(video_spec_raw, axes=(0,))
- video_spec = np.clip(video_spec / 6., -1, 1)
-
- # convert sampled features (e.g. spectrogram) to probability densities of each feature
- # when given a spectrogram, finds the distributions of the MFC coefficients
- def make_log_pdfs(arr):
- resolution = 100
- bins_per_spot = 4
- num_bins = int(resolution * bins_per_spot)
- uniform_prior_strength_per_spot = 1
- uniform_prior_strength_per_bin = uniform_prior_strength_per_spot / float(bins_per_spot)
- bin_range = (-1 - 1e-10, 1 + 1e-10)
- get_hist = lambda x: np.histogram(x, bins=num_bins, range=bin_range)[0]
- pdfs = np.apply_along_axis(get_hist, 1, arr.T)
- pdfs = pdfs + uniform_prior_strength_per_bin
- smooth = lambda x: nd.gaussian_filter(x, sigma=bins_per_spot)
- pdfs = np.apply_along_axis(smooth, 1, pdfs)
- pdfs = pdfs / np.sum(pdfs[0,:])
- log_pdfs = np.log(pdfs)
- bin_edges = np.histogram([], bins=num_bins, range=bin_range)[1]
- return log_pdfs, bin_edges
-
- diff_spec = audio_desc_spec - video_spec
- diff_spec = np.clip(diff_spec, -1, 1)
-
- # Naive Bayes classifier to roughly estimate whether each token is describer speech
- desc_log_pdfs, _ = make_log_pdfs(diff_spec[speech_mask * well_aligned_mask])
- nondesc_log_pdfs, bin_edges = make_log_pdfs(diff_spec[(~speech_mask) * well_aligned_mask])
- lratio_lookup = desc_log_pdfs - nondesc_log_pdfs
- lratios = lratio_lookup[np.fromfunction(lambda i,j: j, diff_spec.shape, dtype=int),
- np.digitize(diff_spec, bin_edges, right=True)-1]
- ratio_desc_to_nondesc = np.sum(speech_mask * well_aligned_mask) /\
- (np.sum((~speech_mask) * well_aligned_mask) + 1.)
- relative_probs = np.sum(lratios, axis=1)
- relative_probs /= np.std(relative_probs)
- relative_probs -= np.mean(relative_probs)
-
- # L1-Minimization to smoothly identify audio descriptions using a linear program
- # x is fit_err_pos, fit_err_neg, delta_fit_pos, delta_fit_neg
- # fit_err[i] = relative_probs[i] - y_fit[i]
- # delta_fit[i] = y_fit[i] - y_fit[i-1]
- # this can be rewritten in terms of fit_err by re-arranging the 1st equation:
- # y_fit[i] = relative_probs[i] - fit_err[i]
- # this gives:
- # delta_fit[i] = (relative_probs[i] - relative_probs[i-1]) -\
- # (fit_err[i] - fit_err[i-1])
- # the delta_fit variables can then be set using equality constraints
- num_fit_points = len(relative_probs)
- y_diffs = np.diff(relative_probs)
- pos_err_cost_factor = MIN_DESC_DURATION / float(TIMESTEP_SIZE_SECONDS)
- neg_err_cost_factor = MAX_GAP_IN_DESC_SEC / float(TIMESTEP_SIZE_SECONDS)
- c = np.hstack([np.ones(num_fit_points) / pos_err_cost_factor,
- np.ones(num_fit_points) / neg_err_cost_factor,
- np.ones(num_fit_points - 1) / 2.,
- np.ones(num_fit_points - 1) / 2.])
- fit_err_coeffs = scipy.sparse.diags([-np.ones(num_fit_points),
- np.ones(num_fit_points)],
- offsets=[0,1],
- shape=(num_fit_points - 1, num_fit_points)).tocsc()
- A_eq = scipy.sparse.hstack([ fit_err_coeffs,
- -fit_err_coeffs,
- scipy.sparse.eye(num_fit_points-1),
- -scipy.sparse.eye(num_fit_points-1)])
- b_eq = y_diffs
- fit = scipy.optimize.linprog(c, A_eq=A_eq, b_eq=b_eq)
- if not fit.success:
- print(fit)
- raise RuntimeError("Describer Voice Detection L1-Min Optimization Failed!")
-
- # combine fit_err_pos and fit_err_neg
- fit_err = fit.x[:num_fit_points] - fit.x[num_fit_points:2*num_fit_points]
-
- # subtract fit errors from nodes to retrieve the smoothed fit
- smooth_desc_locations = relative_probs - fit_err
-
- # hard threshold to classify each token as describer speech or not
- speech_mask = smooth_desc_locations > 1. - 1.5 * detect_sensitivity
- speech_mask *= aligned_mask
-
- # a separate mask is created for describer volume boosting
- # as losing the describer's voice entirely is usually worse than it just being quiet
- # and imperfectly aligned segments may have descriptions, but shouldn't be boosted
- boost_mask = smooth_desc_locations > 1. - 1.5 * boost_sensitivity
- boost_mask *= well_aligned_mask
-
- # convert a token classification into a mask that can be applied directly to samples
- # unlike the input, the output isn't a boolean array but an array of floats
- def token_mask_to_sample_mask(token_mask):
- description_timings = video_timings[1:-1][token_mask[1:-1]]
- sample_mask = np.zeros(video_arr.shape[1], dtype=np.float32)
- window_radius = int(AUDIO_SAMPLE_RATE * TIMESTEP_SIZE_SECONDS)
- window_size_seconds = 2 * window_radius + 1
- bump = scipy.signal.windows.hann(window_size_seconds)
- for description_timing in description_timings:
- window_center = int(description_timing * AUDIO_SAMPLE_RATE)
- sample_mask[window_center-window_radius:window_center+window_radius+1] += bump
- return sample_mask
-
- speech_sample_mask = token_mask_to_sample_mask(speech_mask)
- boost_sample_mask = token_mask_to_sample_mask(boost_mask)
- ad_timings = video_timings.copy()
- ad_timings[~speech_mask] = np.inf
-
- return speech_sample_mask, boost_sample_mask, ad_timings
-
-# Convert piece-wise linear fit to ffmpeg expression for editing video frame timestamps
-def encode_fit_as_ffmpeg_expr(smooth_path, clips, video_offset, start_key_frame):
- # PTS is the input frame's presentation timestamp, which is when frames are displayed
- # TB is the timebase, which is how many seconds each unit of PTS corresponds to
- # the output value of the expression will be the frame's new PTS
- setts_cmd = ['TS']
- start_skip = max(0, video_offset - start_key_frame)
- if start_skip > 0:
- # lossless cutting can only happen at key frames, so we cut the video before the audio starts
- # but that means the video is behind the audio and needs to catch up by playing quicker
- # catchup_spread is the ratio of time to spend catching up to the amount of catching up needed
- catchup_spread = 1./CATCHUP_RATE
- setts_cmd.append(f'+clip(TS-STARTPTS,0,{start_skip*(1+catchup_spread)}/TB)*{-1./(1+catchup_spread)}')
- elif video_offset < 0:
- # if the audio starts before the video, stretch the first frame of the video back to meet it
- setts_cmd.append(f'+clip(TS-STARTPTS,0,{-video_offset/10000.}/TB)*10000')
- # each segment of the linear fit can be encoded as a single clip function
- setts_cmd.append('+(0')
- for clip_start, clip_end in clips:
- audio_desc_start, video_start = smooth_path[clip_start]
- audio_desc_end, video_end = smooth_path[clip_end]
- video_start -= start_key_frame
- video_end -= start_key_frame
- audio_desc_length = audio_desc_end - audio_desc_start
- video_length = video_end - video_start
- slope = audio_desc_length / video_length
- setts_cmd.append(f'+clip(TS-STARTPTS-{video_start:.4f}/TB,0,{max(0,video_length):.4f}/TB)*{slope-1:.9f}')
- setts_cmd.append(')')
- setts_cmd = ''.join(setts_cmd)
- return setts_cmd
-
-def get_ffmpeg():
- return static_ffmpeg.run._get_or_fetch_platform_executables_else_raise_no_lock()[0]
-
-def get_ffprobe():
- return static_ffmpeg.run._get_or_fetch_platform_executables_else_raise_no_lock()[1]
-
-def get_closest_key_frame_time(video_file, time):
- if time <= 0:
- return 0
- key_frames = ffmpeg.probe(video_file, cmd=get_ffprobe(), select_streams='v',
- show_frames=None, skip_frame='nokey')['frames']
- key_frame_times = np.array([float(frame['pts_time']) for frame in key_frames] + [0])
- return np.max(key_frame_times[key_frame_times <= time])
-
-# outputs a new media file with the replaced audio (which includes audio descriptions)
-def write_replaced_media_to_disk(output_filename, media_arr, video_file=None, audio_desc_file=None,
- setts_cmd=None, start_key_frame=None):
- if audio_desc_file is None:
- media_input = ffmpeg.input('pipe:', format='s16le', acodec='pcm_s16le',
- ac=2, ar=AUDIO_SAMPLE_RATE)
- if video_file is None or os.path.splitext(output_filename)[1][1:] in AUDIO_EXTENSIONS:
- write_command = ffmpeg.output(media_input, output_filename, loglevel='fatal').overwrite_output()
- else:
- original_video = ffmpeg.input(video_file)
- # "-max_interleave_delta 0" is sometimes necessary to fix an .mkv bug that freezes audio/video:
- # ffmpeg bug warning: [matroska @ 0000000002c814c0] Starting new cluster due to timestamp
- # more info about the bug and fix: https://reddit.com/r/ffmpeg/comments/efddfs/
- write_command = ffmpeg.output(media_input, original_video, output_filename,
- acodec='copy', vcodec='copy', scodec='copy',
- max_interleave_delta='0', loglevel='fatal',
- **{"c:a:0": "aac", "disposition:a:0": "default"}).overwrite_output()
- ffmpeg_caller = write_command.run_async(pipe_stdin=True, cmd=get_ffmpeg())
- ffmpeg_caller.stdin.write(media_arr.astype(np.int16).T.tobytes())
- ffmpeg_caller.stdin.close()
- ffmpeg_caller.wait()
- else:
- media_input = ffmpeg.input(audio_desc_file)
- audio_desc_streams = ffmpeg.probe(audio_desc_file, cmd=get_ffprobe(), select_streams='a',
- show_entries='format=duration')['streams']
- audio_desc_duration = max([float(stream['duration']) for stream in audio_desc_streams])
- original_video = ffmpeg.input(video_file, an=None, ss=start_key_frame)
- if os.path.splitext(output_filename)[1] == os.path.splitext(video_file)[1]:
- # wav files don't have codecs compatible with most video containers, so we convert to aac
- audio_codec = 'copy' if os.path.splitext(audio_desc_file)[1] != '.wav' else 'aac'
- write_command = ffmpeg.output(media_input, original_video, output_filename,
- acodec=audio_codec, vcodec='copy', scodec='copy',
- max_interleave_delta='0', loglevel='fatal',
- **{'bsf:v': 'setts=ts=\'' + setts_cmd + '\'',
- 'bsf:s': 'setts=ts=\'' + setts_cmd + '\''}).overwrite_output()
- write_command.run(cmd=get_ffmpeg())
- else:
- # work around for bug that sometimes breaks setts when output and input formats differ
- # the trick is separating the input and output by piping from one ffmpeg process into another
- # mkv files break if 'nut' is used, while other files break when 'matroska' is used
- format = 'matroska' if os.path.splitext(output_filename)[1] == '.mkv' else 'nut'
- write_command = ffmpeg.output(original_video, 'pipe:', format=format, vsync='passthrough',
- c='copy', loglevel='fatal')
- ffmpeg_caller = write_command.run_async(pipe_stdout=True, cmd=get_ffmpeg())
- pipe_input = ffmpeg.input('pipe:', format=format, thread_queue_size='512')
- write_command2 = ffmpeg.output(media_input, pipe_input, output_filename, c='copy',
- max_interleave_delta='0', loglevel='fatal', vsync='passthrough',
- **{'bsf:v': 'setts=ts=\'' + setts_cmd + '\'',
- 'bsf:s': 'setts=ts=\'' + setts_cmd + '\''}).overwrite_output()
- ffmpeg_caller2 = write_command2.run_async(pipe_stdin=True, cmd=get_ffmpeg())
- while True:
- in_bytes = ffmpeg_caller.stdout.read(100000)
- if not in_bytes:
- break
- ffmpeg_caller2.stdin.write(in_bytes)
- ffmpeg_caller2.stdin.close()
- ffmpeg_caller.wait()
- ffmpeg_caller2.wait()
-
-
-# check whether static_ffmpeg has already installed ffmpeg and ffprobe
-def is_ffmpeg_installed():
- ffmpeg_dir = static_ffmpeg.run.get_platform_dir()
- indicator_file = os.path.join(ffmpeg_dir, "installed.crumb")
- return os.path.exists(indicator_file)
-
-# combines videos with matching audio files (e.g. audio descriptions)
-# this is the main function of this script, it calls the other functions in order
-def combine(video, audio, smoothness=50, stretch_audio=False, keep_non_ad=False,
- boost=0, ad_detect_sensitivity=.6, boost_sensitivity=.4, yes=False,
- prepend="ad_", no_pitch_correction=False, output_dir=default_output_dir,
- alignment_dir=default_alignment_dir, extension="copy", display_func=None):
- video_files, video_file_types = get_sorted_filenames(video, VIDEO_EXTENSIONS, AUDIO_EXTENSIONS)
-
- if yes == False and sum(video_file_types) > 0:
- print("")
- print("One or more audio files found in video input. Was this intentional?")
- print("If not, press ctrl+c to kill this script.")
- input("If this was intended, press Enter to continue...")
- print("")
- audio_desc_files, _ = get_sorted_filenames(audio, AUDIO_EXTENSIONS)
- if len(video_files) != len(audio_desc_files):
- error_msg = ["Number of valid files in input paths are not the same.",
- f"The video path has {len(video_files)} files",
- f"The audio path has {len(audio_desc_files)} files"]
- raise RuntimeError("\n".join(error_msg))
-
- ensure_folders_exist([output_dir], display_func)
- if PLOT_ALIGNMENT_TO_FILE:
- ensure_folders_exist([alignment_dir], display_func)
-
- display("", display_func)
- for (video_file, audio_desc_file) in zip(video_files, audio_desc_files):
- display(os.path.split(video_file)[1], display_func)
- display(os.path.split(audio_desc_file)[1], display_func)
- display("", display_func)
- if yes == False:
- print("Are the above input file pairings correct?")
- print("If not, press ctrl+c to kill this script.")
- input("If they are correct, press Enter to continue...")
- print("")
-
- # if ffmpeg isn't installed, install it
- if not is_ffmpeg_installed():
- display("Downloading and installing ffmpeg (media editor, 50 MB download)...", display_func)
- get_ffmpeg()
- if not is_ffmpeg_installed():
- RuntimeError("Failed to install ffmpeg.")
- display("Successfully installed ffmpeg.", display_func)
-
- display("Processing files:", display_func)
-
- for (video_file, audio_desc_file, video_filetype) in zip(video_files, audio_desc_files,
- video_file_types):
- # Default is to use the input video's extension for the output video
- if extension is None or extension in ["", "copy"]:
- ext = os.path.splitext(video_file)[1]
- else:
- # add a dot to the extension if it's missing
- ext = ('' if extension[0] == '.' else '.') + extension
- output_filename = prepend + os.path.splitext(os.path.split(video_file)[1])[0] + ext
- output_filename = os.path.join(output_dir, output_filename)
- display(" " + output_filename, display_func)
-
- if os.path.exists(output_filename) and os.path.getsize(output_filename) > 0:
- display(" output file already exists, skipping...", display_func)
- continue
-
- video_arr = parse_audio_from_file(video_file)
- audio_desc_arr = parse_audio_from_file(audio_desc_file)
- video_spec_raw, video_timings = tokenize_audio(video_arr)
- video_spec = normalize_spec(video_spec_raw)
- audio_desc_spec_raw, audio_desc_timings = tokenize_audio_dither(audio_desc_arr, video_timings)
- audio_desc_spec = normalize_spec(audio_desc_spec_raw)
-
- # rescale RMS intensity of audio to match video
- audio_desc_arr *= (np.std(video_arr) / np.std(audio_desc_arr))
-
- path, quals = rough_align(video_spec, audio_desc_spec, video_timings, audio_desc_timings)
-
- smooth_path, runs, bad_clips, clips = smooth_align(path, quals, smoothness)
-
- cap_synced_end_points(smooth_path, video_arr, audio_desc_arr)
-
- ad_timings = None
- if stretch_audio:
- if keep_non_ad:
- video_arr_original = video_arr.copy()
-
- replace_aligned_segments(video_arr, audio_desc_arr, smooth_path, runs, no_pitch_correction)
- del audio_desc_arr
-
- if keep_non_ad or boost != 0:
- outputs = detect_describer(video_arr, video_spec, video_spec_raw, video_timings,
- smooth_path, ad_detect_sensitivity, boost_sensitivity)
- speech_sample_mask, boost_sample_mask, ad_timings = outputs
- if keep_non_ad:
- video_arr *= speech_sample_mask
- video_arr += video_arr_original * (1 - speech_sample_mask)
- del video_arr_original
- del speech_sample_mask
- else:
- ad_timings = None
- if boost != 0:
- video_arr = video_arr * (1. + (10**(boost / 10.) - 1.) * boost_sample_mask)
- del boost_sample_mask
-
- # prevent peaking by rescaling to within +/- 16,382
- video_arr *= (2**15 - 2.) / np.max(np.abs(video_arr))
-
- if video_filetype == 0:
- write_replaced_media_to_disk(output_filename, video_arr, video_file)
- else:
- write_replaced_media_to_disk(output_filename, video_arr)
- else:
- if video_filetype == 1:
- raise RuntimeError("Argument --stretch_audio is required when both inputs are audio files.")
- if os.path.splitext(output_filename)[1][1:] in AUDIO_EXTENSIONS:
- raise RuntimeError("Argument --stretch_audio is required when output file extension is an audio filetype.")
- video_offset = np.diff(smooth_path[clips[0][0]])[0]
- start_key_frame = get_closest_key_frame_time(video_file, video_offset)
- setts_cmd = encode_fit_as_ffmpeg_expr(smooth_path, clips, video_offset, start_key_frame)
- write_replaced_media_to_disk(output_filename, None, video_file, audio_desc_file,
- setts_cmd, start_key_frame)
-
- del video_arr
- if PLOT_ALIGNMENT_TO_FILE:
- plot_filename_no_ext = os.path.join(alignment_dir, os.path.splitext(os.path.split(video_file)[1])[0])
- plot_alignment(plot_filename_no_ext, path, smooth_path, quals, runs, bad_clips, ad_timings)
- display("All files processed.", display_func)
-
-def write_config_file(config_path, settings):
- config = configparser.ConfigParser()
- config.add_section('alignment')
- config['alignment'] = {}
- for key, value in settings.items():
- config['alignment'][key] = str(value)
- with open(config_path, 'w') as f:
- config.write(f)
-
-def read_config_file(config_path):
- config = configparser.ConfigParser()
- config.read(config_path)
- settings = {'smoothness': config.getfloat('alignment', 'smoothness', fallback=50),
- 'stretch_audio': config.getboolean('alignment', 'stretch_audio', fallback=False),
- 'keep_non_ad': config.getboolean('alignment', 'keep_non_ad', fallback=False),
- 'boost': config.getfloat('alignment', 'boost', fallback=0),
- 'ad_detect_sensitivity':config.getfloat('alignment', 'ad_detect_sensitivity', fallback=.6),
- 'boost_sensitivity': config.getfloat('alignment', 'boost_sensitivity', fallback=.4),
- 'prepend': config.get('alignment', 'prepend', fallback='ad_'),
- 'no_pitch_correction': config.getboolean('alignment', 'no_pitch_correction', fallback=False),
- 'output_dir': config.get('alignment', 'output_dir', fallback=default_output_dir),
- 'alignment_dir': config.get('alignment', 'alignment_dir', fallback=default_alignment_dir),
- 'extension': config.get('alignment', 'extension', fallback='copy')}
- if not config.has_section('alignment'):
- write_config_file(config_path, settings)
- return settings
-
-def settings_gui(config_path):
- settings = read_config_file(config_path)
- layout = [[sg.Text('Check tooltips (i.e. mouse-over text) for descriptions:')],
- [sg.Column([[sg.Text('extension:', size=(10, 1.2), pad=(1,5)),
- sg.Input(default_text=str(settings['extension']), size=(8, 1.2), pad=(10,5), key='extension',
- tooltip='File type of output video (e.g. mkv). When set to "copy", copies the ' + \
- 'file type of the corresponding input video. Default is "copy".')]])],
- [sg.Column([[sg.Text('prepend:', size=(8, 1.2), pad=(1,5)),
- sg.Input(default_text=str(settings['prepend']), size=(8, 1.2), pad=(10,5), key='prepend',
- tooltip='Output file name prepend text. Default is "ad_"')]])],
- [sg.Column([[sg.Text('output_dir:', size=(10, 1.2), pad=(1,5)),
- sg.Input(default_text=str(settings['output_dir']), size=(22, 1.2), pad=(10,5), key='output_dir',
- tooltip='Directory combined output media is saved to. Default is "videos_with_ad"'),
- sg.FolderBrowse(button_text="Browse Folder", key='output_browse')]])],
- [sg.Column([[sg.Text('alignment_dir:', size=(13, 1.2), pad=(1,5)),
- sg.Input(default_text=str(settings['alignment_dir']), size=(22, 1.2), pad=(10,5), key='alignment_dir',
- tooltip='Directory alignment data and plots are saved to. Default is "alignment_plots"'),
- sg.FolderBrowse(button_text="Browse Folder", key='alignment_browse')]], pad=(2,7))],
- [sg.Column([[sg.Text('smoothness:', size=(12, 1), pad=(1,5)),
- sg.Input(default_text=str(settings['smoothness']), size=(8, 1.2), pad=(10,5), key='smoothness',
- tooltip='Lower values make the alignment more accurate when there are skips ' + \
- '(e.g. describer pauses), but also make it more likely to misalign. ' + \
- 'Default is 50.')]])],
- [sg.Checkbox('stretch_audio', default=settings['stretch_audio'], key='stretch_audio', change_submits=True,
- tooltip='Stretches the input audio to fit the input video. ' + \
- 'Default is to stretch the video to fit the audio.')],
- [sg.Checkbox('keep_non_ad', default=settings['keep_non_ad'], key='keep_non_ad',
- disabled=not settings['stretch_audio'],
- tooltip='Tries to only replace segments with audio description. Useful if ' + \
- 'video\'s audio quality is better. Default is to replace all aligned audio. ' + \
- 'Requires --stretch_audio to be set, otherwise does nothing.')],
- [sg.Column([[sg.Text('boost:', size=(6, 1), pad=(1,5)),
- sg.Input(default_text=str(settings['boost']), size=(8, 1.2), pad=(10,5),
- key='boost', disabled=not settings['stretch_audio'],
- tooltip='Boost (or quieten) description volume. Units are decibels (dB), so ' + \
- '-3 makes the describer about 2x quieter, while 3 makes them 2x louder. ' + \
- 'Requires --stretch_audio to be set, otherwise does nothing.')]])],
- [sg.Column([[sg.Text('ad_detect_sensitivity:', size=(21, 1.2), pad=(2,5)),
- sg.Input(default_text=str(settings['ad_detect_sensitivity']), size=(8, 1.2), pad=(10,5),
- key='ad_detect_sensitivity', disabled=not settings['stretch_audio'],
- tooltip='Audio description detection sensitivity ratio. Higher values make ' + \
- '--keep_non_ad more likely to replace aligned audio. Default is 0.6')]])],
- [sg.Column([[sg.Text('boost_sensitivity:', size=(17, 1.2), pad=(1,5)),
- sg.Input(default_text=str(settings['boost_sensitivity']), size=(8, 1.2), pad=(10,5),
- key='boost_sensitivity', disabled=not settings['stretch_audio'],
- tooltip='Higher values make --boost less likely to miss a description, but ' + \
- 'also make it more likely to boost non-description audio. Default is 0.4')]])],
- [sg.Checkbox('no_pitch_correction', default=settings['no_pitch_correction'], key='no_pitch_correction',
- disabled=not settings['stretch_audio'],
- tooltip='Skips pitch correction step when stretching audio. ' + \
- 'Requires --stretch_audio to be set, otherwise does nothing.')],
- [sg.Column([[sg.Submit('Save', pad=(40,3)),
- sg.Button('Cancel')]], pad=((135,3),10))]]
- settings_window = sg.Window('Settings - describealign', layout, font=('Arial', 16), finalize=True)
- settings_window['extension'].set_focus()
- while True:
- event, values = settings_window.read()
- if event in (sg.WIN_CLOSED, 'Cancel') or settings_window.TKrootDestroyed:
- break
- if event == 'stretch_audio':
- # work around bug in PySimpleGUIWx's InputText Update function where enabling/disabling are flipped
- if IS_RUNNING_WINDOWS:
- settings_window['boost'].Update(disabled = values['stretch_audio'])
- settings_window['ad_detect_sensitivity'].Update(disabled = values['stretch_audio'])
- settings_window['boost_sensitivity'].Update(disabled = values['stretch_audio'])
- else:
- settings_window['boost'].Update(disabled = not values['stretch_audio'])
- settings_window['ad_detect_sensitivity'].Update(disabled = not values['stretch_audio'])
- settings_window['boost_sensitivity'].Update(disabled = not values['stretch_audio'])
- settings_window['keep_non_ad'].Update(disabled = not values['stretch_audio'])
- settings_window['no_pitch_correction'].Update(disabled = not values['stretch_audio'])
- if event == 'Save':
- settings = values.copy()
- del settings['output_browse']
- del settings['alignment_browse']
- write_config_file(config_path, settings)
- break
- settings_window.close()
-
-def combine_print_exceptions(print_queue, *args, **kwargs):
- try:
- combine(*args, **kwargs)
- except:
- print_queue.put(traceback.format_exc())
- # raise
-
-def combine_gui(video_files, audio_files, config_path):
- output_textbox = sg.Multiline(size=(80,30), key='-OUTPUT-')
- layout = [[output_textbox],
- [sg.Button('Close', pad=(360,5))]]
- combine_window = sg.Window('Combining - describealign', layout, font=('Arial', 16),
- disable_close=True, finalize=True)
- output_textbox.update('Combining media files:', append=True)
- print_queue = multiprocessing.Queue()
-
- settings = read_config_file(config_path)
- settings.update({'display_func':print_queue.put, 'yes':True})
- proc = multiprocessing.Process(target=combine_print_exceptions,
- args=(print_queue, video_files, audio_files),
- kwargs=settings, daemon=True)
- proc.start()
- while True:
- # if the script isn't running anymore, re-enable the default close window button
- if not proc.is_alive():
- combine_window.DisableClose = False
- if not print_queue.empty():
- if IS_RUNNING_WINDOWS:
- cursor_position = output_textbox.WxTextCtrl.GetInsertionPoint()
- output_textbox.update('\n' + print_queue.get(), append=True)
- if IS_RUNNING_WINDOWS:
- output_textbox.WxTextCtrl.SetInsertionPoint(cursor_position)
- event, values = combine_window.read(timeout=100)
- # window closed event isn't always emitted, so also manually check window status
- if event == sg.WIN_CLOSED or combine_window.TKrootDestroyed:
- if proc.is_alive():
- proc.terminate()
- break
- if event == 'Close':
- if not proc.is_alive():
- combine_window.DisableClose = False
- break
- selection = sg.PopupYesNo('Combiner is still running, stop it and close anyway?')
- if selection != 'Yes':
- continue
- proc.terminate()
- combine_window.DisableClose = False
- break
- combine_window.close()
-
-def main_gui():
- config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.ini')
- sg.theme('Light Blue 2')
-
- filetype_sep = ';' if IS_RUNNING_WINDOWS else ' '
- all_audio_file_types = [('All Audio File Types', '*.' + f'{filetype_sep}*.'.join(AUDIO_EXTENSIONS)),]
- all_video_file_types = [('All Video File Types', '*.' + f'{filetype_sep}*.'.join(VIDEO_EXTENSIONS)),]
- all_video_and_audio_file_types = [('All Video and Audio File Types',
- '*.' + f'{filetype_sep}*.'.join(VIDEO_EXTENSIONS | AUDIO_EXTENSIONS)),]
- audio_file_types = [(ext, "*." + ext) for ext in AUDIO_EXTENSIONS]
- video_and_audio_file_types = [(ext, "*." + ext) for ext in VIDEO_EXTENSIONS] + audio_file_types
- audio_file_types = all_audio_file_types + audio_file_types
- video_and_audio_file_types = all_video_file_types + all_video_and_audio_file_types + video_and_audio_file_types
- # work around bug in PySimpleGUIWx's convert_tkinter_filetypes_to_wx function
- if IS_RUNNING_WINDOWS:
- file_fix = lambda file_types: file_types[:1] + [('|' + type[0], type[1]) for type in file_types[1:]]
- audio_file_types = file_fix(audio_file_types)
- video_and_audio_file_types = file_fix(video_and_audio_file_types)
-
- layout = [[sg.Text('Select media files to combine:', size=(40, 2), font=('Arial', 20), pad=(3,15))],
- [sg.Column([[sg.Text('Video Input:', size=(11, 2), pad=(1,5)),
- sg.Input(size=(35, 1.2), pad=(10,5), key='-VIDEO_FILES-',
- tooltip='List video filenames here, in order, separated by semicolons'),
- sg.FilesBrowse(button_text="Browse Video",
- file_types=video_and_audio_file_types,
- tooltip='Select one or more video files')]], pad=(2,7))],
- [sg.Column([[sg.Text('Audio Input:', size=(11, 2), pad=(1,5)),
- sg.Input(size=(35, 1.2), pad=(10,5), key='-AUDIO_FILES-',
- tooltip='List audio filenames here, in order, separated by semicolons'),
- sg.FilesBrowse(button_text="Browse Audio",
- file_types=audio_file_types,
- tooltip='Select one or more audio files')]], pad=(2,7))],
- [sg.Column([[sg.Submit('Combine', pad=(40,3), tooltip='Combine selected video and audio files'),
- sg.Button('Settings', tooltip='Edit settings for the GUI and algorithm.')]],
- pad=((135,3),10))]]
- window = sg.Window('describealign', layout, font=('Arial', 16), resizable=False, finalize=True)
- window['-VIDEO_FILES-'].set_focus()
- while True:
- event, values = window.read()
- if event == 'Combine':
- if len(values['-VIDEO_FILES-']) == 0 or \
- len(values['-AUDIO_FILES-']) == 0:
- window.disable()
- sg.Popup('Error: empty input field.', font=('Arial', 20))
- window.enable()
- continue
- video_files = values['-VIDEO_FILES-'].split(';')
- audio_files = values['-AUDIO_FILES-'].split(';')
- combine_gui(video_files, audio_files, config_path)
- if event == 'Settings':
- window.disable()
- settings_gui(config_path)
- window.enable()
- if event == sg.WIN_CLOSED:
- break
- window.close()
-
-# Entry point for command line interaction, for example:
-# > describealign video.mp4 audio_desc.mp3
-def command_line_interface():
- # override command line argument parser's error handler to make it pause before exiting
- # this allows users to see the error message when accidentally not running from command line
- class ArgumentParser(argparse.ArgumentParser):
- def error(self, message):
- if 'required: video, audio' in message:
- print('No input arguments detected, starting GUI...')
- main_gui()
- self.exit()
- else:
- self.exit(2, f'{self.prog}: error: {message}\n')
- parser = ArgumentParser(description="Replaces a video's sound with an audio description.",
- usage="describealign video_file.mp4 audio_file.mp3")
- parser.add_argument("video", help='A video file or directory containing video files.')
- parser.add_argument("audio", help='An audio file or directory containing audio files.')
- parser.add_argument('--smoothness', type=float, default=50,
- help='Lower values make the alignment more accurate when there are skips ' + \
- '(e.g. describer pauses), but also make it more likely to misalign. ' + \
- 'Default is 50.')
- parser.add_argument('--stretch_audio', action='store_true',
- help='Stretches the input audio to fit the input video. ' + \
- 'Default is to stretch the video to fit the audio.')
- parser.add_argument('--keep_non_ad', action='store_true',
- help='Tries to only replace segments with audio description. Useful if ' + \
- 'video\'s audio quality is better. Default is to replace all aligned audio. ' + \
- 'Requires --stretch_audio to be set, otherwise does nothing.')
- parser.add_argument('--boost', type=float, default=0,
- help='Boost (or quieten) description volume. Units are decibels (dB), so ' + \
- '-3 makes the describer about 2x quieter, while 3 makes them 2x louder. ' + \
- 'Requires --stretch_audio to be set, otherwise does nothing.')
- parser.add_argument('--ad_detect_sensitivity', type=float, default=.6,
- help='Audio description detection sensitivity ratio. Higher values make ' + \
- '--keep_non_ad more likely to replace aligned audio. Default is 0.6')
- parser.add_argument('--boost_sensitivity', type=float, default=.4,
- help='Higher values make --boost less likely to miss a description, but ' + \
- 'also make it more likely to boost non-description audio. Default is 0.4')
- parser.add_argument('--yes', action='store_true',
- help='Auto-skips user prompts asking to verify information.')
- parser.add_argument("--prepend", default="ad_", help='Output file name prepend text. Default is "ad_"')
- parser.add_argument('--no_pitch_correction', action='store_true',
- help='Skips pitch correction step when stretching audio. ' + \
- 'Requires --stretch_audio to be set, otherwise does nothing.')
- parser.add_argument("--output_dir", default=default_output_dir,
- help='Directory combined output media is saved to. Default is "videos_with_ad"')
- parser.add_argument("--alignment_dir", default=default_alignment_dir,
- help='Directory alignment data and plots are saved to. Default is "alignment_plots"')
- parser.add_argument("--extension", default="copy",
- help='File type of output video (e.g. mkv). When set to "copy", copies the ' + \
- 'file type of the corresponding input video. Default is "copy".')
- args = parser.parse_args()
-
- combine(args.video, args.audio, args.smoothness, args.stretch_audio, args.keep_non_ad,
- args.boost, args.ad_detect_sensitivity, args.boost_sensitivity, args.yes,
- args.prepend, args.no_pitch_correction, args.output_dir, args.alignment_dir,
- args.extension)
-
-# allows the script to be run on its own, rather than through the package, for example:
-# python3 describealign.py video.mp4 audio_desc.mp3
-if __name__ == "__main__":
- multiprocessing.freeze_support()
- command_line_interface()
-
-
-
-
+# combines videos with matching audio files (e.g. audio descriptions)
+# input: video or folder of videos and an audio file or folder of audio files
+# output: videos in a folder "videos_with_ad", with aligned segments of the audio replaced
+# this script aligns the new audio to the video using the video's old audio
+# first, the video's sound and the audio file are both converted to spectrograms
+# second, the two spectrograms are roughly aligned by finding their longest common subsequence
+# third, the rough alignment is denoised through L1-Minimization
+# fourth, the spectrogram alignments determine where the new audio replaces the old
+
+'''
+Copyright (C) 2023 Julian Brown
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see .
+'''
+
+# Nuitka build options:
+# nuitka-project-if: {OS} != "Windows":
+# nuitka-project: --enable-plugins=pyside2
+#
+# Compilation mode, standalone everywhere, except on macOS there app bundle
+# nuitka-project-if: {OS} == "Darwin":
+# nuitka-project: --standalone
+# nuitka-project: --macos-create-app-bundle
+# Mac needs onefile too apparently, because pyside2 plugin requires it.
+# All other platforms need it to, so set it universally.
+# nuitka-project: --onefile
+#
+# Debugging options, controlled via environment variable at compile time.
+# nuitka-project-if: os.getenv("DEBUG_COMPILATION", "no") == "yes":
+# nuitka-project: --enable-console
+# nuitka-project-else:
+# nuitka-project: --disable-console
+
+# Set app icon
+# nuitka-project-if: {OS} == "Windows":
+# nuitka-project: --windows-icon-from-ico=describealign.png
+# nuitka-project-else:
+# nuitka-project-if: {OS} == "Darwin":
+# nuitka-project: --macos-app-icon=describealign.png
+# nuitka-project-else:
+# nuitka-project: --linux-icon=describealign.png
+# End Nuitka build options
+
+VIDEO_EXTENSIONS = set(['mp4', 'mkv', 'avi', 'mov', 'webm', 'm4v', 'flv', 'vob'])
+AUDIO_EXTENSIONS = set(['mp3', 'm4a', 'opus', 'wav', 'aac', 'flac', 'ac3', 'mka'])
+PLOT_ALIGNMENT_TO_FILE = True
+
+TIMESTEP_SIZE_SECONDS = .16
+TIMESTEP_OVERLAP_RATIO = .5
+AUDIO_SAMPLE_RATE = 44100
+MEL_COEFFS_PER_TIMESTEP = 25
+DITHER_PERIOD_STEPS = 60
+MIN_CORR_FOR_TOKEN_MATCH = .6
+GAP_START_COST = 1.0
+GAP_EXTEND_COST = -.01
+GAP_EXTEND_DIAG_BONUS = -.01
+SKIP_MATCH_COST = .1
+MAX_RATE_RATIO_DIFF_ALIGN = .1
+PREF_CUT_AT_GAPS_FACTOR = 5
+MIN_DURATION_TO_REPLACE_SECONDS = 2
+MIN_START_END_SYNC_TIME_SECONDS = 2
+MAX_START_END_SYNC_ERR_SECONDS = .2
+MAX_RATE_RATIO_DIFF_BOOST = .003
+MIN_DESC_DURATION = .5
+MAX_GAP_IN_DESC_SEC = 1.5
+JUST_NOTICEABLE_DIFF_IN_FREQ_RATIO = .005
+CATCHUP_RATE = 5
+
+if PLOT_ALIGNMENT_TO_FILE:
+ import matplotlib.pyplot as plt
+import argparse
+import os
+import glob
+import itertools
+import datetime
+import numpy as np
+import ffmpeg
+import static_ffmpeg
+import python_speech_features as psf
+import scipy.signal
+import scipy.optimize
+import scipy.interpolate
+import scipy.ndimage as nd
+import scipy.sparse
+import pytsmod
+import configparser
+import traceback
+import multiprocessing
+import platform
+
+IS_RUNNING_WINDOWS = platform.system() == 'Windows'
+if IS_RUNNING_WINDOWS:
+ import PySimpleGUIWx as sg
+ default_output_dir = 'videos_with_ad'
+ default_alignment_dir = 'alignment_plots'
+else:
+ import PySimpleGUIQt as sg
+ default_output_dir = os.path.expanduser('~') + '/videos_with_ad'
+ default_alignment_dir = os.path.expanduser('~') + '/alignment_plots'
+
+def display(text, func=None):
+ if func:
+ func(text)
+ print(text)
+
+def throw_runtime_error(text, func=None):
+ if func:
+ func(text)
+ raise RuntimeError(text)
+
+def ensure_folders_exist(dirs, display_func=None):
+ for dir in dirs:
+ if not os.path.isdir(dir):
+ display("Directory not found, creating it: " + dir, display_func)
+ os.makedirs(dir)
+
+def get_sorted_filenames(path, extensions, alt_extensions=set([])):
+ # path could be three different things: a file, a directory, a list of files
+ if type(path) is list:
+ files = [os.path.abspath(file) for file in path]
+ for file in files:
+ if not os.path.isfile(file):
+ raise RuntimeError(f"No file found at input path:\n {file}")
+ else:
+ path = os.path.abspath(path)
+ if os.path.isdir(path):
+ files = glob.glob(glob.escape(path) + "/*")
+ if len(files) == 0:
+ raise RuntimeError(f"Empty input directory:\n {path}")
+ else:
+ if not os.path.isfile(path):
+ raise RuntimeError(f"No file or directory found at input path:\n {path}")
+ files = [path]
+ files = [file for file in files if os.path.splitext(file)[1][1:] in extensions | alt_extensions]
+ if len(files) == 0:
+ error_msg = [f"No files with valid extensions found at input path:\n {path}",
+ "Did you accidentally put the audio filepath before the video filepath?",
+ "The video path should be the first positional input, audio second.",
+ "Or maybe you need to add a new extension to this script's regex?",
+ f"valid extensions for this input are:\n {extensions}"]
+ raise RuntimeError("\n".join(error_msg))
+ files = sorted(files)
+ file_types = [0 if os.path.splitext(file)[1][1:] in extensions else 1 for file in files]
+ return files, file_types
+
+# read audio from file with ffmpeg and convert to numpy array
+def parse_audio_from_file(media_file):
+ media_stream, _ = (ffmpeg
+ .input(media_file)
+ .output('-', format='s16le', acodec='pcm_s16le', ac=2, ar=AUDIO_SAMPLE_RATE, loglevel='fatal')
+ .run(capture_stdout=True, cmd=get_ffmpeg())
+ )
+ media_arr = np.frombuffer(media_stream, np.int16).astype(np.float32).reshape((-1,2)).T
+ return media_arr
+
+# tokenize audio by transforming with a mel-frequency cepstrum (MFC)
+def tokenize_audio(media_arr, rate=1):
+ step_size_samples = psf.sigproc.round_half_up(TIMESTEP_SIZE_SECONDS * rate * AUDIO_SAMPLE_RATE)
+ window_size_seconds = TIMESTEP_SIZE_SECONDS / TIMESTEP_OVERLAP_RATIO
+ window_size_samples = psf.sigproc.round_half_up(window_size_seconds * AUDIO_SAMPLE_RATE)
+ fft_size_samples = 2**int(np.ceil(np.log2(window_size_samples)))
+ get_mfcc = lambda arr: psf.mfcc(np.mean(arr, axis=0),
+ samplerate=AUDIO_SAMPLE_RATE,
+ winlen=window_size_seconds,
+ winstep=TIMESTEP_SIZE_SECONDS * rate,
+ numcep=MEL_COEFFS_PER_TIMESTEP,
+ nfilt=MEL_COEFFS_PER_TIMESTEP * 2,
+ nfft=fft_size_samples,
+ winfunc=scipy.signal.windows.hann)
+ num_timesteps = max(1, ((media_arr.shape[1] - window_size_samples - 1) // step_size_samples) + 2)
+ media_spec = np.zeros((num_timesteps, MEL_COEFFS_PER_TIMESTEP))
+ chunk_size = 1000
+ for chunk_index in np.arange(0, num_timesteps, chunk_size):
+ chunk_bounds_samples = ((chunk_index ) * step_size_samples,
+ (chunk_index + chunk_size - 1) * step_size_samples + window_size_samples)
+ media_spec[chunk_index:chunk_index+chunk_size] = get_mfcc(media_arr[:,slice(*chunk_bounds_samples)])
+ '''
+ # alternate python library's MFC implementation
+ import librosa
+ media_spec = librosa.feature.mfcc(y=np.mean(media_arr, axis=0),
+ sr=AUDIO_SAMPLE_RATE,
+ n_mfcc=MEL_COEFFS_PER_TIMESTEP,
+ lifter=22,
+ n_fft=fft_size_samples,
+ hop_length=step_size_samples,
+ win_length=window_size_samples,
+ window=scipy.signal.windows.hann).T
+ num_timesteps = media_spec.shape[0]
+ '''
+ timings_samples = window_size_samples/2. + step_size_samples * np.arange(num_timesteps)
+ timings_seconds = timings_samples / AUDIO_SAMPLE_RATE
+ return media_spec, timings_seconds
+
+# same as tokenize_audio, but dithering the MFC window timings
+# this allows for finer alignment by ameliorating discretization error
+def tokenize_audio_dither(media_arr, slow_timings):
+ # choose a relative step size slightly less than 1 to ameliorate quantization error
+ # maximize alignment accuracy by using least approximable number with desired period
+ # this is the continued fraction [0;1,N-2,1,1,1,...], where the trailing ones give phi
+ fast_rate = 1. / (1 + 1. / (DITHER_PERIOD_STEPS - 2 + (np.sqrt(5) + 1) / 2.))
+ fast_spec, fast_timings = tokenize_audio(media_arr, fast_rate)
+
+ # prevent drift in difficult to align segments (e.g. describer speaking or quiet/droning segments)
+ # by approximately equalizing the number of tokens per unit time between dithered and undithered
+ # the dithered audio will have ~(1 + 1 / DITHER_PERIOD_STEPS) times as many tokens, so
+ # this can be accomplished by simply deleting a token every DITHER_PERIOD_STEPS tokens
+ fast_spec = np.delete(fast_spec, slice(DITHER_PERIOD_STEPS // 2, None, DITHER_PERIOD_STEPS), axis=0)
+ fast_timings = np.delete(fast_timings, slice(DITHER_PERIOD_STEPS // 2, None, DITHER_PERIOD_STEPS))
+ return fast_spec, fast_timings
+
+# normalize along both time and frequency axes to allow comparing tokens by correlation
+def normalize_spec(media_spec_raw, axes=(0,1)):
+ media_spec = media_spec_raw.copy()
+ for axis in axes:
+ norm_func = np.std if axis == 0 else np.linalg.norm
+ media_spec = media_spec - np.mean(media_spec, axis=axis, keepdims=True)
+ media_spec = media_spec/(norm_func(media_spec,axis=axis,keepdims=True)+1e-10)
+ return media_spec
+
+# vectorized implementation of the Wagner–Fischer (Longest Common Subsequence) algorithm
+# modified to include affine gap penalties and skip+match options (i.e. knight's moves)
+# gaps are necessary when parts are cut out of the audio description (e.g. cut credits)
+# or when the audio description includes a commercial break or an extra scene
+# the skip+match option allows for micro-adjustments without eating the full gap penalty
+# skip+match is primarily useful in maintaining alignment when the rates differ slightly
+def rough_align(video_spec, audio_desc_spec, video_timings, audio_desc_timings):
+ pred_map = {0:lambda node: (0, node[1]-1, node[2]-1),
+ 1:lambda node: (0, node[1]-2, node[2]-1),
+ 2:lambda node: (0, node[1]-1, node[2]-2),
+ 3:lambda node: (1, node[1]-1, node[2]-1),
+ 4:lambda node: (0, node[1] , node[2] ),
+ 5:lambda node: (1, node[1]-1, node[2] ),
+ 6:lambda node: (1, node[1]-1, node[2]-1),
+ 7:lambda node: (1, node[1] , node[2]-1)}
+ pred_matrix = np.zeros((2, audio_desc_spec.shape[0], video_spec.shape[0]), dtype=np.uint8)
+ pred_matrix[0,1:,:2] = 0
+ pred_matrix[1,1:,:2] = 4
+ pred_matrix[:,0,:2] = [0,5]
+ path_corrs_match = np.zeros((3, video_spec.shape[0]))
+ path_corrs_gap = np.zeros((3, video_spec.shape[0]))
+ corrs = np.zeros((3, video_spec.shape[0]))
+ corrs[:,:] = np.roll(np.dot(video_spec, audio_desc_spec[0]), 1)[None,:]
+ for i in range(audio_desc_spec.shape[0]):
+ i_mod = i % 3
+ match_pred_corrs = np.hstack([path_corrs_match[i_mod-1][1:-1][:,None],
+ path_corrs_match[i_mod-2][1:-1][:,None] - SKIP_MATCH_COST,
+ path_corrs_match[i_mod-1][0:-2][:,None] - SKIP_MATCH_COST,
+ path_corrs_gap[ i_mod-1][1:-1][:,None]])
+ pred_matrix[0][i][2:] = np.argmax(match_pred_corrs, axis=1)
+ path_corrs_match[i_mod][2:] = np.take_along_axis(match_pred_corrs, pred_matrix[0][i][2:,None], axis=1).T
+ corrs = np.roll(corrs, -1, axis=1)
+ corrs[(i_mod+1)%3,:] = np.roll(np.dot(video_spec, audio_desc_spec[min(audio_desc_spec.shape[0]-1,i+1)]), 1)
+ fisher_infos = (2 * corrs[i_mod] - corrs[i_mod-1] - corrs[(i_mod+1)%3]) / min(.2, TIMESTEP_SIZE_SECONDS)
+ fisher_infos[fisher_infos < 0] = 0
+ fisher_infos[fisher_infos > 10] = 10
+ row_corrs = np.maximum(0, corrs[i_mod][2:] - MIN_CORR_FOR_TOKEN_MATCH)
+ path_corrs_match[i_mod][2:] += row_corrs * (fisher_infos[2:] / 5)
+ gap_pred_corrs = np.hstack([path_corrs_match[i_mod][2: ][:,None] - GAP_START_COST,
+ path_corrs_gap[i_mod-1][2: ][:,None],
+ path_corrs_gap[i_mod-1][1:-1][:,None] - GAP_EXTEND_DIAG_BONUS - \
+ GAP_EXTEND_COST])
+ pred_matrix[1][i][2:] = np.argmax(gap_pred_corrs, axis=1)
+ path_corrs_gap_no_col_skip = np.take_along_axis(gap_pred_corrs, pred_matrix[1][i][2:,None], axis=1).flat
+ pred_matrix[1][i][2:] += 4
+ path_corrs_gap[i_mod][2:] = np.maximum.accumulate(path_corrs_gap_no_col_skip + \
+ GAP_EXTEND_COST * np.arange(video_spec.shape[0]-2)) - \
+ GAP_EXTEND_COST * np.arange(video_spec.shape[0]-2)
+ pred_matrix[1][i][2:][path_corrs_gap[i_mod][2:] > path_corrs_gap_no_col_skip] = 7
+ path_corrs_gap[i_mod][2:] -= GAP_EXTEND_COST
+
+ # reconstruct optimal path by following predecessors backwards through the table
+ end_node_layer = np.argmax([path_corrs_match[i_mod,-1],
+ path_corrs_gap[ i_mod,-1]])
+ cur_node = (end_node_layer, audio_desc_spec.shape[0]-1, video_spec.shape[0]-1)
+ get_predecessor = lambda node: pred_map[pred_matrix[node]](node)
+ path = []
+ visited = set()
+ while min(cur_node[1:]) >= 0:
+ cur_node, last_node = get_predecessor(cur_node), cur_node
+ # failsafe to prevent an infinite loop that should never happen anyways
+ if cur_node in visited:
+ break
+ visited.add(cur_node)
+ if last_node[0] == 0:
+ path.append(last_node[1:])
+ path = path[::-1]
+
+ # determine how much information this node gives about the alignment
+ # a larger double derivative means more precise timing information
+ # sudden noises give more timing information than droning sounds
+ def get_fisher_info(node):
+ i,j = node
+ if node[0] >= audio_desc_spec.shape[0]-1 or \
+ node[1] >= video_spec.shape[0]-1 or \
+ min(node) <= 0:
+ return 0
+ info = 2*np.dot(audio_desc_spec[i ],video_spec[j ]) - \
+ np.dot(audio_desc_spec[i-1],video_spec[j+1]) - \
+ np.dot(audio_desc_spec[i+1],video_spec[j-1])
+ info /= min(.2, TIMESTEP_SIZE_SECONDS)
+ return info
+
+ # the quality of a node combines the correlation of its tokens
+ # with how precisely the match is localized in time
+ def get_match_quality(node):
+ # correlations are between -1 and 1, as all tokens have unit norm
+ token_correlation = np.dot(audio_desc_spec[node[0]],video_spec[node[1]])
+ fisher_info = min(max(0, get_fisher_info(node)), 10)
+ return max(0, token_correlation - MIN_CORR_FOR_TOKEN_MATCH) * (fisher_info / 5)
+
+ # filter out low match quality nodes from LCS path
+ quals = [get_match_quality(node) for node in path]
+ if len(quals) == 0 or max(quals) <= 0:
+ raise RuntimeError("Rough alignment failed, are the input files mismatched?")
+ path, quals = zip(*[(path, qual) for (path, qual) in zip(path, quals) if qual > 0])
+
+ # convert units of path nodes from timesteps to seconds
+ path = [(audio_desc_timings[i], video_timings[j]) for (i,j) in path]
+
+ return path, quals
+
+# chunk path segments of similar slope into clips
+# a clip has the form: (start_index, end_index)
+def chunk_path(smooth_path, tol):
+ x,y = zip(*smooth_path)
+ slopes = np.diff(y) / np.diff(x)
+ median_slope = np.median(slopes)
+ slope_changes = np.diff(slopes)
+ breaks = np.where(np.abs(slope_changes) > tol)[0] + 1
+ breaks = [0] + list(breaks) + [len(x)-1]
+ clips = list(zip(breaks[:-1], breaks[1:]))
+ return clips, median_slope, slopes
+
+# find piece-wise linear alignment that minimizes the weighted combination of
+# total absolute error at each node and total absolute slope change of the fit
+# distance between nodes and the fit (i.e. errors) are weighted by node quality
+# absolute slope changes are differences between the slopes of adjacent fit lines
+# slope changes are weighted much more than node errors to smooth out noise
+# the main source of noise is rough alignment drift while the describer is speaking
+def smooth_align(path, quals, smoothness):
+ # rotate basis to make vertical and horizontal slopes "cost" the same
+ # the new horizontal axis is x+y and the new vertical is -x+y
+ # Wagner–Fischer gives monotonically increasing nodes, so 0 <= slope < inf
+ # after this transformation, we instead have -1 <= slope < 1
+ # perfectly matching audio has pre-transformation slope = 1
+ # after this transformation, it instead has slope = 0
+ rotated_path = [(x+y,-x+y) for x,y in path]
+
+ # stretch the x axis to make all slopes "cost" nearly the same
+ # without this, small changes to the slope at slope = +/-1
+ # cost sqrt(2) times as much as small changes at slope = 0
+ # by stretching, we limit the range of slopes to within +/- 1/x_stretch_factor
+ # the small angle approximation means these slopes all cost roughly the same
+ x_stretch_factor = 10.
+ rotated_stretched_path = [(x_stretch_factor*x,y) for x,y in rotated_path]
+
+ # L1-Minimization to solve the alignment problem using a linear program
+ # the absolute value functions needed for "absolute error" can be represented
+ # in a linear program by splitting variables into positive and negative pieces
+ # and constraining each to be positive (done by default in scipy's linprog)
+ # x is fit_err_pos, fit_err_neg, slope_change_pos, slope_change_neg
+ # fit_err[i] = path[i][1] - y_fit[i]
+ # slope_change[i] = (y_fit[i+2] - y_fit[i+1])/(path[i+2][0] - path[i+1][0]) - \
+ # (y_fit[i+1] - y_fit[i ])/(path[i+1][0] - path[i ][0])
+ # this can be rewritten in terms of fit_err by re-arranging the 1st equation:
+ # y_fit[i] = path[i][1] - fit_err[i]
+ # this gives:
+ # slope_change[i] = path_half[i] - fit_err_half[i]
+ # where each half is just the original equation but y_fit is swapped out
+ # the slope_change variables can then be set using equality constraints
+ num_fit_points = len(rotated_stretched_path)
+ x,y = [np.array(arr) for arr in zip(*rotated_stretched_path)]
+ x_diffs = np.diff(x, prepend=[-10**10], append=[10**10])
+ y_diffs = np.diff(y, prepend=[ 0 ], append=[ 0 ])
+ slope_change_magnitudes = np.abs(np.diff(y_diffs/x_diffs)) * x_stretch_factor
+ slope_change_locations = (slope_change_magnitudes > MAX_RATE_RATIO_DIFF_ALIGN)
+ slope_change_locations[1:-1] *= (np.abs(y[2:] - y[:-2]) > 5)
+ slope_change_costs = np.full(num_fit_points, smoothness / float(TIMESTEP_SIZE_SECONDS))
+ slope_change_costs[slope_change_locations] /= PREF_CUT_AT_GAPS_FACTOR
+ c = np.hstack([quals,
+ quals,
+ slope_change_costs * x_stretch_factor,
+ slope_change_costs * x_stretch_factor])
+ fit_err_coeffs = scipy.sparse.diags([ 1. / x_diffs[:-1],
+ -1. / x_diffs[:-1] - 1. / x_diffs[1:],
+ 1. / x_diffs[1:]],
+ offsets=[0,1,2],
+ shape=(num_fit_points, num_fit_points + 2)).tocsc()[:,1:-1]
+ A_eq = scipy.sparse.hstack([ fit_err_coeffs,
+ -fit_err_coeffs,
+ scipy.sparse.eye(num_fit_points),
+ -scipy.sparse.eye(num_fit_points)])
+ b_eq = y_diffs[1: ] / x_diffs[1: ] - \
+ y_diffs[ :-1] / x_diffs[ :-1]
+ fit = scipy.optimize.linprog(c, A_eq=A_eq, b_eq=b_eq)
+ if not fit.success:
+ print(fit)
+ raise RuntimeError("Smooth Alignment L1-Min Optimization Failed!")
+
+ # combine fit_err_pos and fit_err_neg
+ fit_err = fit.x[:num_fit_points] - fit.x[num_fit_points:2*num_fit_points]
+
+ # subtract fit errors from nodes to retrieve the smooth fit's coordinates
+ # also, unstretch x axis and rotate basis back, reversing the affine pre-processing
+ smooth_path = [(((x / x_stretch_factor) - y) / 2.,
+ ((x / x_stretch_factor) + y) / 2.) for x,y in zip(x, y - fit_err)]
+
+ # clip off start/end of replacement audio if it doesn't match or isn't aligned
+ # without this, describer intro/outro skips can cause mismatches at the start/end
+ # the problem would be localized and just means audio might not match video at the start/end
+ # instead we just keep the original video's audio in those segments if mismatches are detected
+ # if instead the first few or last few nodes are well-aligned, that edge is marked as synced
+ # during audio replacement, synced edges will be extended backwards/forwards as far as possible
+ # this is useful when the describer begins talking immediately (or before any alignable audio)
+ # or when the describer continues speaking until the end (or no more alignable audio remains)
+ # otherwise, the mismatch would result in the describer's voice not replacing audio in that part
+ max_sync_err = MAX_START_END_SYNC_ERR_SECONDS
+ smoothing_std = MIN_START_END_SYNC_TIME_SECONDS / (2. * TIMESTEP_SIZE_SECONDS)
+ smoothed_fit_err = nd.gaussian_filter(np.abs(fit_err), sigma=smoothing_std)
+ smooth_err_path = zip(smoothed_fit_err, smooth_path)
+ old_length = num_fit_points
+ smooth_err_path = list(itertools.dropwhile(lambda x: x[0] > max_sync_err, smooth_err_path))[::-1]
+ is_synced_at_start = len(smooth_err_path) == old_length
+ old_length = len(smooth_err_path)
+ smooth_err_path = list(itertools.dropwhile(lambda x: x[0] > max_sync_err, smooth_err_path))[::-1]
+ is_synced_at_end = len(smooth_err_path) == old_length
+ _, smooth_path = zip(*smooth_err_path)
+ smooth_path = list(smooth_path)
+ if is_synced_at_start:
+ slope = (smooth_path[1][1] - smooth_path[0][1]) / (smooth_path[1][0] - smooth_path[0][0])
+ smooth_path.insert(0, (-10e10, -10e10 * slope))
+ if is_synced_at_end:
+ slope = (smooth_path[-1][1] - smooth_path[-2][1]) / (smooth_path[-1][0] - smooth_path[-2][0])
+ smooth_path.append((10e10, 10e10 * slope))
+
+ clips, median_slope, slopes = chunk_path(smooth_path, tol=1e-7)
+
+ # assemble clips with slopes within the rate tolerance into runs
+ runs, run = [], []
+ bad_clips = []
+ for clip in clips:
+ if np.abs(median_slope-slopes[clip[0]]) > MAX_RATE_RATIO_DIFF_ALIGN:
+ if len(run) > 0:
+ runs.append(run)
+ run = []
+ bad_clips.append(clip)
+ continue
+ run.append(clip)
+ if len(run) > 0:
+ runs.append(run)
+
+ return smooth_path, runs, bad_clips, clips
+
+# if the start or end were marked as synced during smooth alignment then
+# extend that alignment to the edge (i.e. to the start/end of the audio)
+def cap_synced_end_points(smooth_path, video_arr, audio_desc_arr):
+ if smooth_path[0][0] < -10e9:
+ slope = smooth_path[0][1] / smooth_path[0][0]
+ new_start_point = (0, smooth_path[1][1] - smooth_path[1][0] * slope)
+ if new_start_point[1] < 0:
+ new_start_point = (smooth_path[1][0] - smooth_path[1][1] / slope, 0)
+ smooth_path[0] = new_start_point
+ if smooth_path[-1][0] > 10e9:
+ video_runtime = (video_arr.shape[1] - 2.) / AUDIO_SAMPLE_RATE
+ audio_runtime = (audio_desc_arr.shape[1] - 2.) / AUDIO_SAMPLE_RATE
+ slope = smooth_path[-1][1] / smooth_path[-1][0]
+ new_end_point = (audio_runtime, smooth_path[-2][1] + (audio_runtime - smooth_path[-2][0]) * slope)
+ if new_end_point[1] > video_runtime:
+ new_end_point = (smooth_path[-2][0] + (video_runtime - smooth_path[-2][1]) / slope, video_runtime)
+ smooth_path[-1] = new_end_point
+
+# visualize both the rough and smooth alignments
+def plot_alignment(plot_filename_no_ext, path, smooth_path, quals, runs, bad_clips, ad_timings):
+ scatter_color = [.2,.4,.8]
+ lcs_rgba = np.zeros((len(quals),4))
+ lcs_rgba[:,:3] = np.array(scatter_color)[None,:]
+ lcs_rgba[:,3] = np.minimum(1, np.array(quals) * 500. / len(quals))
+ audio_times, video_times = np.array(path).T.reshape((2,-1))
+ audio_offsets = audio_times - video_times
+ def expand_limits(start, end, ratio=.01):
+ average = (end + start) / 2.
+ half_diff = (end - start) / 2.
+ half_diff *= (1 + ratio)
+ return (average - half_diff, average + half_diff)
+ plt.xlim(expand_limits(*(0, np.max(video_times) / 60.)))
+ plt.ylim(expand_limits(*(np.min(audio_offsets) - TIMESTEP_SIZE_SECONDS / 2.,
+ np.max(audio_offsets) + TIMESTEP_SIZE_SECONDS / 2.)))
+ plt.scatter(video_times / 60., audio_offsets, s=3, c=lcs_rgba, label='LCS Matches')
+ audio_times, video_times = np.array(smooth_path).T.reshape((2,-1))
+ audio_offsets = audio_times - video_times
+ if ad_timings is None:
+ plt.plot(video_times / 60., audio_offsets, 'r-', lw=.5, label='Replaced Audio')
+ bad_path = []
+ for clip in bad_clips:
+ bad_path.extend(smooth_path[clip[0]:clip[1]+1])
+ bad_path.append((smooth_path[clip[1]][0] + 1e-10, np.nan))
+ audio_times, video_times = np.array(bad_path).T.reshape((2,-1))
+ audio_offsets = audio_times - video_times
+ if len(audio_offsets) > 0:
+ plt.plot(video_times / 60., audio_offsets, 'c-', lw=1, label='Original Audio')
+ else:
+ interp = scipy.interpolate.interp1d(video_times, audio_offsets,
+ fill_value = np.inf,
+ bounds_error = False, assume_sorted = True)
+ plt.plot(video_times / 60., audio_offsets, 'c-', lw=.5, label='Original Audio')
+ video_times = ad_timings
+ audio_offsets = interp(ad_timings)
+ if len(audio_offsets) > 0:
+ plt.plot(video_times / 60., audio_offsets, 'r-', lw=1, label='Replaced Audio')
+ plt.xlabel('Video Time (minutes)')
+ plt.ylabel('Audio Description Offset (seconds)')
+ plt.title('Alignment')
+ plt.legend().legendHandles[0].set_color(scatter_color)
+ plt.tight_layout()
+ plt.savefig(plot_filename_no_ext + '.png', dpi=400)
+ plt.clf()
+
+ with open(plot_filename_no_ext + '.txt', 'w') as file:
+ rough_clips, median_slope, _ = chunk_path(smooth_path, tol=2e-2)
+ video_offset = np.diff(smooth_path[rough_clips[0][0]])[0]
+ print("Main changes needed to video to align it to audio input:", file=file)
+ print(f"Start Offset: {-video_offset:.2f} seconds", file=file)
+ print(f"Median Rate Change: {(median_slope-1.)*100:.2f}%", file=file)
+ for clip_start, clip_end in rough_clips:
+ audio_desc_start, video_start = smooth_path[clip_start]
+ audio_desc_end, video_end = smooth_path[clip_end]
+ slope = (video_end - video_start) / (audio_desc_end - audio_desc_start)
+ def str_from_time(seconds):
+ minutes, seconds = divmod(seconds, 60)
+ hours, minutes = divmod(minutes, 60)
+ return f"{hours:2.0f}:{minutes:02.0f}:{seconds:05.2f}"
+ print(f"Rate change of {(slope-1.)*100:6.1f}% from {str_from_time(video_start)} to " + \
+ f"{str_from_time(video_end)} aligning with audio from " + \
+ f"{str_from_time(audio_desc_start)} to {str_from_time(audio_desc_end)}", file=file)
+
+# use the smooth alignment to replace runs of video sound with corresponding described audio
+def replace_aligned_segments(video_arr, audio_desc_arr, smooth_path, runs, no_pitch_correction=False):
+ # perform quadratic interpolation of the audio description's waveform
+ # this allows it to be stretched to match the corresponding video segment
+ def audio_desc_arr_interp(samples):
+ chunk_size = 10**7
+ interpolated_chunks = []
+ for chunk in (samples[i:i+chunk_size] for i in range(0, len(samples), chunk_size)):
+ interp_bounds = (max(int(chunk[0]-2), 0),
+ min(int(chunk[-1]+2), audio_desc_arr.shape[1]))
+ interp = scipy.interpolate.interp1d(np.arange(*interp_bounds),
+ audio_desc_arr[:,slice(*interp_bounds)],
+ copy=False, bounds_error=False, fill_value=0,
+ kind='quadratic', assume_sorted=True)
+ interpolated_chunks.append(interp(chunk).astype(np.float32))
+ return np.hstack(interpolated_chunks)
+
+ # construct a stretched audio description waveform using the quadratic interpolator
+ def get_interped_segment(run, interp):
+ segment = []
+ for clip in run:
+ num_samples = int(y[clip[1]] * AUDIO_SAMPLE_RATE) - \
+ int(y[clip[0]] * AUDIO_SAMPLE_RATE)
+ clip_bounds = np.array((x[clip[0]], x[clip[1]])) * AUDIO_SAMPLE_RATE
+ sample_points = np.linspace(*clip_bounds, num=num_samples, endpoint=False)
+ segment.append(interp(sample_points))
+ segment = np.hstack(segment)
+ return segment
+
+ x,y = zip(*smooth_path)
+ for run in runs:
+ run_length_seconds = y[run[-1][1]] - y[run[0][0]]
+ if run_length_seconds < MIN_DURATION_TO_REPLACE_SECONDS:
+ continue
+ anchor_point_path_indices = [clip[0] for clip in run]
+ anchor_point_path_indices.append(run[-1][1])
+ anchor_points = (np.array((np.array(x)[anchor_point_path_indices],
+ np.array(y)[anchor_point_path_indices])) * AUDIO_SAMPLE_RATE).astype(int)
+ slopes = np.diff(anchor_points[1]) / np.diff(anchor_points[0])
+ for clip_index, (clip, slope) in enumerate(zip(run, slopes)):
+ # only apply pitch correction if the difference would be noticeable
+ if no_pitch_correction or np.abs(1 - slope) <= JUST_NOTICEABLE_DIFF_IN_FREQ_RATIO:
+ stretched_audio = get_interped_segment([clip], audio_desc_arr_interp)
+ else:
+ anchor_point_pair = anchor_points[:,clip_index:clip_index+2].copy()
+ # account for quirks of pytsmod's wsola anchor point implementation
+ anchor_point_pair[1][-1] -= 1
+ anchor_y_offset = anchor_point_pair[1][0]
+ anchor_point_pair[1,:] -= anchor_y_offset
+ stretched_audio = pytsmod.wsola(audio_desc_arr, anchor_point_pair)
+ video_arr[:,slice(*anchor_points[1,clip_index:clip_index+2])] = stretched_audio
+
+# identify which segments of the replaced audio actually have the describer speaking
+# uses a Naive Bayes classifier smoothed with L1-Minimization to identify the describer
+def detect_describer(video_arr, video_spec, video_spec_raw, video_timings,
+ smooth_path, detect_sensitivity, boost_sensitivity):
+ # retokenize the audio description, which has been stretched to match the video
+ audio_desc_spec_raw, audio_timings = tokenize_audio(video_arr)
+ audio_desc_spec = normalize_spec(audio_desc_spec_raw)
+
+ # avoid boosting or training on mismatched segments, like those close to skips
+ # assumes matching segments all have the same, constant play rate
+ # could be modified to handle a multi-modal distribution of rates
+ aligned_audio_times, aligned_video_times = zip(*smooth_path)
+ interp = scipy.interpolate.interp1d(aligned_video_times, aligned_audio_times,
+ fill_value = 'extrapolate',
+ bounds_error = False, assume_sorted = True)
+ slopes = (interp(video_timings + 1e-5) - \
+ interp(video_timings - 1e-5)) / 2e-5
+ median_slope = np.median(slopes)
+ aligned_mask = np.abs(slopes - median_slope) < MAX_RATE_RATIO_DIFF_ALIGN
+ well_aligned_mask = np.abs(slopes - median_slope) < MAX_RATE_RATIO_DIFF_BOOST
+
+ # first pass identification by assuming poorly matched tokens are describer speech
+ # also assumes the describer doesn't speak very quietly
+ corrs = np.sum(audio_desc_spec * video_spec, axis=-1)
+ smooth_volume = nd.gaussian_filter(audio_desc_spec[:,0], sigma=1)
+ audio_desc_loud = smooth_volume > np.percentile(smooth_volume, 30)
+ speech_mask = (corrs < .2) * audio_desc_loud
+
+ # normalize spectrogram coefficients along time axis to prep for conversion to PDFs
+ audio_desc_spec = normalize_spec(audio_desc_spec_raw, axes=(0,))
+ audio_desc_spec = np.clip(audio_desc_spec / 6., -1, 1)
+ video_spec = normalize_spec(video_spec_raw, axes=(0,))
+ video_spec = np.clip(video_spec / 6., -1, 1)
+
+ # convert sampled features (e.g. spectrogram) to probability densities of each feature
+ # when given a spectrogram, finds the distributions of the MFC coefficients
+ def make_log_pdfs(arr):
+ resolution = 100
+ bins_per_spot = 4
+ num_bins = int(resolution * bins_per_spot)
+ uniform_prior_strength_per_spot = 1
+ uniform_prior_strength_per_bin = uniform_prior_strength_per_spot / float(bins_per_spot)
+ bin_range = (-1 - 1e-10, 1 + 1e-10)
+ get_hist = lambda x: np.histogram(x, bins=num_bins, range=bin_range)[0]
+ pdfs = np.apply_along_axis(get_hist, 1, arr.T)
+ pdfs = pdfs + uniform_prior_strength_per_bin
+ smooth = lambda x: nd.gaussian_filter(x, sigma=bins_per_spot)
+ pdfs = np.apply_along_axis(smooth, 1, pdfs)
+ pdfs = pdfs / np.sum(pdfs[0,:])
+ log_pdfs = np.log(pdfs)
+ bin_edges = np.histogram([], bins=num_bins, range=bin_range)[1]
+ return log_pdfs, bin_edges
+
+ diff_spec = audio_desc_spec - video_spec
+ diff_spec = np.clip(diff_spec, -1, 1)
+
+ # Naive Bayes classifier to roughly estimate whether each token is describer speech
+ desc_log_pdfs, _ = make_log_pdfs(diff_spec[speech_mask * well_aligned_mask])
+ nondesc_log_pdfs, bin_edges = make_log_pdfs(diff_spec[(~speech_mask) * well_aligned_mask])
+ lratio_lookup = desc_log_pdfs - nondesc_log_pdfs
+ lratios = lratio_lookup[np.fromfunction(lambda i,j: j, diff_spec.shape, dtype=int),
+ np.digitize(diff_spec, bin_edges, right=True)-1]
+ ratio_desc_to_nondesc = np.sum(speech_mask * well_aligned_mask) /\
+ (np.sum((~speech_mask) * well_aligned_mask) + 1.)
+ relative_probs = np.sum(lratios, axis=1)
+ relative_probs /= np.std(relative_probs)
+ relative_probs -= np.mean(relative_probs)
+
+ # L1-Minimization to smoothly identify audio descriptions using a linear program
+ # x is fit_err_pos, fit_err_neg, delta_fit_pos, delta_fit_neg
+ # fit_err[i] = relative_probs[i] - y_fit[i]
+ # delta_fit[i] = y_fit[i] - y_fit[i-1]
+ # this can be rewritten in terms of fit_err by re-arranging the 1st equation:
+ # y_fit[i] = relative_probs[i] - fit_err[i]
+ # this gives:
+ # delta_fit[i] = (relative_probs[i] - relative_probs[i-1]) -\
+ # (fit_err[i] - fit_err[i-1])
+ # the delta_fit variables can then be set using equality constraints
+ num_fit_points = len(relative_probs)
+ y_diffs = np.diff(relative_probs)
+ pos_err_cost_factor = MIN_DESC_DURATION / float(TIMESTEP_SIZE_SECONDS)
+ neg_err_cost_factor = MAX_GAP_IN_DESC_SEC / float(TIMESTEP_SIZE_SECONDS)
+ c = np.hstack([np.ones(num_fit_points) / pos_err_cost_factor,
+ np.ones(num_fit_points) / neg_err_cost_factor,
+ np.ones(num_fit_points - 1) / 2.,
+ np.ones(num_fit_points - 1) / 2.])
+ fit_err_coeffs = scipy.sparse.diags([-np.ones(num_fit_points),
+ np.ones(num_fit_points)],
+ offsets=[0,1],
+ shape=(num_fit_points - 1, num_fit_points)).tocsc()
+ A_eq = scipy.sparse.hstack([ fit_err_coeffs,
+ -fit_err_coeffs,
+ scipy.sparse.eye(num_fit_points-1),
+ -scipy.sparse.eye(num_fit_points-1)])
+ b_eq = y_diffs
+ fit = scipy.optimize.linprog(c, A_eq=A_eq, b_eq=b_eq)
+ if not fit.success:
+ print(fit)
+ raise RuntimeError("Describer Voice Detection L1-Min Optimization Failed!")
+
+ # combine fit_err_pos and fit_err_neg
+ fit_err = fit.x[:num_fit_points] - fit.x[num_fit_points:2*num_fit_points]
+
+ # subtract fit errors from nodes to retrieve the smoothed fit
+ smooth_desc_locations = relative_probs - fit_err
+
+ # hard threshold to classify each token as describer speech or not
+ speech_mask = smooth_desc_locations > 1. - 1.5 * detect_sensitivity
+ speech_mask *= aligned_mask
+
+ # a separate mask is created for describer volume boosting
+ # as losing the describer's voice entirely is usually worse than it just being quiet
+ # and imperfectly aligned segments may have descriptions, but shouldn't be boosted
+ boost_mask = smooth_desc_locations > 1. - 1.5 * boost_sensitivity
+ boost_mask *= well_aligned_mask
+
+ # convert a token classification into a mask that can be applied directly to samples
+ # unlike the input, the output isn't a boolean array but an array of floats
+ def token_mask_to_sample_mask(token_mask):
+ description_timings = video_timings[1:-1][token_mask[1:-1]]
+ sample_mask = np.zeros(video_arr.shape[1], dtype=np.float32)
+ window_radius = int(AUDIO_SAMPLE_RATE * TIMESTEP_SIZE_SECONDS)
+ window_size_seconds = 2 * window_radius + 1
+ bump = scipy.signal.windows.hann(window_size_seconds)
+ for description_timing in description_timings:
+ window_center = int(description_timing * AUDIO_SAMPLE_RATE)
+ sample_mask[window_center-window_radius:window_center+window_radius+1] += bump
+ return sample_mask
+
+ speech_sample_mask = token_mask_to_sample_mask(speech_mask)
+ boost_sample_mask = token_mask_to_sample_mask(boost_mask)
+ ad_timings = video_timings.copy()
+ ad_timings[~speech_mask] = np.inf
+
+ return speech_sample_mask, boost_sample_mask, ad_timings
+
+# Convert piece-wise linear fit to ffmpeg expression for editing video frame timestamps
+def encode_fit_as_ffmpeg_expr(smooth_path, clips, video_offset, start_key_frame):
+ # PTS is the input frame's presentation timestamp, which is when frames are displayed
+ # TB is the timebase, which is how many seconds each unit of PTS corresponds to
+ # the output value of the expression will be the frame's new PTS
+ setts_cmd = ['TS']
+ start_skip = max(0, video_offset - start_key_frame)
+ if start_skip > 0:
+ # lossless cutting can only happen at key frames, so we cut the video before the audio starts
+ # but that means the video is behind the audio and needs to catch up by playing quicker
+ # catchup_spread is the ratio of time to spend catching up to the amount of catching up needed
+ catchup_spread = 1./CATCHUP_RATE
+ setts_cmd.append(f'+clip(TS-STARTPTS,0,{start_skip*(1+catchup_spread)}/TB)*{-1./(1+catchup_spread)}')
+ elif video_offset < 0:
+ # if the audio starts before the video, stretch the first frame of the video back to meet it
+ setts_cmd.append(f'+clip(TS-STARTPTS,0,{-video_offset/10000.}/TB)*10000')
+ # each segment of the linear fit can be encoded as a single clip function
+ setts_cmd.append('+(0')
+ for clip_start, clip_end in clips:
+ audio_desc_start, video_start = smooth_path[clip_start]
+ audio_desc_end, video_end = smooth_path[clip_end]
+ video_start -= start_key_frame
+ video_end -= start_key_frame
+ audio_desc_length = audio_desc_end - audio_desc_start
+ video_length = video_end - video_start
+ slope = audio_desc_length / video_length
+ setts_cmd.append(f'+clip(TS-STARTPTS-{video_start:.4f}/TB,0,{max(0,video_length):.4f}/TB)*{slope-1:.9f}')
+ setts_cmd.append(')')
+ setts_cmd = ''.join(setts_cmd)
+ return setts_cmd
+
+def get_ffmpeg():
+ return static_ffmpeg.run._get_or_fetch_platform_executables_else_raise_no_lock()[0]
+
+def get_ffprobe():
+ return static_ffmpeg.run._get_or_fetch_platform_executables_else_raise_no_lock()[1]
+
+def get_closest_key_frame_time(video_file, time):
+ if time <= 0:
+ return 0
+ key_frames = ffmpeg.probe(video_file, cmd=get_ffprobe(), select_streams='v',
+ show_frames=None, skip_frame='nokey')['frames']
+ key_frame_times = np.array([float(frame['pts_time']) for frame in key_frames] + [0])
+ return np.max(key_frame_times[key_frame_times <= time])
+
+# outputs a new media file with the replaced audio (which includes audio descriptions)
+def write_replaced_media_to_disk(output_filename, media_arr, video_file=None, audio_desc_file=None,
+ setts_cmd=None, start_key_frame=None):
+ if audio_desc_file is None:
+ media_input = ffmpeg.input('pipe:', format='s16le', acodec='pcm_s16le',
+ ac=2, ar=AUDIO_SAMPLE_RATE)
+ if video_file is None or os.path.splitext(output_filename)[1][1:] in AUDIO_EXTENSIONS:
+ write_command = ffmpeg.output(media_input, output_filename, loglevel='fatal').overwrite_output()
+ else:
+ original_video = ffmpeg.input(video_file)
+ # "-max_interleave_delta 0" is sometimes necessary to fix an .mkv bug that freezes audio/video:
+ # ffmpeg bug warning: [matroska @ 0000000002c814c0] Starting new cluster due to timestamp
+ # more info about the bug and fix: https://reddit.com/r/ffmpeg/comments/efddfs/
+ write_command = ffmpeg.output(media_input, original_video, output_filename,
+ acodec='copy', vcodec='copy', scodec='copy',
+ max_interleave_delta='0', loglevel='fatal',
+ **{"c:a:0": "aac", "disposition:a:0": "default"}).overwrite_output()
+ ffmpeg_caller = write_command.run_async(pipe_stdin=True, cmd=get_ffmpeg())
+ ffmpeg_caller.stdin.write(media_arr.astype(np.int16).T.tobytes())
+ ffmpeg_caller.stdin.close()
+ ffmpeg_caller.wait()
+ else:
+ media_input = ffmpeg.input(audio_desc_file)
+ audio_desc_streams = ffmpeg.probe(audio_desc_file, cmd=get_ffprobe(), select_streams='a',
+ show_entries='format=duration')['streams']
+ audio_desc_duration = max([float(stream['duration']) for stream in audio_desc_streams])
+ original_video = ffmpeg.input(video_file, an=None, ss=start_key_frame)
+ if os.path.splitext(output_filename)[1] == os.path.splitext(video_file)[1]:
+ # wav files don't have codecs compatible with most video containers, so we convert to aac
+ audio_codec = 'copy' if os.path.splitext(audio_desc_file)[1] != '.wav' else 'aac'
+ write_command = ffmpeg.output(media_input, original_video, output_filename,
+ acodec=audio_codec, vcodec='copy', scodec='copy',
+ max_interleave_delta='0', loglevel='fatal',
+ **{'bsf:v': 'setts=ts=\'' + setts_cmd + '\'',
+ 'bsf:s': 'setts=ts=\'' + setts_cmd + '\''}).overwrite_output()
+ write_command.run(cmd=get_ffmpeg())
+ else:
+ # work around for bug that sometimes breaks setts when output and input formats differ
+ # the trick is separating the input and output by piping from one ffmpeg process into another
+ # mkv files break if 'nut' is used, while other files break when 'matroska' is used
+ format = 'matroska' if os.path.splitext(output_filename)[1] == '.mkv' else 'nut'
+ write_command = ffmpeg.output(original_video, 'pipe:', format=format, vsync='passthrough',
+ c='copy', loglevel='fatal')
+ ffmpeg_caller = write_command.run_async(pipe_stdout=True, cmd=get_ffmpeg())
+ pipe_input = ffmpeg.input('pipe:', format=format, thread_queue_size='512')
+ write_command2 = ffmpeg.output(media_input, pipe_input, output_filename, c='copy',
+ max_interleave_delta='0', loglevel='fatal', vsync='passthrough',
+ **{'bsf:v': 'setts=ts=\'' + setts_cmd + '\'',
+ 'bsf:s': 'setts=ts=\'' + setts_cmd + '\''}).overwrite_output()
+ ffmpeg_caller2 = write_command2.run_async(pipe_stdin=True, cmd=get_ffmpeg())
+ while True:
+ in_bytes = ffmpeg_caller.stdout.read(100000)
+ if not in_bytes:
+ break
+ ffmpeg_caller2.stdin.write(in_bytes)
+ ffmpeg_caller2.stdin.close()
+ ffmpeg_caller.wait()
+ ffmpeg_caller2.wait()
+
+
+# check whether static_ffmpeg has already installed ffmpeg and ffprobe
+def is_ffmpeg_installed():
+ ffmpeg_dir = static_ffmpeg.run.get_platform_dir()
+ indicator_file = os.path.join(ffmpeg_dir, "installed.crumb")
+ return os.path.exists(indicator_file)
+
+# combines videos with matching audio files (e.g. audio descriptions)
+# this is the main function of this script, it calls the other functions in order
+def combine(video, audio, smoothness=50, stretch_audio=False, keep_non_ad=False,
+ boost=0, ad_detect_sensitivity=.6, boost_sensitivity=.4, yes=False,
+ prepend="ad_", no_pitch_correction=False, output_dir=default_output_dir,
+ alignment_dir=default_alignment_dir, extension="copy", display_func=None):
+ video_files, video_file_types = get_sorted_filenames(video, VIDEO_EXTENSIONS, AUDIO_EXTENSIONS)
+
+ if yes == False and sum(video_file_types) > 0:
+ print("")
+ print("One or more audio files found in video input. Was this intentional?")
+ print("If not, press ctrl+c to kill this script.")
+ input("If this was intended, press Enter to continue...")
+ print("")
+ audio_desc_files, _ = get_sorted_filenames(audio, AUDIO_EXTENSIONS)
+ if len(video_files) != len(audio_desc_files):
+ error_msg = ["Number of valid files in input paths are not the same.",
+ f"The video path has {len(video_files)} files",
+ f"The audio path has {len(audio_desc_files)} files"]
+ raise RuntimeError("\n".join(error_msg))
+
+ ensure_folders_exist([output_dir], display_func)
+ if PLOT_ALIGNMENT_TO_FILE:
+ ensure_folders_exist([alignment_dir], display_func)
+
+ display("", display_func)
+ for (video_file, audio_desc_file) in zip(video_files, audio_desc_files):
+ display(os.path.split(video_file)[1], display_func)
+ display(os.path.split(audio_desc_file)[1], display_func)
+ display("", display_func)
+ if yes == False:
+ print("Are the above input file pairings correct?")
+ print("If not, press ctrl+c to kill this script.")
+ input("If they are correct, press Enter to continue...")
+ print("")
+
+ # if ffmpeg isn't installed, install it
+ if not is_ffmpeg_installed():
+ display("Downloading and installing ffmpeg (media editor, 50 MB download)...", display_func)
+ get_ffmpeg()
+ if not is_ffmpeg_installed():
+ RuntimeError("Failed to install ffmpeg.")
+ display("Successfully installed ffmpeg.", display_func)
+
+ display("Processing files:", display_func)
+
+ for (video_file, audio_desc_file, video_filetype) in zip(video_files, audio_desc_files,
+ video_file_types):
+ # Default is to use the input video's extension for the output video
+ if extension is None or extension in ["", "copy"]:
+ ext = os.path.splitext(video_file)[1]
+ else:
+ # add a dot to the extension if it's missing
+ ext = ('' if extension[0] == '.' else '.') + extension
+ output_filename = prepend + os.path.splitext(os.path.split(video_file)[1])[0] + ext
+ output_filename = os.path.join(output_dir, output_filename)
+ display(" " + output_filename, display_func)
+
+ if os.path.exists(output_filename) and os.path.getsize(output_filename) > 0:
+ display(" output file already exists, skipping...", display_func)
+ continue
+
+ video_arr = parse_audio_from_file(video_file)
+ audio_desc_arr = parse_audio_from_file(audio_desc_file)
+ video_spec_raw, video_timings = tokenize_audio(video_arr)
+ video_spec = normalize_spec(video_spec_raw)
+ audio_desc_spec_raw, audio_desc_timings = tokenize_audio_dither(audio_desc_arr, video_timings)
+ audio_desc_spec = normalize_spec(audio_desc_spec_raw)
+
+ # rescale RMS intensity of audio to match video
+ audio_desc_arr *= (np.std(video_arr) / np.std(audio_desc_arr))
+
+ path, quals = rough_align(video_spec, audio_desc_spec, video_timings, audio_desc_timings)
+
+ smooth_path, runs, bad_clips, clips = smooth_align(path, quals, smoothness)
+
+ cap_synced_end_points(smooth_path, video_arr, audio_desc_arr)
+
+ ad_timings = None
+ if stretch_audio:
+ if keep_non_ad:
+ video_arr_original = video_arr.copy()
+
+ replace_aligned_segments(video_arr, audio_desc_arr, smooth_path, runs, no_pitch_correction)
+ del audio_desc_arr
+
+ if keep_non_ad or boost != 0:
+ outputs = detect_describer(video_arr, video_spec, video_spec_raw, video_timings,
+ smooth_path, ad_detect_sensitivity, boost_sensitivity)
+ speech_sample_mask, boost_sample_mask, ad_timings = outputs
+ if keep_non_ad:
+ video_arr *= speech_sample_mask
+ video_arr += video_arr_original * (1 - speech_sample_mask)
+ del video_arr_original
+ del speech_sample_mask
+ else:
+ ad_timings = None
+ if boost != 0:
+ video_arr = video_arr * (1. + (10**(boost / 10.) - 1.) * boost_sample_mask)
+ del boost_sample_mask
+
+ # prevent peaking by rescaling to within +/- 16,382
+ video_arr *= (2**15 - 2.) / np.max(np.abs(video_arr))
+
+ if video_filetype == 0:
+ write_replaced_media_to_disk(output_filename, video_arr, video_file)
+ else:
+ write_replaced_media_to_disk(output_filename, video_arr)
+ else:
+ if video_filetype == 1:
+ raise RuntimeError("Argument --stretch_audio is required when both inputs are audio files.")
+ if os.path.splitext(output_filename)[1][1:] in AUDIO_EXTENSIONS:
+ raise RuntimeError("Argument --stretch_audio is required when output file extension is an audio filetype.")
+ video_offset = np.diff(smooth_path[clips[0][0]])[0]
+ start_key_frame = get_closest_key_frame_time(video_file, video_offset)
+ setts_cmd = encode_fit_as_ffmpeg_expr(smooth_path, clips, video_offset, start_key_frame)
+ write_replaced_media_to_disk(output_filename, None, video_file, audio_desc_file,
+ setts_cmd, start_key_frame)
+
+ del video_arr
+ if PLOT_ALIGNMENT_TO_FILE:
+ plot_filename_no_ext = os.path.join(alignment_dir, os.path.splitext(os.path.split(video_file)[1])[0])
+ plot_alignment(plot_filename_no_ext, path, smooth_path, quals, runs, bad_clips, ad_timings)
+ display("All files processed.", display_func)
+
+def write_config_file(config_path, settings):
+ config = configparser.ConfigParser()
+ config.add_section('alignment')
+ config['alignment'] = {}
+ for key, value in settings.items():
+ config['alignment'][key] = str(value)
+ with open(config_path, 'w') as f:
+ config.write(f)
+
+def read_config_file(config_path):
+ config = configparser.ConfigParser()
+ config.read(config_path)
+ settings = {'smoothness': config.getfloat('alignment', 'smoothness', fallback=50),
+ 'stretch_audio': config.getboolean('alignment', 'stretch_audio', fallback=False),
+ 'keep_non_ad': config.getboolean('alignment', 'keep_non_ad', fallback=False),
+ 'boost': config.getfloat('alignment', 'boost', fallback=0),
+ 'ad_detect_sensitivity':config.getfloat('alignment', 'ad_detect_sensitivity', fallback=.6),
+ 'boost_sensitivity': config.getfloat('alignment', 'boost_sensitivity', fallback=.4),
+ 'prepend': config.get('alignment', 'prepend', fallback='ad_'),
+ 'no_pitch_correction': config.getboolean('alignment', 'no_pitch_correction', fallback=False),
+ 'output_dir': config.get('alignment', 'output_dir', fallback=default_output_dir),
+ 'alignment_dir': config.get('alignment', 'alignment_dir', fallback=default_alignment_dir),
+ 'extension': config.get('alignment', 'extension', fallback='copy')}
+ if not config.has_section('alignment'):
+ write_config_file(config_path, settings)
+ return settings
+
+def settings_gui(config_path):
+ settings = read_config_file(config_path)
+ layout = [[sg.Text('Check tooltips (i.e. mouse-over text) for descriptions:')],
+ [sg.Column([[sg.Text('extension:', size=(10, 1.2), pad=(1,5)),
+ sg.Input(default_text=str(settings['extension']), size=(8, 1.2), pad=(10,5), key='extension',
+ tooltip='File type of output video (e.g. mkv). When set to "copy", copies the ' + \
+ 'file type of the corresponding input video. Default is "copy".')]])],
+ [sg.Column([[sg.Text('prepend:', size=(8, 1.2), pad=(1,5)),
+ sg.Input(default_text=str(settings['prepend']), size=(8, 1.2), pad=(10,5), key='prepend',
+ tooltip='Output file name prepend text. Default is "ad_"')]])],
+ [sg.Column([[sg.Text('output_dir:', size=(10, 1.2), pad=(1,5)),
+ sg.Input(default_text=str(settings['output_dir']), size=(22, 1.2), pad=(10,5), key='output_dir',
+ tooltip='Directory combined output media is saved to. Default is "videos_with_ad"'),
+ sg.FolderBrowse(button_text="Browse Folder", key='output_browse')]])],
+ [sg.Column([[sg.Text('alignment_dir:', size=(13, 1.2), pad=(1,5)),
+ sg.Input(default_text=str(settings['alignment_dir']), size=(22, 1.2), pad=(10,5), key='alignment_dir',
+ tooltip='Directory alignment data and plots are saved to. Default is "alignment_plots"'),
+ sg.FolderBrowse(button_text="Browse Folder", key='alignment_browse')]], pad=(2,7))],
+ [sg.Column([[sg.Text('smoothness:', size=(12, 1), pad=(1,5)),
+ sg.Input(default_text=str(settings['smoothness']), size=(8, 1.2), pad=(10,5), key='smoothness',
+ tooltip='Lower values make the alignment more accurate when there are skips ' + \
+ '(e.g. describer pauses), but also make it more likely to misalign. ' + \
+ 'Default is 50.')]])],
+ [sg.Checkbox('stretch_audio', default=settings['stretch_audio'], key='stretch_audio', change_submits=True,
+ tooltip='Stretches the input audio to fit the input video. ' + \
+ 'Default is to stretch the video to fit the audio.')],
+ [sg.Checkbox('keep_non_ad', default=settings['keep_non_ad'], key='keep_non_ad',
+ disabled=not settings['stretch_audio'],
+ tooltip='Tries to only replace segments with audio description. Useful if ' + \
+ 'video\'s audio quality is better. Default is to replace all aligned audio. ' + \
+ 'Requires --stretch_audio to be set, otherwise does nothing.')],
+ [sg.Column([[sg.Text('boost:', size=(6, 1), pad=(1,5)),
+ sg.Input(default_text=str(settings['boost']), size=(8, 1.2), pad=(10,5),
+ key='boost', disabled=not settings['stretch_audio'],
+ tooltip='Boost (or quieten) description volume. Units are decibels (dB), so ' + \
+ '-3 makes the describer about 2x quieter, while 3 makes them 2x louder. ' + \
+ 'Requires --stretch_audio to be set, otherwise does nothing.')]])],
+ [sg.Column([[sg.Text('ad_detect_sensitivity:', size=(21, 1.2), pad=(2,5)),
+ sg.Input(default_text=str(settings['ad_detect_sensitivity']), size=(8, 1.2), pad=(10,5),
+ key='ad_detect_sensitivity', disabled=not settings['stretch_audio'],
+ tooltip='Audio description detection sensitivity ratio. Higher values make ' + \
+ '--keep_non_ad more likely to replace aligned audio. Default is 0.6')]])],
+ [sg.Column([[sg.Text('boost_sensitivity:', size=(17, 1.2), pad=(1,5)),
+ sg.Input(default_text=str(settings['boost_sensitivity']), size=(8, 1.2), pad=(10,5),
+ key='boost_sensitivity', disabled=not settings['stretch_audio'],
+ tooltip='Higher values make --boost less likely to miss a description, but ' + \
+ 'also make it more likely to boost non-description audio. Default is 0.4')]])],
+ [sg.Checkbox('no_pitch_correction', default=settings['no_pitch_correction'], key='no_pitch_correction',
+ disabled=not settings['stretch_audio'],
+ tooltip='Skips pitch correction step when stretching audio. ' + \
+ 'Requires --stretch_audio to be set, otherwise does nothing.')],
+ [sg.Column([[sg.Submit('Save', pad=(40,3)),
+ sg.Button('Cancel')]], pad=((135,3),10))]]
+ settings_window = sg.Window('Settings - describealign', layout, font=('Arial', 16), finalize=True)
+ settings_window['extension'].set_focus()
+ while True:
+ event, values = settings_window.read()
+ if event in (sg.WIN_CLOSED, 'Cancel') or settings_window.TKrootDestroyed:
+ break
+ if event == 'stretch_audio':
+ # work around bug in PySimpleGUIWx's InputText Update function where enabling/disabling are flipped
+ if IS_RUNNING_WINDOWS:
+ settings_window['boost'].Update(disabled = values['stretch_audio'])
+ settings_window['ad_detect_sensitivity'].Update(disabled = values['stretch_audio'])
+ settings_window['boost_sensitivity'].Update(disabled = values['stretch_audio'])
+ else:
+ settings_window['boost'].Update(disabled = not values['stretch_audio'])
+ settings_window['ad_detect_sensitivity'].Update(disabled = not values['stretch_audio'])
+ settings_window['boost_sensitivity'].Update(disabled = not values['stretch_audio'])
+ settings_window['keep_non_ad'].Update(disabled = not values['stretch_audio'])
+ settings_window['no_pitch_correction'].Update(disabled = not values['stretch_audio'])
+ if event == 'Save':
+ settings = values.copy()
+ del settings['output_browse']
+ del settings['alignment_browse']
+ write_config_file(config_path, settings)
+ break
+ settings_window.close()
+
+def combine_print_exceptions(print_queue, *args, **kwargs):
+ try:
+ combine(*args, **kwargs)
+ except:
+ print_queue.put(traceback.format_exc())
+ # raise
+
+def combine_gui(video_files, audio_files, config_path):
+ output_textbox = sg.Multiline(size=(80,30), key='-OUTPUT-')
+ layout = [[output_textbox],
+ [sg.Button('Close', pad=(360,5))]]
+ combine_window = sg.Window('Combining - describealign', layout, font=('Arial', 16),
+ disable_close=True, finalize=True)
+ output_textbox.update('Combining media files:', append=True)
+ print_queue = multiprocessing.Queue()
+
+ settings = read_config_file(config_path)
+ settings.update({'display_func':print_queue.put, 'yes':True})
+ proc = multiprocessing.Process(target=combine_print_exceptions,
+ args=(print_queue, video_files, audio_files),
+ kwargs=settings, daemon=True)
+ proc.start()
+ while True:
+ # if the script isn't running anymore, re-enable the default close window button
+ if not proc.is_alive():
+ combine_window.DisableClose = False
+ if not print_queue.empty():
+ if IS_RUNNING_WINDOWS:
+ cursor_position = output_textbox.WxTextCtrl.GetInsertionPoint()
+ output_textbox.update('\n' + print_queue.get(), append=True)
+ if IS_RUNNING_WINDOWS:
+ output_textbox.WxTextCtrl.SetInsertionPoint(cursor_position)
+ event, values = combine_window.read(timeout=100)
+ # window closed event isn't always emitted, so also manually check window status
+ if event == sg.WIN_CLOSED or combine_window.TKrootDestroyed:
+ if proc.is_alive():
+ proc.terminate()
+ break
+ if event == 'Close':
+ if not proc.is_alive():
+ combine_window.DisableClose = False
+ break
+ selection = sg.PopupYesNo('Combiner is still running, stop it and close anyway?')
+ if selection != 'Yes':
+ continue
+ proc.terminate()
+ combine_window.DisableClose = False
+ break
+ combine_window.close()
+
+def main_gui():
+ config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.ini')
+ sg.theme('Light Blue 2')
+
+ filetype_sep = ';' if IS_RUNNING_WINDOWS else ' '
+ all_audio_file_types = [('All Audio File Types', '*.' + f'{filetype_sep}*.'.join(AUDIO_EXTENSIONS)),]
+ all_video_file_types = [('All Video File Types', '*.' + f'{filetype_sep}*.'.join(VIDEO_EXTENSIONS)),]
+ all_video_and_audio_file_types = [('All Video and Audio File Types',
+ '*.' + f'{filetype_sep}*.'.join(VIDEO_EXTENSIONS | AUDIO_EXTENSIONS)),]
+ audio_file_types = [(ext, "*." + ext) for ext in AUDIO_EXTENSIONS]
+ video_and_audio_file_types = [(ext, "*." + ext) for ext in VIDEO_EXTENSIONS] + audio_file_types
+ audio_file_types = all_audio_file_types + audio_file_types
+ video_and_audio_file_types = all_video_file_types + all_video_and_audio_file_types + video_and_audio_file_types
+ # work around bug in PySimpleGUIWx's convert_tkinter_filetypes_to_wx function
+ if IS_RUNNING_WINDOWS:
+ file_fix = lambda file_types: file_types[:1] + [('|' + type[0], type[1]) for type in file_types[1:]]
+ audio_file_types = file_fix(audio_file_types)
+ video_and_audio_file_types = file_fix(video_and_audio_file_types)
+
+ layout = [[sg.Text('Select media files to combine:', size=(40, 2), font=('Arial', 20), pad=(3,15))],
+ [sg.Column([[sg.Text('Video Input:', size=(11, 2), pad=(1,5)),
+ sg.Input(size=(35, 1.2), pad=(10,5), key='-VIDEO_FILES-',
+ tooltip='List video filenames here, in order, separated by semicolons'),
+ sg.FilesBrowse(button_text="Browse Video",
+ file_types=video_and_audio_file_types,
+ tooltip='Select one or more video files')]], pad=(2,7))],
+ [sg.Column([[sg.Text('Audio Input:', size=(11, 2), pad=(1,5)),
+ sg.Input(size=(35, 1.2), pad=(10,5), key='-AUDIO_FILES-',
+ tooltip='List audio filenames here, in order, separated by semicolons'),
+ sg.FilesBrowse(button_text="Browse Audio",
+ file_types=audio_file_types,
+ tooltip='Select one or more audio files')]], pad=(2,7))],
+ [sg.Column([[sg.Submit('Combine', pad=(40,3), tooltip='Combine selected video and audio files'),
+ sg.Button('Settings', tooltip='Edit settings for the GUI and algorithm.')]],
+ pad=((135,3),10))]]
+ window = sg.Window('describealign', layout, font=('Arial', 16), resizable=False, finalize=True)
+ window['-VIDEO_FILES-'].set_focus()
+ while True:
+ event, values = window.read()
+ if event == 'Combine':
+ if len(values['-VIDEO_FILES-']) == 0 or \
+ len(values['-AUDIO_FILES-']) == 0:
+ window.disable()
+ sg.Popup('Error: empty input field.', font=('Arial', 20))
+ window.enable()
+ continue
+ video_files = values['-VIDEO_FILES-'].split(';')
+ audio_files = values['-AUDIO_FILES-'].split(';')
+ combine_gui(video_files, audio_files, config_path)
+ if event == 'Settings':
+ window.disable()
+ settings_gui(config_path)
+ window.enable()
+ if event == sg.WIN_CLOSED:
+ break
+ window.close()
+
+# Entry point for command line interaction, for example:
+# > describealign video.mp4 audio_desc.mp3
+def command_line_interface():
+ # override command line argument parser's error handler to make it pause before exiting
+ # this allows users to see the error message when accidentally not running from command line
+ class ArgumentParser(argparse.ArgumentParser):
+ def error(self, message):
+ if 'required: video, audio' in message:
+ print('No input arguments detected, starting GUI...')
+ main_gui()
+ self.exit()
+ else:
+ self.exit(2, f'{self.prog}: error: {message}\n')
+ parser = ArgumentParser(description="Replaces a video's sound with an audio description.",
+ usage="describealign video_file.mp4 audio_file.mp3")
+ parser.add_argument("video", help='A video file or directory containing video files.')
+ parser.add_argument("audio", help='An audio file or directory containing audio files.')
+ parser.add_argument('--smoothness', type=float, default=50,
+ help='Lower values make the alignment more accurate when there are skips ' + \
+ '(e.g. describer pauses), but also make it more likely to misalign. ' + \
+ 'Default is 50.')
+ parser.add_argument('--stretch_audio', action='store_true',
+ help='Stretches the input audio to fit the input video. ' + \
+ 'Default is to stretch the video to fit the audio.')
+ parser.add_argument('--keep_non_ad', action='store_true',
+ help='Tries to only replace segments with audio description. Useful if ' + \
+ 'video\'s audio quality is better. Default is to replace all aligned audio. ' + \
+ 'Requires --stretch_audio to be set, otherwise does nothing.')
+ parser.add_argument('--boost', type=float, default=0,
+ help='Boost (or quieten) description volume. Units are decibels (dB), so ' + \
+ '-3 makes the describer about 2x quieter, while 3 makes them 2x louder. ' + \
+ 'Requires --stretch_audio to be set, otherwise does nothing.')
+ parser.add_argument('--ad_detect_sensitivity', type=float, default=.6,
+ help='Audio description detection sensitivity ratio. Higher values make ' + \
+ '--keep_non_ad more likely to replace aligned audio. Default is 0.6')
+ parser.add_argument('--boost_sensitivity', type=float, default=.4,
+ help='Higher values make --boost less likely to miss a description, but ' + \
+ 'also make it more likely to boost non-description audio. Default is 0.4')
+ parser.add_argument('--yes', action='store_true',
+ help='Auto-skips user prompts asking to verify information.')
+ parser.add_argument("--prepend", default="ad_", help='Output file name prepend text. Default is "ad_"')
+ parser.add_argument('--no_pitch_correction', action='store_true',
+ help='Skips pitch correction step when stretching audio. ' + \
+ 'Requires --stretch_audio to be set, otherwise does nothing.')
+ parser.add_argument("--output_dir", default=default_output_dir,
+ help='Directory combined output media is saved to. Default is "videos_with_ad"')
+ parser.add_argument("--alignment_dir", default=default_alignment_dir,
+ help='Directory alignment data and plots are saved to. Default is "alignment_plots"')
+ parser.add_argument("--extension", default="copy",
+ help='File type of output video (e.g. mkv). When set to "copy", copies the ' + \
+ 'file type of the corresponding input video. Default is "copy".')
+ args = parser.parse_args()
+
+ combine(args.video, args.audio, args.smoothness, args.stretch_audio, args.keep_non_ad,
+ args.boost, args.ad_detect_sensitivity, args.boost_sensitivity, args.yes,
+ args.prepend, args.no_pitch_correction, args.output_dir, args.alignment_dir,
+ args.extension)
+
+# allows the script to be run on its own, rather than through the package, for example:
+# python3 describealign.py video.mp4 audio_desc.mp3
+if __name__ == "__main__":
+ multiprocessing.freeze_support()
+ command_line_interface()
+
+
+
+
diff --git a/pyproject.toml b/pyproject.toml
index 3a8ccb9..48d21e1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,27 +1,27 @@
-[build-system]
-requires = ["setuptools>=61.0"]
-build-backend = "setuptools.build_meta"
-
-[project]
-name = "describealign"
-authors = [{ name = "Julian Brown", email = "julbean@proton.me" }]
-description = "Combines videos with matching audio files (e.g. audio descriptions)"
-readme = "README.md"
-requires-python = ">=3.8"
-classifiers = [
- "Programming Language :: Python :: 3",
- "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
- "Operating System :: OS Independent",
-]
-dynamic = ["version", "dependencies"]
-
-[tool.setuptools.dynamic]
-version = { file = "version" }
-dependencies = { file = "requirements.txt" }
-
-[project.scripts]
-describealign = "describealign:command_line_interface"
-
-[project.urls]
-"Homepage" = "/~https://github.com/julbean/describealign"
-"Bug Tracker" = "/~https://github.com/julbean/describealign/issues"
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "describealign"
+authors = [{ name = "Julian Brown", email = "julbean@proton.me" }]
+description = "Combines videos with matching audio files (e.g. audio descriptions)"
+readme = "README.md"
+requires-python = ">=3.8"
+classifiers = [
+ "Programming Language :: Python :: 3",
+ "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
+ "Operating System :: OS Independent",
+]
+dynamic = ["version", "dependencies"]
+
+[tool.setuptools.dynamic]
+version = { file = "version" }
+dependencies = { file = "requirements.txt" }
+
+[project.scripts]
+describealign = "describealign:command_line_interface"
+
+[project.urls]
+"Homepage" = "/~https://github.com/julbean/describealign"
+"Bug Tracker" = "/~https://github.com/julbean/describealign/issues"
diff --git a/requirements.txt b/requirements.txt
index 3bd4dd0..af751a0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,10 +1,10 @@
-ffmpeg_python~=0.2.0
-static-ffmpeg~=2.5
-matplotlib~=3.5.0
-numpy~=1.21.4
-python_speech_features~=0.6
-scipy~=1.10.1
-pytsmod~=0.3.7
-PySimpleGUIWx~=0.17.2; platform_system == 'Windows'
-PySimpleGUIQt~=0.35.0; platform_system != 'Windows'
-PySide2~=5.15.2.1; platform_system != "Windows"
+ffmpeg_python~=0.2.0
+static-ffmpeg~=2.5
+matplotlib~=3.5.0
+numpy~=1.21.4
+python_speech_features~=0.6
+scipy~=1.10.1
+pytsmod~=0.3.7
+PySimpleGUIWx~=0.17.2; platform_system == 'Windows'
+PySimpleGUIQt~=0.35.0; platform_system != 'Windows'
+PySide2~=5.15.2.1; platform_system != "Windows"
diff --git a/setup.cfg b/setup.cfg
index 51749fc..96fadd5 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,3 +1,3 @@
-[egg_info]
-tag_build =
-tag_date = 0
+[egg_info]
+tag_build =
+tag_date = 0