diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..9d6f1d0
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,6 @@
+# Git Attributes (https://git-scm.com/docs/gitattributes)
+# Default git attributes
+* text=auto
+
+# Overrides
+*.png -text
diff --git a/describealign.py b/describealign.py
index 9fdc621..717baea 100644
--- a/describealign.py
+++ b/describealign.py
@@ -1,1250 +1,1250 @@
-# combines videos with matching audio files (e.g. audio descriptions)
-# input: video or folder of videos and an audio file or folder of audio files
-# output: videos in a folder "videos_with_ad", with aligned segments of the audio replaced
-# this script aligns the new audio to the video using the video's old audio
-# first, the video's sound and the audio file are both converted to spectrograms
-# second, the two spectrograms are roughly aligned by finding their longest common subsequence
-# third, the rough alignment is denoised through L1-Minimization
-# fourth, the spectrogram alignments determine where the new audio replaces the old
-
-'''
-Copyright (C) 2023  Julian Brown
-
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program.  If not, see <https://www.gnu.org/licenses/>.
-'''
-
-# Nuitka build options:
-# nuitka-project-if: {OS} != "Windows":
-#    nuitka-project: --enable-plugins=pyside2
-#
-# Compilation mode, standalone everywhere, except on macOS there app bundle
-# nuitka-project-if: {OS} == "Darwin":
-#    nuitka-project: --standalone
-#    nuitka-project: --macos-create-app-bundle
-# Mac needs onefile too apparently, because pyside2 plugin requires it.
-# All other platforms need it to, so set it universally.
-# nuitka-project: --onefile
-#
-# Debugging options, controlled via environment variable at compile time.
-# nuitka-project-if: os.getenv("DEBUG_COMPILATION", "no") == "yes":
-#     nuitka-project: --enable-console
-# nuitka-project-else:
-#     nuitka-project: --disable-console
-
-# Set app icon
-# nuitka-project-if: {OS} == "Windows":
-#   nuitka-project: --windows-icon-from-ico=describealign.png
-# nuitka-project-else:
-#   nuitka-project-if: {OS} == "Darwin":
-#     nuitka-project: --macos-app-icon=describealign.png
-#   nuitka-project-else:
-#     nuitka-project: --linux-icon=describealign.png
-# End Nuitka build options
-
-VIDEO_EXTENSIONS = set(['mp4', 'mkv', 'avi', 'mov', 'webm', 'm4v', 'flv', 'vob'])
-AUDIO_EXTENSIONS = set(['mp3', 'm4a', 'opus', 'wav', 'aac', 'flac', 'ac3', 'mka'])
-PLOT_ALIGNMENT_TO_FILE = True
-
-TIMESTEP_SIZE_SECONDS = .16
-TIMESTEP_OVERLAP_RATIO = .5
-AUDIO_SAMPLE_RATE = 44100
-MEL_COEFFS_PER_TIMESTEP = 25
-DITHER_PERIOD_STEPS = 60
-MIN_CORR_FOR_TOKEN_MATCH = .6
-GAP_START_COST = 1.0
-GAP_EXTEND_COST = -.01
-GAP_EXTEND_DIAG_BONUS = -.01
-SKIP_MATCH_COST = .1
-MAX_RATE_RATIO_DIFF_ALIGN = .1
-PREF_CUT_AT_GAPS_FACTOR = 5
-MIN_DURATION_TO_REPLACE_SECONDS = 2
-MIN_START_END_SYNC_TIME_SECONDS = 2
-MAX_START_END_SYNC_ERR_SECONDS = .2
-MAX_RATE_RATIO_DIFF_BOOST = .003
-MIN_DESC_DURATION = .5
-MAX_GAP_IN_DESC_SEC = 1.5
-JUST_NOTICEABLE_DIFF_IN_FREQ_RATIO = .005
-CATCHUP_RATE = 5
-
-if PLOT_ALIGNMENT_TO_FILE:
-  import matplotlib.pyplot as plt
-import argparse
-import os
-import glob
-import itertools
-import datetime
-import numpy as np
-import ffmpeg
-import static_ffmpeg
-import python_speech_features as psf
-import scipy.signal
-import scipy.optimize
-import scipy.interpolate
-import scipy.ndimage as nd
-import scipy.sparse
-import pytsmod
-import configparser
-import traceback
-import multiprocessing
-import platform
-
-IS_RUNNING_WINDOWS = platform.system() == 'Windows'
-if IS_RUNNING_WINDOWS:
-  import PySimpleGUIWx as sg
-  default_output_dir = 'videos_with_ad'
-  default_alignment_dir = 'alignment_plots'
-else:
-  import PySimpleGUIQt as sg
-  default_output_dir = os.path.expanduser('~') + '/videos_with_ad'
-  default_alignment_dir = os.path.expanduser('~') + '/alignment_plots'
-
-def display(text, func=None):
-  if func:
-    func(text)
-  print(text)
-
-def throw_runtime_error(text, func=None):
-  if func:
-    func(text)
-  raise RuntimeError(text)
-
-def ensure_folders_exist(dirs, display_func=None):
-  for dir in dirs:
-    if not os.path.isdir(dir):
-      display("Directory not found, creating it: " + dir, display_func)
-      os.makedirs(dir)
-
-def get_sorted_filenames(path, extensions, alt_extensions=set([])):
-  # path could be three different things: a file, a directory, a list of files
-  if type(path) is list:
-    files = [os.path.abspath(file) for file in path]
-    for file in files:
-      if not os.path.isfile(file):
-        raise RuntimeError(f"No file found at input path:\n  {file}")
-  else:
-    path = os.path.abspath(path)
-    if os.path.isdir(path):
-      files = glob.glob(glob.escape(path) + "/*")
-      if len(files) == 0:
-        raise RuntimeError(f"Empty input directory:\n  {path}")
-    else:
-      if not os.path.isfile(path):
-        raise RuntimeError(f"No file or directory found at input path:\n  {path}")
-      files = [path]
-  files = [file for file in files if os.path.splitext(file)[1][1:] in extensions | alt_extensions]
-  if len(files) == 0:
-    error_msg = [f"No files with valid extensions found at input path:\n  {path}",
-                 "Did you accidentally put the audio filepath before the video filepath?",
-                 "The video path should be the first positional input, audio second.",
-                 "Or maybe you need to add a new extension to this script's regex?",
-                 f"valid extensions for this input are:\n  {extensions}"]
-    raise RuntimeError("\n".join(error_msg))
-  files = sorted(files)
-  file_types = [0 if os.path.splitext(file)[1][1:] in extensions else 1 for file in files]
-  return files, file_types
-
-# read audio from file with ffmpeg and convert to numpy array
-def parse_audio_from_file(media_file):
-  media_stream, _ = (ffmpeg
-    .input(media_file)
-    .output('-', format='s16le', acodec='pcm_s16le', ac=2, ar=AUDIO_SAMPLE_RATE, loglevel='fatal')
-    .run(capture_stdout=True, cmd=get_ffmpeg())
-  )
-  media_arr = np.frombuffer(media_stream, np.int16).astype(np.float32).reshape((-1,2)).T
-  return media_arr
-
-# tokenize audio by transforming with a mel-frequency cepstrum (MFC)
-def tokenize_audio(media_arr, rate=1):
-  step_size_samples = psf.sigproc.round_half_up(TIMESTEP_SIZE_SECONDS * rate * AUDIO_SAMPLE_RATE)
-  window_size_seconds = TIMESTEP_SIZE_SECONDS / TIMESTEP_OVERLAP_RATIO
-  window_size_samples = psf.sigproc.round_half_up(window_size_seconds * AUDIO_SAMPLE_RATE)
-  fft_size_samples = 2**int(np.ceil(np.log2(window_size_samples)))
-  get_mfcc = lambda arr: psf.mfcc(np.mean(arr, axis=0),
-                                  samplerate=AUDIO_SAMPLE_RATE,
-                                  winlen=window_size_seconds,
-                                  winstep=TIMESTEP_SIZE_SECONDS * rate,
-                                  numcep=MEL_COEFFS_PER_TIMESTEP,
-                                  nfilt=MEL_COEFFS_PER_TIMESTEP * 2,
-                                  nfft=fft_size_samples,
-                                  winfunc=scipy.signal.windows.hann)
-  num_timesteps = max(1, ((media_arr.shape[1] - window_size_samples - 1) // step_size_samples) + 2)
-  media_spec = np.zeros((num_timesteps, MEL_COEFFS_PER_TIMESTEP))
-  chunk_size = 1000
-  for chunk_index in np.arange(0, num_timesteps, chunk_size):
-    chunk_bounds_samples = ((chunk_index                 ) * step_size_samples,
-                            (chunk_index + chunk_size - 1) * step_size_samples + window_size_samples)
-    media_spec[chunk_index:chunk_index+chunk_size] = get_mfcc(media_arr[:,slice(*chunk_bounds_samples)])
-  '''
-  # alternate python library's MFC implementation
-  import librosa
-  media_spec = librosa.feature.mfcc(y=np.mean(media_arr, axis=0),
-                                    sr=AUDIO_SAMPLE_RATE,
-                                    n_mfcc=MEL_COEFFS_PER_TIMESTEP,
-                                    lifter=22,
-                                    n_fft=fft_size_samples,
-                                    hop_length=step_size_samples,
-                                    win_length=window_size_samples,
-                                    window=scipy.signal.windows.hann).T
-  num_timesteps = media_spec.shape[0]
-  '''
-  timings_samples = window_size_samples/2. + step_size_samples * np.arange(num_timesteps)
-  timings_seconds = timings_samples / AUDIO_SAMPLE_RATE
-  return media_spec, timings_seconds
-
-# same as tokenize_audio, but dithering the MFC window timings
-# this allows for finer alignment by ameliorating discretization error
-def tokenize_audio_dither(media_arr, slow_timings):
-  # choose a relative step size slightly less than 1 to ameliorate quantization error
-  # maximize alignment accuracy by using least approximable number with desired period
-  # this is the continued fraction [0;1,N-2,1,1,1,...], where the trailing ones give phi
-  fast_rate = 1. / (1 + 1. / (DITHER_PERIOD_STEPS - 2 + (np.sqrt(5) + 1) / 2.))
-  fast_spec, fast_timings = tokenize_audio(media_arr, fast_rate)
-  
-  # prevent drift in difficult to align segments (e.g. describer speaking or quiet/droning segments)
-  # by approximately equalizing the number of tokens per unit time between dithered and undithered
-  # the dithered audio will have ~(1 + 1 / DITHER_PERIOD_STEPS) times as many tokens, so
-  # this can be accomplished by simply deleting a token every DITHER_PERIOD_STEPS tokens
-  fast_spec = np.delete(fast_spec, slice(DITHER_PERIOD_STEPS // 2, None, DITHER_PERIOD_STEPS), axis=0)
-  fast_timings = np.delete(fast_timings, slice(DITHER_PERIOD_STEPS // 2, None, DITHER_PERIOD_STEPS))
-  return fast_spec, fast_timings
-
-# normalize along both time and frequency axes to allow comparing tokens by correlation
-def normalize_spec(media_spec_raw, axes=(0,1)):
-  media_spec = media_spec_raw.copy()
-  for axis in axes:
-    norm_func = np.std if axis == 0 else np.linalg.norm
-    media_spec = media_spec - np.mean(media_spec, axis=axis, keepdims=True)
-    media_spec = media_spec/(norm_func(media_spec,axis=axis,keepdims=True)+1e-10)
-  return media_spec
-
-# vectorized implementation of the Wagner–Fischer (Longest Common Subsequence) algorithm
-# modified to include affine gap penalties and skip+match options (i.e. knight's moves)
-# gaps are necessary when parts are cut out of the audio description (e.g. cut credits)
-# or when the audio description includes a commercial break or an extra scene
-# the skip+match option allows for micro-adjustments without eating the full gap penalty
-# skip+match is primarily useful in maintaining alignment when the rates differ slightly
-def rough_align(video_spec, audio_desc_spec, video_timings, audio_desc_timings):
-  pred_map = {0:lambda node: (0, node[1]-1, node[2]-1),
-              1:lambda node: (0, node[1]-2, node[2]-1),
-              2:lambda node: (0, node[1]-1, node[2]-2),
-              3:lambda node: (1, node[1]-1, node[2]-1),
-              4:lambda node: (0, node[1]  , node[2]  ),
-              5:lambda node: (1, node[1]-1, node[2]  ),
-              6:lambda node: (1, node[1]-1, node[2]-1),
-              7:lambda node: (1, node[1]  , node[2]-1)}
-  pred_matrix = np.zeros((2, audio_desc_spec.shape[0], video_spec.shape[0]), dtype=np.uint8)
-  pred_matrix[0,1:,:2] = 0
-  pred_matrix[1,1:,:2] = 4
-  pred_matrix[:,0,:2] = [0,5]
-  path_corrs_match = np.zeros((3, video_spec.shape[0]))
-  path_corrs_gap = np.zeros((3, video_spec.shape[0]))
-  corrs = np.zeros((3, video_spec.shape[0]))
-  corrs[:,:] = np.roll(np.dot(video_spec, audio_desc_spec[0]), 1)[None,:]
-  for i in range(audio_desc_spec.shape[0]):
-    i_mod = i % 3
-    match_pred_corrs = np.hstack([path_corrs_match[i_mod-1][1:-1][:,None],
-                                  path_corrs_match[i_mod-2][1:-1][:,None] - SKIP_MATCH_COST,
-                                  path_corrs_match[i_mod-1][0:-2][:,None] - SKIP_MATCH_COST,
-                                  path_corrs_gap[  i_mod-1][1:-1][:,None]])
-    pred_matrix[0][i][2:] = np.argmax(match_pred_corrs, axis=1)
-    path_corrs_match[i_mod][2:] = np.take_along_axis(match_pred_corrs, pred_matrix[0][i][2:,None], axis=1).T
-    corrs = np.roll(corrs, -1, axis=1)
-    corrs[(i_mod+1)%3,:] = np.roll(np.dot(video_spec, audio_desc_spec[min(audio_desc_spec.shape[0]-1,i+1)]), 1)
-    fisher_infos = (2 * corrs[i_mod] - corrs[i_mod-1] - corrs[(i_mod+1)%3]) / min(.2, TIMESTEP_SIZE_SECONDS)
-    fisher_infos[fisher_infos < 0] = 0
-    fisher_infos[fisher_infos > 10] = 10
-    row_corrs = np.maximum(0, corrs[i_mod][2:] - MIN_CORR_FOR_TOKEN_MATCH)
-    path_corrs_match[i_mod][2:] += row_corrs * (fisher_infos[2:] / 5)
-    gap_pred_corrs = np.hstack([path_corrs_match[i_mod][2:  ][:,None] - GAP_START_COST,
-                                path_corrs_gap[i_mod-1][2:  ][:,None],
-                                path_corrs_gap[i_mod-1][1:-1][:,None] - GAP_EXTEND_DIAG_BONUS - \
-                                                                        GAP_EXTEND_COST])
-    pred_matrix[1][i][2:] = np.argmax(gap_pred_corrs, axis=1)
-    path_corrs_gap_no_col_skip = np.take_along_axis(gap_pred_corrs, pred_matrix[1][i][2:,None], axis=1).flat
-    pred_matrix[1][i][2:] += 4
-    path_corrs_gap[i_mod][2:] = np.maximum.accumulate(path_corrs_gap_no_col_skip + \
-                                                      GAP_EXTEND_COST * np.arange(video_spec.shape[0]-2)) - \
-                                                      GAP_EXTEND_COST * np.arange(video_spec.shape[0]-2)
-    pred_matrix[1][i][2:][path_corrs_gap[i_mod][2:] > path_corrs_gap_no_col_skip] = 7
-    path_corrs_gap[i_mod][2:] -= GAP_EXTEND_COST
-  
-  # reconstruct optimal path by following predecessors backwards through the table
-  end_node_layer = np.argmax([path_corrs_match[i_mod,-1],
-                              path_corrs_gap[  i_mod,-1]])
-  cur_node = (end_node_layer, audio_desc_spec.shape[0]-1, video_spec.shape[0]-1)
-  get_predecessor = lambda node: pred_map[pred_matrix[node]](node)
-  path = []
-  visited = set()
-  while min(cur_node[1:]) >= 0:
-    cur_node, last_node = get_predecessor(cur_node), cur_node
-    # failsafe to prevent an infinite loop that should never happen anyways
-    if cur_node in visited:
-      break
-    visited.add(cur_node)
-    if last_node[0] == 0:
-      path.append(last_node[1:])
-  path = path[::-1]
-  
-  # determine how much information this node gives about the alignment
-  # a larger double derivative means more precise timing information
-  # sudden noises give more timing information than droning sounds
-  def get_fisher_info(node):
-    i,j = node
-    if node[0] >= audio_desc_spec.shape[0]-1 or \
-       node[1] >= video_spec.shape[0]-1 or \
-       min(node) <= 0:
-      return 0
-    info = 2*np.dot(audio_desc_spec[i  ],video_spec[j  ]) - \
-             np.dot(audio_desc_spec[i-1],video_spec[j+1]) - \
-             np.dot(audio_desc_spec[i+1],video_spec[j-1])
-    info /= min(.2, TIMESTEP_SIZE_SECONDS)
-    return info
-  
-  # the quality of a node combines the correlation of its tokens
-  # with how precisely the match is localized in time
-  def get_match_quality(node):
-    # correlations are between -1 and 1, as all tokens have unit norm
-    token_correlation = np.dot(audio_desc_spec[node[0]],video_spec[node[1]])
-    fisher_info = min(max(0, get_fisher_info(node)), 10)
-    return max(0, token_correlation - MIN_CORR_FOR_TOKEN_MATCH) * (fisher_info / 5)
-  
-  # filter out low match quality nodes from LCS path
-  quals = [get_match_quality(node) for node in path]
-  if len(quals) == 0 or max(quals) <= 0:
-    raise RuntimeError("Rough alignment failed, are the input files mismatched?")
-  path, quals = zip(*[(path, qual) for (path, qual) in zip(path, quals) if qual > 0])
-  
-  # convert units of path nodes from timesteps to seconds
-  path = [(audio_desc_timings[i], video_timings[j]) for (i,j) in path]
-  
-  return path, quals
-
-# chunk path segments of similar slope into clips
-# a clip has the form: (start_index, end_index)
-def chunk_path(smooth_path, tol):
-  x,y = zip(*smooth_path)
-  slopes = np.diff(y) / np.diff(x)
-  median_slope = np.median(slopes)
-  slope_changes = np.diff(slopes)
-  breaks = np.where(np.abs(slope_changes) > tol)[0] + 1
-  breaks = [0] + list(breaks) + [len(x)-1]
-  clips = list(zip(breaks[:-1], breaks[1:]))
-  return clips, median_slope, slopes
-
-# find piece-wise linear alignment that minimizes the weighted combination of
-# total absolute error at each node and total absolute slope change of the fit
-# distance between nodes and the fit (i.e. errors) are weighted by node quality
-# absolute slope changes are differences between the slopes of adjacent fit lines
-# slope changes are weighted much more than node errors to smooth out noise
-# the main source of noise is rough alignment drift while the describer is speaking
-def smooth_align(path, quals, smoothness):
-  # rotate basis to make vertical and horizontal slopes "cost" the same
-  # the new horizontal axis is x+y and the new vertical is -x+y
-  # Wagner–Fischer gives monotonically increasing nodes, so 0 <= slope < inf
-  # after this transformation, we instead have -1 <= slope < 1
-  # perfectly matching audio has pre-transformation slope = 1
-  # after this transformation, it instead has slope = 0
-  rotated_path = [(x+y,-x+y) for x,y in path]
-  
-  # stretch the x axis to make all slopes "cost" nearly the same
-  # without this, small changes to the slope at slope = +/-1
-  # cost sqrt(2) times as much as small changes at slope = 0
-  # by stretching, we limit the range of slopes to within +/- 1/x_stretch_factor
-  # the small angle approximation means these slopes all cost roughly the same
-  x_stretch_factor = 10.
-  rotated_stretched_path = [(x_stretch_factor*x,y) for x,y in rotated_path]
-  
-  # L1-Minimization to solve the alignment problem using a linear program
-  # the absolute value functions needed for "absolute error" can be represented
-  # in a linear program by splitting variables into positive and negative pieces
-  # and constraining each to be positive (done by default in scipy's linprog)
-  # x is fit_err_pos, fit_err_neg, slope_change_pos, slope_change_neg
-  # fit_err[i] = path[i][1] - y_fit[i]
-  # slope_change[i] = (y_fit[i+2] - y_fit[i+1])/(path[i+2][0] - path[i+1][0]) - \
-  #                   (y_fit[i+1] - y_fit[i  ])/(path[i+1][0] - path[i  ][0])
-  # this can be rewritten in terms of fit_err by re-arranging the 1st equation:
-  #   y_fit[i] = path[i][1] - fit_err[i]
-  # this gives:
-  #   slope_change[i] = path_half[i] - fit_err_half[i]
-  #   where each half is just the original equation but y_fit is swapped out
-  # the slope_change variables can then be set using equality constraints
-  num_fit_points = len(rotated_stretched_path)
-  x,y = [np.array(arr) for arr in zip(*rotated_stretched_path)]
-  x_diffs = np.diff(x, prepend=[-10**10], append=[10**10])
-  y_diffs = np.diff(y, prepend=[  0    ], append=[ 0    ])
-  slope_change_magnitudes = np.abs(np.diff(y_diffs/x_diffs)) * x_stretch_factor
-  slope_change_locations = (slope_change_magnitudes > MAX_RATE_RATIO_DIFF_ALIGN)
-  slope_change_locations[1:-1] *= (np.abs(y[2:] - y[:-2]) > 5)
-  slope_change_costs = np.full(num_fit_points, smoothness / float(TIMESTEP_SIZE_SECONDS))
-  slope_change_costs[slope_change_locations] /= PREF_CUT_AT_GAPS_FACTOR
-  c = np.hstack([quals,
-                 quals,
-                 slope_change_costs * x_stretch_factor,
-                 slope_change_costs * x_stretch_factor])
-  fit_err_coeffs = scipy.sparse.diags([ 1. / x_diffs[:-1],
-                                       -1. / x_diffs[:-1] - 1. / x_diffs[1:],
-                                                            1. / x_diffs[1:]],
-                                      offsets=[0,1,2],
-                                      shape=(num_fit_points, num_fit_points + 2)).tocsc()[:,1:-1]
-  A_eq = scipy.sparse.hstack([ fit_err_coeffs,
-                              -fit_err_coeffs,
-                               scipy.sparse.eye(num_fit_points),
-                              -scipy.sparse.eye(num_fit_points)])
-  b_eq = y_diffs[1:  ] / x_diffs[1:  ] - \
-         y_diffs[ :-1] / x_diffs[ :-1]
-  fit = scipy.optimize.linprog(c, A_eq=A_eq, b_eq=b_eq)
-  if not fit.success:
-    print(fit)
-    raise RuntimeError("Smooth Alignment L1-Min Optimization Failed!")
-  
-  # combine fit_err_pos and fit_err_neg
-  fit_err = fit.x[:num_fit_points] - fit.x[num_fit_points:2*num_fit_points]
-  
-  # subtract fit errors from nodes to retrieve the smooth fit's coordinates
-  # also, unstretch x axis and rotate basis back, reversing the affine pre-processing
-  smooth_path = [(((x / x_stretch_factor) - y) / 2.,
-                  ((x / x_stretch_factor) + y) / 2.) for x,y in zip(x, y - fit_err)]
-  
-  # clip off start/end of replacement audio if it doesn't match or isn't aligned
-  # without this, describer intro/outro skips can cause mismatches at the start/end
-  # the problem would be localized and just means audio might not match video at the start/end
-  # instead we just keep the original video's audio in those segments if mismatches are detected
-  # if instead the first few or last few nodes are well-aligned, that edge is marked as synced
-  # during audio replacement, synced edges will be extended backwards/forwards as far as possible
-  # this is useful when the describer begins talking immediately (or before any alignable audio)
-  # or when the describer continues speaking until the end (or no more alignable audio remains)
-  # otherwise, the mismatch would result in the describer's voice not replacing audio in that part
-  max_sync_err = MAX_START_END_SYNC_ERR_SECONDS
-  smoothing_std = MIN_START_END_SYNC_TIME_SECONDS / (2. * TIMESTEP_SIZE_SECONDS)
-  smoothed_fit_err = nd.gaussian_filter(np.abs(fit_err), sigma=smoothing_std)
-  smooth_err_path = zip(smoothed_fit_err, smooth_path)
-  old_length = num_fit_points
-  smooth_err_path = list(itertools.dropwhile(lambda x: x[0] > max_sync_err, smooth_err_path))[::-1]
-  is_synced_at_start = len(smooth_err_path) == old_length
-  old_length = len(smooth_err_path)
-  smooth_err_path = list(itertools.dropwhile(lambda x: x[0] > max_sync_err, smooth_err_path))[::-1]
-  is_synced_at_end = len(smooth_err_path) == old_length
-  _, smooth_path = zip(*smooth_err_path)
-  smooth_path = list(smooth_path)
-  if is_synced_at_start:
-    slope = (smooth_path[1][1] - smooth_path[0][1]) / (smooth_path[1][0] - smooth_path[0][0])
-    smooth_path.insert(0, (-10e10, -10e10 * slope))
-  if is_synced_at_end:
-    slope = (smooth_path[-1][1] - smooth_path[-2][1]) / (smooth_path[-1][0] - smooth_path[-2][0])
-    smooth_path.append((10e10, 10e10 * slope))
-  
-  clips, median_slope, slopes = chunk_path(smooth_path, tol=1e-7)
-  
-  # assemble clips with slopes within the rate tolerance into runs
-  runs, run = [], []
-  bad_clips = []
-  for clip in clips:
-    if np.abs(median_slope-slopes[clip[0]]) > MAX_RATE_RATIO_DIFF_ALIGN:
-      if len(run) > 0:
-        runs.append(run)
-        run = []
-      bad_clips.append(clip)
-      continue
-    run.append(clip)
-  if len(run) > 0:
-    runs.append(run)
-  
-  return smooth_path, runs, bad_clips, clips
-
-# if the start or end were marked as synced during smooth alignment then
-# extend that alignment to the edge (i.e. to the start/end of the audio)
-def cap_synced_end_points(smooth_path, video_arr, audio_desc_arr):
-  if smooth_path[0][0] < -10e9:
-    slope = smooth_path[0][1] / smooth_path[0][0]
-    new_start_point = (0, smooth_path[1][1] - smooth_path[1][0] * slope)
-    if new_start_point[1] < 0:
-      new_start_point = (smooth_path[1][0] - smooth_path[1][1] / slope, 0)
-    smooth_path[0] = new_start_point
-  if smooth_path[-1][0] > 10e9:
-    video_runtime = (video_arr.shape[1] - 2.) / AUDIO_SAMPLE_RATE
-    audio_runtime = (audio_desc_arr.shape[1] - 2.) / AUDIO_SAMPLE_RATE
-    slope = smooth_path[-1][1] / smooth_path[-1][0]
-    new_end_point = (audio_runtime, smooth_path[-2][1] + (audio_runtime - smooth_path[-2][0]) * slope)
-    if new_end_point[1] > video_runtime:
-      new_end_point = (smooth_path[-2][0] + (video_runtime - smooth_path[-2][1]) / slope, video_runtime)
-    smooth_path[-1] = new_end_point
-
-# visualize both the rough and smooth alignments
-def plot_alignment(plot_filename_no_ext, path, smooth_path, quals, runs, bad_clips, ad_timings):
-  scatter_color = [.2,.4,.8]
-  lcs_rgba = np.zeros((len(quals),4))
-  lcs_rgba[:,:3] = np.array(scatter_color)[None,:]
-  lcs_rgba[:,3] = np.minimum(1, np.array(quals) * 500. / len(quals))
-  audio_times, video_times = np.array(path).T.reshape((2,-1))
-  audio_offsets = audio_times - video_times
-  def expand_limits(start, end, ratio=.01):
-    average = (end + start) / 2.
-    half_diff = (end - start) / 2.
-    half_diff *= (1 + ratio)
-    return (average - half_diff, average + half_diff)
-  plt.xlim(expand_limits(*(0, np.max(video_times) / 60.)))
-  plt.ylim(expand_limits(*(np.min(audio_offsets) - TIMESTEP_SIZE_SECONDS / 2.,
-                          np.max(audio_offsets) + TIMESTEP_SIZE_SECONDS / 2.)))
-  plt.scatter(video_times / 60., audio_offsets, s=3, c=lcs_rgba, label='LCS Matches')
-  audio_times, video_times = np.array(smooth_path).T.reshape((2,-1))
-  audio_offsets = audio_times - video_times
-  if ad_timings is None:
-    plt.plot(video_times / 60., audio_offsets, 'r-', lw=.5, label='Replaced Audio')
-    bad_path = []
-    for clip in bad_clips:
-      bad_path.extend(smooth_path[clip[0]:clip[1]+1])
-      bad_path.append((smooth_path[clip[1]][0] + 1e-10, np.nan))
-    audio_times, video_times = np.array(bad_path).T.reshape((2,-1))
-    audio_offsets = audio_times - video_times
-    if len(audio_offsets) > 0:
-      plt.plot(video_times / 60., audio_offsets, 'c-', lw=1, label='Original Audio')
-  else:
-    interp = scipy.interpolate.interp1d(video_times, audio_offsets,
-                                        fill_value = np.inf,
-                                        bounds_error = False, assume_sorted = True)
-    plt.plot(video_times / 60., audio_offsets, 'c-', lw=.5, label='Original Audio')
-    video_times = ad_timings
-    audio_offsets = interp(ad_timings)
-    if len(audio_offsets) > 0:
-      plt.plot(video_times / 60., audio_offsets, 'r-', lw=1, label='Replaced Audio')
-  plt.xlabel('Video Time (minutes)')
-  plt.ylabel('Audio Description Offset (seconds)')
-  plt.title('Alignment')
-  plt.legend().legendHandles[0].set_color(scatter_color)
-  plt.tight_layout()
-  plt.savefig(plot_filename_no_ext + '.png', dpi=400)
-  plt.clf()
-  
-  with open(plot_filename_no_ext + '.txt', 'w') as file:
-    rough_clips, median_slope, _ = chunk_path(smooth_path, tol=2e-2)
-    video_offset = np.diff(smooth_path[rough_clips[0][0]])[0]
-    print("Main changes needed to video to align it to audio input:", file=file)
-    print(f"Start Offset: {-video_offset:.2f} seconds", file=file)
-    print(f"Median Rate Change: {(median_slope-1.)*100:.2f}%", file=file)
-    for clip_start, clip_end in rough_clips:
-      audio_desc_start, video_start = smooth_path[clip_start]
-      audio_desc_end, video_end = smooth_path[clip_end]
-      slope = (video_end - video_start) / (audio_desc_end - audio_desc_start)
-      def str_from_time(seconds):
-        minutes, seconds = divmod(seconds, 60)
-        hours, minutes = divmod(minutes, 60)
-        return f"{hours:2.0f}:{minutes:02.0f}:{seconds:05.2f}"
-      print(f"Rate change of {(slope-1.)*100:6.1f}% from {str_from_time(video_start)} to " + \
-            f"{str_from_time(video_end)} aligning with audio from " + \
-            f"{str_from_time(audio_desc_start)} to {str_from_time(audio_desc_end)}", file=file)
-
-# use the smooth alignment to replace runs of video sound with corresponding described audio
-def replace_aligned_segments(video_arr, audio_desc_arr, smooth_path, runs, no_pitch_correction=False):
-  # perform quadratic interpolation of the audio description's waveform
-  # this allows it to be stretched to match the corresponding video segment
-  def audio_desc_arr_interp(samples):
-    chunk_size = 10**7
-    interpolated_chunks = []
-    for chunk in (samples[i:i+chunk_size] for i in range(0, len(samples), chunk_size)):
-      interp_bounds = (max(int(chunk[0]-2), 0),
-                       min(int(chunk[-1]+2), audio_desc_arr.shape[1]))
-      interp = scipy.interpolate.interp1d(np.arange(*interp_bounds),
-                                          audio_desc_arr[:,slice(*interp_bounds)],
-                                          copy=False, bounds_error=False, fill_value=0,
-                                          kind='quadratic', assume_sorted=True)
-      interpolated_chunks.append(interp(chunk).astype(np.float32))
-    return np.hstack(interpolated_chunks)
-  
-  # construct a stretched audio description waveform using the quadratic interpolator
-  def get_interped_segment(run, interp):
-    segment = []
-    for clip in run:
-      num_samples = int(y[clip[1]] * AUDIO_SAMPLE_RATE) - \
-                    int(y[clip[0]] * AUDIO_SAMPLE_RATE)
-      clip_bounds = np.array((x[clip[0]], x[clip[1]])) * AUDIO_SAMPLE_RATE
-      sample_points = np.linspace(*clip_bounds, num=num_samples, endpoint=False)
-      segment.append(interp(sample_points))
-    segment = np.hstack(segment)
-    return segment
-  
-  x,y = zip(*smooth_path)
-  for run in runs:
-    run_length_seconds = y[run[-1][1]] - y[run[0][0]]
-    if run_length_seconds < MIN_DURATION_TO_REPLACE_SECONDS:
-      continue
-    anchor_point_path_indices = [clip[0] for clip in run]
-    anchor_point_path_indices.append(run[-1][1])
-    anchor_points = (np.array((np.array(x)[anchor_point_path_indices],
-                               np.array(y)[anchor_point_path_indices])) * AUDIO_SAMPLE_RATE).astype(int)
-    slopes = np.diff(anchor_points[1]) / np.diff(anchor_points[0])
-    for clip_index, (clip, slope) in enumerate(zip(run, slopes)):
-      # only apply pitch correction if the difference would be noticeable
-      if no_pitch_correction or np.abs(1 - slope) <= JUST_NOTICEABLE_DIFF_IN_FREQ_RATIO:
-        stretched_audio = get_interped_segment([clip], audio_desc_arr_interp)
-      else:
-        anchor_point_pair = anchor_points[:,clip_index:clip_index+2].copy()
-        # account for quirks of pytsmod's wsola anchor point implementation
-        anchor_point_pair[1][-1] -= 1
-        anchor_y_offset = anchor_point_pair[1][0]
-        anchor_point_pair[1,:] -= anchor_y_offset
-        stretched_audio = pytsmod.wsola(audio_desc_arr, anchor_point_pair)
-      video_arr[:,slice(*anchor_points[1,clip_index:clip_index+2])] = stretched_audio
-
-# identify which segments of the replaced audio actually have the describer speaking
-# uses a Naive Bayes classifier smoothed with L1-Minimization to identify the describer
-def detect_describer(video_arr, video_spec, video_spec_raw, video_timings,
-                     smooth_path, detect_sensitivity, boost_sensitivity):
-  # retokenize the audio description, which has been stretched to match the video
-  audio_desc_spec_raw, audio_timings = tokenize_audio(video_arr)
-  audio_desc_spec = normalize_spec(audio_desc_spec_raw)
-  
-  # avoid boosting or training on mismatched segments, like those close to skips
-  # assumes matching segments all have the same, constant play rate
-  # could be modified to handle a multi-modal distribution of rates
-  aligned_audio_times, aligned_video_times = zip(*smooth_path)
-  interp = scipy.interpolate.interp1d(aligned_video_times, aligned_audio_times,
-                                      fill_value = 'extrapolate',
-                                      bounds_error = False, assume_sorted = True)
-  slopes = (interp(video_timings + 1e-5) - \
-            interp(video_timings - 1e-5)) / 2e-5
-  median_slope = np.median(slopes)
-  aligned_mask =      np.abs(slopes - median_slope) < MAX_RATE_RATIO_DIFF_ALIGN
-  well_aligned_mask = np.abs(slopes - median_slope) < MAX_RATE_RATIO_DIFF_BOOST
-  
-  # first pass identification by assuming poorly matched tokens are describer speech
-  # also assumes the describer doesn't speak very quietly
-  corrs = np.sum(audio_desc_spec * video_spec, axis=-1)
-  smooth_volume = nd.gaussian_filter(audio_desc_spec[:,0], sigma=1)
-  audio_desc_loud = smooth_volume > np.percentile(smooth_volume, 30)
-  speech_mask = (corrs < .2) * audio_desc_loud
-  
-  # normalize spectrogram coefficients along time axis to prep for conversion to PDFs
-  audio_desc_spec = normalize_spec(audio_desc_spec_raw, axes=(0,))
-  audio_desc_spec = np.clip(audio_desc_spec / 6., -1, 1)
-  video_spec = normalize_spec(video_spec_raw, axes=(0,))
-  video_spec = np.clip(video_spec / 6., -1, 1)
-  
-  # convert sampled features (e.g. spectrogram) to probability densities of each feature
-  # when given a spectrogram, finds the distributions of the MFC coefficients
-  def make_log_pdfs(arr):
-    resolution = 100
-    bins_per_spot = 4
-    num_bins = int(resolution * bins_per_spot)
-    uniform_prior_strength_per_spot = 1
-    uniform_prior_strength_per_bin = uniform_prior_strength_per_spot / float(bins_per_spot)
-    bin_range = (-1 - 1e-10, 1 + 1e-10)
-    get_hist = lambda x: np.histogram(x, bins=num_bins, range=bin_range)[0]
-    pdfs = np.apply_along_axis(get_hist, 1, arr.T)
-    pdfs = pdfs + uniform_prior_strength_per_bin
-    smooth = lambda x: nd.gaussian_filter(x, sigma=bins_per_spot)
-    pdfs = np.apply_along_axis(smooth, 1, pdfs)
-    pdfs = pdfs / np.sum(pdfs[0,:])
-    log_pdfs = np.log(pdfs)
-    bin_edges = np.histogram([], bins=num_bins, range=bin_range)[1]
-    return log_pdfs, bin_edges
-  
-  diff_spec = audio_desc_spec - video_spec
-  diff_spec = np.clip(diff_spec, -1, 1)
-  
-  # Naive Bayes classifier to roughly estimate whether each token is describer speech
-  desc_log_pdfs, _ = make_log_pdfs(diff_spec[speech_mask * well_aligned_mask])
-  nondesc_log_pdfs, bin_edges = make_log_pdfs(diff_spec[(~speech_mask) * well_aligned_mask])
-  lratio_lookup = desc_log_pdfs - nondesc_log_pdfs
-  lratios = lratio_lookup[np.fromfunction(lambda i,j: j, diff_spec.shape, dtype=int),
-                          np.digitize(diff_spec, bin_edges, right=True)-1]
-  ratio_desc_to_nondesc = np.sum(speech_mask * well_aligned_mask) /\
-                         (np.sum((~speech_mask) * well_aligned_mask) + 1.)
-  relative_probs = np.sum(lratios, axis=1)
-  relative_probs /= np.std(relative_probs)
-  relative_probs -= np.mean(relative_probs)
-  
-  # L1-Minimization to smoothly identify audio descriptions using a linear program
-  # x is fit_err_pos, fit_err_neg, delta_fit_pos, delta_fit_neg
-  # fit_err[i] = relative_probs[i] - y_fit[i]
-  # delta_fit[i] = y_fit[i] - y_fit[i-1]
-  # this can be rewritten in terms of fit_err by re-arranging the 1st equation:
-  #   y_fit[i] = relative_probs[i] - fit_err[i]
-  # this gives:
-  #   delta_fit[i] = (relative_probs[i] - relative_probs[i-1]) -\
-  #                  (fit_err[i] - fit_err[i-1])
-  # the delta_fit variables can then be set using equality constraints
-  num_fit_points = len(relative_probs)
-  y_diffs = np.diff(relative_probs)
-  pos_err_cost_factor = MIN_DESC_DURATION / float(TIMESTEP_SIZE_SECONDS)
-  neg_err_cost_factor = MAX_GAP_IN_DESC_SEC / float(TIMESTEP_SIZE_SECONDS)
-  c = np.hstack([np.ones(num_fit_points) / pos_err_cost_factor,
-                 np.ones(num_fit_points) / neg_err_cost_factor,
-                 np.ones(num_fit_points - 1) / 2.,
-                 np.ones(num_fit_points - 1) / 2.])
-  fit_err_coeffs = scipy.sparse.diags([-np.ones(num_fit_points),
-                                        np.ones(num_fit_points)],
-                                      offsets=[0,1],
-                                      shape=(num_fit_points - 1, num_fit_points)).tocsc()
-  A_eq = scipy.sparse.hstack([ fit_err_coeffs,
-                              -fit_err_coeffs,
-                               scipy.sparse.eye(num_fit_points-1),
-                              -scipy.sparse.eye(num_fit_points-1)])
-  b_eq = y_diffs
-  fit = scipy.optimize.linprog(c, A_eq=A_eq, b_eq=b_eq)
-  if not fit.success:
-    print(fit)
-    raise RuntimeError("Describer Voice Detection L1-Min Optimization Failed!")
-  
-  # combine fit_err_pos and fit_err_neg
-  fit_err = fit.x[:num_fit_points] - fit.x[num_fit_points:2*num_fit_points]
-  
-  # subtract fit errors from nodes to retrieve the smoothed fit
-  smooth_desc_locations = relative_probs - fit_err
-  
-  # hard threshold to classify each token as describer speech or not
-  speech_mask = smooth_desc_locations > 1. - 1.5 * detect_sensitivity
-  speech_mask *= aligned_mask
-  
-  # a separate mask is created for describer volume boosting
-  # as losing the describer's voice entirely is usually worse than it just being quiet
-  # and imperfectly aligned segments may have descriptions, but shouldn't be boosted
-  boost_mask = smooth_desc_locations > 1. - 1.5 * boost_sensitivity
-  boost_mask *= well_aligned_mask
-  
-  # convert a token classification into a mask that can be applied directly to samples
-  # unlike the input, the output isn't a boolean array but an array of floats
-  def token_mask_to_sample_mask(token_mask):
-    description_timings = video_timings[1:-1][token_mask[1:-1]]
-    sample_mask = np.zeros(video_arr.shape[1], dtype=np.float32)
-    window_radius = int(AUDIO_SAMPLE_RATE * TIMESTEP_SIZE_SECONDS)
-    window_size_seconds = 2 * window_radius + 1
-    bump = scipy.signal.windows.hann(window_size_seconds)
-    for description_timing in description_timings:
-      window_center = int(description_timing * AUDIO_SAMPLE_RATE)
-      sample_mask[window_center-window_radius:window_center+window_radius+1] += bump
-    return sample_mask
-  
-  speech_sample_mask = token_mask_to_sample_mask(speech_mask)
-  boost_sample_mask = token_mask_to_sample_mask(boost_mask)
-  ad_timings = video_timings.copy()
-  ad_timings[~speech_mask] = np.inf
-  
-  return speech_sample_mask, boost_sample_mask, ad_timings
-
-# Convert piece-wise linear fit to ffmpeg expression for editing video frame timestamps
-def encode_fit_as_ffmpeg_expr(smooth_path, clips, video_offset, start_key_frame):
-  # PTS is the input frame's presentation timestamp, which is when frames are displayed
-  # TB is the timebase, which is how many seconds each unit of PTS corresponds to
-  # the output value of the expression will be the frame's new PTS
-  setts_cmd = ['TS']
-  start_skip = max(0, video_offset - start_key_frame)
-  if start_skip > 0:
-    # lossless cutting can only happen at key frames, so we cut the video before the audio starts
-    # but that means the video is behind the audio and needs to catch up by playing quicker
-    # catchup_spread is the ratio of time to spend catching up to the amount of catching up needed
-    catchup_spread = 1./CATCHUP_RATE
-    setts_cmd.append(f'+clip(TS-STARTPTS,0,{start_skip*(1+catchup_spread)}/TB)*{-1./(1+catchup_spread)}')
-  elif video_offset < 0:
-    # if the audio starts before the video, stretch the first frame of the video back to meet it
-    setts_cmd.append(f'+clip(TS-STARTPTS,0,{-video_offset/10000.}/TB)*10000')
-  # each segment of the linear fit can be encoded as a single clip function
-  setts_cmd.append('+(0')
-  for clip_start, clip_end in clips:
-    audio_desc_start, video_start = smooth_path[clip_start]
-    audio_desc_end, video_end = smooth_path[clip_end]
-    video_start -= start_key_frame
-    video_end -= start_key_frame
-    audio_desc_length = audio_desc_end - audio_desc_start
-    video_length = video_end - video_start
-    slope = audio_desc_length / video_length
-    setts_cmd.append(f'+clip(TS-STARTPTS-{video_start:.4f}/TB,0,{max(0,video_length):.4f}/TB)*{slope-1:.9f}')
-  setts_cmd.append(')')
-  setts_cmd = ''.join(setts_cmd)
-  return setts_cmd
-
-def get_ffmpeg():
-  return static_ffmpeg.run._get_or_fetch_platform_executables_else_raise_no_lock()[0]
-
-def get_ffprobe():
-  return static_ffmpeg.run._get_or_fetch_platform_executables_else_raise_no_lock()[1]
-
-def get_closest_key_frame_time(video_file, time):
-  if time <= 0:
-    return 0
-  key_frames = ffmpeg.probe(video_file, cmd=get_ffprobe(), select_streams='v',
-                            show_frames=None, skip_frame='nokey')['frames']
-  key_frame_times = np.array([float(frame['pts_time']) for frame in key_frames] + [0])
-  return np.max(key_frame_times[key_frame_times <= time])
-
-# outputs a new media file with the replaced audio (which includes audio descriptions)
-def write_replaced_media_to_disk(output_filename, media_arr, video_file=None, audio_desc_file=None,
-                                 setts_cmd=None, start_key_frame=None):
-  if audio_desc_file is None:
-    media_input = ffmpeg.input('pipe:', format='s16le', acodec='pcm_s16le',
-                               ac=2, ar=AUDIO_SAMPLE_RATE)
-    if video_file is None or os.path.splitext(output_filename)[1][1:] in AUDIO_EXTENSIONS:
-      write_command = ffmpeg.output(media_input, output_filename, loglevel='fatal').overwrite_output()
-    else:
-      original_video = ffmpeg.input(video_file)
-      # "-max_interleave_delta 0" is sometimes necessary to fix an .mkv bug that freezes audio/video:
-      #   ffmpeg bug warning: [matroska @ 0000000002c814c0] Starting new cluster due to timestamp
-      # more info about the bug and fix: https://reddit.com/r/ffmpeg/comments/efddfs/
-      write_command = ffmpeg.output(media_input, original_video, output_filename,
-                                    acodec='copy', vcodec='copy', scodec='copy',
-                                    max_interleave_delta='0', loglevel='fatal',
-                                    **{"c:a:0": "aac", "disposition:a:0": "default"}).overwrite_output()
-    ffmpeg_caller = write_command.run_async(pipe_stdin=True, cmd=get_ffmpeg())
-    ffmpeg_caller.stdin.write(media_arr.astype(np.int16).T.tobytes())
-    ffmpeg_caller.stdin.close()
-    ffmpeg_caller.wait()
-  else:
-    media_input = ffmpeg.input(audio_desc_file)
-    audio_desc_streams = ffmpeg.probe(audio_desc_file, cmd=get_ffprobe(), select_streams='a',
-                                      show_entries='format=duration')['streams']
-    audio_desc_duration = max([float(stream['duration']) for stream in audio_desc_streams])
-    original_video = ffmpeg.input(video_file, an=None, ss=start_key_frame)
-    if os.path.splitext(output_filename)[1] == os.path.splitext(video_file)[1]:
-      # wav files don't have codecs compatible with most video containers, so we convert to aac
-      audio_codec = 'copy' if os.path.splitext(audio_desc_file)[1] != '.wav' else 'aac'
-      write_command = ffmpeg.output(media_input, original_video, output_filename,
-                                    acodec=audio_codec, vcodec='copy', scodec='copy',
-                                    max_interleave_delta='0', loglevel='fatal',
-                                    **{'bsf:v': 'setts=ts=\'' + setts_cmd + '\'',
-                                       'bsf:s': 'setts=ts=\'' + setts_cmd + '\''}).overwrite_output()
-      write_command.run(cmd=get_ffmpeg())
-    else:
-      # work around for bug that sometimes breaks setts when output and input formats differ
-      # the trick is separating the input and output by piping from one ffmpeg process into another
-      # mkv files break if 'nut' is used, while other files break when 'matroska' is used
-      format = 'matroska' if os.path.splitext(output_filename)[1] == '.mkv' else 'nut'
-      write_command = ffmpeg.output(original_video, 'pipe:', format=format, vsync='passthrough',
-                                    c='copy', loglevel='fatal')
-      ffmpeg_caller = write_command.run_async(pipe_stdout=True, cmd=get_ffmpeg())
-      pipe_input = ffmpeg.input('pipe:', format=format, thread_queue_size='512')
-      write_command2 = ffmpeg.output(media_input, pipe_input, output_filename, c='copy',
-                                     max_interleave_delta='0', loglevel='fatal', vsync='passthrough',
-                                     **{'bsf:v': 'setts=ts=\'' + setts_cmd + '\'',
-                                        'bsf:s': 'setts=ts=\'' + setts_cmd + '\''}).overwrite_output()
-      ffmpeg_caller2 = write_command2.run_async(pipe_stdin=True, cmd=get_ffmpeg())
-      while True:
-        in_bytes = ffmpeg_caller.stdout.read(100000)
-        if not in_bytes:
-          break
-        ffmpeg_caller2.stdin.write(in_bytes)
-      ffmpeg_caller2.stdin.close()
-      ffmpeg_caller.wait()
-      ffmpeg_caller2.wait()
-
-
-# check whether static_ffmpeg has already installed ffmpeg and ffprobe
-def is_ffmpeg_installed():
-  ffmpeg_dir = static_ffmpeg.run.get_platform_dir()
-  indicator_file = os.path.join(ffmpeg_dir, "installed.crumb")
-  return os.path.exists(indicator_file)
-
-# combines videos with matching audio files (e.g. audio descriptions)
-# this is the main function of this script, it calls the other functions in order
-def combine(video, audio, smoothness=50, stretch_audio=False, keep_non_ad=False,
-            boost=0, ad_detect_sensitivity=.6, boost_sensitivity=.4, yes=False,
-            prepend="ad_", no_pitch_correction=False, output_dir=default_output_dir,
-            alignment_dir=default_alignment_dir, extension="copy", display_func=None):
-  video_files, video_file_types = get_sorted_filenames(video, VIDEO_EXTENSIONS, AUDIO_EXTENSIONS)
-  
-  if yes == False and sum(video_file_types) > 0:
-    print("")
-    print("One or more audio files found in video input. Was this intentional?")
-    print("If not, press ctrl+c to kill this script.")
-    input("If this was intended, press Enter to continue...")
-    print("")
-  audio_desc_files, _ = get_sorted_filenames(audio, AUDIO_EXTENSIONS)
-  if len(video_files) != len(audio_desc_files):
-    error_msg = ["Number of valid files in input paths are not the same.",
-                 f"The video path has {len(video_files)} files",
-                 f"The audio path has {len(audio_desc_files)} files"]
-    raise RuntimeError("\n".join(error_msg))
-  
-  ensure_folders_exist([output_dir], display_func)
-  if PLOT_ALIGNMENT_TO_FILE:
-    ensure_folders_exist([alignment_dir], display_func)
-  
-  display("", display_func)
-  for (video_file, audio_desc_file) in zip(video_files, audio_desc_files):
-    display(os.path.split(video_file)[1], display_func)
-    display(os.path.split(audio_desc_file)[1], display_func)
-    display("", display_func)
-  if yes == False:
-    print("Are the above input file pairings correct?")
-    print("If not, press ctrl+c to kill this script.")
-    input("If they are correct, press Enter to continue...")
-    print("")
-  
-  # if ffmpeg isn't installed, install it
-  if not is_ffmpeg_installed():
-    display("Downloading and installing ffmpeg (media editor, 50 MB download)...", display_func)
-    get_ffmpeg()
-    if not is_ffmpeg_installed():
-      RuntimeError("Failed to install ffmpeg.")
-    display("Successfully installed ffmpeg.", display_func)
-  
-  display("Processing files:", display_func)
-  
-  for (video_file, audio_desc_file, video_filetype) in zip(video_files, audio_desc_files,
-                                                           video_file_types):
-    # Default is to use the input video's extension for the output video
-    if extension is None or extension in ["", "copy"]:
-      ext = os.path.splitext(video_file)[1]
-    else:
-      # add a dot to the extension if it's missing
-      ext = ('' if extension[0] == '.' else '.') + extension
-    output_filename = prepend + os.path.splitext(os.path.split(video_file)[1])[0] + ext
-    output_filename = os.path.join(output_dir, output_filename)
-    display(" " + output_filename, display_func)
-    
-    if os.path.exists(output_filename) and os.path.getsize(output_filename) > 0:
-      display("   output file already exists, skipping...", display_func)
-      continue
-    
-    video_arr = parse_audio_from_file(video_file)
-    audio_desc_arr = parse_audio_from_file(audio_desc_file)
-    video_spec_raw, video_timings = tokenize_audio(video_arr)
-    video_spec = normalize_spec(video_spec_raw)
-    audio_desc_spec_raw, audio_desc_timings = tokenize_audio_dither(audio_desc_arr, video_timings)
-    audio_desc_spec = normalize_spec(audio_desc_spec_raw)
-    
-    # rescale RMS intensity of audio to match video
-    audio_desc_arr *= (np.std(video_arr) / np.std(audio_desc_arr))
-    
-    path, quals = rough_align(video_spec, audio_desc_spec, video_timings, audio_desc_timings)
-    
-    smooth_path, runs, bad_clips, clips = smooth_align(path, quals, smoothness)
-    
-    cap_synced_end_points(smooth_path, video_arr, audio_desc_arr)
-    
-    ad_timings = None
-    if stretch_audio:
-      if keep_non_ad:
-        video_arr_original = video_arr.copy()
-      
-      replace_aligned_segments(video_arr, audio_desc_arr, smooth_path, runs, no_pitch_correction)
-      del audio_desc_arr
-      
-      if keep_non_ad or boost != 0:
-        outputs = detect_describer(video_arr, video_spec, video_spec_raw, video_timings,
-                                   smooth_path, ad_detect_sensitivity, boost_sensitivity)
-        speech_sample_mask, boost_sample_mask, ad_timings = outputs
-      if keep_non_ad:
-        video_arr *= speech_sample_mask
-        video_arr += video_arr_original * (1 - speech_sample_mask)
-        del video_arr_original
-        del speech_sample_mask
-      else:
-        ad_timings = None
-      if boost != 0:
-        video_arr = video_arr * (1. + (10**(boost / 10.) - 1.) * boost_sample_mask)
-        del boost_sample_mask
-      
-      # prevent peaking by rescaling to within +/- 16,382
-      video_arr *= (2**15 - 2.) / np.max(np.abs(video_arr))
-      
-      if video_filetype == 0:
-        write_replaced_media_to_disk(output_filename, video_arr, video_file)
-      else:
-        write_replaced_media_to_disk(output_filename, video_arr)
-    else:
-      if video_filetype == 1:
-        raise RuntimeError("Argument --stretch_audio is required when both inputs are audio files.")
-      if os.path.splitext(output_filename)[1][1:] in AUDIO_EXTENSIONS:
-        raise RuntimeError("Argument --stretch_audio is required when output file extension is an audio filetype.")
-      video_offset = np.diff(smooth_path[clips[0][0]])[0]
-      start_key_frame = get_closest_key_frame_time(video_file, video_offset)
-      setts_cmd = encode_fit_as_ffmpeg_expr(smooth_path, clips, video_offset, start_key_frame)
-      write_replaced_media_to_disk(output_filename, None, video_file, audio_desc_file,
-                                   setts_cmd, start_key_frame)
-    
-    del video_arr
-    if PLOT_ALIGNMENT_TO_FILE:
-      plot_filename_no_ext = os.path.join(alignment_dir, os.path.splitext(os.path.split(video_file)[1])[0])
-      plot_alignment(plot_filename_no_ext, path, smooth_path, quals, runs, bad_clips, ad_timings)
-  display("All files processed.", display_func)
-
-def write_config_file(config_path, settings):
-  config = configparser.ConfigParser()
-  config.add_section('alignment')
-  config['alignment'] = {}
-  for key, value in settings.items():
-    config['alignment'][key] = str(value)
-  with open(config_path, 'w') as f:
-    config.write(f)
-
-def read_config_file(config_path):
-  config = configparser.ConfigParser()
-  config.read(config_path)
-  settings = {'smoothness':           config.getfloat('alignment', 'smoothness', fallback=50),
-              'stretch_audio':        config.getboolean('alignment', 'stretch_audio', fallback=False),
-              'keep_non_ad':          config.getboolean('alignment', 'keep_non_ad', fallback=False),
-              'boost':                config.getfloat('alignment', 'boost', fallback=0),
-              'ad_detect_sensitivity':config.getfloat('alignment', 'ad_detect_sensitivity', fallback=.6),
-              'boost_sensitivity':    config.getfloat('alignment', 'boost_sensitivity', fallback=.4),
-              'prepend':              config.get('alignment', 'prepend', fallback='ad_'),
-              'no_pitch_correction':  config.getboolean('alignment', 'no_pitch_correction', fallback=False),
-              'output_dir':           config.get('alignment', 'output_dir', fallback=default_output_dir),
-              'alignment_dir':        config.get('alignment', 'alignment_dir', fallback=default_alignment_dir),
-              'extension':            config.get('alignment', 'extension', fallback='copy')}
-  if not config.has_section('alignment'):
-    write_config_file(config_path, settings)
-  return settings
-
-def settings_gui(config_path):
-  settings = read_config_file(config_path)
-  layout = [[sg.Text('Check tooltips (i.e. mouse-over text) for descriptions:')],
-            [sg.Column([[sg.Text('extension:', size=(10, 1.2), pad=(1,5)),
-                         sg.Input(default_text=str(settings['extension']), size=(8, 1.2), pad=(10,5), key='extension',
-                                  tooltip='File type of output video (e.g. mkv). When set to "copy", copies the ' + \
-                                          'file type of the corresponding input video. Default is "copy".')]])],
-            [sg.Column([[sg.Text('prepend:', size=(8, 1.2), pad=(1,5)),
-                         sg.Input(default_text=str(settings['prepend']), size=(8, 1.2), pad=(10,5), key='prepend',
-                                  tooltip='Output file name prepend text. Default is "ad_"')]])],
-            [sg.Column([[sg.Text('output_dir:', size=(10, 1.2), pad=(1,5)),
-                         sg.Input(default_text=str(settings['output_dir']), size=(22, 1.2), pad=(10,5), key='output_dir',
-                                  tooltip='Directory combined output media is saved to. Default is "videos_with_ad"'),
-                                  sg.FolderBrowse(button_text="Browse Folder", key='output_browse')]])],
-            [sg.Column([[sg.Text('alignment_dir:', size=(13, 1.2), pad=(1,5)),
-                         sg.Input(default_text=str(settings['alignment_dir']), size=(22, 1.2), pad=(10,5), key='alignment_dir',
-                                  tooltip='Directory alignment data and plots are saved to. Default is "alignment_plots"'),
-                         sg.FolderBrowse(button_text="Browse Folder", key='alignment_browse')]], pad=(2,7))],
-            [sg.Column([[sg.Text('smoothness:', size=(12, 1), pad=(1,5)),
-                         sg.Input(default_text=str(settings['smoothness']), size=(8, 1.2), pad=(10,5), key='smoothness',
-                                  tooltip='Lower values make the alignment more accurate when there are skips ' + \
-                                          '(e.g. describer pauses), but also make it more likely to misalign. ' + \
-                                          'Default is 50.')]])],
-            [sg.Checkbox('stretch_audio', default=settings['stretch_audio'], key='stretch_audio', change_submits=True,
-                         tooltip='Stretches the input audio to fit the input video. ' + \
-                                 'Default is to stretch the video to fit the audio.')],
-            [sg.Checkbox('keep_non_ad', default=settings['keep_non_ad'], key='keep_non_ad',
-                         disabled=not settings['stretch_audio'],
-                         tooltip='Tries to only replace segments with audio description. Useful if ' + \
-                                 'video\'s audio quality is better. Default is to replace all aligned audio. ' + \
-                                 'Requires --stretch_audio to be set, otherwise does nothing.')],
-            [sg.Column([[sg.Text('boost:', size=(6, 1), pad=(1,5)),
-                         sg.Input(default_text=str(settings['boost']), size=(8, 1.2), pad=(10,5),
-                                  key='boost', disabled=not settings['stretch_audio'],
-                                  tooltip='Boost (or quieten) description volume. Units are decibels (dB), so ' + \
-                                          '-3 makes the describer about 2x quieter, while 3 makes them 2x louder. ' + \
-                                          'Requires --stretch_audio to be set, otherwise does nothing.')]])],
-            [sg.Column([[sg.Text('ad_detect_sensitivity:', size=(21, 1.2), pad=(2,5)),
-                         sg.Input(default_text=str(settings['ad_detect_sensitivity']), size=(8, 1.2), pad=(10,5),
-                                  key='ad_detect_sensitivity', disabled=not settings['stretch_audio'],
-                                  tooltip='Audio description detection sensitivity ratio. Higher values make ' + \
-                                          '--keep_non_ad more likely to replace aligned audio. Default is 0.6')]])],
-            [sg.Column([[sg.Text('boost_sensitivity:', size=(17, 1.2), pad=(1,5)),
-                         sg.Input(default_text=str(settings['boost_sensitivity']), size=(8, 1.2), pad=(10,5),
-                                  key='boost_sensitivity', disabled=not settings['stretch_audio'],
-                                  tooltip='Higher values make --boost less likely to miss a description, but ' + \
-                                          'also make it more likely to boost non-description audio. Default is 0.4')]])],
-            [sg.Checkbox('no_pitch_correction', default=settings['no_pitch_correction'], key='no_pitch_correction',
-                         disabled=not settings['stretch_audio'],
-                         tooltip='Skips pitch correction step when stretching audio. ' + \
-                                 'Requires --stretch_audio to be set, otherwise does nothing.')],
-            [sg.Column([[sg.Submit('Save', pad=(40,3)),
-                         sg.Button('Cancel')]], pad=((135,3),10))]]
-  settings_window = sg.Window('Settings - describealign', layout, font=('Arial', 16), finalize=True)
-  settings_window['extension'].set_focus()
-  while True:
-    event, values = settings_window.read()
-    if event in (sg.WIN_CLOSED, 'Cancel') or settings_window.TKrootDestroyed:
-      break
-    if event == 'stretch_audio':
-      # work around bug in PySimpleGUIWx's InputText Update function where enabling/disabling are flipped
-      if IS_RUNNING_WINDOWS:
-        settings_window['boost'].Update(disabled = values['stretch_audio'])
-        settings_window['ad_detect_sensitivity'].Update(disabled = values['stretch_audio'])
-        settings_window['boost_sensitivity'].Update(disabled = values['stretch_audio'])
-      else:
-        settings_window['boost'].Update(disabled = not values['stretch_audio'])
-        settings_window['ad_detect_sensitivity'].Update(disabled = not values['stretch_audio'])
-        settings_window['boost_sensitivity'].Update(disabled = not values['stretch_audio'])
-      settings_window['keep_non_ad'].Update(disabled = not values['stretch_audio'])
-      settings_window['no_pitch_correction'].Update(disabled = not values['stretch_audio'])
-    if event == 'Save':
-      settings = values.copy()
-      del settings['output_browse']
-      del settings['alignment_browse']
-      write_config_file(config_path, settings)
-      break
-  settings_window.close()
-
-def combine_print_exceptions(print_queue, *args, **kwargs):
-  try:
-    combine(*args, **kwargs)
-  except:
-    print_queue.put(traceback.format_exc())
-    # raise
-
-def combine_gui(video_files, audio_files, config_path):
-  output_textbox = sg.Multiline(size=(80,30), key='-OUTPUT-')
-  layout = [[output_textbox],
-            [sg.Button('Close', pad=(360,5))]]
-  combine_window = sg.Window('Combining - describealign', layout, font=('Arial', 16),
-                             disable_close=True, finalize=True)
-  output_textbox.update('Combining media files:', append=True)
-  print_queue = multiprocessing.Queue()
-  
-  settings = read_config_file(config_path)
-  settings.update({'display_func':print_queue.put, 'yes':True})
-  proc = multiprocessing.Process(target=combine_print_exceptions,
-                                 args=(print_queue, video_files, audio_files),
-                                 kwargs=settings, daemon=True)
-  proc.start()
-  while True:
-    # if the script isn't running anymore, re-enable the default close window button
-    if not proc.is_alive():
-      combine_window.DisableClose = False
-    if not print_queue.empty():
-      if IS_RUNNING_WINDOWS:
-        cursor_position = output_textbox.WxTextCtrl.GetInsertionPoint()
-      output_textbox.update('\n' + print_queue.get(), append=True)
-      if IS_RUNNING_WINDOWS:
-        output_textbox.WxTextCtrl.SetInsertionPoint(cursor_position)
-    event, values = combine_window.read(timeout=100)
-    # window closed event isn't always emitted, so also manually check window status
-    if event == sg.WIN_CLOSED or combine_window.TKrootDestroyed:
-      if proc.is_alive():
-        proc.terminate()
-      break
-    if event == 'Close':
-      if not proc.is_alive():
-        combine_window.DisableClose = False
-        break
-      selection = sg.PopupYesNo('Combiner is still running, stop it and close anyway?')
-      if selection != 'Yes':
-        continue
-      proc.terminate()
-      combine_window.DisableClose = False
-      break
-  combine_window.close()
-
-def main_gui():
-  config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.ini')
-  sg.theme('Light Blue 2')
-  
-  filetype_sep = ';' if IS_RUNNING_WINDOWS else ' '
-  all_audio_file_types = [('All Audio File Types', '*.' + f'{filetype_sep}*.'.join(AUDIO_EXTENSIONS)),]
-  all_video_file_types = [('All Video File Types', '*.' + f'{filetype_sep}*.'.join(VIDEO_EXTENSIONS)),]
-  all_video_and_audio_file_types = [('All Video and Audio File Types',
-                                     '*.' + f'{filetype_sep}*.'.join(VIDEO_EXTENSIONS | AUDIO_EXTENSIONS)),]
-  audio_file_types = [(ext, "*." + ext) for ext in AUDIO_EXTENSIONS]
-  video_and_audio_file_types = [(ext, "*." + ext) for ext in VIDEO_EXTENSIONS] + audio_file_types
-  audio_file_types = all_audio_file_types + audio_file_types
-  video_and_audio_file_types = all_video_file_types + all_video_and_audio_file_types + video_and_audio_file_types
-  # work around bug in PySimpleGUIWx's convert_tkinter_filetypes_to_wx function
-  if IS_RUNNING_WINDOWS:
-    file_fix = lambda file_types: file_types[:1] + [('|' + type[0], type[1]) for type in file_types[1:]]
-    audio_file_types = file_fix(audio_file_types)
-    video_and_audio_file_types = file_fix(video_and_audio_file_types)
-  
-  layout = [[sg.Text('Select media files to combine:', size=(40, 2), font=('Arial', 20), pad=(3,15))],
-            [sg.Column([[sg.Text('Video Input:', size=(11, 2), pad=(1,5)),
-                         sg.Input(size=(35, 1.2), pad=(10,5), key='-VIDEO_FILES-',
-                                  tooltip='List video filenames here, in order, separated by semicolons'),
-                         sg.FilesBrowse(button_text="Browse Video",
-                                        file_types=video_and_audio_file_types,
-                                        tooltip='Select one or more video files')]], pad=(2,7))],
-            [sg.Column([[sg.Text('Audio Input:', size=(11, 2), pad=(1,5)),
-                         sg.Input(size=(35, 1.2), pad=(10,5), key='-AUDIO_FILES-',
-                                  tooltip='List audio filenames here, in order, separated by semicolons'),
-                         sg.FilesBrowse(button_text="Browse Audio",
-                                        file_types=audio_file_types,
-                                        tooltip='Select one or more audio files')]], pad=(2,7))],
-            [sg.Column([[sg.Submit('Combine', pad=(40,3), tooltip='Combine selected video and audio files'),
-                         sg.Button('Settings', tooltip='Edit settings for the GUI and algorithm.')]],
-                         pad=((135,3),10))]]
-  window = sg.Window('describealign', layout, font=('Arial', 16), resizable=False, finalize=True)
-  window['-VIDEO_FILES-'].set_focus()
-  while True:
-    event, values = window.read()
-    if event == 'Combine':
-      if len(values['-VIDEO_FILES-']) == 0 or \
-         len(values['-AUDIO_FILES-']) == 0:
-        window.disable()
-        sg.Popup('Error: empty input field.', font=('Arial', 20))
-        window.enable()
-        continue
-      video_files = values['-VIDEO_FILES-'].split(';')
-      audio_files = values['-AUDIO_FILES-'].split(';')
-      combine_gui(video_files, audio_files, config_path)
-    if event == 'Settings':
-      window.disable()
-      settings_gui(config_path)
-      window.enable()
-    if event == sg.WIN_CLOSED:
-      break
-  window.close()
-
-# Entry point for command line interaction, for example:
-# > describealign video.mp4 audio_desc.mp3
-def command_line_interface():
-  # override command line argument parser's error handler to make it pause before exiting
-  # this allows users to see the error message when accidentally not running from command line
-  class ArgumentParser(argparse.ArgumentParser):
-    def error(self, message):
-      if 'required: video, audio' in message:
-        print('No input arguments detected, starting GUI...')
-        main_gui()
-        self.exit()
-      else:
-        self.exit(2, f'{self.prog}: error: {message}\n')
-  parser = ArgumentParser(description="Replaces a video's sound with an audio description.",
-                          usage="describealign video_file.mp4 audio_file.mp3")
-  parser.add_argument("video", help='A video file or directory containing video files.')
-  parser.add_argument("audio", help='An audio file or directory containing audio files.')
-  parser.add_argument('--smoothness', type=float, default=50,
-                      help='Lower values make the alignment more accurate when there are skips ' + \
-                           '(e.g. describer pauses), but also make it more likely to misalign. ' + \
-                           'Default is 50.')
-  parser.add_argument('--stretch_audio', action='store_true',
-                      help='Stretches the input audio to fit the input video. ' + \
-                           'Default is to stretch the video to fit the audio.')
-  parser.add_argument('--keep_non_ad', action='store_true',
-                      help='Tries to only replace segments with audio description. Useful if ' + \
-                           'video\'s audio quality is better. Default is to replace all aligned audio. ' + \
-                           'Requires --stretch_audio to be set, otherwise does nothing.')
-  parser.add_argument('--boost', type=float, default=0,
-                      help='Boost (or quieten) description volume. Units are decibels (dB), so ' + \
-                           '-3 makes the describer about 2x quieter, while 3 makes them 2x louder. ' + \
-                           'Requires --stretch_audio to be set, otherwise does nothing.')
-  parser.add_argument('--ad_detect_sensitivity', type=float, default=.6,
-                      help='Audio description detection sensitivity ratio. Higher values make ' + \
-                           '--keep_non_ad more likely to replace aligned audio. Default is 0.6')
-  parser.add_argument('--boost_sensitivity', type=float, default=.4,
-                      help='Higher values make --boost less likely to miss a description, but ' + \
-                           'also make it more likely to boost non-description audio. Default is 0.4')
-  parser.add_argument('--yes', action='store_true',
-                      help='Auto-skips user prompts asking to verify information.')
-  parser.add_argument("--prepend", default="ad_", help='Output file name prepend text. Default is "ad_"')
-  parser.add_argument('--no_pitch_correction', action='store_true',
-                      help='Skips pitch correction step when stretching audio. ' + \
-                           'Requires --stretch_audio to be set, otherwise does nothing.')
-  parser.add_argument("--output_dir", default=default_output_dir,
-                      help='Directory combined output media is saved to. Default is "videos_with_ad"')
-  parser.add_argument("--alignment_dir", default=default_alignment_dir,
-                      help='Directory alignment data and plots are saved to. Default is "alignment_plots"')
-  parser.add_argument("--extension", default="copy",
-                      help='File type of output video (e.g. mkv). When set to "copy", copies the ' + \
-                           'file type of the corresponding input video. Default is "copy".')
-  args = parser.parse_args()
-  
-  combine(args.video, args.audio, args.smoothness, args.stretch_audio, args.keep_non_ad,
-          args.boost, args.ad_detect_sensitivity, args.boost_sensitivity, args.yes,
-          args.prepend, args.no_pitch_correction, args.output_dir, args.alignment_dir,
-          args.extension)
-
-# allows the script to be run on its own, rather than through the package, for example:
-# python3 describealign.py video.mp4 audio_desc.mp3
-if __name__ == "__main__":
-  multiprocessing.freeze_support()
-  command_line_interface()
-
-
-
-
+# combines videos with matching audio files (e.g. audio descriptions)
+# input: video or folder of videos and an audio file or folder of audio files
+# output: videos in a folder "videos_with_ad", with aligned segments of the audio replaced
+# this script aligns the new audio to the video using the video's old audio
+# first, the video's sound and the audio file are both converted to spectrograms
+# second, the two spectrograms are roughly aligned by finding their longest common subsequence
+# third, the rough alignment is denoised through L1-Minimization
+# fourth, the spectrogram alignments determine where the new audio replaces the old
+
+'''
+Copyright (C) 2023  Julian Brown
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <https://www.gnu.org/licenses/>.
+'''
+
+# Nuitka build options:
+# nuitka-project-if: {OS} != "Windows":
+#    nuitka-project: --enable-plugins=pyside2
+#
+# Compilation mode, standalone everywhere, except on macOS there app bundle
+# nuitka-project-if: {OS} == "Darwin":
+#    nuitka-project: --standalone
+#    nuitka-project: --macos-create-app-bundle
+# Mac needs onefile too apparently, because pyside2 plugin requires it.
+# All other platforms need it to, so set it universally.
+# nuitka-project: --onefile
+#
+# Debugging options, controlled via environment variable at compile time.
+# nuitka-project-if: os.getenv("DEBUG_COMPILATION", "no") == "yes":
+#     nuitka-project: --enable-console
+# nuitka-project-else:
+#     nuitka-project: --disable-console
+
+# Set app icon
+# nuitka-project-if: {OS} == "Windows":
+#   nuitka-project: --windows-icon-from-ico=describealign.png
+# nuitka-project-else:
+#   nuitka-project-if: {OS} == "Darwin":
+#     nuitka-project: --macos-app-icon=describealign.png
+#   nuitka-project-else:
+#     nuitka-project: --linux-icon=describealign.png
+# End Nuitka build options
+
+VIDEO_EXTENSIONS = set(['mp4', 'mkv', 'avi', 'mov', 'webm', 'm4v', 'flv', 'vob'])
+AUDIO_EXTENSIONS = set(['mp3', 'm4a', 'opus', 'wav', 'aac', 'flac', 'ac3', 'mka'])
+PLOT_ALIGNMENT_TO_FILE = True
+
+TIMESTEP_SIZE_SECONDS = .16
+TIMESTEP_OVERLAP_RATIO = .5
+AUDIO_SAMPLE_RATE = 44100
+MEL_COEFFS_PER_TIMESTEP = 25
+DITHER_PERIOD_STEPS = 60
+MIN_CORR_FOR_TOKEN_MATCH = .6
+GAP_START_COST = 1.0
+GAP_EXTEND_COST = -.01
+GAP_EXTEND_DIAG_BONUS = -.01
+SKIP_MATCH_COST = .1
+MAX_RATE_RATIO_DIFF_ALIGN = .1
+PREF_CUT_AT_GAPS_FACTOR = 5
+MIN_DURATION_TO_REPLACE_SECONDS = 2
+MIN_START_END_SYNC_TIME_SECONDS = 2
+MAX_START_END_SYNC_ERR_SECONDS = .2
+MAX_RATE_RATIO_DIFF_BOOST = .003
+MIN_DESC_DURATION = .5
+MAX_GAP_IN_DESC_SEC = 1.5
+JUST_NOTICEABLE_DIFF_IN_FREQ_RATIO = .005
+CATCHUP_RATE = 5
+
+if PLOT_ALIGNMENT_TO_FILE:
+  import matplotlib.pyplot as plt
+import argparse
+import os
+import glob
+import itertools
+import datetime
+import numpy as np
+import ffmpeg
+import static_ffmpeg
+import python_speech_features as psf
+import scipy.signal
+import scipy.optimize
+import scipy.interpolate
+import scipy.ndimage as nd
+import scipy.sparse
+import pytsmod
+import configparser
+import traceback
+import multiprocessing
+import platform
+
+IS_RUNNING_WINDOWS = platform.system() == 'Windows'
+if IS_RUNNING_WINDOWS:
+  import PySimpleGUIWx as sg
+  default_output_dir = 'videos_with_ad'
+  default_alignment_dir = 'alignment_plots'
+else:
+  import PySimpleGUIQt as sg
+  default_output_dir = os.path.expanduser('~') + '/videos_with_ad'
+  default_alignment_dir = os.path.expanduser('~') + '/alignment_plots'
+
+def display(text, func=None):
+  if func:
+    func(text)
+  print(text)
+
+def throw_runtime_error(text, func=None):
+  if func:
+    func(text)
+  raise RuntimeError(text)
+
+def ensure_folders_exist(dirs, display_func=None):
+  for dir in dirs:
+    if not os.path.isdir(dir):
+      display("Directory not found, creating it: " + dir, display_func)
+      os.makedirs(dir)
+
+def get_sorted_filenames(path, extensions, alt_extensions=set([])):
+  # path could be three different things: a file, a directory, a list of files
+  if type(path) is list:
+    files = [os.path.abspath(file) for file in path]
+    for file in files:
+      if not os.path.isfile(file):
+        raise RuntimeError(f"No file found at input path:\n  {file}")
+  else:
+    path = os.path.abspath(path)
+    if os.path.isdir(path):
+      files = glob.glob(glob.escape(path) + "/*")
+      if len(files) == 0:
+        raise RuntimeError(f"Empty input directory:\n  {path}")
+    else:
+      if not os.path.isfile(path):
+        raise RuntimeError(f"No file or directory found at input path:\n  {path}")
+      files = [path]
+  files = [file for file in files if os.path.splitext(file)[1][1:] in extensions | alt_extensions]
+  if len(files) == 0:
+    error_msg = [f"No files with valid extensions found at input path:\n  {path}",
+                 "Did you accidentally put the audio filepath before the video filepath?",
+                 "The video path should be the first positional input, audio second.",
+                 "Or maybe you need to add a new extension to this script's regex?",
+                 f"valid extensions for this input are:\n  {extensions}"]
+    raise RuntimeError("\n".join(error_msg))
+  files = sorted(files)
+  file_types = [0 if os.path.splitext(file)[1][1:] in extensions else 1 for file in files]
+  return files, file_types
+
+# read audio from file with ffmpeg and convert to numpy array
+def parse_audio_from_file(media_file):
+  media_stream, _ = (ffmpeg
+    .input(media_file)
+    .output('-', format='s16le', acodec='pcm_s16le', ac=2, ar=AUDIO_SAMPLE_RATE, loglevel='fatal')
+    .run(capture_stdout=True, cmd=get_ffmpeg())
+  )
+  media_arr = np.frombuffer(media_stream, np.int16).astype(np.float32).reshape((-1,2)).T
+  return media_arr
+
+# tokenize audio by transforming with a mel-frequency cepstrum (MFC)
+def tokenize_audio(media_arr, rate=1):
+  step_size_samples = psf.sigproc.round_half_up(TIMESTEP_SIZE_SECONDS * rate * AUDIO_SAMPLE_RATE)
+  window_size_seconds = TIMESTEP_SIZE_SECONDS / TIMESTEP_OVERLAP_RATIO
+  window_size_samples = psf.sigproc.round_half_up(window_size_seconds * AUDIO_SAMPLE_RATE)
+  fft_size_samples = 2**int(np.ceil(np.log2(window_size_samples)))
+  get_mfcc = lambda arr: psf.mfcc(np.mean(arr, axis=0),
+                                  samplerate=AUDIO_SAMPLE_RATE,
+                                  winlen=window_size_seconds,
+                                  winstep=TIMESTEP_SIZE_SECONDS * rate,
+                                  numcep=MEL_COEFFS_PER_TIMESTEP,
+                                  nfilt=MEL_COEFFS_PER_TIMESTEP * 2,
+                                  nfft=fft_size_samples,
+                                  winfunc=scipy.signal.windows.hann)
+  num_timesteps = max(1, ((media_arr.shape[1] - window_size_samples - 1) // step_size_samples) + 2)
+  media_spec = np.zeros((num_timesteps, MEL_COEFFS_PER_TIMESTEP))
+  chunk_size = 1000
+  for chunk_index in np.arange(0, num_timesteps, chunk_size):
+    chunk_bounds_samples = ((chunk_index                 ) * step_size_samples,
+                            (chunk_index + chunk_size - 1) * step_size_samples + window_size_samples)
+    media_spec[chunk_index:chunk_index+chunk_size] = get_mfcc(media_arr[:,slice(*chunk_bounds_samples)])
+  '''
+  # alternate python library's MFC implementation
+  import librosa
+  media_spec = librosa.feature.mfcc(y=np.mean(media_arr, axis=0),
+                                    sr=AUDIO_SAMPLE_RATE,
+                                    n_mfcc=MEL_COEFFS_PER_TIMESTEP,
+                                    lifter=22,
+                                    n_fft=fft_size_samples,
+                                    hop_length=step_size_samples,
+                                    win_length=window_size_samples,
+                                    window=scipy.signal.windows.hann).T
+  num_timesteps = media_spec.shape[0]
+  '''
+  timings_samples = window_size_samples/2. + step_size_samples * np.arange(num_timesteps)
+  timings_seconds = timings_samples / AUDIO_SAMPLE_RATE
+  return media_spec, timings_seconds
+
+# same as tokenize_audio, but dithering the MFC window timings
+# this allows for finer alignment by ameliorating discretization error
+def tokenize_audio_dither(media_arr, slow_timings):
+  # choose a relative step size slightly less than 1 to ameliorate quantization error
+  # maximize alignment accuracy by using least approximable number with desired period
+  # this is the continued fraction [0;1,N-2,1,1,1,...], where the trailing ones give phi
+  fast_rate = 1. / (1 + 1. / (DITHER_PERIOD_STEPS - 2 + (np.sqrt(5) + 1) / 2.))
+  fast_spec, fast_timings = tokenize_audio(media_arr, fast_rate)
+  
+  # prevent drift in difficult to align segments (e.g. describer speaking or quiet/droning segments)
+  # by approximately equalizing the number of tokens per unit time between dithered and undithered
+  # the dithered audio will have ~(1 + 1 / DITHER_PERIOD_STEPS) times as many tokens, so
+  # this can be accomplished by simply deleting a token every DITHER_PERIOD_STEPS tokens
+  fast_spec = np.delete(fast_spec, slice(DITHER_PERIOD_STEPS // 2, None, DITHER_PERIOD_STEPS), axis=0)
+  fast_timings = np.delete(fast_timings, slice(DITHER_PERIOD_STEPS // 2, None, DITHER_PERIOD_STEPS))
+  return fast_spec, fast_timings
+
+# normalize along both time and frequency axes to allow comparing tokens by correlation
+def normalize_spec(media_spec_raw, axes=(0,1)):
+  media_spec = media_spec_raw.copy()
+  for axis in axes:
+    norm_func = np.std if axis == 0 else np.linalg.norm
+    media_spec = media_spec - np.mean(media_spec, axis=axis, keepdims=True)
+    media_spec = media_spec/(norm_func(media_spec,axis=axis,keepdims=True)+1e-10)
+  return media_spec
+
+# vectorized implementation of the Wagner–Fischer (Longest Common Subsequence) algorithm
+# modified to include affine gap penalties and skip+match options (i.e. knight's moves)
+# gaps are necessary when parts are cut out of the audio description (e.g. cut credits)
+# or when the audio description includes a commercial break or an extra scene
+# the skip+match option allows for micro-adjustments without eating the full gap penalty
+# skip+match is primarily useful in maintaining alignment when the rates differ slightly
+def rough_align(video_spec, audio_desc_spec, video_timings, audio_desc_timings):
+  pred_map = {0:lambda node: (0, node[1]-1, node[2]-1),
+              1:lambda node: (0, node[1]-2, node[2]-1),
+              2:lambda node: (0, node[1]-1, node[2]-2),
+              3:lambda node: (1, node[1]-1, node[2]-1),
+              4:lambda node: (0, node[1]  , node[2]  ),
+              5:lambda node: (1, node[1]-1, node[2]  ),
+              6:lambda node: (1, node[1]-1, node[2]-1),
+              7:lambda node: (1, node[1]  , node[2]-1)}
+  pred_matrix = np.zeros((2, audio_desc_spec.shape[0], video_spec.shape[0]), dtype=np.uint8)
+  pred_matrix[0,1:,:2] = 0
+  pred_matrix[1,1:,:2] = 4
+  pred_matrix[:,0,:2] = [0,5]
+  path_corrs_match = np.zeros((3, video_spec.shape[0]))
+  path_corrs_gap = np.zeros((3, video_spec.shape[0]))
+  corrs = np.zeros((3, video_spec.shape[0]))
+  corrs[:,:] = np.roll(np.dot(video_spec, audio_desc_spec[0]), 1)[None,:]
+  for i in range(audio_desc_spec.shape[0]):
+    i_mod = i % 3
+    match_pred_corrs = np.hstack([path_corrs_match[i_mod-1][1:-1][:,None],
+                                  path_corrs_match[i_mod-2][1:-1][:,None] - SKIP_MATCH_COST,
+                                  path_corrs_match[i_mod-1][0:-2][:,None] - SKIP_MATCH_COST,
+                                  path_corrs_gap[  i_mod-1][1:-1][:,None]])
+    pred_matrix[0][i][2:] = np.argmax(match_pred_corrs, axis=1)
+    path_corrs_match[i_mod][2:] = np.take_along_axis(match_pred_corrs, pred_matrix[0][i][2:,None], axis=1).T
+    corrs = np.roll(corrs, -1, axis=1)
+    corrs[(i_mod+1)%3,:] = np.roll(np.dot(video_spec, audio_desc_spec[min(audio_desc_spec.shape[0]-1,i+1)]), 1)
+    fisher_infos = (2 * corrs[i_mod] - corrs[i_mod-1] - corrs[(i_mod+1)%3]) / min(.2, TIMESTEP_SIZE_SECONDS)
+    fisher_infos[fisher_infos < 0] = 0
+    fisher_infos[fisher_infos > 10] = 10
+    row_corrs = np.maximum(0, corrs[i_mod][2:] - MIN_CORR_FOR_TOKEN_MATCH)
+    path_corrs_match[i_mod][2:] += row_corrs * (fisher_infos[2:] / 5)
+    gap_pred_corrs = np.hstack([path_corrs_match[i_mod][2:  ][:,None] - GAP_START_COST,
+                                path_corrs_gap[i_mod-1][2:  ][:,None],
+                                path_corrs_gap[i_mod-1][1:-1][:,None] - GAP_EXTEND_DIAG_BONUS - \
+                                                                        GAP_EXTEND_COST])
+    pred_matrix[1][i][2:] = np.argmax(gap_pred_corrs, axis=1)
+    path_corrs_gap_no_col_skip = np.take_along_axis(gap_pred_corrs, pred_matrix[1][i][2:,None], axis=1).flat
+    pred_matrix[1][i][2:] += 4
+    path_corrs_gap[i_mod][2:] = np.maximum.accumulate(path_corrs_gap_no_col_skip + \
+                                                      GAP_EXTEND_COST * np.arange(video_spec.shape[0]-2)) - \
+                                                      GAP_EXTEND_COST * np.arange(video_spec.shape[0]-2)
+    pred_matrix[1][i][2:][path_corrs_gap[i_mod][2:] > path_corrs_gap_no_col_skip] = 7
+    path_corrs_gap[i_mod][2:] -= GAP_EXTEND_COST
+  
+  # reconstruct optimal path by following predecessors backwards through the table
+  end_node_layer = np.argmax([path_corrs_match[i_mod,-1],
+                              path_corrs_gap[  i_mod,-1]])
+  cur_node = (end_node_layer, audio_desc_spec.shape[0]-1, video_spec.shape[0]-1)
+  get_predecessor = lambda node: pred_map[pred_matrix[node]](node)
+  path = []
+  visited = set()
+  while min(cur_node[1:]) >= 0:
+    cur_node, last_node = get_predecessor(cur_node), cur_node
+    # failsafe to prevent an infinite loop that should never happen anyways
+    if cur_node in visited:
+      break
+    visited.add(cur_node)
+    if last_node[0] == 0:
+      path.append(last_node[1:])
+  path = path[::-1]
+  
+  # determine how much information this node gives about the alignment
+  # a larger double derivative means more precise timing information
+  # sudden noises give more timing information than droning sounds
+  def get_fisher_info(node):
+    i,j = node
+    if node[0] >= audio_desc_spec.shape[0]-1 or \
+       node[1] >= video_spec.shape[0]-1 or \
+       min(node) <= 0:
+      return 0
+    info = 2*np.dot(audio_desc_spec[i  ],video_spec[j  ]) - \
+             np.dot(audio_desc_spec[i-1],video_spec[j+1]) - \
+             np.dot(audio_desc_spec[i+1],video_spec[j-1])
+    info /= min(.2, TIMESTEP_SIZE_SECONDS)
+    return info
+  
+  # the quality of a node combines the correlation of its tokens
+  # with how precisely the match is localized in time
+  def get_match_quality(node):
+    # correlations are between -1 and 1, as all tokens have unit norm
+    token_correlation = np.dot(audio_desc_spec[node[0]],video_spec[node[1]])
+    fisher_info = min(max(0, get_fisher_info(node)), 10)
+    return max(0, token_correlation - MIN_CORR_FOR_TOKEN_MATCH) * (fisher_info / 5)
+  
+  # filter out low match quality nodes from LCS path
+  quals = [get_match_quality(node) for node in path]
+  if len(quals) == 0 or max(quals) <= 0:
+    raise RuntimeError("Rough alignment failed, are the input files mismatched?")
+  path, quals = zip(*[(path, qual) for (path, qual) in zip(path, quals) if qual > 0])
+  
+  # convert units of path nodes from timesteps to seconds
+  path = [(audio_desc_timings[i], video_timings[j]) for (i,j) in path]
+  
+  return path, quals
+
+# chunk path segments of similar slope into clips
+# a clip has the form: (start_index, end_index)
+def chunk_path(smooth_path, tol):
+  x,y = zip(*smooth_path)
+  slopes = np.diff(y) / np.diff(x)
+  median_slope = np.median(slopes)
+  slope_changes = np.diff(slopes)
+  breaks = np.where(np.abs(slope_changes) > tol)[0] + 1
+  breaks = [0] + list(breaks) + [len(x)-1]
+  clips = list(zip(breaks[:-1], breaks[1:]))
+  return clips, median_slope, slopes
+
+# find piece-wise linear alignment that minimizes the weighted combination of
+# total absolute error at each node and total absolute slope change of the fit
+# distance between nodes and the fit (i.e. errors) are weighted by node quality
+# absolute slope changes are differences between the slopes of adjacent fit lines
+# slope changes are weighted much more than node errors to smooth out noise
+# the main source of noise is rough alignment drift while the describer is speaking
+def smooth_align(path, quals, smoothness):
+  # rotate basis to make vertical and horizontal slopes "cost" the same
+  # the new horizontal axis is x+y and the new vertical is -x+y
+  # Wagner–Fischer gives monotonically increasing nodes, so 0 <= slope < inf
+  # after this transformation, we instead have -1 <= slope < 1
+  # perfectly matching audio has pre-transformation slope = 1
+  # after this transformation, it instead has slope = 0
+  rotated_path = [(x+y,-x+y) for x,y in path]
+  
+  # stretch the x axis to make all slopes "cost" nearly the same
+  # without this, small changes to the slope at slope = +/-1
+  # cost sqrt(2) times as much as small changes at slope = 0
+  # by stretching, we limit the range of slopes to within +/- 1/x_stretch_factor
+  # the small angle approximation means these slopes all cost roughly the same
+  x_stretch_factor = 10.
+  rotated_stretched_path = [(x_stretch_factor*x,y) for x,y in rotated_path]
+  
+  # L1-Minimization to solve the alignment problem using a linear program
+  # the absolute value functions needed for "absolute error" can be represented
+  # in a linear program by splitting variables into positive and negative pieces
+  # and constraining each to be positive (done by default in scipy's linprog)
+  # x is fit_err_pos, fit_err_neg, slope_change_pos, slope_change_neg
+  # fit_err[i] = path[i][1] - y_fit[i]
+  # slope_change[i] = (y_fit[i+2] - y_fit[i+1])/(path[i+2][0] - path[i+1][0]) - \
+  #                   (y_fit[i+1] - y_fit[i  ])/(path[i+1][0] - path[i  ][0])
+  # this can be rewritten in terms of fit_err by re-arranging the 1st equation:
+  #   y_fit[i] = path[i][1] - fit_err[i]
+  # this gives:
+  #   slope_change[i] = path_half[i] - fit_err_half[i]
+  #   where each half is just the original equation but y_fit is swapped out
+  # the slope_change variables can then be set using equality constraints
+  num_fit_points = len(rotated_stretched_path)
+  x,y = [np.array(arr) for arr in zip(*rotated_stretched_path)]
+  x_diffs = np.diff(x, prepend=[-10**10], append=[10**10])
+  y_diffs = np.diff(y, prepend=[  0    ], append=[ 0    ])
+  slope_change_magnitudes = np.abs(np.diff(y_diffs/x_diffs)) * x_stretch_factor
+  slope_change_locations = (slope_change_magnitudes > MAX_RATE_RATIO_DIFF_ALIGN)
+  slope_change_locations[1:-1] *= (np.abs(y[2:] - y[:-2]) > 5)
+  slope_change_costs = np.full(num_fit_points, smoothness / float(TIMESTEP_SIZE_SECONDS))
+  slope_change_costs[slope_change_locations] /= PREF_CUT_AT_GAPS_FACTOR
+  c = np.hstack([quals,
+                 quals,
+                 slope_change_costs * x_stretch_factor,
+                 slope_change_costs * x_stretch_factor])
+  fit_err_coeffs = scipy.sparse.diags([ 1. / x_diffs[:-1],
+                                       -1. / x_diffs[:-1] - 1. / x_diffs[1:],
+                                                            1. / x_diffs[1:]],
+                                      offsets=[0,1,2],
+                                      shape=(num_fit_points, num_fit_points + 2)).tocsc()[:,1:-1]
+  A_eq = scipy.sparse.hstack([ fit_err_coeffs,
+                              -fit_err_coeffs,
+                               scipy.sparse.eye(num_fit_points),
+                              -scipy.sparse.eye(num_fit_points)])
+  b_eq = y_diffs[1:  ] / x_diffs[1:  ] - \
+         y_diffs[ :-1] / x_diffs[ :-1]
+  fit = scipy.optimize.linprog(c, A_eq=A_eq, b_eq=b_eq)
+  if not fit.success:
+    print(fit)
+    raise RuntimeError("Smooth Alignment L1-Min Optimization Failed!")
+  
+  # combine fit_err_pos and fit_err_neg
+  fit_err = fit.x[:num_fit_points] - fit.x[num_fit_points:2*num_fit_points]
+  
+  # subtract fit errors from nodes to retrieve the smooth fit's coordinates
+  # also, unstretch x axis and rotate basis back, reversing the affine pre-processing
+  smooth_path = [(((x / x_stretch_factor) - y) / 2.,
+                  ((x / x_stretch_factor) + y) / 2.) for x,y in zip(x, y - fit_err)]
+  
+  # clip off start/end of replacement audio if it doesn't match or isn't aligned
+  # without this, describer intro/outro skips can cause mismatches at the start/end
+  # the problem would be localized and just means audio might not match video at the start/end
+  # instead we just keep the original video's audio in those segments if mismatches are detected
+  # if instead the first few or last few nodes are well-aligned, that edge is marked as synced
+  # during audio replacement, synced edges will be extended backwards/forwards as far as possible
+  # this is useful when the describer begins talking immediately (or before any alignable audio)
+  # or when the describer continues speaking until the end (or no more alignable audio remains)
+  # otherwise, the mismatch would result in the describer's voice not replacing audio in that part
+  max_sync_err = MAX_START_END_SYNC_ERR_SECONDS
+  smoothing_std = MIN_START_END_SYNC_TIME_SECONDS / (2. * TIMESTEP_SIZE_SECONDS)
+  smoothed_fit_err = nd.gaussian_filter(np.abs(fit_err), sigma=smoothing_std)
+  smooth_err_path = zip(smoothed_fit_err, smooth_path)
+  old_length = num_fit_points
+  smooth_err_path = list(itertools.dropwhile(lambda x: x[0] > max_sync_err, smooth_err_path))[::-1]
+  is_synced_at_start = len(smooth_err_path) == old_length
+  old_length = len(smooth_err_path)
+  smooth_err_path = list(itertools.dropwhile(lambda x: x[0] > max_sync_err, smooth_err_path))[::-1]
+  is_synced_at_end = len(smooth_err_path) == old_length
+  _, smooth_path = zip(*smooth_err_path)
+  smooth_path = list(smooth_path)
+  if is_synced_at_start:
+    slope = (smooth_path[1][1] - smooth_path[0][1]) / (smooth_path[1][0] - smooth_path[0][0])
+    smooth_path.insert(0, (-10e10, -10e10 * slope))
+  if is_synced_at_end:
+    slope = (smooth_path[-1][1] - smooth_path[-2][1]) / (smooth_path[-1][0] - smooth_path[-2][0])
+    smooth_path.append((10e10, 10e10 * slope))
+  
+  clips, median_slope, slopes = chunk_path(smooth_path, tol=1e-7)
+  
+  # assemble clips with slopes within the rate tolerance into runs
+  runs, run = [], []
+  bad_clips = []
+  for clip in clips:
+    if np.abs(median_slope-slopes[clip[0]]) > MAX_RATE_RATIO_DIFF_ALIGN:
+      if len(run) > 0:
+        runs.append(run)
+        run = []
+      bad_clips.append(clip)
+      continue
+    run.append(clip)
+  if len(run) > 0:
+    runs.append(run)
+  
+  return smooth_path, runs, bad_clips, clips
+
+# if the start or end were marked as synced during smooth alignment then
+# extend that alignment to the edge (i.e. to the start/end of the audio)
+def cap_synced_end_points(smooth_path, video_arr, audio_desc_arr):
+  if smooth_path[0][0] < -10e9:
+    slope = smooth_path[0][1] / smooth_path[0][0]
+    new_start_point = (0, smooth_path[1][1] - smooth_path[1][0] * slope)
+    if new_start_point[1] < 0:
+      new_start_point = (smooth_path[1][0] - smooth_path[1][1] / slope, 0)
+    smooth_path[0] = new_start_point
+  if smooth_path[-1][0] > 10e9:
+    video_runtime = (video_arr.shape[1] - 2.) / AUDIO_SAMPLE_RATE
+    audio_runtime = (audio_desc_arr.shape[1] - 2.) / AUDIO_SAMPLE_RATE
+    slope = smooth_path[-1][1] / smooth_path[-1][0]
+    new_end_point = (audio_runtime, smooth_path[-2][1] + (audio_runtime - smooth_path[-2][0]) * slope)
+    if new_end_point[1] > video_runtime:
+      new_end_point = (smooth_path[-2][0] + (video_runtime - smooth_path[-2][1]) / slope, video_runtime)
+    smooth_path[-1] = new_end_point
+
+# visualize both the rough and smooth alignments
+def plot_alignment(plot_filename_no_ext, path, smooth_path, quals, runs, bad_clips, ad_timings):
+  scatter_color = [.2,.4,.8]
+  lcs_rgba = np.zeros((len(quals),4))
+  lcs_rgba[:,:3] = np.array(scatter_color)[None,:]
+  lcs_rgba[:,3] = np.minimum(1, np.array(quals) * 500. / len(quals))
+  audio_times, video_times = np.array(path).T.reshape((2,-1))
+  audio_offsets = audio_times - video_times
+  def expand_limits(start, end, ratio=.01):
+    average = (end + start) / 2.
+    half_diff = (end - start) / 2.
+    half_diff *= (1 + ratio)
+    return (average - half_diff, average + half_diff)
+  plt.xlim(expand_limits(*(0, np.max(video_times) / 60.)))
+  plt.ylim(expand_limits(*(np.min(audio_offsets) - TIMESTEP_SIZE_SECONDS / 2.,
+                          np.max(audio_offsets) + TIMESTEP_SIZE_SECONDS / 2.)))
+  plt.scatter(video_times / 60., audio_offsets, s=3, c=lcs_rgba, label='LCS Matches')
+  audio_times, video_times = np.array(smooth_path).T.reshape((2,-1))
+  audio_offsets = audio_times - video_times
+  if ad_timings is None:
+    plt.plot(video_times / 60., audio_offsets, 'r-', lw=.5, label='Replaced Audio')
+    bad_path = []
+    for clip in bad_clips:
+      bad_path.extend(smooth_path[clip[0]:clip[1]+1])
+      bad_path.append((smooth_path[clip[1]][0] + 1e-10, np.nan))
+    audio_times, video_times = np.array(bad_path).T.reshape((2,-1))
+    audio_offsets = audio_times - video_times
+    if len(audio_offsets) > 0:
+      plt.plot(video_times / 60., audio_offsets, 'c-', lw=1, label='Original Audio')
+  else:
+    interp = scipy.interpolate.interp1d(video_times, audio_offsets,
+                                        fill_value = np.inf,
+                                        bounds_error = False, assume_sorted = True)
+    plt.plot(video_times / 60., audio_offsets, 'c-', lw=.5, label='Original Audio')
+    video_times = ad_timings
+    audio_offsets = interp(ad_timings)
+    if len(audio_offsets) > 0:
+      plt.plot(video_times / 60., audio_offsets, 'r-', lw=1, label='Replaced Audio')
+  plt.xlabel('Video Time (minutes)')
+  plt.ylabel('Audio Description Offset (seconds)')
+  plt.title('Alignment')
+  plt.legend().legendHandles[0].set_color(scatter_color)
+  plt.tight_layout()
+  plt.savefig(plot_filename_no_ext + '.png', dpi=400)
+  plt.clf()
+  
+  with open(plot_filename_no_ext + '.txt', 'w') as file:
+    rough_clips, median_slope, _ = chunk_path(smooth_path, tol=2e-2)
+    video_offset = np.diff(smooth_path[rough_clips[0][0]])[0]
+    print("Main changes needed to video to align it to audio input:", file=file)
+    print(f"Start Offset: {-video_offset:.2f} seconds", file=file)
+    print(f"Median Rate Change: {(median_slope-1.)*100:.2f}%", file=file)
+    for clip_start, clip_end in rough_clips:
+      audio_desc_start, video_start = smooth_path[clip_start]
+      audio_desc_end, video_end = smooth_path[clip_end]
+      slope = (video_end - video_start) / (audio_desc_end - audio_desc_start)
+      def str_from_time(seconds):
+        minutes, seconds = divmod(seconds, 60)
+        hours, minutes = divmod(minutes, 60)
+        return f"{hours:2.0f}:{minutes:02.0f}:{seconds:05.2f}"
+      print(f"Rate change of {(slope-1.)*100:6.1f}% from {str_from_time(video_start)} to " + \
+            f"{str_from_time(video_end)} aligning with audio from " + \
+            f"{str_from_time(audio_desc_start)} to {str_from_time(audio_desc_end)}", file=file)
+
+# use the smooth alignment to replace runs of video sound with corresponding described audio
+def replace_aligned_segments(video_arr, audio_desc_arr, smooth_path, runs, no_pitch_correction=False):
+  # perform quadratic interpolation of the audio description's waveform
+  # this allows it to be stretched to match the corresponding video segment
+  def audio_desc_arr_interp(samples):
+    chunk_size = 10**7
+    interpolated_chunks = []
+    for chunk in (samples[i:i+chunk_size] for i in range(0, len(samples), chunk_size)):
+      interp_bounds = (max(int(chunk[0]-2), 0),
+                       min(int(chunk[-1]+2), audio_desc_arr.shape[1]))
+      interp = scipy.interpolate.interp1d(np.arange(*interp_bounds),
+                                          audio_desc_arr[:,slice(*interp_bounds)],
+                                          copy=False, bounds_error=False, fill_value=0,
+                                          kind='quadratic', assume_sorted=True)
+      interpolated_chunks.append(interp(chunk).astype(np.float32))
+    return np.hstack(interpolated_chunks)
+  
+  # construct a stretched audio description waveform using the quadratic interpolator
+  def get_interped_segment(run, interp):
+    segment = []
+    for clip in run:
+      num_samples = int(y[clip[1]] * AUDIO_SAMPLE_RATE) - \
+                    int(y[clip[0]] * AUDIO_SAMPLE_RATE)
+      clip_bounds = np.array((x[clip[0]], x[clip[1]])) * AUDIO_SAMPLE_RATE
+      sample_points = np.linspace(*clip_bounds, num=num_samples, endpoint=False)
+      segment.append(interp(sample_points))
+    segment = np.hstack(segment)
+    return segment
+  
+  x,y = zip(*smooth_path)
+  for run in runs:
+    run_length_seconds = y[run[-1][1]] - y[run[0][0]]
+    if run_length_seconds < MIN_DURATION_TO_REPLACE_SECONDS:
+      continue
+    anchor_point_path_indices = [clip[0] for clip in run]
+    anchor_point_path_indices.append(run[-1][1])
+    anchor_points = (np.array((np.array(x)[anchor_point_path_indices],
+                               np.array(y)[anchor_point_path_indices])) * AUDIO_SAMPLE_RATE).astype(int)
+    slopes = np.diff(anchor_points[1]) / np.diff(anchor_points[0])
+    for clip_index, (clip, slope) in enumerate(zip(run, slopes)):
+      # only apply pitch correction if the difference would be noticeable
+      if no_pitch_correction or np.abs(1 - slope) <= JUST_NOTICEABLE_DIFF_IN_FREQ_RATIO:
+        stretched_audio = get_interped_segment([clip], audio_desc_arr_interp)
+      else:
+        anchor_point_pair = anchor_points[:,clip_index:clip_index+2].copy()
+        # account for quirks of pytsmod's wsola anchor point implementation
+        anchor_point_pair[1][-1] -= 1
+        anchor_y_offset = anchor_point_pair[1][0]
+        anchor_point_pair[1,:] -= anchor_y_offset
+        stretched_audio = pytsmod.wsola(audio_desc_arr, anchor_point_pair)
+      video_arr[:,slice(*anchor_points[1,clip_index:clip_index+2])] = stretched_audio
+
+# identify which segments of the replaced audio actually have the describer speaking
+# uses a Naive Bayes classifier smoothed with L1-Minimization to identify the describer
+def detect_describer(video_arr, video_spec, video_spec_raw, video_timings,
+                     smooth_path, detect_sensitivity, boost_sensitivity):
+  # retokenize the audio description, which has been stretched to match the video
+  audio_desc_spec_raw, audio_timings = tokenize_audio(video_arr)
+  audio_desc_spec = normalize_spec(audio_desc_spec_raw)
+  
+  # avoid boosting or training on mismatched segments, like those close to skips
+  # assumes matching segments all have the same, constant play rate
+  # could be modified to handle a multi-modal distribution of rates
+  aligned_audio_times, aligned_video_times = zip(*smooth_path)
+  interp = scipy.interpolate.interp1d(aligned_video_times, aligned_audio_times,
+                                      fill_value = 'extrapolate',
+                                      bounds_error = False, assume_sorted = True)
+  slopes = (interp(video_timings + 1e-5) - \
+            interp(video_timings - 1e-5)) / 2e-5
+  median_slope = np.median(slopes)
+  aligned_mask =      np.abs(slopes - median_slope) < MAX_RATE_RATIO_DIFF_ALIGN
+  well_aligned_mask = np.abs(slopes - median_slope) < MAX_RATE_RATIO_DIFF_BOOST
+  
+  # first pass identification by assuming poorly matched tokens are describer speech
+  # also assumes the describer doesn't speak very quietly
+  corrs = np.sum(audio_desc_spec * video_spec, axis=-1)
+  smooth_volume = nd.gaussian_filter(audio_desc_spec[:,0], sigma=1)
+  audio_desc_loud = smooth_volume > np.percentile(smooth_volume, 30)
+  speech_mask = (corrs < .2) * audio_desc_loud
+  
+  # normalize spectrogram coefficients along time axis to prep for conversion to PDFs
+  audio_desc_spec = normalize_spec(audio_desc_spec_raw, axes=(0,))
+  audio_desc_spec = np.clip(audio_desc_spec / 6., -1, 1)
+  video_spec = normalize_spec(video_spec_raw, axes=(0,))
+  video_spec = np.clip(video_spec / 6., -1, 1)
+  
+  # convert sampled features (e.g. spectrogram) to probability densities of each feature
+  # when given a spectrogram, finds the distributions of the MFC coefficients
+  def make_log_pdfs(arr):
+    resolution = 100
+    bins_per_spot = 4
+    num_bins = int(resolution * bins_per_spot)
+    uniform_prior_strength_per_spot = 1
+    uniform_prior_strength_per_bin = uniform_prior_strength_per_spot / float(bins_per_spot)
+    bin_range = (-1 - 1e-10, 1 + 1e-10)
+    get_hist = lambda x: np.histogram(x, bins=num_bins, range=bin_range)[0]
+    pdfs = np.apply_along_axis(get_hist, 1, arr.T)
+    pdfs = pdfs + uniform_prior_strength_per_bin
+    smooth = lambda x: nd.gaussian_filter(x, sigma=bins_per_spot)
+    pdfs = np.apply_along_axis(smooth, 1, pdfs)
+    pdfs = pdfs / np.sum(pdfs[0,:])
+    log_pdfs = np.log(pdfs)
+    bin_edges = np.histogram([], bins=num_bins, range=bin_range)[1]
+    return log_pdfs, bin_edges
+  
+  diff_spec = audio_desc_spec - video_spec
+  diff_spec = np.clip(diff_spec, -1, 1)
+  
+  # Naive Bayes classifier to roughly estimate whether each token is describer speech
+  desc_log_pdfs, _ = make_log_pdfs(diff_spec[speech_mask * well_aligned_mask])
+  nondesc_log_pdfs, bin_edges = make_log_pdfs(diff_spec[(~speech_mask) * well_aligned_mask])
+  lratio_lookup = desc_log_pdfs - nondesc_log_pdfs
+  lratios = lratio_lookup[np.fromfunction(lambda i,j: j, diff_spec.shape, dtype=int),
+                          np.digitize(diff_spec, bin_edges, right=True)-1]
+  ratio_desc_to_nondesc = np.sum(speech_mask * well_aligned_mask) /\
+                         (np.sum((~speech_mask) * well_aligned_mask) + 1.)
+  relative_probs = np.sum(lratios, axis=1)
+  relative_probs /= np.std(relative_probs)
+  relative_probs -= np.mean(relative_probs)
+  
+  # L1-Minimization to smoothly identify audio descriptions using a linear program
+  # x is fit_err_pos, fit_err_neg, delta_fit_pos, delta_fit_neg
+  # fit_err[i] = relative_probs[i] - y_fit[i]
+  # delta_fit[i] = y_fit[i] - y_fit[i-1]
+  # this can be rewritten in terms of fit_err by re-arranging the 1st equation:
+  #   y_fit[i] = relative_probs[i] - fit_err[i]
+  # this gives:
+  #   delta_fit[i] = (relative_probs[i] - relative_probs[i-1]) -\
+  #                  (fit_err[i] - fit_err[i-1])
+  # the delta_fit variables can then be set using equality constraints
+  num_fit_points = len(relative_probs)
+  y_diffs = np.diff(relative_probs)
+  pos_err_cost_factor = MIN_DESC_DURATION / float(TIMESTEP_SIZE_SECONDS)
+  neg_err_cost_factor = MAX_GAP_IN_DESC_SEC / float(TIMESTEP_SIZE_SECONDS)
+  c = np.hstack([np.ones(num_fit_points) / pos_err_cost_factor,
+                 np.ones(num_fit_points) / neg_err_cost_factor,
+                 np.ones(num_fit_points - 1) / 2.,
+                 np.ones(num_fit_points - 1) / 2.])
+  fit_err_coeffs = scipy.sparse.diags([-np.ones(num_fit_points),
+                                        np.ones(num_fit_points)],
+                                      offsets=[0,1],
+                                      shape=(num_fit_points - 1, num_fit_points)).tocsc()
+  A_eq = scipy.sparse.hstack([ fit_err_coeffs,
+                              -fit_err_coeffs,
+                               scipy.sparse.eye(num_fit_points-1),
+                              -scipy.sparse.eye(num_fit_points-1)])
+  b_eq = y_diffs
+  fit = scipy.optimize.linprog(c, A_eq=A_eq, b_eq=b_eq)
+  if not fit.success:
+    print(fit)
+    raise RuntimeError("Describer Voice Detection L1-Min Optimization Failed!")
+  
+  # combine fit_err_pos and fit_err_neg
+  fit_err = fit.x[:num_fit_points] - fit.x[num_fit_points:2*num_fit_points]
+  
+  # subtract fit errors from nodes to retrieve the smoothed fit
+  smooth_desc_locations = relative_probs - fit_err
+  
+  # hard threshold to classify each token as describer speech or not
+  speech_mask = smooth_desc_locations > 1. - 1.5 * detect_sensitivity
+  speech_mask *= aligned_mask
+  
+  # a separate mask is created for describer volume boosting
+  # as losing the describer's voice entirely is usually worse than it just being quiet
+  # and imperfectly aligned segments may have descriptions, but shouldn't be boosted
+  boost_mask = smooth_desc_locations > 1. - 1.5 * boost_sensitivity
+  boost_mask *= well_aligned_mask
+  
+  # convert a token classification into a mask that can be applied directly to samples
+  # unlike the input, the output isn't a boolean array but an array of floats
+  def token_mask_to_sample_mask(token_mask):
+    description_timings = video_timings[1:-1][token_mask[1:-1]]
+    sample_mask = np.zeros(video_arr.shape[1], dtype=np.float32)
+    window_radius = int(AUDIO_SAMPLE_RATE * TIMESTEP_SIZE_SECONDS)
+    window_size_seconds = 2 * window_radius + 1
+    bump = scipy.signal.windows.hann(window_size_seconds)
+    for description_timing in description_timings:
+      window_center = int(description_timing * AUDIO_SAMPLE_RATE)
+      sample_mask[window_center-window_radius:window_center+window_radius+1] += bump
+    return sample_mask
+  
+  speech_sample_mask = token_mask_to_sample_mask(speech_mask)
+  boost_sample_mask = token_mask_to_sample_mask(boost_mask)
+  ad_timings = video_timings.copy()
+  ad_timings[~speech_mask] = np.inf
+  
+  return speech_sample_mask, boost_sample_mask, ad_timings
+
+# Convert piece-wise linear fit to ffmpeg expression for editing video frame timestamps
+def encode_fit_as_ffmpeg_expr(smooth_path, clips, video_offset, start_key_frame):
+  # PTS is the input frame's presentation timestamp, which is when frames are displayed
+  # TB is the timebase, which is how many seconds each unit of PTS corresponds to
+  # the output value of the expression will be the frame's new PTS
+  setts_cmd = ['TS']
+  start_skip = max(0, video_offset - start_key_frame)
+  if start_skip > 0:
+    # lossless cutting can only happen at key frames, so we cut the video before the audio starts
+    # but that means the video is behind the audio and needs to catch up by playing quicker
+    # catchup_spread is the ratio of time to spend catching up to the amount of catching up needed
+    catchup_spread = 1./CATCHUP_RATE
+    setts_cmd.append(f'+clip(TS-STARTPTS,0,{start_skip*(1+catchup_spread)}/TB)*{-1./(1+catchup_spread)}')
+  elif video_offset < 0:
+    # if the audio starts before the video, stretch the first frame of the video back to meet it
+    setts_cmd.append(f'+clip(TS-STARTPTS,0,{-video_offset/10000.}/TB)*10000')
+  # each segment of the linear fit can be encoded as a single clip function
+  setts_cmd.append('+(0')
+  for clip_start, clip_end in clips:
+    audio_desc_start, video_start = smooth_path[clip_start]
+    audio_desc_end, video_end = smooth_path[clip_end]
+    video_start -= start_key_frame
+    video_end -= start_key_frame
+    audio_desc_length = audio_desc_end - audio_desc_start
+    video_length = video_end - video_start
+    slope = audio_desc_length / video_length
+    setts_cmd.append(f'+clip(TS-STARTPTS-{video_start:.4f}/TB,0,{max(0,video_length):.4f}/TB)*{slope-1:.9f}')
+  setts_cmd.append(')')
+  setts_cmd = ''.join(setts_cmd)
+  return setts_cmd
+
+def get_ffmpeg():
+  return static_ffmpeg.run._get_or_fetch_platform_executables_else_raise_no_lock()[0]
+
+def get_ffprobe():
+  return static_ffmpeg.run._get_or_fetch_platform_executables_else_raise_no_lock()[1]
+
+def get_closest_key_frame_time(video_file, time):
+  if time <= 0:
+    return 0
+  key_frames = ffmpeg.probe(video_file, cmd=get_ffprobe(), select_streams='v',
+                            show_frames=None, skip_frame='nokey')['frames']
+  key_frame_times = np.array([float(frame['pts_time']) for frame in key_frames] + [0])
+  return np.max(key_frame_times[key_frame_times <= time])
+
+# outputs a new media file with the replaced audio (which includes audio descriptions)
+def write_replaced_media_to_disk(output_filename, media_arr, video_file=None, audio_desc_file=None,
+                                 setts_cmd=None, start_key_frame=None):
+  if audio_desc_file is None:
+    media_input = ffmpeg.input('pipe:', format='s16le', acodec='pcm_s16le',
+                               ac=2, ar=AUDIO_SAMPLE_RATE)
+    if video_file is None or os.path.splitext(output_filename)[1][1:] in AUDIO_EXTENSIONS:
+      write_command = ffmpeg.output(media_input, output_filename, loglevel='fatal').overwrite_output()
+    else:
+      original_video = ffmpeg.input(video_file)
+      # "-max_interleave_delta 0" is sometimes necessary to fix an .mkv bug that freezes audio/video:
+      #   ffmpeg bug warning: [matroska @ 0000000002c814c0] Starting new cluster due to timestamp
+      # more info about the bug and fix: https://reddit.com/r/ffmpeg/comments/efddfs/
+      write_command = ffmpeg.output(media_input, original_video, output_filename,
+                                    acodec='copy', vcodec='copy', scodec='copy',
+                                    max_interleave_delta='0', loglevel='fatal',
+                                    **{"c:a:0": "aac", "disposition:a:0": "default"}).overwrite_output()
+    ffmpeg_caller = write_command.run_async(pipe_stdin=True, cmd=get_ffmpeg())
+    ffmpeg_caller.stdin.write(media_arr.astype(np.int16).T.tobytes())
+    ffmpeg_caller.stdin.close()
+    ffmpeg_caller.wait()
+  else:
+    media_input = ffmpeg.input(audio_desc_file)
+    audio_desc_streams = ffmpeg.probe(audio_desc_file, cmd=get_ffprobe(), select_streams='a',
+                                      show_entries='format=duration')['streams']
+    audio_desc_duration = max([float(stream['duration']) for stream in audio_desc_streams])
+    original_video = ffmpeg.input(video_file, an=None, ss=start_key_frame)
+    if os.path.splitext(output_filename)[1] == os.path.splitext(video_file)[1]:
+      # wav files don't have codecs compatible with most video containers, so we convert to aac
+      audio_codec = 'copy' if os.path.splitext(audio_desc_file)[1] != '.wav' else 'aac'
+      write_command = ffmpeg.output(media_input, original_video, output_filename,
+                                    acodec=audio_codec, vcodec='copy', scodec='copy',
+                                    max_interleave_delta='0', loglevel='fatal',
+                                    **{'bsf:v': 'setts=ts=\'' + setts_cmd + '\'',
+                                       'bsf:s': 'setts=ts=\'' + setts_cmd + '\''}).overwrite_output()
+      write_command.run(cmd=get_ffmpeg())
+    else:
+      # work around for bug that sometimes breaks setts when output and input formats differ
+      # the trick is separating the input and output by piping from one ffmpeg process into another
+      # mkv files break if 'nut' is used, while other files break when 'matroska' is used
+      format = 'matroska' if os.path.splitext(output_filename)[1] == '.mkv' else 'nut'
+      write_command = ffmpeg.output(original_video, 'pipe:', format=format, vsync='passthrough',
+                                    c='copy', loglevel='fatal')
+      ffmpeg_caller = write_command.run_async(pipe_stdout=True, cmd=get_ffmpeg())
+      pipe_input = ffmpeg.input('pipe:', format=format, thread_queue_size='512')
+      write_command2 = ffmpeg.output(media_input, pipe_input, output_filename, c='copy',
+                                     max_interleave_delta='0', loglevel='fatal', vsync='passthrough',
+                                     **{'bsf:v': 'setts=ts=\'' + setts_cmd + '\'',
+                                        'bsf:s': 'setts=ts=\'' + setts_cmd + '\''}).overwrite_output()
+      ffmpeg_caller2 = write_command2.run_async(pipe_stdin=True, cmd=get_ffmpeg())
+      while True:
+        in_bytes = ffmpeg_caller.stdout.read(100000)
+        if not in_bytes:
+          break
+        ffmpeg_caller2.stdin.write(in_bytes)
+      ffmpeg_caller2.stdin.close()
+      ffmpeg_caller.wait()
+      ffmpeg_caller2.wait()
+
+
+# check whether static_ffmpeg has already installed ffmpeg and ffprobe
+def is_ffmpeg_installed():
+  ffmpeg_dir = static_ffmpeg.run.get_platform_dir()
+  indicator_file = os.path.join(ffmpeg_dir, "installed.crumb")
+  return os.path.exists(indicator_file)
+
+# combines videos with matching audio files (e.g. audio descriptions)
+# this is the main function of this script, it calls the other functions in order
+def combine(video, audio, smoothness=50, stretch_audio=False, keep_non_ad=False,
+            boost=0, ad_detect_sensitivity=.6, boost_sensitivity=.4, yes=False,
+            prepend="ad_", no_pitch_correction=False, output_dir=default_output_dir,
+            alignment_dir=default_alignment_dir, extension="copy", display_func=None):
+  video_files, video_file_types = get_sorted_filenames(video, VIDEO_EXTENSIONS, AUDIO_EXTENSIONS)
+  
+  if yes == False and sum(video_file_types) > 0:
+    print("")
+    print("One or more audio files found in video input. Was this intentional?")
+    print("If not, press ctrl+c to kill this script.")
+    input("If this was intended, press Enter to continue...")
+    print("")
+  audio_desc_files, _ = get_sorted_filenames(audio, AUDIO_EXTENSIONS)
+  if len(video_files) != len(audio_desc_files):
+    error_msg = ["Number of valid files in input paths are not the same.",
+                 f"The video path has {len(video_files)} files",
+                 f"The audio path has {len(audio_desc_files)} files"]
+    raise RuntimeError("\n".join(error_msg))
+  
+  ensure_folders_exist([output_dir], display_func)
+  if PLOT_ALIGNMENT_TO_FILE:
+    ensure_folders_exist([alignment_dir], display_func)
+  
+  display("", display_func)
+  for (video_file, audio_desc_file) in zip(video_files, audio_desc_files):
+    display(os.path.split(video_file)[1], display_func)
+    display(os.path.split(audio_desc_file)[1], display_func)
+    display("", display_func)
+  if yes == False:
+    print("Are the above input file pairings correct?")
+    print("If not, press ctrl+c to kill this script.")
+    input("If they are correct, press Enter to continue...")
+    print("")
+  
+  # if ffmpeg isn't installed, install it
+  if not is_ffmpeg_installed():
+    display("Downloading and installing ffmpeg (media editor, 50 MB download)...", display_func)
+    get_ffmpeg()
+    if not is_ffmpeg_installed():
+      RuntimeError("Failed to install ffmpeg.")
+    display("Successfully installed ffmpeg.", display_func)
+  
+  display("Processing files:", display_func)
+  
+  for (video_file, audio_desc_file, video_filetype) in zip(video_files, audio_desc_files,
+                                                           video_file_types):
+    # Default is to use the input video's extension for the output video
+    if extension is None or extension in ["", "copy"]:
+      ext = os.path.splitext(video_file)[1]
+    else:
+      # add a dot to the extension if it's missing
+      ext = ('' if extension[0] == '.' else '.') + extension
+    output_filename = prepend + os.path.splitext(os.path.split(video_file)[1])[0] + ext
+    output_filename = os.path.join(output_dir, output_filename)
+    display(" " + output_filename, display_func)
+    
+    if os.path.exists(output_filename) and os.path.getsize(output_filename) > 0:
+      display("   output file already exists, skipping...", display_func)
+      continue
+    
+    video_arr = parse_audio_from_file(video_file)
+    audio_desc_arr = parse_audio_from_file(audio_desc_file)
+    video_spec_raw, video_timings = tokenize_audio(video_arr)
+    video_spec = normalize_spec(video_spec_raw)
+    audio_desc_spec_raw, audio_desc_timings = tokenize_audio_dither(audio_desc_arr, video_timings)
+    audio_desc_spec = normalize_spec(audio_desc_spec_raw)
+    
+    # rescale RMS intensity of audio to match video
+    audio_desc_arr *= (np.std(video_arr) / np.std(audio_desc_arr))
+    
+    path, quals = rough_align(video_spec, audio_desc_spec, video_timings, audio_desc_timings)
+    
+    smooth_path, runs, bad_clips, clips = smooth_align(path, quals, smoothness)
+    
+    cap_synced_end_points(smooth_path, video_arr, audio_desc_arr)
+    
+    ad_timings = None
+    if stretch_audio:
+      if keep_non_ad:
+        video_arr_original = video_arr.copy()
+      
+      replace_aligned_segments(video_arr, audio_desc_arr, smooth_path, runs, no_pitch_correction)
+      del audio_desc_arr
+      
+      if keep_non_ad or boost != 0:
+        outputs = detect_describer(video_arr, video_spec, video_spec_raw, video_timings,
+                                   smooth_path, ad_detect_sensitivity, boost_sensitivity)
+        speech_sample_mask, boost_sample_mask, ad_timings = outputs
+      if keep_non_ad:
+        video_arr *= speech_sample_mask
+        video_arr += video_arr_original * (1 - speech_sample_mask)
+        del video_arr_original
+        del speech_sample_mask
+      else:
+        ad_timings = None
+      if boost != 0:
+        video_arr = video_arr * (1. + (10**(boost / 10.) - 1.) * boost_sample_mask)
+        del boost_sample_mask
+      
+      # prevent peaking by rescaling to within +/- 16,382
+      video_arr *= (2**15 - 2.) / np.max(np.abs(video_arr))
+      
+      if video_filetype == 0:
+        write_replaced_media_to_disk(output_filename, video_arr, video_file)
+      else:
+        write_replaced_media_to_disk(output_filename, video_arr)
+    else:
+      if video_filetype == 1:
+        raise RuntimeError("Argument --stretch_audio is required when both inputs are audio files.")
+      if os.path.splitext(output_filename)[1][1:] in AUDIO_EXTENSIONS:
+        raise RuntimeError("Argument --stretch_audio is required when output file extension is an audio filetype.")
+      video_offset = np.diff(smooth_path[clips[0][0]])[0]
+      start_key_frame = get_closest_key_frame_time(video_file, video_offset)
+      setts_cmd = encode_fit_as_ffmpeg_expr(smooth_path, clips, video_offset, start_key_frame)
+      write_replaced_media_to_disk(output_filename, None, video_file, audio_desc_file,
+                                   setts_cmd, start_key_frame)
+    
+    del video_arr
+    if PLOT_ALIGNMENT_TO_FILE:
+      plot_filename_no_ext = os.path.join(alignment_dir, os.path.splitext(os.path.split(video_file)[1])[0])
+      plot_alignment(plot_filename_no_ext, path, smooth_path, quals, runs, bad_clips, ad_timings)
+  display("All files processed.", display_func)
+
+def write_config_file(config_path, settings):
+  config = configparser.ConfigParser()
+  config.add_section('alignment')
+  config['alignment'] = {}
+  for key, value in settings.items():
+    config['alignment'][key] = str(value)
+  with open(config_path, 'w') as f:
+    config.write(f)
+
+def read_config_file(config_path):
+  config = configparser.ConfigParser()
+  config.read(config_path)
+  settings = {'smoothness':           config.getfloat('alignment', 'smoothness', fallback=50),
+              'stretch_audio':        config.getboolean('alignment', 'stretch_audio', fallback=False),
+              'keep_non_ad':          config.getboolean('alignment', 'keep_non_ad', fallback=False),
+              'boost':                config.getfloat('alignment', 'boost', fallback=0),
+              'ad_detect_sensitivity':config.getfloat('alignment', 'ad_detect_sensitivity', fallback=.6),
+              'boost_sensitivity':    config.getfloat('alignment', 'boost_sensitivity', fallback=.4),
+              'prepend':              config.get('alignment', 'prepend', fallback='ad_'),
+              'no_pitch_correction':  config.getboolean('alignment', 'no_pitch_correction', fallback=False),
+              'output_dir':           config.get('alignment', 'output_dir', fallback=default_output_dir),
+              'alignment_dir':        config.get('alignment', 'alignment_dir', fallback=default_alignment_dir),
+              'extension':            config.get('alignment', 'extension', fallback='copy')}
+  if not config.has_section('alignment'):
+    write_config_file(config_path, settings)
+  return settings
+
+def settings_gui(config_path):
+  settings = read_config_file(config_path)
+  layout = [[sg.Text('Check tooltips (i.e. mouse-over text) for descriptions:')],
+            [sg.Column([[sg.Text('extension:', size=(10, 1.2), pad=(1,5)),
+                         sg.Input(default_text=str(settings['extension']), size=(8, 1.2), pad=(10,5), key='extension',
+                                  tooltip='File type of output video (e.g. mkv). When set to "copy", copies the ' + \
+                                          'file type of the corresponding input video. Default is "copy".')]])],
+            [sg.Column([[sg.Text('prepend:', size=(8, 1.2), pad=(1,5)),
+                         sg.Input(default_text=str(settings['prepend']), size=(8, 1.2), pad=(10,5), key='prepend',
+                                  tooltip='Output file name prepend text. Default is "ad_"')]])],
+            [sg.Column([[sg.Text('output_dir:', size=(10, 1.2), pad=(1,5)),
+                         sg.Input(default_text=str(settings['output_dir']), size=(22, 1.2), pad=(10,5), key='output_dir',
+                                  tooltip='Directory combined output media is saved to. Default is "videos_with_ad"'),
+                                  sg.FolderBrowse(button_text="Browse Folder", key='output_browse')]])],
+            [sg.Column([[sg.Text('alignment_dir:', size=(13, 1.2), pad=(1,5)),
+                         sg.Input(default_text=str(settings['alignment_dir']), size=(22, 1.2), pad=(10,5), key='alignment_dir',
+                                  tooltip='Directory alignment data and plots are saved to. Default is "alignment_plots"'),
+                         sg.FolderBrowse(button_text="Browse Folder", key='alignment_browse')]], pad=(2,7))],
+            [sg.Column([[sg.Text('smoothness:', size=(12, 1), pad=(1,5)),
+                         sg.Input(default_text=str(settings['smoothness']), size=(8, 1.2), pad=(10,5), key='smoothness',
+                                  tooltip='Lower values make the alignment more accurate when there are skips ' + \
+                                          '(e.g. describer pauses), but also make it more likely to misalign. ' + \
+                                          'Default is 50.')]])],
+            [sg.Checkbox('stretch_audio', default=settings['stretch_audio'], key='stretch_audio', change_submits=True,
+                         tooltip='Stretches the input audio to fit the input video. ' + \
+                                 'Default is to stretch the video to fit the audio.')],
+            [sg.Checkbox('keep_non_ad', default=settings['keep_non_ad'], key='keep_non_ad',
+                         disabled=not settings['stretch_audio'],
+                         tooltip='Tries to only replace segments with audio description. Useful if ' + \
+                                 'video\'s audio quality is better. Default is to replace all aligned audio. ' + \
+                                 'Requires --stretch_audio to be set, otherwise does nothing.')],
+            [sg.Column([[sg.Text('boost:', size=(6, 1), pad=(1,5)),
+                         sg.Input(default_text=str(settings['boost']), size=(8, 1.2), pad=(10,5),
+                                  key='boost', disabled=not settings['stretch_audio'],
+                                  tooltip='Boost (or quieten) description volume. Units are decibels (dB), so ' + \
+                                          '-3 makes the describer about 2x quieter, while 3 makes them 2x louder. ' + \
+                                          'Requires --stretch_audio to be set, otherwise does nothing.')]])],
+            [sg.Column([[sg.Text('ad_detect_sensitivity:', size=(21, 1.2), pad=(2,5)),
+                         sg.Input(default_text=str(settings['ad_detect_sensitivity']), size=(8, 1.2), pad=(10,5),
+                                  key='ad_detect_sensitivity', disabled=not settings['stretch_audio'],
+                                  tooltip='Audio description detection sensitivity ratio. Higher values make ' + \
+                                          '--keep_non_ad more likely to replace aligned audio. Default is 0.6')]])],
+            [sg.Column([[sg.Text('boost_sensitivity:', size=(17, 1.2), pad=(1,5)),
+                         sg.Input(default_text=str(settings['boost_sensitivity']), size=(8, 1.2), pad=(10,5),
+                                  key='boost_sensitivity', disabled=not settings['stretch_audio'],
+                                  tooltip='Higher values make --boost less likely to miss a description, but ' + \
+                                          'also make it more likely to boost non-description audio. Default is 0.4')]])],
+            [sg.Checkbox('no_pitch_correction', default=settings['no_pitch_correction'], key='no_pitch_correction',
+                         disabled=not settings['stretch_audio'],
+                         tooltip='Skips pitch correction step when stretching audio. ' + \
+                                 'Requires --stretch_audio to be set, otherwise does nothing.')],
+            [sg.Column([[sg.Submit('Save', pad=(40,3)),
+                         sg.Button('Cancel')]], pad=((135,3),10))]]
+  settings_window = sg.Window('Settings - describealign', layout, font=('Arial', 16), finalize=True)
+  settings_window['extension'].set_focus()
+  while True:
+    event, values = settings_window.read()
+    if event in (sg.WIN_CLOSED, 'Cancel') or settings_window.TKrootDestroyed:
+      break
+    if event == 'stretch_audio':
+      # work around bug in PySimpleGUIWx's InputText Update function where enabling/disabling are flipped
+      if IS_RUNNING_WINDOWS:
+        settings_window['boost'].Update(disabled = values['stretch_audio'])
+        settings_window['ad_detect_sensitivity'].Update(disabled = values['stretch_audio'])
+        settings_window['boost_sensitivity'].Update(disabled = values['stretch_audio'])
+      else:
+        settings_window['boost'].Update(disabled = not values['stretch_audio'])
+        settings_window['ad_detect_sensitivity'].Update(disabled = not values['stretch_audio'])
+        settings_window['boost_sensitivity'].Update(disabled = not values['stretch_audio'])
+      settings_window['keep_non_ad'].Update(disabled = not values['stretch_audio'])
+      settings_window['no_pitch_correction'].Update(disabled = not values['stretch_audio'])
+    if event == 'Save':
+      settings = values.copy()
+      del settings['output_browse']
+      del settings['alignment_browse']
+      write_config_file(config_path, settings)
+      break
+  settings_window.close()
+
+def combine_print_exceptions(print_queue, *args, **kwargs):
+  try:
+    combine(*args, **kwargs)
+  except:
+    print_queue.put(traceback.format_exc())
+    # raise
+
+def combine_gui(video_files, audio_files, config_path):
+  output_textbox = sg.Multiline(size=(80,30), key='-OUTPUT-')
+  layout = [[output_textbox],
+            [sg.Button('Close', pad=(360,5))]]
+  combine_window = sg.Window('Combining - describealign', layout, font=('Arial', 16),
+                             disable_close=True, finalize=True)
+  output_textbox.update('Combining media files:', append=True)
+  print_queue = multiprocessing.Queue()
+  
+  settings = read_config_file(config_path)
+  settings.update({'display_func':print_queue.put, 'yes':True})
+  proc = multiprocessing.Process(target=combine_print_exceptions,
+                                 args=(print_queue, video_files, audio_files),
+                                 kwargs=settings, daemon=True)
+  proc.start()
+  while True:
+    # if the script isn't running anymore, re-enable the default close window button
+    if not proc.is_alive():
+      combine_window.DisableClose = False
+    if not print_queue.empty():
+      if IS_RUNNING_WINDOWS:
+        cursor_position = output_textbox.WxTextCtrl.GetInsertionPoint()
+      output_textbox.update('\n' + print_queue.get(), append=True)
+      if IS_RUNNING_WINDOWS:
+        output_textbox.WxTextCtrl.SetInsertionPoint(cursor_position)
+    event, values = combine_window.read(timeout=100)
+    # window closed event isn't always emitted, so also manually check window status
+    if event == sg.WIN_CLOSED or combine_window.TKrootDestroyed:
+      if proc.is_alive():
+        proc.terminate()
+      break
+    if event == 'Close':
+      if not proc.is_alive():
+        combine_window.DisableClose = False
+        break
+      selection = sg.PopupYesNo('Combiner is still running, stop it and close anyway?')
+      if selection != 'Yes':
+        continue
+      proc.terminate()
+      combine_window.DisableClose = False
+      break
+  combine_window.close()
+
+def main_gui():
+  config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.ini')
+  sg.theme('Light Blue 2')
+  
+  filetype_sep = ';' if IS_RUNNING_WINDOWS else ' '
+  all_audio_file_types = [('All Audio File Types', '*.' + f'{filetype_sep}*.'.join(AUDIO_EXTENSIONS)),]
+  all_video_file_types = [('All Video File Types', '*.' + f'{filetype_sep}*.'.join(VIDEO_EXTENSIONS)),]
+  all_video_and_audio_file_types = [('All Video and Audio File Types',
+                                     '*.' + f'{filetype_sep}*.'.join(VIDEO_EXTENSIONS | AUDIO_EXTENSIONS)),]
+  audio_file_types = [(ext, "*." + ext) for ext in AUDIO_EXTENSIONS]
+  video_and_audio_file_types = [(ext, "*." + ext) for ext in VIDEO_EXTENSIONS] + audio_file_types
+  audio_file_types = all_audio_file_types + audio_file_types
+  video_and_audio_file_types = all_video_file_types + all_video_and_audio_file_types + video_and_audio_file_types
+  # work around bug in PySimpleGUIWx's convert_tkinter_filetypes_to_wx function
+  if IS_RUNNING_WINDOWS:
+    file_fix = lambda file_types: file_types[:1] + [('|' + type[0], type[1]) for type in file_types[1:]]
+    audio_file_types = file_fix(audio_file_types)
+    video_and_audio_file_types = file_fix(video_and_audio_file_types)
+  
+  layout = [[sg.Text('Select media files to combine:', size=(40, 2), font=('Arial', 20), pad=(3,15))],
+            [sg.Column([[sg.Text('Video Input:', size=(11, 2), pad=(1,5)),
+                         sg.Input(size=(35, 1.2), pad=(10,5), key='-VIDEO_FILES-',
+                                  tooltip='List video filenames here, in order, separated by semicolons'),
+                         sg.FilesBrowse(button_text="Browse Video",
+                                        file_types=video_and_audio_file_types,
+                                        tooltip='Select one or more video files')]], pad=(2,7))],
+            [sg.Column([[sg.Text('Audio Input:', size=(11, 2), pad=(1,5)),
+                         sg.Input(size=(35, 1.2), pad=(10,5), key='-AUDIO_FILES-',
+                                  tooltip='List audio filenames here, in order, separated by semicolons'),
+                         sg.FilesBrowse(button_text="Browse Audio",
+                                        file_types=audio_file_types,
+                                        tooltip='Select one or more audio files')]], pad=(2,7))],
+            [sg.Column([[sg.Submit('Combine', pad=(40,3), tooltip='Combine selected video and audio files'),
+                         sg.Button('Settings', tooltip='Edit settings for the GUI and algorithm.')]],
+                         pad=((135,3),10))]]
+  window = sg.Window('describealign', layout, font=('Arial', 16), resizable=False, finalize=True)
+  window['-VIDEO_FILES-'].set_focus()
+  while True:
+    event, values = window.read()
+    if event == 'Combine':
+      if len(values['-VIDEO_FILES-']) == 0 or \
+         len(values['-AUDIO_FILES-']) == 0:
+        window.disable()
+        sg.Popup('Error: empty input field.', font=('Arial', 20))
+        window.enable()
+        continue
+      video_files = values['-VIDEO_FILES-'].split(';')
+      audio_files = values['-AUDIO_FILES-'].split(';')
+      combine_gui(video_files, audio_files, config_path)
+    if event == 'Settings':
+      window.disable()
+      settings_gui(config_path)
+      window.enable()
+    if event == sg.WIN_CLOSED:
+      break
+  window.close()
+
+# Entry point for command line interaction, for example:
+# > describealign video.mp4 audio_desc.mp3
+def command_line_interface():
+  # override command line argument parser's error handler to make it pause before exiting
+  # this allows users to see the error message when accidentally not running from command line
+  class ArgumentParser(argparse.ArgumentParser):
+    def error(self, message):
+      if 'required: video, audio' in message:
+        print('No input arguments detected, starting GUI...')
+        main_gui()
+        self.exit()
+      else:
+        self.exit(2, f'{self.prog}: error: {message}\n')
+  parser = ArgumentParser(description="Replaces a video's sound with an audio description.",
+                          usage="describealign video_file.mp4 audio_file.mp3")
+  parser.add_argument("video", help='A video file or directory containing video files.')
+  parser.add_argument("audio", help='An audio file or directory containing audio files.')
+  parser.add_argument('--smoothness', type=float, default=50,
+                      help='Lower values make the alignment more accurate when there are skips ' + \
+                           '(e.g. describer pauses), but also make it more likely to misalign. ' + \
+                           'Default is 50.')
+  parser.add_argument('--stretch_audio', action='store_true',
+                      help='Stretches the input audio to fit the input video. ' + \
+                           'Default is to stretch the video to fit the audio.')
+  parser.add_argument('--keep_non_ad', action='store_true',
+                      help='Tries to only replace segments with audio description. Useful if ' + \
+                           'video\'s audio quality is better. Default is to replace all aligned audio. ' + \
+                           'Requires --stretch_audio to be set, otherwise does nothing.')
+  parser.add_argument('--boost', type=float, default=0,
+                      help='Boost (or quieten) description volume. Units are decibels (dB), so ' + \
+                           '-3 makes the describer about 2x quieter, while 3 makes them 2x louder. ' + \
+                           'Requires --stretch_audio to be set, otherwise does nothing.')
+  parser.add_argument('--ad_detect_sensitivity', type=float, default=.6,
+                      help='Audio description detection sensitivity ratio. Higher values make ' + \
+                           '--keep_non_ad more likely to replace aligned audio. Default is 0.6')
+  parser.add_argument('--boost_sensitivity', type=float, default=.4,
+                      help='Higher values make --boost less likely to miss a description, but ' + \
+                           'also make it more likely to boost non-description audio. Default is 0.4')
+  parser.add_argument('--yes', action='store_true',
+                      help='Auto-skips user prompts asking to verify information.')
+  parser.add_argument("--prepend", default="ad_", help='Output file name prepend text. Default is "ad_"')
+  parser.add_argument('--no_pitch_correction', action='store_true',
+                      help='Skips pitch correction step when stretching audio. ' + \
+                           'Requires --stretch_audio to be set, otherwise does nothing.')
+  parser.add_argument("--output_dir", default=default_output_dir,
+                      help='Directory combined output media is saved to. Default is "videos_with_ad"')
+  parser.add_argument("--alignment_dir", default=default_alignment_dir,
+                      help='Directory alignment data and plots are saved to. Default is "alignment_plots"')
+  parser.add_argument("--extension", default="copy",
+                      help='File type of output video (e.g. mkv). When set to "copy", copies the ' + \
+                           'file type of the corresponding input video. Default is "copy".')
+  args = parser.parse_args()
+  
+  combine(args.video, args.audio, args.smoothness, args.stretch_audio, args.keep_non_ad,
+          args.boost, args.ad_detect_sensitivity, args.boost_sensitivity, args.yes,
+          args.prepend, args.no_pitch_correction, args.output_dir, args.alignment_dir,
+          args.extension)
+
+# allows the script to be run on its own, rather than through the package, for example:
+# python3 describealign.py video.mp4 audio_desc.mp3
+if __name__ == "__main__":
+  multiprocessing.freeze_support()
+  command_line_interface()
+
+
+
+
diff --git a/pyproject.toml b/pyproject.toml
index 3a8ccb9..48d21e1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,27 +1,27 @@
-[build-system]
-requires = ["setuptools>=61.0"]
-build-backend = "setuptools.build_meta"
-
-[project]
-name = "describealign"
-authors = [{ name = "Julian Brown", email = "julbean@proton.me" }]
-description = "Combines videos with matching audio files (e.g. audio descriptions)"
-readme = "README.md"
-requires-python = ">=3.8"
-classifiers = [
-  "Programming Language :: Python :: 3",
-  "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
-  "Operating System :: OS Independent",
-]
-dynamic = ["version", "dependencies"]
-
-[tool.setuptools.dynamic]
-version = { file = "version" }
-dependencies = { file = "requirements.txt" }
-
-[project.scripts]
-describealign = "describealign:command_line_interface"
-
-[project.urls]
-"Homepage" = "/~https://github.com/julbean/describealign"
-"Bug Tracker" = "/~https://github.com/julbean/describealign/issues"
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "describealign"
+authors = [{ name = "Julian Brown", email = "julbean@proton.me" }]
+description = "Combines videos with matching audio files (e.g. audio descriptions)"
+readme = "README.md"
+requires-python = ">=3.8"
+classifiers = [
+  "Programming Language :: Python :: 3",
+  "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
+  "Operating System :: OS Independent",
+]
+dynamic = ["version", "dependencies"]
+
+[tool.setuptools.dynamic]
+version = { file = "version" }
+dependencies = { file = "requirements.txt" }
+
+[project.scripts]
+describealign = "describealign:command_line_interface"
+
+[project.urls]
+"Homepage" = "/~https://github.com/julbean/describealign"
+"Bug Tracker" = "/~https://github.com/julbean/describealign/issues"
diff --git a/requirements.txt b/requirements.txt
index 3bd4dd0..af751a0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,10 +1,10 @@
-ffmpeg_python~=0.2.0
-static-ffmpeg~=2.5
-matplotlib~=3.5.0
-numpy~=1.21.4
-python_speech_features~=0.6
-scipy~=1.10.1
-pytsmod~=0.3.7
-PySimpleGUIWx~=0.17.2; platform_system == 'Windows'
-PySimpleGUIQt~=0.35.0; platform_system != 'Windows'
-PySide2~=5.15.2.1; platform_system != "Windows"
+ffmpeg_python~=0.2.0
+static-ffmpeg~=2.5
+matplotlib~=3.5.0
+numpy~=1.21.4
+python_speech_features~=0.6
+scipy~=1.10.1
+pytsmod~=0.3.7
+PySimpleGUIWx~=0.17.2; platform_system == 'Windows'
+PySimpleGUIQt~=0.35.0; platform_system != 'Windows'
+PySide2~=5.15.2.1; platform_system != "Windows"
diff --git a/setup.cfg b/setup.cfg
index 51749fc..96fadd5 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,3 +1,3 @@
-[egg_info]
-tag_build =
-tag_date = 0
+[egg_info]
+tag_build =
+tag_date = 0