Switch to audio fingerprinting based syncing (#393)

* rename merge-auto-tolerance -> merge-time-tolerance

* move format_time_delta to own file

* switch to audio fingerprinting based syncing

* move format_time_delta to own file

* simpler approach to determine negative time deltas

* add missing readme part for --sync-precision

* fix all clippy "errors"

* Use rust-native chromaprint port instead of ffmpeg

* buffer with 128kb instead of 32kb

* improve helps

* improve help

---------

Co-authored-by: bytedream <bytedream@protonmail.com>
This commit is contained in:
Simon 2024-05-02 00:35:13 +02:00 committed by GitHub
parent f237033aff
commit 72c574c883
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 555 additions and 325 deletions

View file

@ -0,0 +1,432 @@
use std::io::Read;
use std::process::Stdio;
use std::{
cmp,
collections::{HashMap, HashSet},
mem,
ops::Not,
path::Path,
process::Command,
};
use chrono::TimeDelta;
use crunchyroll_rs::Locale;
use log::debug;
use tempfile::TempPath;
use anyhow::{bail, Result};
use rusty_chromaprint::{Configuration, Fingerprinter};
use super::fmt::format_time_delta;
pub struct SyncAudio {
pub format_id: usize,
pub path: TempPath,
pub locale: Locale,
pub sample_rate: u32,
pub video_idx: usize,
}
#[derive(Debug, Clone, Copy)]
struct TimeRange {
start: f64,
end: f64,
}
pub fn sync_audios(
available_audios: &Vec<SyncAudio>,
sync_tolerance: u32,
sync_precision: u32,
) -> Result<Option<HashMap<usize, TimeDelta>>> {
let mut result: HashMap<usize, TimeDelta> = HashMap::new();
let mut sync_audios = vec![];
let mut chromaprints = HashMap::new();
let mut formats = HashSet::new();
for audio in available_audios {
if formats.contains(&audio.format_id) {
continue;
}
formats.insert(audio.format_id);
sync_audios.push((audio.format_id, &audio.path, audio.sample_rate));
chromaprints.insert(
audio.format_id,
generate_chromaprint(
&audio.path,
audio.sample_rate,
&TimeDelta::zero(),
&TimeDelta::zero(),
&TimeDelta::zero(),
)?,
);
}
sync_audios.sort_by_key(|sync_audio| chromaprints.get(&sync_audio.0).unwrap().len());
let base_audio = sync_audios.remove(0);
let mut start = f64::MAX;
let mut end = f64::MIN;
let mut initial_offsets = HashMap::new();
for audio in &sync_audios {
debug!(
"Initial comparison of format {} to {}",
audio.0, &base_audio.0
);
let (lhs_ranges, rhs_ranges) = compare_chromaprints(
chromaprints.get(&base_audio.0).unwrap(),
chromaprints.get(&audio.0).unwrap(),
sync_tolerance,
);
if lhs_ranges.is_empty() || rhs_ranges.is_empty() {
bail!(
"Failed to sync videos, couldn't find matching audio parts between format {} and {}",
base_audio.0 + 1,
audio.0 + 1
);
}
let lhs_range = lhs_ranges[0];
let rhs_range = rhs_ranges[0];
start = start.min(lhs_range.start);
end = end.max(lhs_range.end);
start = start.min(rhs_range.start);
end = end.max(rhs_range.end);
let offset = TimeDelta::milliseconds(((rhs_range.start - lhs_range.start) * 1000.0) as i64);
initial_offsets.insert(audio.0, TimeDelta::zero().checked_sub(&offset).unwrap());
debug!(
"Found initial offset of {}ms ({} - {} {}s) ({} - {} {}s) for format {} to {}",
offset.num_milliseconds(),
lhs_range.start,
lhs_range.end,
lhs_range.end - lhs_range.start,
rhs_range.start,
rhs_range.end,
rhs_range.end - rhs_range.start,
audio.0,
base_audio.0
);
}
debug!(
"Found matching audio parts at {} - {}, narrowing search",
start, end
);
let start = TimeDelta::milliseconds((start * 1000.0) as i64 - 20000);
let end = TimeDelta::milliseconds((end * 1000.0) as i64 + 20000);
for sync_audio in &sync_audios {
let chromaprint = generate_chromaprint(
sync_audio.1,
sync_audio.2,
&start,
&end,
initial_offsets.get(&sync_audio.0).unwrap(),
)?;
chromaprints.insert(sync_audio.0, chromaprint);
}
let mut runs: HashMap<usize, i64> = HashMap::new();
let iterator_range_limits: i64 = 2 ^ sync_precision as i64;
for i in -iterator_range_limits..=iterator_range_limits {
let base_offset = TimeDelta::milliseconds(
((0.128 / iterator_range_limits as f64 * i as f64) * 1000.0) as i64,
);
chromaprints.insert(
base_audio.0,
generate_chromaprint(base_audio.1, base_audio.2, &start, &end, &base_offset)?,
);
for audio in &sync_audios {
let initial_offset = initial_offsets.get(&audio.0).copied().unwrap();
let offset = find_offset(
(&base_audio.0, chromaprints.get(&base_audio.0).unwrap()),
&base_offset,
(&audio.0, chromaprints.get(&audio.0).unwrap()),
&initial_offset,
&start,
sync_tolerance,
);
if offset.is_none() {
continue;
}
let offset = offset.unwrap();
result.insert(
audio.0,
result
.get(&audio.0)
.copied()
.unwrap_or_default()
.checked_add(&offset)
.unwrap(),
);
runs.insert(audio.0, runs.get(&audio.0).copied().unwrap_or_default() + 1);
}
}
let mut result: HashMap<usize, TimeDelta> = result
.iter()
.map(|(format_id, offset)| {
(
*format_id,
TimeDelta::milliseconds(
offset.num_milliseconds() / runs.get(format_id).copied().unwrap(),
),
)
})
.collect();
result.insert(base_audio.0, TimeDelta::milliseconds(0));
Ok(Some(result))
}
fn find_offset(
lhs: (&usize, &Vec<u32>),
lhs_shift: &TimeDelta,
rhs: (&usize, &Vec<u32>),
rhs_shift: &TimeDelta,
start: &TimeDelta,
sync_tolerance: u32,
) -> Option<TimeDelta> {
let (lhs_ranges, rhs_ranges) = compare_chromaprints(lhs.1, rhs.1, sync_tolerance);
if lhs_ranges.is_empty() || rhs_ranges.is_empty() {
return None;
}
let lhs_range = lhs_ranges[0];
let rhs_range = rhs_ranges[0];
let offset = rhs_range.end - lhs_range.end;
let offset = TimeDelta::milliseconds((offset * 1000.0) as i64)
.checked_add(lhs_shift)?
.checked_sub(rhs_shift)?;
debug!(
"Found offset of {}ms ({} - {} {}s) ({} - {} {}s) for format {} to {}",
offset.num_milliseconds(),
lhs_range.start + start.num_milliseconds() as f64 / 1000.0,
lhs_range.end + start.num_milliseconds() as f64 / 1000.0,
lhs_range.end - lhs_range.start,
rhs_range.start + start.num_milliseconds() as f64 / 1000.0,
rhs_range.end + start.num_milliseconds() as f64 / 1000.0,
rhs_range.end - rhs_range.start,
rhs.0,
lhs.0
);
Some(offset)
}
fn generate_chromaprint(
input_file: &Path,
sample_rate: u32,
start: &TimeDelta,
end: &TimeDelta,
offset: &TimeDelta,
) -> Result<Vec<u32>> {
let mut ss_argument: &TimeDelta = &start.checked_sub(offset).unwrap();
let mut offset_argument = &TimeDelta::zero();
if *offset < TimeDelta::zero() {
ss_argument = start;
offset_argument = offset;
};
let mut printer = Fingerprinter::new(&Configuration::preset_test1());
printer.start(sample_rate, 2)?;
let mut command = Command::new("ffmpeg");
command
.arg("-hide_banner")
.arg("-y")
.args(["-ss", format_time_delta(ss_argument).as_str()]);
if end.is_zero().not() {
command.args(["-to", format_time_delta(end).as_str()]);
}
command
.args(["-itsoffset", format_time_delta(offset_argument).as_str()])
.args(["-i", input_file.to_string_lossy().to_string().as_str()])
.args(["-ac", "2"])
.args([
"-f",
if cfg!(target_endian = "big") {
"s16be"
} else {
"s16le"
},
])
.arg("-");
let mut handle = command
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()?;
// the stdout is read in chunks because keeping all the raw audio data in memory would take up
// a significant amount of space
let mut stdout = handle.stdout.take().unwrap();
let mut buf: [u8; 128_000] = [0; 128_000];
while handle.try_wait()?.is_none() {
loop {
let read_bytes = stdout.read(&mut buf)?;
if read_bytes == 0 {
break;
}
let data: [i16; 64_000] = unsafe { mem::transmute(buf) };
printer.consume(&data[0..(read_bytes / 2)])
}
}
if !handle.wait()?.success() {
bail!("{}", std::io::read_to_string(handle.stderr.unwrap())?)
}
printer.finish();
return Ok(printer.fingerprint().into());
}
fn compare_chromaprints(
lhs_chromaprint: &Vec<u32>,
rhs_chromaprint: &Vec<u32>,
sync_tolerance: u32,
) -> (Vec<TimeRange>, Vec<TimeRange>) {
let lhs_inverse_index = create_inverse_index(lhs_chromaprint);
let rhs_inverse_index = create_inverse_index(rhs_chromaprint);
let mut possible_shifts = HashSet::new();
for lhs_pair in lhs_inverse_index {
let original_point = lhs_pair.0;
for i in -2..=2 {
let modified_point = (original_point as i32 + i) as u32;
if rhs_inverse_index.contains_key(&modified_point) {
let rhs_index = rhs_inverse_index.get(&modified_point).copied().unwrap();
possible_shifts.insert(rhs_index as i32 - lhs_pair.1 as i32);
}
}
}
let mut all_lhs_time_ranges = vec![];
let mut all_rhs_time_ranges = vec![];
for shift_amount in possible_shifts {
let time_range_pair = find_time_ranges(
lhs_chromaprint,
rhs_chromaprint,
shift_amount,
sync_tolerance,
);
if time_range_pair.is_none() {
continue;
}
let (mut lhs_time_ranges, mut rhs_time_ranges) = time_range_pair.unwrap();
let mut lhs_time_ranges: Vec<TimeRange> = lhs_time_ranges
.drain(..)
.filter(|time_range| {
(20.0 < (time_range.end - time_range.start))
&& ((time_range.end - time_range.start) < 180.0)
&& time_range.end > 0.0
})
.collect();
lhs_time_ranges.sort_by(|a, b| (b.end - b.start).total_cmp(&(a.end - a.start)));
let mut rhs_time_ranges: Vec<TimeRange> = rhs_time_ranges
.drain(..)
.filter(|time_range| {
(20.0 < (time_range.end - time_range.start))
&& ((time_range.end - time_range.start) < 180.0)
&& time_range.end > 0.0
})
.collect();
rhs_time_ranges.sort_by(|a, b| (b.end - b.start).total_cmp(&(a.end - a.start)));
if lhs_time_ranges.is_empty() || rhs_time_ranges.is_empty() {
continue;
}
all_lhs_time_ranges.push(lhs_time_ranges[0]);
all_rhs_time_ranges.push(rhs_time_ranges[0]);
}
all_lhs_time_ranges.sort_by(|a, b| (a.end - a.start).total_cmp(&(b.end - b.start)));
all_lhs_time_ranges.reverse();
all_rhs_time_ranges.sort_by(|a, b| (a.end - a.start).total_cmp(&(b.end - b.start)));
all_rhs_time_ranges.reverse();
(all_lhs_time_ranges, all_rhs_time_ranges)
}
fn create_inverse_index(chromaprint: &Vec<u32>) -> HashMap<u32, usize> {
let mut inverse_index = HashMap::with_capacity(chromaprint.capacity());
for (i, fingerprint) in chromaprint.iter().enumerate().take(chromaprint.capacity()) {
inverse_index.insert(*fingerprint, i);
}
inverse_index
}
fn find_time_ranges(
lhs_chromaprint: &[u32],
rhs_chromaprint: &[u32],
shift_amount: i32,
sync_tolerance: u32,
) -> Option<(Vec<TimeRange>, Vec<TimeRange>)> {
let mut lhs_shift: i32 = 0;
let mut rhs_shift: i32 = 0;
if shift_amount < 0 {
lhs_shift -= shift_amount;
} else {
rhs_shift += shift_amount;
}
let mut lhs_matching_timestamps = vec![];
let mut rhs_matching_timestamps = vec![];
let upper_limit =
cmp::min(lhs_chromaprint.len(), rhs_chromaprint.len()) as i32 - shift_amount.abs();
for i in 0..upper_limit {
let lhs_position = i + lhs_shift;
let rhs_position = i + rhs_shift;
let difference = (lhs_chromaprint[lhs_position as usize]
^ rhs_chromaprint[rhs_position as usize])
.count_ones();
if difference > sync_tolerance {
continue;
}
lhs_matching_timestamps.push(lhs_position as f64 * 0.128);
rhs_matching_timestamps.push(rhs_position as f64 * 0.128);
}
lhs_matching_timestamps.push(f64::MAX);
rhs_matching_timestamps.push(f64::MAX);
let lhs_time_ranges = timestamps_to_ranges(lhs_matching_timestamps);
lhs_time_ranges.as_ref()?;
let lhs_time_ranges = lhs_time_ranges.unwrap();
let rhs_time_ranges = timestamps_to_ranges(rhs_matching_timestamps).unwrap();
Some((lhs_time_ranges, rhs_time_ranges))
}
fn timestamps_to_ranges(mut timestamps: Vec<f64>) -> Option<Vec<TimeRange>> {
if timestamps.is_empty() {
return None;
}
timestamps.sort_by(|a, b| a.total_cmp(b));
let mut time_ranges = vec![];
let mut current_range = TimeRange {
start: timestamps[0],
end: timestamps[0],
};
for i in 0..timestamps.len() - 1 {
let current = timestamps[i];
let next = timestamps[i + 1];
if next - current <= 1.0 {
current_range.end = next;
continue;
}
time_ranges.push(current_range);
current_range.start = next;
current_range.end = next;
}
if !time_ranges.is_empty() {
Some(time_ranges)
} else {
None
}
}