Switch to audio fingerprinting based syncing (#393)

* rename merge-auto-tolerance -> merge-time-tolerance

* move format_time_delta to own file

* switch to audio fingerprinting based syncing

* move format_time_delta to own file

* simpler approach to determine negative time deltas

* add missing readme part for --sync-precision

* fix all clippy "errors"

* Use rust-native chromaprint port instead of ffmpeg

* buffer with 128kb instead of 32kb

* improve helps

* improve help

---------

Co-authored-by: bytedream <bytedream@protonmail.com>
This commit is contained in:
Simon 2024-05-02 00:35:13 +02:00 committed by GitHub
parent f237033aff
commit 72c574c883
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 555 additions and 325 deletions

96
Cargo.lock generated
View file

@ -179,18 +179,6 @@ version = "3.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
[[package]]
name = "bytemuck"
version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d6d68c57235a3a081186990eca2867354726650f42f7516ca50c28d6281fd15"
[[package]]
name = "byteorder"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
[[package]] [[package]]
name = "bytes" name = "bytes"
version = "1.6.0" version = "1.6.0"
@ -381,8 +369,6 @@ dependencies = [
"fs2", "fs2",
"futures-util", "futures-util",
"http", "http",
"image",
"image_hasher",
"indicatif", "indicatif",
"lazy_static", "lazy_static",
"log", "log",
@ -391,6 +377,7 @@ dependencies = [
"regex", "regex",
"reqwest", "reqwest",
"rustls-native-certs", "rustls-native-certs",
"rusty-chromaprint",
"serde", "serde",
"serde_json", "serde_json",
"serde_plain", "serde_plain",
@ -951,32 +938,6 @@ dependencies = [
"unicode-normalization", "unicode-normalization",
] ]
[[package]]
name = "image"
version = "0.25.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd54d660e773627692c524beaad361aca785a4f9f5730ce91f42aabe5bce3d11"
dependencies = [
"bytemuck",
"byteorder",
"num-traits",
"zune-core",
"zune-jpeg",
]
[[package]]
name = "image_hasher"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9481465fe767d92494987319b0b447a5829edf57f09c52bf8639396abaaeaf78"
dependencies = [
"base64 0.22.0",
"image",
"rustdct",
"serde",
"transpose",
]
[[package]] [[package]]
name = "indexmap" name = "indexmap"
version = "1.9.3" version = "1.9.3"
@ -1417,6 +1378,15 @@ dependencies = [
"proc-macro2", "proc-macro2",
] ]
[[package]]
name = "realfft"
version = "3.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "953d9f7e5cdd80963547b456251296efc2626ed4e3cbf36c869d9564e0220571"
dependencies = [
"rustfft",
]
[[package]] [[package]]
name = "redox_users" name = "redox_users"
version = "0.4.5" version = "0.4.5"
@ -1531,21 +1501,24 @@ version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b833d8d034ea094b1ea68aa6d5c740e0d04bad9d16568d08ba6f76823a114316" checksum = "b833d8d034ea094b1ea68aa6d5c740e0d04bad9d16568d08ba6f76823a114316"
[[package]]
name = "rubato"
version = "0.14.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6dd52e80cfc21894deadf554a5673002938ae4625f7a283e536f9cf7c17b0d5"
dependencies = [
"num-complex",
"num-integer",
"num-traits",
"realfft",
]
[[package]] [[package]]
name = "rustc-demangle" name = "rustc-demangle"
version = "0.1.23" version = "0.1.23"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76"
[[package]]
name = "rustdct"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b61555105d6a9bf98797c063c362a1d24ed8ab0431655e38f1cf51e52089551"
dependencies = [
"rustfft",
]
[[package]] [[package]]
name = "rustfft" name = "rustfft"
version = "6.2.0" version = "6.2.0"
@ -1628,6 +1601,16 @@ dependencies = [
"untrusted", "untrusted",
] ]
[[package]]
name = "rusty-chromaprint"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1755646867c36ecb391776deaa0b557a76d3badf20c142de7282630c34b20440"
dependencies = [
"rubato",
"rustfft",
]
[[package]] [[package]]
name = "ryu" name = "ryu"
version = "1.0.17" version = "1.0.17"
@ -2501,18 +2484,3 @@ name = "zeroize"
version = "1.8.0" version = "1.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "63381fa6624bf92130a6b87c0d07380116f80b565c42cf0d754136f0238359ef" checksum = "63381fa6624bf92130a6b87c0d07380116f80b565c42cf0d754136f0238359ef"
[[package]]
name = "zune-core"
version = "0.4.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f423a2c17029964870cfaabb1f13dfab7d092a62a29a89264f4d36990ca414a"
[[package]]
name = "zune-jpeg"
version = "0.4.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec866b44a2a1fd6133d363f073ca1b179f438f99e7e5bfb1e33f7181facfe448"
dependencies = [
"zune-core",
]

View file

@ -462,7 +462,7 @@ The `archive` command lets you download episodes with multiple audios and subtit
In the best case, when multiple audio & subtitle tracks are used, there is only one *video* track and all other languages can be stored as audio-only. In the best case, when multiple audio & subtitle tracks are used, there is only one *video* track and all other languages can be stored as audio-only.
But, as said, this is not always the case. But, as said, this is not always the case.
With the `-m` / `--merge` flag you can define the behaviour when an episodes' video tracks differ in length. With the `-m` / `--merge` flag you can define the behaviour when an episodes' video tracks differ in length.
Valid options are `audio` - store one video and all other languages as audio only; `video` - store the video + audio for every language; `auto` - detect if videos differ in length: if so, behave like `video` - otherwise like `audio`. Valid options are `audio` - store one video and all other languages as audio only; `video` - store the video + audio for every language; `auto` - detect if videos differ in length: if so, behave like `video` - otherwise like `audio`; `sync` - detect if videos differ in length: if so, it tries to find the offset of matching audio parts and removes the offset from the beginning, otherwise it behaves like `audio`.
Subtitles will always match the primary audio and video. Subtitles will always match the primary audio and video.
```shell ```shell
@ -482,15 +482,18 @@ The `archive` command lets you download episodes with multiple audios and subtit
Default are `200` milliseconds. Default are `200` milliseconds.
- <span id="archive-sync-start">Sync start</span> - <span id="archive-sync-tolerance">Sync tolerance</span>
If you want that all videos of the same episode should start at the same time and `--merge` doesn't fit your needs (e.g. one video has an intro, all other doesn't), you might consider using the `--sync-start`. Sometimes two video tracks are downloaded with `--merge` set to `sync` because the audio fingerprinting fails to identify matching audio parts (e.g. opening).
It tries to sync the timing of all downloaded audios to match one video. To prevent this, you can use the `--sync-tolerance` flag to specify the difference by which two fingerprints are considered equal.
This is done by downloading the first few segments/frames of all video tracks that differ in length and comparing them frame by frame.
The flag takes an optional value determines how accurate the syncing is, generally speaking everything over 15 begins to be more inaccurate and everything below 6 is too accurate (and won't succeed).
When the syncing fails, the command is continued as if `--sync-start` wasn't provided for this episode.
Default is `7.5`. Default is `6`.
- <span id="archive-sync-precision">Sync precision</span>
If you use `--merge` set to `sync` and the syncing seems to be not accurate enough or takes to long, you can use the `--sync-precision` flag to specify the amount of offset determination runs from which the final offset is calculated.
Default is `4`.
- <span id="archive-language-tagging">Language tagging</span> - <span id="archive-language-tagging">Language tagging</span>

View file

@ -24,14 +24,13 @@ derive_setters = "0.1"
futures-util = { version = "0.3", features = ["io"] } futures-util = { version = "0.3", features = ["io"] }
fs2 = "0.4" fs2 = "0.4"
http = "1.1" http = "1.1"
image = { version = "0.25", features = ["jpeg"], default-features = false }
image_hasher = "2.0"
indicatif = "0.17" indicatif = "0.17"
lazy_static = "1.4" lazy_static = "1.4"
log = { version = "0.4", features = ["std"] } log = { version = "0.4", features = ["std"] }
num_cpus = "1.16" num_cpus = "1.16"
regex = "1.10" regex = "1.10"
reqwest = { version = "0.12", features = ["socks", "stream"] } reqwest = { version = "0.12", features = ["socks", "stream"] }
rusty-chromaprint = "0.2"
serde = "1.0" serde = "1.0"
serde_json = "1.0" serde_json = "1.0"
serde_plain = "1.0" serde_plain = "1.0"

View file

@ -90,32 +90,31 @@ pub struct Archive {
pub(crate) resolution: Resolution, pub(crate) resolution: Resolution,
#[arg( #[arg(
help = "Sets the behavior of the stream merging. Valid behaviors are 'auto', 'audio' and 'video'" help = "Sets the behavior of the stream merging. Valid behaviors are 'auto', 'sync', 'audio' and 'video'"
)] )]
#[arg( #[arg(
long_help = "Because of local restrictions (or other reasons) some episodes with different languages does not have the same length (e.g. when some scenes were cut out). \ long_help = "Because of local restrictions (or other reasons) some episodes with different languages does not have the same length (e.g. when some scenes were cut out). \
With this flag you can set the behavior when handling multiple language. With this flag you can set the behavior when handling multiple language.
Valid options are 'audio' (stores one video and all other languages as audio only), 'video' (stores the video + audio for every language) and 'auto' (detects if videos differ in length: if so, behave like 'video' else like 'audio')" Valid options are 'audio' (stores one video and all other languages as audio only), 'video' (stores the video + audio for every language), 'auto' (detects if videos differ in length: if so, behave like 'video' else like 'audio') and 'sync' (detects if videos differ in length: if so, tries to find the offset of matching audio parts and removes it from the beginning, otherwise it behaves like 'audio')"
)] )]
#[arg(short, long, default_value = "auto")] #[arg(short, long, default_value = "auto")]
#[arg(value_parser = MergeBehavior::parse)] #[arg(value_parser = MergeBehavior::parse)]
pub(crate) merge: MergeBehavior, pub(crate) merge: MergeBehavior,
#[arg( #[arg(
help = "If the merge behavior is 'auto', only download multiple video tracks if their length difference is higher than the given milliseconds" help = "If the merge behavior is 'auto' or 'sync', consider videos to be of equal lengths if the difference in length is smaller than the specified milliseconds"
)] )]
#[arg(long, default_value_t = 200)] #[arg(long, default_value_t = 200)]
pub(crate) merge_time_tolerance: u32, pub(crate) merge_time_tolerance: u32,
#[arg(help = "Tries to sync the timing of all downloaded audios to match one video")]
#[arg( #[arg(
long_help = "Tries to sync the timing of all downloaded audios to match one video. \ help = "If the merge behavior is 'sync', specify the difference by which two fingerprints are considered equal, higher values can help when the algorithm fails"
This is done by downloading the first few segments/frames of all video tracks that differ in length and comparing them frame by frame. \
The value of this flag determines how accurate the syncing is, generally speaking everything over 15 begins to be more inaccurate and everything below 6 is too accurate (and won't succeed). \
If you want to provide a custom value to this flag, you have to set it with an equals (e.g. `--sync-start=10` instead of `--sync-start 10`). \
When the syncing fails, the command is continued as if `--sync-start` wasn't provided for this episode
"
)] )]
#[arg(long, require_equals = true, num_args = 0..=1, default_missing_value = "7.5")] #[arg(long, default_value_t = 6)]
pub(crate) sync_start: Option<f64>, pub(crate) sync_tolerance: u32,
#[arg(
help = "If the merge behavior is 'sync', specify the amount of offset determination runs from which the final offset is calculated, higher values will increase the time required but lead to more precise offsets"
)]
#[arg(long, default_value_t = 4)]
pub(crate) sync_precision: u32,
#[arg( #[arg(
help = "Specified which language tagging the audio and subtitle tracks and language specific format options should have. \ help = "Specified which language tagging the audio and subtitle tracks and language specific format options should have. \
@ -229,18 +228,10 @@ impl Execute for Archive {
} }
if self.include_chapters if self.include_chapters
&& !matches!(self.merge, MergeBehavior::Sync)
&& !matches!(self.merge, MergeBehavior::Audio) && !matches!(self.merge, MergeBehavior::Audio)
&& self.sync_start.is_none()
{ {
bail!("`--include-chapters` can only be used if `--merge` is set to 'audio' or `--sync-start` is set") bail!("`--include-chapters` can only be used if `--merge` is set to 'audio' or 'sync'")
}
if !matches!(self.merge, MergeBehavior::Auto) && self.sync_start.is_some() {
bail!("`--sync-start` can only be used if `--merge` is set to `auto`")
}
if self.sync_start.is_some() && self.ffmpeg_preset.is_none() {
warn!("Using `--sync-start` without `--ffmpeg-preset` might produce worse sync results than with `--ffmpeg-preset` set")
} }
self.audio = all_locale_in_locales(self.audio.clone()); self.audio = all_locale_in_locales(self.audio.clone());
@ -317,7 +308,14 @@ impl Execute for Archive {
.audio_sort(Some(self.audio.clone())) .audio_sort(Some(self.audio.clone()))
.subtitle_sort(Some(self.subtitle.clone())) .subtitle_sort(Some(self.subtitle.clone()))
.no_closed_caption(self.no_closed_caption) .no_closed_caption(self.no_closed_caption)
.sync_start_value(self.sync_start) .sync_tolerance(match self.merge {
MergeBehavior::Sync => Some(self.sync_tolerance),
_ => None,
})
.sync_precision(match self.merge {
MergeBehavior::Sync => Some(self.sync_precision),
_ => None,
})
.threads(self.threads) .threads(self.threads)
.audio_locale_output_map( .audio_locale_output_map(
zip(self.audio.clone(), self.output_audio_locales.clone()).collect(), zip(self.audio.clone(), self.output_audio_locales.clone()).collect(),
@ -560,7 +558,7 @@ async fn get_format(
}, },
}, },
}), }),
MergeBehavior::Auto => { MergeBehavior::Auto | MergeBehavior::Sync => {
let mut d_formats: Vec<(Duration, DownloadFormat)> = vec![]; let mut d_formats: Vec<(Duration, DownloadFormat)> = vec![];
for (single_format, video, audio, subtitles) in format_pairs { for (single_format, video, audio, subtitles) in format_pairs {

View file

@ -2,15 +2,13 @@ use crate::utils::ffmpeg::FFmpegPreset;
use crate::utils::filter::real_dedup_vec; use crate::utils::filter::real_dedup_vec;
use crate::utils::fmt::format_time_delta; use crate::utils::fmt::format_time_delta;
use crate::utils::log::progress; use crate::utils::log::progress;
use crate::utils::os::{ use crate::utils::os::{cache_dir, is_special_file, temp_directory, temp_named_pipe, tempfile};
cache_dir, is_special_file, temp_directory, temp_named_pipe, tempdir, tempfile,
};
use crate::utils::rate_limit::RateLimiterService; use crate::utils::rate_limit::RateLimiterService;
use crate::utils::sync::{sync_audios, SyncAudio};
use anyhow::{bail, Result}; use anyhow::{bail, Result};
use chrono::{NaiveTime, TimeDelta}; use chrono::{NaiveTime, TimeDelta};
use crunchyroll_rs::media::{SkipEvents, SkipEventsEvent, StreamData, StreamSegment, Subtitle}; use crunchyroll_rs::media::{SkipEvents, SkipEventsEvent, StreamData, StreamSegment, Subtitle};
use crunchyroll_rs::Locale; use crunchyroll_rs::Locale;
use image_hasher::{Hasher, HasherConfig, ImageHash};
use indicatif::{ProgressBar, ProgressDrawTarget, ProgressFinish, ProgressStyle}; use indicatif::{ProgressBar, ProgressDrawTarget, ProgressFinish, ProgressStyle};
use log::{debug, warn, LevelFilter}; use log::{debug, warn, LevelFilter};
use regex::Regex; use regex::Regex;
@ -39,6 +37,7 @@ pub enum MergeBehavior {
Video, Video,
Audio, Audio,
Auto, Auto,
Sync,
} }
impl MergeBehavior { impl MergeBehavior {
@ -47,6 +46,7 @@ impl MergeBehavior {
"video" => MergeBehavior::Video, "video" => MergeBehavior::Video,
"audio" => MergeBehavior::Audio, "audio" => MergeBehavior::Audio,
"auto" => MergeBehavior::Auto, "auto" => MergeBehavior::Auto,
"sync" => MergeBehavior::Sync,
_ => return Err(format!("'{}' is not a valid merge behavior", s)), _ => return Err(format!("'{}' is not a valid merge behavior", s)),
}) })
} }
@ -64,7 +64,8 @@ pub struct DownloadBuilder {
force_hardsub: bool, force_hardsub: bool,
download_fonts: bool, download_fonts: bool,
no_closed_caption: bool, no_closed_caption: bool,
sync_start_value: Option<f64>, sync_tolerance: Option<u32>,
sync_precision: Option<u32>,
threads: usize, threads: usize,
ffmpeg_threads: Option<usize>, ffmpeg_threads: Option<usize>,
audio_locale_output_map: HashMap<Locale, String>, audio_locale_output_map: HashMap<Locale, String>,
@ -84,7 +85,8 @@ impl DownloadBuilder {
force_hardsub: false, force_hardsub: false,
download_fonts: false, download_fonts: false,
no_closed_caption: false, no_closed_caption: false,
sync_start_value: None, sync_tolerance: None,
sync_precision: None,
threads: num_cpus::get(), threads: num_cpus::get(),
ffmpeg_threads: None, ffmpeg_threads: None,
audio_locale_output_map: HashMap::new(), audio_locale_output_map: HashMap::new(),
@ -106,7 +108,8 @@ impl DownloadBuilder {
download_fonts: self.download_fonts, download_fonts: self.download_fonts,
no_closed_caption: self.no_closed_caption, no_closed_caption: self.no_closed_caption,
sync_start_value: self.sync_start_value, sync_tolerance: self.sync_tolerance,
sync_precision: self.sync_precision,
download_threads: self.threads, download_threads: self.threads,
ffmpeg_threads: self.ffmpeg_threads, ffmpeg_threads: self.ffmpeg_threads,
@ -165,7 +168,8 @@ pub struct Downloader {
download_fonts: bool, download_fonts: bool,
no_closed_caption: bool, no_closed_caption: bool,
sync_start_value: Option<f64>, sync_tolerance: Option<u32>,
sync_precision: Option<u32>,
download_threads: usize, download_threads: usize,
ffmpeg_threads: Option<usize>, ffmpeg_threads: Option<usize>,
@ -245,6 +249,7 @@ impl Downloader {
let mut video_offset = None; let mut video_offset = None;
let mut audio_offsets = HashMap::new(); let mut audio_offsets = HashMap::new();
let mut subtitle_offsets = HashMap::new(); let mut subtitle_offsets = HashMap::new();
let mut raw_audios = vec![];
let mut videos = vec![]; let mut videos = vec![];
let mut audios = vec![]; let mut audios = vec![];
let mut subtitles = vec![]; let mut subtitles = vec![];
@ -263,40 +268,33 @@ impl Downloader {
.max() .max()
.unwrap(); .unwrap();
if self.formats.len() > 1 && self.sync_start_value.is_some() { // downloads all audios
let all_segments_count: Vec<usize> = self for (i, format) in self.formats.iter().enumerate() {
.formats for (stream_data, locale) in &format.audios {
.iter()
.map(|f| f.video.0.segments().len())
.collect();
let sync_segments = 11.max(
all_segments_count.iter().max().unwrap() - all_segments_count.iter().min().unwrap(),
);
let mut sync_vids = vec![];
for (i, format) in self.formats.iter().enumerate() {
let path = self let path = self
.download_video( .download_audio(
&format.video.0, stream_data,
format!("Downloading video #{} sync segments", i + 1), format!("{:<1$}", format!("Downloading {} audio", locale), fmt_space),
Some(sync_segments),
) )
.await?; .await?;
sync_vids.push(SyncVideo { raw_audios.push(SyncAudio {
format_id: i,
path, path,
length: len_from_segments(&format.video.0.segments()), locale: locale.clone(),
available_frames: (len_from_segments( sample_rate: stream_data.sampling_rate().unwrap(),
&format.video.0.segments()[0..sync_segments], video_idx: i,
)
.num_milliseconds() as f64
* format.video.0.fps().unwrap()
/ 1000.0) as u64,
idx: i,
}) })
} }
}
if self.formats.len() > 1 && self.sync_tolerance.is_some() {
let _progress_handler = let _progress_handler =
progress!("Syncing video start times (this might take some time)"); progress!("Syncing video start times (this might take some time)");
let mut offsets = sync_videos(sync_vids, self.sync_start_value.unwrap())?; let mut offsets = sync_audios(
&raw_audios,
self.sync_tolerance.unwrap(),
self.sync_precision.unwrap(),
)?;
drop(_progress_handler); drop(_progress_handler);
let mut offset_pre_checked = false; let mut offset_pre_checked = false;
@ -307,19 +305,14 @@ impl Downloader {
.enumerate() .enumerate()
.map(|(i, f)| { .map(|(i, f)| {
len_from_segments(&f.video.0.segments()) len_from_segments(&f.video.0.segments())
- TimeDelta::milliseconds( - tmp_offsets.get(&i).copied().unwrap_or_default()
tmp_offsets
.get(&i)
.map(|o| (*o as f64 / f.video.0.fps().unwrap() * 1000.0) as i64)
.unwrap_or_default(),
)
}) })
.collect(); .collect();
let min = formats_with_offset.iter().min().unwrap(); let min = formats_with_offset.iter().min().unwrap();
let max = formats_with_offset.iter().max().unwrap(); let max = formats_with_offset.iter().max().unwrap();
if max.num_seconds() - min.num_seconds() > 15 { if max.num_seconds() - min.num_seconds() > 15 {
warn!("Found difference of >15 seconds after sync, skipping applying it"); warn!("Found difference of >15 seconds after sync, so the application was skipped");
offsets = None; offsets = None;
offset_pre_checked = true offset_pre_checked = true
} }
@ -331,7 +324,7 @@ impl Downloader {
let mut audio_count: usize = 0; let mut audio_count: usize = 0;
let mut subtitle_count: usize = 0; let mut subtitle_count: usize = 0;
for (i, format) in self.formats.iter().enumerate() { for (i, format) in self.formats.iter().enumerate() {
let format_fps = format.video.0.fps().unwrap(); let offset = offsets.get(&i).copied().unwrap_or_default();
let format_len = format let format_len = format
.video .video
.0 .0
@ -339,7 +332,7 @@ impl Downloader {
.iter() .iter()
.map(|s| s.length.as_millis()) .map(|s| s.length.as_millis())
.sum::<u128>() as u64 .sum::<u128>() as u64
- offsets.get(&i).map_or(0, |o| *o); - offset.num_milliseconds() as u64;
if format_len > root_format_length { if format_len > root_format_length {
root_format_idx = i; root_format_idx = i;
root_format_length = format_len; root_format_length = format_len;
@ -347,23 +340,13 @@ impl Downloader {
for _ in &format.audios { for _ in &format.audios {
if let Some(offset) = &offsets.get(&i) { if let Some(offset) = &offsets.get(&i) {
audio_offsets.insert( audio_offsets.insert(audio_count, **offset);
audio_count,
TimeDelta::milliseconds(
(**offset as f64 / format_fps * 1000.0) as i64,
),
);
} }
audio_count += 1 audio_count += 1
} }
for _ in &format.subtitles { for _ in &format.subtitles {
if let Some(offset) = &offsets.get(&i) { if let Some(offset) = &offsets.get(&i) {
subtitle_offsets.insert( subtitle_offsets.insert(subtitle_count, **offset);
subtitle_count,
TimeDelta::milliseconds(
(**offset as f64 / format_fps * 1000.0) as i64,
),
);
} }
subtitle_count += 1 subtitle_count += 1
} }
@ -390,20 +373,28 @@ impl Downloader {
root_format.subtitles.extend(subtitle_append); root_format.subtitles.extend(subtitle_append);
self.formats = vec![root_format]; self.formats = vec![root_format];
video_offset = offsets.get(&root_format_idx).map(|o| { video_offset = offsets.get(&root_format_idx).copied();
TimeDelta::milliseconds( for raw_audio in raw_audios.iter_mut() {
(*o as f64 / self.formats[0].video.0.fps().unwrap() * 1000.0) as i64, raw_audio.video_idx = root_format_idx;
) }
})
} else { } else {
for format in &mut self.formats { for format in &mut self.formats {
format.metadata.skip_events = None format.metadata.skip_events = None
} }
if !offset_pre_checked {
warn!("Couldn't find reliable sync positions")
}
} }
}
if !offset_pre_checked { // add audio metadata
warn!("Couldn't find reliable sync positions") for raw_audio in raw_audios {
} audios.push(FFmpegAudioMeta {
path: raw_audio.path,
locale: raw_audio.locale,
start_time: audio_offsets.get(&raw_audio.format_id).copied(),
video_idx: raw_audio.video_idx,
})
} }
// downloads all videos // downloads all videos
@ -435,24 +426,6 @@ impl Downloader {
}) })
} }
// downloads all audios
for (i, format) in self.formats.iter().enumerate() {
for (j, (stream_data, locale)) in format.audios.iter().enumerate() {
let path = self
.download_audio(
stream_data,
format!("{:<1$}", format!("Downloading {} audio", locale), fmt_space),
)
.await?;
audios.push(FFmpegAudioMeta {
path,
locale: locale.clone(),
start_time: audio_offsets.get(&j).cloned(),
video_idx: i,
})
}
}
for (i, format) in self.formats.iter().enumerate() { for (i, format) in self.formats.iter().enumerate() {
if format.subtitles.is_empty() { if format.subtitles.is_empty() {
continue; continue;
@ -1538,134 +1511,6 @@ async fn ffmpeg_progress<R: AsyncReadExt + Unpin>(
Ok(()) Ok(())
} }
struct SyncVideo {
path: TempPath,
length: TimeDelta,
available_frames: u64,
idx: usize,
}
fn sync_videos(mut sync_videos: Vec<SyncVideo>, value: f64) -> Result<Option<HashMap<usize, u64>>> {
let mut result = HashMap::new();
let hasher = HasherConfig::new().preproc_dct().to_hasher();
let start_frame = 300;
sync_videos.sort_by_key(|sv| sv.length);
let sync_base = sync_videos.remove(0);
let sync_hashes = extract_frame_hashes(&sync_base.path, start_frame, 50, &hasher)?;
for sync_video in sync_videos {
let mut highest_frame_match = f64::INFINITY;
let mut frame = start_frame;
let mut hashes = vec![];
loop {
if frame == sync_video.available_frames {
debug!(
"Failed to sync videos, end of stream {} reached (highest frame match: {})",
sync_video.idx + 1,
highest_frame_match
);
return Ok(None);
}
hashes.drain(0..(hashes.len() as i32 - sync_hashes.len() as i32).max(0) as usize);
hashes.extend(extract_frame_hashes(
&sync_video.path,
frame,
300 - hashes.len() as u64,
&hasher,
)?);
let mut check_frame_windows_result: Vec<(usize, f64)> =
check_frame_windows(&sync_hashes, &hashes)
.into_iter()
.enumerate()
.collect();
check_frame_windows_result.sort_by(|(_, a), (_, b)| a.partial_cmp(&b).unwrap());
if check_frame_windows_result[0].1 <= value {
result.insert(
sync_video.idx,
frame + check_frame_windows_result[0].0 as u64 - start_frame,
);
break;
} else if check_frame_windows_result[0].1 < highest_frame_match {
highest_frame_match = check_frame_windows_result[0].1
}
frame = (frame + 300 - sync_hashes.len() as u64).min(sync_video.available_frames)
}
}
Ok(Some(result))
}
fn extract_frame_hashes(
input_file: &Path,
start_frame: u64,
frame_count: u64,
hasher: &Hasher,
) -> Result<Vec<ImageHash>> {
let frame_dir = tempdir(format!(
"{}_sync_frames",
input_file
.file_name()
.unwrap_or_default()
.to_string_lossy()
.trim_end_matches(
&input_file
.file_stem()
.unwrap_or_default()
.to_string_lossy()
.to_string()
)
))?;
let extract_output = Command::new("ffmpeg")
.arg("-hide_banner")
.arg("-y")
.args(["-i", input_file.to_string_lossy().to_string().as_str()])
.args([
"-vf",
format!(
r#"select=between(n\,{}\,{}),setpts=PTS-STARTPTS,scale=-1:240"#,
start_frame,
start_frame + frame_count
)
.as_str(),
])
.args(["-vframes", frame_count.to_string().as_str()])
.arg(format!("{}/%03d.jpg", frame_dir.path().to_string_lossy()))
.output()?;
if !extract_output.status.success() {
bail!(
"{}",
String::from_utf8_lossy(extract_output.stderr.as_slice())
)
}
let mut hashes = vec![];
for file in frame_dir.path().read_dir()? {
let file = file?;
let img = image::open(file.path())?;
hashes.push(hasher.hash_image(&img))
}
Ok(hashes)
}
fn check_frame_windows(base_hashes: &[ImageHash], check_hashes: &[ImageHash]) -> Vec<f64> {
let mut results = vec![];
for i in 0..(check_hashes.len() - base_hashes.len()) {
let check_window = &check_hashes[i..(base_hashes.len() + i)];
let sum = std::iter::zip(base_hashes, check_window)
.map(|(a, b)| a.dist(b))
.sum::<u32>();
results.push(sum as f64 / check_window.len() as f64);
}
results
}
fn len_from_segments(segments: &[StreamSegment]) -> TimeDelta { fn len_from_segments(segments: &[StreamSegment]) -> TimeDelta {
TimeDelta::milliseconds(segments.iter().map(|s| s.length.as_millis()).sum::<u128>() as i64) TimeDelta::milliseconds(segments.iter().map(|s| s.length.as_millis()).sum::<u128>() as i64)
} }

View file

@ -544,7 +544,7 @@ impl Format {
path.set_file_name(format!("{}.{}", &name[..(255 - ext.len() - 1)], ext)) path.set_file_name(format!("{}.{}", &name[..(255 - ext.len() - 1)], ext))
} }
} }
path.into_iter() path.iter()
.map(|s| { .map(|s| {
if s.len() > 255 { if s.len() > 255 {
s.to_string_lossy()[..255].to_string() s.to_string_lossy()[..255].to_string()

View file

@ -11,4 +11,5 @@ pub mod log;
pub mod os; pub mod os;
pub mod parse; pub mod parse;
pub mod rate_limit; pub mod rate_limit;
pub mod sync;
pub mod video; pub mod video;

View file

@ -7,7 +7,7 @@ use std::pin::Pin;
use std::process::{Command, Stdio}; use std::process::{Command, Stdio};
use std::task::{Context, Poll}; use std::task::{Context, Poll};
use std::{env, fs, io}; use std::{env, fs, io};
use tempfile::{Builder, NamedTempFile, TempDir, TempPath}; use tempfile::{Builder, NamedTempFile, TempPath};
use tokio::io::{AsyncRead, ReadBuf}; use tokio::io::{AsyncRead, ReadBuf};
pub fn has_ffmpeg() -> bool { pub fn has_ffmpeg() -> bool {
@ -46,22 +46,6 @@ pub fn tempfile<S: AsRef<str>>(suffix: S) -> io::Result<NamedTempFile> {
Ok(tempfile) Ok(tempfile)
} }
/// Any tempdir should be created with this function. The prefix and directory of every directory
/// created with this function stays the same which is helpful to query all existing tempdirs and
/// e.g. remove them in a case of ctrl-c. Having one function also good to prevent mistakes like
/// setting the wrong prefix if done manually.
pub fn tempdir<S: AsRef<str>>(suffix: S) -> io::Result<TempDir> {
let tempdir = Builder::default()
.prefix(".crunchy-cli_")
.suffix(suffix.as_ref())
.tempdir_in(temp_directory())?;
debug!(
"Created temporary directory: {}",
tempdir.path().to_string_lossy()
);
Ok(tempdir)
}
pub fn cache_dir<S: AsRef<str>>(name: S) -> io::Result<PathBuf> { pub fn cache_dir<S: AsRef<str>>(name: S) -> io::Result<PathBuf> {
let cache_dir = temp_directory().join(format!(".crunchy-cli_{}_cache", name.as_ref())); let cache_dir = temp_directory().join(format!(".crunchy-cli_{}_cache", name.as_ref()));
fs::create_dir_all(&cache_dir)?; fs::create_dir_all(&cache_dir)?;

View file

@ -0,0 +1,432 @@
use std::io::Read;
use std::process::Stdio;
use std::{
cmp,
collections::{HashMap, HashSet},
mem,
ops::Not,
path::Path,
process::Command,
};
use chrono::TimeDelta;
use crunchyroll_rs::Locale;
use log::debug;
use tempfile::TempPath;
use anyhow::{bail, Result};
use rusty_chromaprint::{Configuration, Fingerprinter};
use super::fmt::format_time_delta;
pub struct SyncAudio {
pub format_id: usize,
pub path: TempPath,
pub locale: Locale,
pub sample_rate: u32,
pub video_idx: usize,
}
#[derive(Debug, Clone, Copy)]
struct TimeRange {
start: f64,
end: f64,
}
pub fn sync_audios(
available_audios: &Vec<SyncAudio>,
sync_tolerance: u32,
sync_precision: u32,
) -> Result<Option<HashMap<usize, TimeDelta>>> {
let mut result: HashMap<usize, TimeDelta> = HashMap::new();
let mut sync_audios = vec![];
let mut chromaprints = HashMap::new();
let mut formats = HashSet::new();
for audio in available_audios {
if formats.contains(&audio.format_id) {
continue;
}
formats.insert(audio.format_id);
sync_audios.push((audio.format_id, &audio.path, audio.sample_rate));
chromaprints.insert(
audio.format_id,
generate_chromaprint(
&audio.path,
audio.sample_rate,
&TimeDelta::zero(),
&TimeDelta::zero(),
&TimeDelta::zero(),
)?,
);
}
sync_audios.sort_by_key(|sync_audio| chromaprints.get(&sync_audio.0).unwrap().len());
let base_audio = sync_audios.remove(0);
let mut start = f64::MAX;
let mut end = f64::MIN;
let mut initial_offsets = HashMap::new();
for audio in &sync_audios {
debug!(
"Initial comparison of format {} to {}",
audio.0, &base_audio.0
);
let (lhs_ranges, rhs_ranges) = compare_chromaprints(
chromaprints.get(&base_audio.0).unwrap(),
chromaprints.get(&audio.0).unwrap(),
sync_tolerance,
);
if lhs_ranges.is_empty() || rhs_ranges.is_empty() {
bail!(
"Failed to sync videos, couldn't find matching audio parts between format {} and {}",
base_audio.0 + 1,
audio.0 + 1
);
}
let lhs_range = lhs_ranges[0];
let rhs_range = rhs_ranges[0];
start = start.min(lhs_range.start);
end = end.max(lhs_range.end);
start = start.min(rhs_range.start);
end = end.max(rhs_range.end);
let offset = TimeDelta::milliseconds(((rhs_range.start - lhs_range.start) * 1000.0) as i64);
initial_offsets.insert(audio.0, TimeDelta::zero().checked_sub(&offset).unwrap());
debug!(
"Found initial offset of {}ms ({} - {} {}s) ({} - {} {}s) for format {} to {}",
offset.num_milliseconds(),
lhs_range.start,
lhs_range.end,
lhs_range.end - lhs_range.start,
rhs_range.start,
rhs_range.end,
rhs_range.end - rhs_range.start,
audio.0,
base_audio.0
);
}
debug!(
"Found matching audio parts at {} - {}, narrowing search",
start, end
);
let start = TimeDelta::milliseconds((start * 1000.0) as i64 - 20000);
let end = TimeDelta::milliseconds((end * 1000.0) as i64 + 20000);
for sync_audio in &sync_audios {
let chromaprint = generate_chromaprint(
sync_audio.1,
sync_audio.2,
&start,
&end,
initial_offsets.get(&sync_audio.0).unwrap(),
)?;
chromaprints.insert(sync_audio.0, chromaprint);
}
let mut runs: HashMap<usize, i64> = HashMap::new();
let iterator_range_limits: i64 = 2 ^ sync_precision as i64;
for i in -iterator_range_limits..=iterator_range_limits {
let base_offset = TimeDelta::milliseconds(
((0.128 / iterator_range_limits as f64 * i as f64) * 1000.0) as i64,
);
chromaprints.insert(
base_audio.0,
generate_chromaprint(base_audio.1, base_audio.2, &start, &end, &base_offset)?,
);
for audio in &sync_audios {
let initial_offset = initial_offsets.get(&audio.0).copied().unwrap();
let offset = find_offset(
(&base_audio.0, chromaprints.get(&base_audio.0).unwrap()),
&base_offset,
(&audio.0, chromaprints.get(&audio.0).unwrap()),
&initial_offset,
&start,
sync_tolerance,
);
if offset.is_none() {
continue;
}
let offset = offset.unwrap();
result.insert(
audio.0,
result
.get(&audio.0)
.copied()
.unwrap_or_default()
.checked_add(&offset)
.unwrap(),
);
runs.insert(audio.0, runs.get(&audio.0).copied().unwrap_or_default() + 1);
}
}
let mut result: HashMap<usize, TimeDelta> = result
.iter()
.map(|(format_id, offset)| {
(
*format_id,
TimeDelta::milliseconds(
offset.num_milliseconds() / runs.get(format_id).copied().unwrap(),
),
)
})
.collect();
result.insert(base_audio.0, TimeDelta::milliseconds(0));
Ok(Some(result))
}
fn find_offset(
lhs: (&usize, &Vec<u32>),
lhs_shift: &TimeDelta,
rhs: (&usize, &Vec<u32>),
rhs_shift: &TimeDelta,
start: &TimeDelta,
sync_tolerance: u32,
) -> Option<TimeDelta> {
let (lhs_ranges, rhs_ranges) = compare_chromaprints(lhs.1, rhs.1, sync_tolerance);
if lhs_ranges.is_empty() || rhs_ranges.is_empty() {
return None;
}
let lhs_range = lhs_ranges[0];
let rhs_range = rhs_ranges[0];
let offset = rhs_range.end - lhs_range.end;
let offset = TimeDelta::milliseconds((offset * 1000.0) as i64)
.checked_add(lhs_shift)?
.checked_sub(rhs_shift)?;
debug!(
"Found offset of {}ms ({} - {} {}s) ({} - {} {}s) for format {} to {}",
offset.num_milliseconds(),
lhs_range.start + start.num_milliseconds() as f64 / 1000.0,
lhs_range.end + start.num_milliseconds() as f64 / 1000.0,
lhs_range.end - lhs_range.start,
rhs_range.start + start.num_milliseconds() as f64 / 1000.0,
rhs_range.end + start.num_milliseconds() as f64 / 1000.0,
rhs_range.end - rhs_range.start,
rhs.0,
lhs.0
);
Some(offset)
}
fn generate_chromaprint(
input_file: &Path,
sample_rate: u32,
start: &TimeDelta,
end: &TimeDelta,
offset: &TimeDelta,
) -> Result<Vec<u32>> {
let mut ss_argument: &TimeDelta = &start.checked_sub(offset).unwrap();
let mut offset_argument = &TimeDelta::zero();
if *offset < TimeDelta::zero() {
ss_argument = start;
offset_argument = offset;
};
let mut printer = Fingerprinter::new(&Configuration::preset_test1());
printer.start(sample_rate, 2)?;
let mut command = Command::new("ffmpeg");
command
.arg("-hide_banner")
.arg("-y")
.args(["-ss", format_time_delta(ss_argument).as_str()]);
if end.is_zero().not() {
command.args(["-to", format_time_delta(end).as_str()]);
}
command
.args(["-itsoffset", format_time_delta(offset_argument).as_str()])
.args(["-i", input_file.to_string_lossy().to_string().as_str()])
.args(["-ac", "2"])
.args([
"-f",
if cfg!(target_endian = "big") {
"s16be"
} else {
"s16le"
},
])
.arg("-");
let mut handle = command
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()?;
// the stdout is read in chunks because keeping all the raw audio data in memory would take up
// a significant amount of space
let mut stdout = handle.stdout.take().unwrap();
let mut buf: [u8; 128_000] = [0; 128_000];
while handle.try_wait()?.is_none() {
loop {
let read_bytes = stdout.read(&mut buf)?;
if read_bytes == 0 {
break;
}
let data: [i16; 64_000] = unsafe { mem::transmute(buf) };
printer.consume(&data[0..(read_bytes / 2)])
}
}
if !handle.wait()?.success() {
bail!("{}", std::io::read_to_string(handle.stderr.unwrap())?)
}
printer.finish();
return Ok(printer.fingerprint().into());
}
fn compare_chromaprints(
lhs_chromaprint: &Vec<u32>,
rhs_chromaprint: &Vec<u32>,
sync_tolerance: u32,
) -> (Vec<TimeRange>, Vec<TimeRange>) {
let lhs_inverse_index = create_inverse_index(lhs_chromaprint);
let rhs_inverse_index = create_inverse_index(rhs_chromaprint);
let mut possible_shifts = HashSet::new();
for lhs_pair in lhs_inverse_index {
let original_point = lhs_pair.0;
for i in -2..=2 {
let modified_point = (original_point as i32 + i) as u32;
if rhs_inverse_index.contains_key(&modified_point) {
let rhs_index = rhs_inverse_index.get(&modified_point).copied().unwrap();
possible_shifts.insert(rhs_index as i32 - lhs_pair.1 as i32);
}
}
}
let mut all_lhs_time_ranges = vec![];
let mut all_rhs_time_ranges = vec![];
for shift_amount in possible_shifts {
let time_range_pair = find_time_ranges(
lhs_chromaprint,
rhs_chromaprint,
shift_amount,
sync_tolerance,
);
if time_range_pair.is_none() {
continue;
}
let (mut lhs_time_ranges, mut rhs_time_ranges) = time_range_pair.unwrap();
let mut lhs_time_ranges: Vec<TimeRange> = lhs_time_ranges
.drain(..)
.filter(|time_range| {
(20.0 < (time_range.end - time_range.start))
&& ((time_range.end - time_range.start) < 180.0)
&& time_range.end > 0.0
})
.collect();
lhs_time_ranges.sort_by(|a, b| (b.end - b.start).total_cmp(&(a.end - a.start)));
let mut rhs_time_ranges: Vec<TimeRange> = rhs_time_ranges
.drain(..)
.filter(|time_range| {
(20.0 < (time_range.end - time_range.start))
&& ((time_range.end - time_range.start) < 180.0)
&& time_range.end > 0.0
})
.collect();
rhs_time_ranges.sort_by(|a, b| (b.end - b.start).total_cmp(&(a.end - a.start)));
if lhs_time_ranges.is_empty() || rhs_time_ranges.is_empty() {
continue;
}
all_lhs_time_ranges.push(lhs_time_ranges[0]);
all_rhs_time_ranges.push(rhs_time_ranges[0]);
}
all_lhs_time_ranges.sort_by(|a, b| (a.end - a.start).total_cmp(&(b.end - b.start)));
all_lhs_time_ranges.reverse();
all_rhs_time_ranges.sort_by(|a, b| (a.end - a.start).total_cmp(&(b.end - b.start)));
all_rhs_time_ranges.reverse();
(all_lhs_time_ranges, all_rhs_time_ranges)
}
fn create_inverse_index(chromaprint: &Vec<u32>) -> HashMap<u32, usize> {
let mut inverse_index = HashMap::with_capacity(chromaprint.capacity());
for (i, fingerprint) in chromaprint.iter().enumerate().take(chromaprint.capacity()) {
inverse_index.insert(*fingerprint, i);
}
inverse_index
}
fn find_time_ranges(
lhs_chromaprint: &[u32],
rhs_chromaprint: &[u32],
shift_amount: i32,
sync_tolerance: u32,
) -> Option<(Vec<TimeRange>, Vec<TimeRange>)> {
let mut lhs_shift: i32 = 0;
let mut rhs_shift: i32 = 0;
if shift_amount < 0 {
lhs_shift -= shift_amount;
} else {
rhs_shift += shift_amount;
}
let mut lhs_matching_timestamps = vec![];
let mut rhs_matching_timestamps = vec![];
let upper_limit =
cmp::min(lhs_chromaprint.len(), rhs_chromaprint.len()) as i32 - shift_amount.abs();
for i in 0..upper_limit {
let lhs_position = i + lhs_shift;
let rhs_position = i + rhs_shift;
let difference = (lhs_chromaprint[lhs_position as usize]
^ rhs_chromaprint[rhs_position as usize])
.count_ones();
if difference > sync_tolerance {
continue;
}
lhs_matching_timestamps.push(lhs_position as f64 * 0.128);
rhs_matching_timestamps.push(rhs_position as f64 * 0.128);
}
lhs_matching_timestamps.push(f64::MAX);
rhs_matching_timestamps.push(f64::MAX);
let lhs_time_ranges = timestamps_to_ranges(lhs_matching_timestamps);
lhs_time_ranges.as_ref()?;
let lhs_time_ranges = lhs_time_ranges.unwrap();
let rhs_time_ranges = timestamps_to_ranges(rhs_matching_timestamps).unwrap();
Some((lhs_time_ranges, rhs_time_ranges))
}
fn timestamps_to_ranges(mut timestamps: Vec<f64>) -> Option<Vec<TimeRange>> {
if timestamps.is_empty() {
return None;
}
timestamps.sort_by(|a, b| a.total_cmp(b));
let mut time_ranges = vec![];
let mut current_range = TimeRange {
start: timestamps[0],
end: timestamps[0],
};
for i in 0..timestamps.len() - 1 {
let current = timestamps[i];
let next = timestamps[i + 1];
if next - current <= 1.0 {
current_range.end = next;
continue;
}
time_ranges.push(current_range);
current_range.start = next;
current_range.end = next;
}
if !time_ranges.is_empty() {
Some(time_ranges)
} else {
None
}
}