Add options to specify audio & subtitle locales as IETF language tag and add --language_tagging flag for archive and download to modify the output file language tagging (#330)

This commit is contained in:
bytedream 2024-03-10 04:04:58 +01:00
parent 3f33db6728
commit f1d266c940
5 changed files with 260 additions and 15 deletions

View file

@ -6,7 +6,7 @@ use crate::utils::download::{
use crate::utils::ffmpeg::FFmpegPreset;
use crate::utils::filter::Filter;
use crate::utils::format::{Format, SingleFormat};
use crate::utils::locale::all_locale_in_locales;
use crate::utils::locale::{all_locale_in_locales, resolve_locales, LanguageTagging};
use crate::utils::log::progress;
use crate::utils::os::{free_file, has_ffmpeg, is_special_file};
use crate::utils::parse::parse_url;
@ -20,6 +20,7 @@ use crunchyroll_rs::Locale;
use log::{debug, warn};
use regex::Regex;
use std::fmt::{Display, Formatter};
use std::iter::zip;
use std::ops::Sub;
use std::path::{Path, PathBuf};
use std::process::{Command, Stdio};
@ -31,15 +32,19 @@ pub struct Archive {
#[arg(help = format!("Audio languages. Can be used multiple times. \
Available languages are: {}", Locale::all().into_iter().map(|l| l.to_string()).collect::<Vec<String>>().join(", ")))]
#[arg(long_help = format!("Audio languages. Can be used multiple times. \
Available languages are:\n {}", Locale::all().into_iter().map(|l| format!("{:<6} {}", l.to_string(), l.to_human_readable())).collect::<Vec<String>>().join("\n ")))]
Available languages are:\n {}\nIETF tagged language codes for the shown available locales can be used too", Locale::all().into_iter().map(|l| format!("{:<6} {}", l.to_string(), l.to_human_readable())).collect::<Vec<String>>().join("\n ")))]
#[arg(short, long, default_values_t = vec![Locale::ja_JP, crate::utils::locale::system_locale()])]
pub(crate) audio: Vec<Locale>,
#[arg(skip)]
output_audio_locales: Vec<String>,
#[arg(help = format!("Subtitle languages. Can be used multiple times. \
Available languages are: {}", Locale::all().into_iter().map(|l| l.to_string()).collect::<Vec<String>>().join(", ")))]
#[arg(long_help = format!("Subtitle languages. Can be used multiple times. \
Available languages are: {}", Locale::all().into_iter().map(|l| l.to_string()).collect::<Vec<String>>().join(", ")))]
Available languages are: {}\nIETF tagged language codes for the shown available locales can be used too", Locale::all().into_iter().map(|l| l.to_string()).collect::<Vec<String>>().join(", ")))]
#[arg(short, long, default_values_t = Locale::all())]
pub(crate) subtitle: Vec<Locale>,
#[arg(skip)]
output_subtitle_locales: Vec<String>,
#[arg(help = "Name of the output file")]
#[arg(long_help = "Name of the output file. \
@ -95,12 +100,22 @@ pub struct Archive {
#[arg(short, long, default_value = "auto")]
#[arg(value_parser = MergeBehavior::parse)]
pub(crate) merge: MergeBehavior,
#[arg(
help = "If the merge behavior is 'auto', only download multiple video tracks if their length difference is higher than the given milliseconds"
)]
#[arg(long, default_value_t = 200)]
pub(crate) merge_auto_tolerance: u32,
#[arg(
long,
help = "Specified which language tagging the audio and subtitle tracks and language specific format options should have. \
Valid options are: 'default' (how Crunchyroll uses it internally), 'ietf' (according to the IETF standard)"
)]
#[arg(
long_help = "Specified which language tagging the audio and subtitle tracks and language specific format options should have. \
Valid options are: 'default' (how Crunchyroll uses it internally), 'ietf' (according to the IETF standard; you might run in issues as there are multiple locales which resolve to the same IETF language code, e.g. 'es-LA' and 'es-ES' are both resolving to 'es')"
)]
#[arg(value_parser = LanguageTagging::parse)]
pub(crate) language_tagging: Option<LanguageTagging>,
#[arg(help = format!("Presets for converting the video to a specific coding format. \
Available presets: \n {}", FFmpegPreset::available_matches_human_readable().join("\n ")))]
@ -217,6 +232,26 @@ impl Execute for Archive {
self.audio = all_locale_in_locales(self.audio.clone());
self.subtitle = all_locale_in_locales(self.subtitle.clone());
if let Some(language_tagging) = &self.language_tagging {
self.audio = resolve_locales(&self.audio);
self.subtitle = resolve_locales(&self.subtitle);
self.output_audio_locales = language_tagging.convert_locales(&self.audio);
self.output_subtitle_locales = language_tagging.convert_locales(&self.subtitle);
} else {
self.output_audio_locales = self
.audio
.clone()
.into_iter()
.map(|l| l.to_string())
.collect();
self.output_subtitle_locales = self
.subtitle
.clone()
.into_iter()
.map(|l| l.to_string())
.collect();
}
Ok(())
}
@ -259,7 +294,13 @@ impl Execute for Archive {
.audio_sort(Some(self.audio.clone()))
.subtitle_sort(Some(self.subtitle.clone()))
.no_closed_caption(self.no_closed_caption)
.threads(self.threads);
.threads(self.threads)
.audio_locale_output_map(
zip(self.audio.clone(), self.output_audio_locales.clone()).collect(),
)
.subtitle_locale_output_map(
zip(self.subtitle.clone(), self.output_subtitle_locales.clone()).collect(),
);
for single_formats in single_format_collection.into_iter() {
let (download_formats, mut format) = get_format(&self, &single_formats).await?;
@ -275,9 +316,14 @@ impl Execute for Archive {
.as_ref()
.map_or((&self.output).into(), |so| so.into()),
self.universal_output,
self.language_tagging.as_ref(),
)
} else {
format.format_path((&self.output).into(), self.universal_output)
format.format_path(
(&self.output).into(),
self.universal_output,
self.language_tagging.as_ref(),
)
};
let (mut path, changed) = free_file(formatted_path.clone());

View file

@ -4,6 +4,7 @@ use crate::utils::download::{DownloadBuilder, DownloadFormat, DownloadFormatMeta
use crate::utils::ffmpeg::{FFmpegPreset, SOFTSUB_CONTAINERS};
use crate::utils::filter::Filter;
use crate::utils::format::{Format, SingleFormat};
use crate::utils::locale::{resolve_locales, LanguageTagging};
use crate::utils::log::progress;
use crate::utils::os::{free_file, has_ffmpeg, is_special_file};
use crate::utils::parse::parse_url;
@ -14,6 +15,7 @@ use anyhow::Result;
use crunchyroll_rs::media::Resolution;
use crunchyroll_rs::Locale;
use log::{debug, warn};
use std::collections::HashMap;
use std::path::Path;
#[derive(Clone, Debug, clap::Parser)]
@ -23,14 +25,18 @@ pub struct Download {
#[arg(help = format!("Audio language. Can only be used if the provided url(s) point to a series. \
Available languages are: {}", Locale::all().into_iter().map(|l| l.to_string()).collect::<Vec<String>>().join(", ")))]
#[arg(long_help = format!("Audio language. Can only be used if the provided url(s) point to a series. \
Available languages are:\n {}", Locale::all().into_iter().map(|l| format!("{:<6} {}", l.to_string(), l.to_human_readable())).collect::<Vec<String>>().join("\n ")))]
Available languages are:\n {}\nIETF tagged language codes for the shown available locales can be used too", Locale::all().into_iter().map(|l| format!("{:<6} {}", l.to_string(), l.to_human_readable())).collect::<Vec<String>>().join("\n ")))]
#[arg(short, long, default_value_t = crate::utils::locale::system_locale())]
pub(crate) audio: Locale,
#[arg(skip)]
output_audio_locale: String,
#[arg(help = format!("Subtitle language. Available languages are: {}", Locale::all().into_iter().map(|l| l.to_string()).collect::<Vec<String>>().join(", ")))]
#[arg(long_help = format!("Subtitle language. If set, the subtitle will be burned into the video and cannot be disabled. \
Available languages are: {}", Locale::all().into_iter().map(|l| l.to_string()).collect::<Vec<String>>().join(", ")))]
Available languages are: {}\nIETF tagged language codes for the shown available locales can be used too", Locale::all().into_iter().map(|l| l.to_string()).collect::<Vec<String>>().join(", ")))]
#[arg(short, long)]
pub(crate) subtitle: Option<Locale>,
#[arg(skip)]
output_subtitle_locale: String,
#[arg(help = "Name of the output file")]
#[arg(long_help = "Name of the output file. \
@ -75,6 +81,18 @@ pub struct Download {
#[arg(value_parser = crate::utils::clap::clap_parse_resolution)]
pub(crate) resolution: Resolution,
#[arg(
long,
help = "Specified which language tagging the audio and subtitle tracks and language specific format options should have. \
Valid options are: 'default' (how Crunchyroll uses it internally), 'ietf' (according to the IETF standard)"
)]
#[arg(
long_help = "Specified which language tagging the audio and subtitle tracks and language specific format options should have. \
Valid options are: 'default' (how Crunchyroll uses it internally), 'ietf' (according to the IETF standard; you might run in issues as there are multiple locales which resolve to the same IETF language code, e.g. 'es-LA' and 'es-ES' are both resolving to 'es')"
)]
#[arg(value_parser = LanguageTagging::parse)]
pub(crate) language_tagging: Option<LanguageTagging>,
#[arg(help = format!("Presets for converting the video to a specific coding format. \
Available presets: \n {}", FFmpegPreset::available_matches_human_readable().join("\n ")))]
#[arg(long_help = format!("Presets for converting the video to a specific coding format. \
@ -178,6 +196,27 @@ impl Execute for Download {
warn!("The '{{resolution}}' format option is deprecated and will be removed in a future version. Please use '{{width}}' and '{{height}}' instead")
}
if let Some(language_tagging) = &self.language_tagging {
self.audio = resolve_locales(&[self.audio.clone()]).remove(0);
self.subtitle = self
.subtitle
.as_ref()
.map(|s| resolve_locales(&[s.clone()]).remove(0));
self.output_audio_locale = language_tagging.for_locale(&self.audio);
self.output_subtitle_locale = self
.subtitle
.as_ref()
.map(|s| language_tagging.for_locale(s))
.unwrap_or_default()
} else {
self.output_audio_locale = self.audio.to_string();
self.output_subtitle_locale = self
.subtitle
.as_ref()
.map(|s| s.to_string())
.unwrap_or_default();
}
Ok(())
}
@ -240,7 +279,16 @@ impl Execute for Download {
})
.ffmpeg_preset(self.ffmpeg_preset.clone().unwrap_or_default())
.ffmpeg_threads(self.ffmpeg_threads)
.threads(self.threads);
.threads(self.threads)
.audio_locale_output_map(HashMap::from([(
self.audio.clone(),
self.output_audio_locale.clone(),
)]))
.subtitle_locale_output_map(
self.subtitle.as_ref().map_or(HashMap::new(), |s| {
HashMap::from([(s.clone(), self.output_subtitle_locale.clone())])
}),
);
for mut single_formats in single_format_collection.into_iter() {
// the vec contains always only one item
@ -268,9 +316,14 @@ impl Execute for Download {
.as_ref()
.map_or((&self.output).into(), |so| so.into()),
self.universal_output,
self.language_tagging.as_ref(),
)
} else {
format.format_path((&self.output).into(), self.universal_output)
format.format_path(
(&self.output).into(),
self.universal_output,
self.language_tagging.as_ref(),
)
};
let (path, changed) = free_file(formatted_path.clone());

View file

@ -12,7 +12,7 @@ use regex::Regex;
use reqwest::Client;
use std::borrow::Borrow;
use std::cmp::Ordering;
use std::collections::BTreeMap;
use std::collections::{BTreeMap, HashMap};
use std::io::Write;
use std::path::{Path, PathBuf};
use std::process::{Command, Stdio};
@ -61,6 +61,8 @@ pub struct DownloadBuilder {
no_closed_caption: bool,
threads: usize,
ffmpeg_threads: Option<usize>,
audio_locale_output_map: HashMap<Locale, String>,
subtitle_locale_output_map: HashMap<Locale, String>,
}
impl DownloadBuilder {
@ -78,6 +80,8 @@ impl DownloadBuilder {
no_closed_caption: false,
threads: num_cpus::get(),
ffmpeg_threads: None,
audio_locale_output_map: HashMap::new(),
subtitle_locale_output_map: HashMap::new(),
}
}
@ -99,6 +103,9 @@ impl DownloadBuilder {
ffmpeg_threads: self.ffmpeg_threads,
formats: vec![],
audio_locale_output_map: self.audio_locale_output_map,
subtitle_locale_output_map: self.subtitle_locale_output_map,
}
}
}
@ -138,6 +145,9 @@ pub struct Downloader {
ffmpeg_threads: Option<usize>,
formats: Vec<DownloadFormat>,
audio_locale_output_map: HashMap<Locale, String>,
subtitle_locale_output_map: HashMap<Locale, String>,
}
impl Downloader {
@ -426,7 +436,12 @@ impl Downloader {
maps.extend(["-map".to_string(), (i + videos.len()).to_string()]);
metadata.extend([
format!("-metadata:s:a:{}", i),
format!("language={}", meta.language),
format!(
"language={}",
self.audio_locale_output_map
.get(&meta.language)
.unwrap_or(&meta.language.to_string())
),
]);
metadata.extend([
format!("-metadata:s:a:{}", i),
@ -457,7 +472,12 @@ impl Downloader {
]);
metadata.extend([
format!("-metadata:s:s:{}", i),
format!("language={}", meta.language),
format!(
"language={}",
self.subtitle_locale_output_map
.get(&meta.language)
.unwrap_or(&meta.language.to_string())
),
]);
metadata.extend([
format!("-metadata:s:s:{}", i),

View file

@ -1,4 +1,5 @@
use crate::utils::filter::real_dedup_vec;
use crate::utils::locale::LanguageTagging;
use crate::utils::log::tab_info;
use crate::utils::os::{is_special_file, sanitize};
use anyhow::Result;
@ -417,7 +418,12 @@ impl Format {
}
/// Formats the given string if it has specific pattern in it. It also sanitizes the filename.
pub fn format_path(&self, path: PathBuf, universal: bool) -> PathBuf {
pub fn format_path(
&self,
path: PathBuf,
universal: bool,
language_tagging: Option<&LanguageTagging>,
) -> PathBuf {
let path = path
.to_string_lossy()
.to_string()
@ -427,7 +433,7 @@ impl Format {
&sanitize(
self.locales
.iter()
.map(|(a, _)| a.to_string())
.map(|(a, _)| language_tagging.map_or(a.to_string(), |t| t.for_locale(a)))
.collect::<Vec<String>>()
.join(
&env::var("CRUNCHY_CLI_FORMAT_DELIMITER")

View file

@ -1,4 +1,124 @@
use crunchyroll_rs::Locale;
use log::warn;
#[derive(Clone, Debug)]
#[allow(clippy::upper_case_acronyms)]
pub enum LanguageTagging {
Default,
IETF,
}
impl LanguageTagging {
pub fn parse(s: &str) -> Result<Self, String> {
Ok(match s.to_lowercase().as_str() {
"default" => Self::Default,
"ietf" => Self::IETF,
_ => return Err(format!("'{}' is not a valid language tagging", s)),
})
}
pub fn convert_locales(&self, locales: &[Locale]) -> Vec<String> {
let ietf_language_codes = ietf_language_codes();
let mut converted = vec![];
match &self {
LanguageTagging::Default => {
for locale in locales {
let Some((_, available)) =
ietf_language_codes.iter().find(|(_, l)| l.contains(locale))
else {
// if no matching IETF language code was found, just pass it as it is
converted.push(locale.to_string());
continue;
};
converted.push(available.first().unwrap().to_string())
}
}
LanguageTagging::IETF => {
for locale in locales {
let Some((tag, _)) =
ietf_language_codes.iter().find(|(_, l)| l.contains(locale))
else {
// if no matching IETF language code was found, just pass it as it is
converted.push(locale.to_string());
continue;
};
converted.push(tag.to_string())
}
}
}
converted
}
pub fn for_locale(&self, locale: &Locale) -> String {
match &self {
LanguageTagging::Default => ietf_language_codes()
.iter()
.find(|(_, l)| l.contains(locale))
.map_or(locale.to_string(), |(_, l)| l[0].to_string()),
LanguageTagging::IETF => ietf_language_codes()
.iter()
.find(|(_, l)| l.contains(locale))
.map_or(locale.to_string(), |(tag, _)| tag.to_string()),
}
}
}
pub fn resolve_locales(locales: &[Locale]) -> Vec<Locale> {
let ietf_language_codes = ietf_language_codes();
let all_locales = Locale::all();
let mut resolved = vec![];
for locale in locales {
if all_locales.contains(locale) {
resolved.push(locale.clone())
} else if let Some((_, resolved_locales)) = ietf_language_codes
.iter()
.find(|(tag, _)| tag == &locale.to_string().as_str())
{
let (first, alternatives) = resolved_locales.split_first().unwrap();
resolved.push(first.clone());
// ignoring `Locale::en_IN` because I think the majority of users which want english
// audio / subs want the "actual" english version and not the hindi accent dub
if !alternatives.is_empty() && resolved_locales.first().unwrap() != &Locale::en_IN {
warn!("Resolving locale '{}' to '{}', but there are some alternatives: {}. If you an alternative instead, please write it completely out instead of '{}'", locale, first, alternatives.iter().map(|l| format!("'{l}'")).collect::<Vec<String>>().join(", "), locale)
}
} else {
resolved.push(locale.clone());
warn!("Unknown locale '{}'", locale)
}
}
resolved
}
fn ietf_language_codes<'a>() -> Vec<(&'a str, Vec<Locale>)> {
vec![
("ar", vec![Locale::ar_ME, Locale::ar_SA]),
("ca", vec![Locale::ca_ES]),
("de", vec![Locale::de_DE]),
("en", vec![Locale::en_US, Locale::hi_IN]),
("es", vec![Locale::es_ES, Locale::es_419, Locale::es_LA]),
("fr", vec![Locale::fr_FR]),
("hi", vec![Locale::hi_IN]),
("id", vec![Locale::id_ID]),
("it", vec![Locale::it_IT]),
("ja", vec![Locale::ja_JP]),
("ko", vec![Locale::ko_KR]),
("ms", vec![Locale::ms_MY]),
("pl", vec![Locale::pl_PL]),
("pt", vec![Locale::pt_PT, Locale::pt_BR]),
("ru", vec![Locale::ru_RU]),
("ta", vec![Locale::ta_IN]),
("te", vec![Locale::te_IN]),
("th", vec![Locale::th_TH]),
("tr", vec![Locale::tr_TR]),
("vi", vec![Locale::vi_VN]),
("zh", vec![Locale::zh_CN, Locale::zh_HK, Locale::zh_TW]),
]
}
/// Return the locale of the system.
pub fn system_locale() -> Locale {