Add options to specify audio & subtitle locales as IETF language tag and add --language_tagging flag for archive and download to modify the output file language tagging (#330)

This commit is contained in:
bytedream 2024-03-10 04:04:58 +01:00
parent 3f33db6728
commit f1d266c940
5 changed files with 260 additions and 15 deletions

View file

@ -12,7 +12,7 @@ use regex::Regex;
use reqwest::Client;
use std::borrow::Borrow;
use std::cmp::Ordering;
use std::collections::BTreeMap;
use std::collections::{BTreeMap, HashMap};
use std::io::Write;
use std::path::{Path, PathBuf};
use std::process::{Command, Stdio};
@ -61,6 +61,8 @@ pub struct DownloadBuilder {
no_closed_caption: bool,
threads: usize,
ffmpeg_threads: Option<usize>,
audio_locale_output_map: HashMap<Locale, String>,
subtitle_locale_output_map: HashMap<Locale, String>,
}
impl DownloadBuilder {
@ -78,6 +80,8 @@ impl DownloadBuilder {
no_closed_caption: false,
threads: num_cpus::get(),
ffmpeg_threads: None,
audio_locale_output_map: HashMap::new(),
subtitle_locale_output_map: HashMap::new(),
}
}
@ -99,6 +103,9 @@ impl DownloadBuilder {
ffmpeg_threads: self.ffmpeg_threads,
formats: vec![],
audio_locale_output_map: self.audio_locale_output_map,
subtitle_locale_output_map: self.subtitle_locale_output_map,
}
}
}
@ -138,6 +145,9 @@ pub struct Downloader {
ffmpeg_threads: Option<usize>,
formats: Vec<DownloadFormat>,
audio_locale_output_map: HashMap<Locale, String>,
subtitle_locale_output_map: HashMap<Locale, String>,
}
impl Downloader {
@ -426,7 +436,12 @@ impl Downloader {
maps.extend(["-map".to_string(), (i + videos.len()).to_string()]);
metadata.extend([
format!("-metadata:s:a:{}", i),
format!("language={}", meta.language),
format!(
"language={}",
self.audio_locale_output_map
.get(&meta.language)
.unwrap_or(&meta.language.to_string())
),
]);
metadata.extend([
format!("-metadata:s:a:{}", i),
@ -457,7 +472,12 @@ impl Downloader {
]);
metadata.extend([
format!("-metadata:s:s:{}", i),
format!("language={}", meta.language),
format!(
"language={}",
self.subtitle_locale_output_map
.get(&meta.language)
.unwrap_or(&meta.language.to_string())
),
]);
metadata.extend([
format!("-metadata:s:s:{}", i),

View file

@ -1,4 +1,5 @@
use crate::utils::filter::real_dedup_vec;
use crate::utils::locale::LanguageTagging;
use crate::utils::log::tab_info;
use crate::utils::os::{is_special_file, sanitize};
use anyhow::Result;
@ -417,7 +418,12 @@ impl Format {
}
/// Formats the given string if it has specific pattern in it. It also sanitizes the filename.
pub fn format_path(&self, path: PathBuf, universal: bool) -> PathBuf {
pub fn format_path(
&self,
path: PathBuf,
universal: bool,
language_tagging: Option<&LanguageTagging>,
) -> PathBuf {
let path = path
.to_string_lossy()
.to_string()
@ -427,7 +433,7 @@ impl Format {
&sanitize(
self.locales
.iter()
.map(|(a, _)| a.to_string())
.map(|(a, _)| language_tagging.map_or(a.to_string(), |t| t.for_locale(a)))
.collect::<Vec<String>>()
.join(
&env::var("CRUNCHY_CLI_FORMAT_DELIMITER")

View file

@ -1,4 +1,124 @@
use crunchyroll_rs::Locale;
use log::warn;
#[derive(Clone, Debug)]
#[allow(clippy::upper_case_acronyms)]
pub enum LanguageTagging {
Default,
IETF,
}
impl LanguageTagging {
pub fn parse(s: &str) -> Result<Self, String> {
Ok(match s.to_lowercase().as_str() {
"default" => Self::Default,
"ietf" => Self::IETF,
_ => return Err(format!("'{}' is not a valid language tagging", s)),
})
}
pub fn convert_locales(&self, locales: &[Locale]) -> Vec<String> {
let ietf_language_codes = ietf_language_codes();
let mut converted = vec![];
match &self {
LanguageTagging::Default => {
for locale in locales {
let Some((_, available)) =
ietf_language_codes.iter().find(|(_, l)| l.contains(locale))
else {
// if no matching IETF language code was found, just pass it as it is
converted.push(locale.to_string());
continue;
};
converted.push(available.first().unwrap().to_string())
}
}
LanguageTagging::IETF => {
for locale in locales {
let Some((tag, _)) =
ietf_language_codes.iter().find(|(_, l)| l.contains(locale))
else {
// if no matching IETF language code was found, just pass it as it is
converted.push(locale.to_string());
continue;
};
converted.push(tag.to_string())
}
}
}
converted
}
pub fn for_locale(&self, locale: &Locale) -> String {
match &self {
LanguageTagging::Default => ietf_language_codes()
.iter()
.find(|(_, l)| l.contains(locale))
.map_or(locale.to_string(), |(_, l)| l[0].to_string()),
LanguageTagging::IETF => ietf_language_codes()
.iter()
.find(|(_, l)| l.contains(locale))
.map_or(locale.to_string(), |(tag, _)| tag.to_string()),
}
}
}
pub fn resolve_locales(locales: &[Locale]) -> Vec<Locale> {
let ietf_language_codes = ietf_language_codes();
let all_locales = Locale::all();
let mut resolved = vec![];
for locale in locales {
if all_locales.contains(locale) {
resolved.push(locale.clone())
} else if let Some((_, resolved_locales)) = ietf_language_codes
.iter()
.find(|(tag, _)| tag == &locale.to_string().as_str())
{
let (first, alternatives) = resolved_locales.split_first().unwrap();
resolved.push(first.clone());
// ignoring `Locale::en_IN` because I think the majority of users which want english
// audio / subs want the "actual" english version and not the hindi accent dub
if !alternatives.is_empty() && resolved_locales.first().unwrap() != &Locale::en_IN {
warn!("Resolving locale '{}' to '{}', but there are some alternatives: {}. If you an alternative instead, please write it completely out instead of '{}'", locale, first, alternatives.iter().map(|l| format!("'{l}'")).collect::<Vec<String>>().join(", "), locale)
}
} else {
resolved.push(locale.clone());
warn!("Unknown locale '{}'", locale)
}
}
resolved
}
fn ietf_language_codes<'a>() -> Vec<(&'a str, Vec<Locale>)> {
vec![
("ar", vec![Locale::ar_ME, Locale::ar_SA]),
("ca", vec![Locale::ca_ES]),
("de", vec![Locale::de_DE]),
("en", vec![Locale::en_US, Locale::hi_IN]),
("es", vec![Locale::es_ES, Locale::es_419, Locale::es_LA]),
("fr", vec![Locale::fr_FR]),
("hi", vec![Locale::hi_IN]),
("id", vec![Locale::id_ID]),
("it", vec![Locale::it_IT]),
("ja", vec![Locale::ja_JP]),
("ko", vec![Locale::ko_KR]),
("ms", vec![Locale::ms_MY]),
("pl", vec![Locale::pl_PL]),
("pt", vec![Locale::pt_PT, Locale::pt_BR]),
("ru", vec![Locale::ru_RU]),
("ta", vec![Locale::ta_IN]),
("te", vec![Locale::te_IN]),
("th", vec![Locale::th_TH]),
("tr", vec![Locale::tr_TR]),
("vi", vec![Locale::vi_VN]),
("zh", vec![Locale::zh_CN, Locale::zh_HK, Locale::zh_TW]),
]
}
/// Return the locale of the system.
pub fn system_locale() -> Locale {