feat: auto detect source text encoding

This commit is contained in:
poly000 2024-03-22 12:59:15 +08:00
parent 1ff222bee4
commit eed3145488
No known key found for this signature in database
3 changed files with 56 additions and 9 deletions

View File

@ -60,6 +60,7 @@ android = []
#cpal = { path = "./3rdparty/cpal" }
byteorder = "1.4"
case_insensitive_hashmap = "1.0.0"
charset-normalizer-rs = "1.0.6"
chrono = { version = "0.4", default-features = false, features = ["clock", "std"] }
cpal = { git = "https://github.com/doukutsu-rs/cpal", rev = "9d269d8724102404e73a61e9def0c0cbc921b676" }
directories = "3"

View File

@ -66,9 +66,61 @@ pub fn put_string(buffer: &mut Vec<u8>, out: &mut Vec<u8>, encoding: TextScriptE
let mut tmp_buf = Vec::new();
let encoding = match encoding {
TextScriptEncoding::UTF8 => encoding_rs::UTF_8,
TextScriptEncoding::ShiftJIS => encoding_rs::SHIFT_JIS,
TextScriptEncoding::GBK => encoding_rs::GBK,
// auto detection
_ => {
let guess = charset_normalizer_rs::from_bytes(&buffer, None);
let Some(gussed) = guess.get_best() else {
return;
};
match gussed.encoding() {
"iso-2022-jp" => encoding_rs::ISO_2022_JP,
"iso-8859-2" => encoding_rs::ISO_8859_2,
"iso-8859-3" => encoding_rs::ISO_8859_3,
"iso-8859-4" => encoding_rs::ISO_8859_4,
"iso-8859-5" => encoding_rs::ISO_8859_5,
"iso-8859-6" => encoding_rs::ISO_8859_6,
"iso-8859-7" => encoding_rs::ISO_8859_7,
"iso-8859-8" => encoding_rs::ISO_8859_8,
"iso-8859-8-i" => encoding_rs::ISO_8859_8_I,
"iso-8859-10" => encoding_rs::ISO_8859_10,
"iso-8859-13" => encoding_rs::ISO_8859_13,
"iso-8859-14" => encoding_rs::ISO_8859_14,
"iso-8859-15" => encoding_rs::ISO_8859_15,
"iso-8859-16" => encoding_rs::ISO_8859_16,
"koi8-r" => encoding_rs::KOI8_R,
"koi8-u" => encoding_rs::KOI8_U,
"macintosh" => encoding_rs::MACINTOSH,
"euc-jp" => encoding_rs::EUC_JP,
"euc-kr" => encoding_rs::EUC_KR,
"gb18030" => encoding_rs::GB18030,
"gbk" => encoding_rs::GBK,
"big5" => encoding_rs::BIG5,
"windows-1250" => encoding_rs::WINDOWS_1250,
"windows-1251" => encoding_rs::WINDOWS_1251,
"windows-1252" => encoding_rs::WINDOWS_1252,
"windows-1253" => encoding_rs::WINDOWS_1253,
"windows-1254" => encoding_rs::WINDOWS_1254,
"windows-1255" => encoding_rs::WINDOWS_1255,
"windows-1256" => encoding_rs::WINDOWS_1256,
"windows-1257" => encoding_rs::WINDOWS_1257,
"windows-1258" => encoding_rs::WINDOWS_1258,
"utf-8" => encoding_rs::UTF_8,
"utf-16be" => encoding_rs::UTF_16BE,
"utf-16le" => encoding_rs::UTF_16LE,
"x-mac-cyrillic" => encoding_rs::X_MAC_CYRILLIC,
"x-user-defined" => encoding_rs::X_USER_DEFINED,
_ => encoding_rs::UTF_8,
}
}
};
let decoded_text = encoding.decode_without_bom_handling(&buffer).0;

View File

@ -48,15 +48,12 @@ bitfield! {
pub enum TextScriptEncoding {
UTF8 = 0,
ShiftJIS,
GBK,
}
impl From<&str> for TextScriptEncoding {
fn from(s: &str) -> Self {
match s {
"utf-8" => Self::UTF8,
// GBK is a superset to GB2312
"gbk" | "gb2312" => Self::GBK,
_ => Self::ShiftJIS,
}
}
@ -67,10 +64,7 @@ impl TextScriptEncoding {
let required_encoding = if (state.loc.code == "jp" || state.loc.code == "en") && state.constants.is_base() {
TextScriptEncoding::ShiftJIS
} else {
match state.loc.code.as_str() {
"zh" => TextScriptEncoding::GBK,
_ => TextScriptEncoding::UTF8,
}
TextScriptEncoding::UTF8
};
encoding != required_encoding