feat: auto detect source text encoding
This commit is contained in:
parent
1ff222bee4
commit
eed3145488
|
@ -60,6 +60,7 @@ android = []
|
|||
#cpal = { path = "./3rdparty/cpal" }
|
||||
byteorder = "1.4"
|
||||
case_insensitive_hashmap = "1.0.0"
|
||||
charset-normalizer-rs = "1.0.6"
|
||||
chrono = { version = "0.4", default-features = false, features = ["clock", "std"] }
|
||||
cpal = { git = "https://github.com/doukutsu-rs/cpal", rev = "9d269d8724102404e73a61e9def0c0cbc921b676" }
|
||||
directories = "3"
|
||||
|
|
|
@ -66,9 +66,61 @@ pub fn put_string(buffer: &mut Vec<u8>, out: &mut Vec<u8>, encoding: TextScriptE
|
|||
let mut tmp_buf = Vec::new();
|
||||
|
||||
let encoding = match encoding {
|
||||
TextScriptEncoding::UTF8 => encoding_rs::UTF_8,
|
||||
TextScriptEncoding::ShiftJIS => encoding_rs::SHIFT_JIS,
|
||||
TextScriptEncoding::GBK => encoding_rs::GBK,
|
||||
// auto detection
|
||||
_ => {
|
||||
let guess = charset_normalizer_rs::from_bytes(&buffer, None);
|
||||
let Some(gussed) = guess.get_best() else {
|
||||
return;
|
||||
};
|
||||
|
||||
match gussed.encoding() {
|
||||
"iso-2022-jp" => encoding_rs::ISO_2022_JP,
|
||||
"iso-8859-2" => encoding_rs::ISO_8859_2,
|
||||
"iso-8859-3" => encoding_rs::ISO_8859_3,
|
||||
"iso-8859-4" => encoding_rs::ISO_8859_4,
|
||||
"iso-8859-5" => encoding_rs::ISO_8859_5,
|
||||
"iso-8859-6" => encoding_rs::ISO_8859_6,
|
||||
"iso-8859-7" => encoding_rs::ISO_8859_7,
|
||||
"iso-8859-8" => encoding_rs::ISO_8859_8,
|
||||
"iso-8859-8-i" => encoding_rs::ISO_8859_8_I,
|
||||
"iso-8859-10" => encoding_rs::ISO_8859_10,
|
||||
"iso-8859-13" => encoding_rs::ISO_8859_13,
|
||||
"iso-8859-14" => encoding_rs::ISO_8859_14,
|
||||
"iso-8859-15" => encoding_rs::ISO_8859_15,
|
||||
"iso-8859-16" => encoding_rs::ISO_8859_16,
|
||||
|
||||
"koi8-r" => encoding_rs::KOI8_R,
|
||||
"koi8-u" => encoding_rs::KOI8_U,
|
||||
|
||||
"macintosh" => encoding_rs::MACINTOSH,
|
||||
|
||||
"euc-jp" => encoding_rs::EUC_JP,
|
||||
"euc-kr" => encoding_rs::EUC_KR,
|
||||
|
||||
"gb18030" => encoding_rs::GB18030,
|
||||
"gbk" => encoding_rs::GBK,
|
||||
"big5" => encoding_rs::BIG5,
|
||||
|
||||
"windows-1250" => encoding_rs::WINDOWS_1250,
|
||||
"windows-1251" => encoding_rs::WINDOWS_1251,
|
||||
"windows-1252" => encoding_rs::WINDOWS_1252,
|
||||
"windows-1253" => encoding_rs::WINDOWS_1253,
|
||||
"windows-1254" => encoding_rs::WINDOWS_1254,
|
||||
"windows-1255" => encoding_rs::WINDOWS_1255,
|
||||
"windows-1256" => encoding_rs::WINDOWS_1256,
|
||||
"windows-1257" => encoding_rs::WINDOWS_1257,
|
||||
"windows-1258" => encoding_rs::WINDOWS_1258,
|
||||
|
||||
"utf-8" => encoding_rs::UTF_8,
|
||||
"utf-16be" => encoding_rs::UTF_16BE,
|
||||
"utf-16le" => encoding_rs::UTF_16LE,
|
||||
|
||||
"x-mac-cyrillic" => encoding_rs::X_MAC_CYRILLIC,
|
||||
"x-user-defined" => encoding_rs::X_USER_DEFINED,
|
||||
_ => encoding_rs::UTF_8,
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let decoded_text = encoding.decode_without_bom_handling(&buffer).0;
|
||||
|
|
|
@ -48,15 +48,12 @@ bitfield! {
|
|||
pub enum TextScriptEncoding {
|
||||
UTF8 = 0,
|
||||
ShiftJIS,
|
||||
GBK,
|
||||
}
|
||||
|
||||
impl From<&str> for TextScriptEncoding {
|
||||
fn from(s: &str) -> Self {
|
||||
match s {
|
||||
"utf-8" => Self::UTF8,
|
||||
// GBK is a superset to GB2312
|
||||
"gbk" | "gb2312" => Self::GBK,
|
||||
_ => Self::ShiftJIS,
|
||||
}
|
||||
}
|
||||
|
@ -67,10 +64,7 @@ impl TextScriptEncoding {
|
|||
let required_encoding = if (state.loc.code == "jp" || state.loc.code == "en") && state.constants.is_base() {
|
||||
TextScriptEncoding::ShiftJIS
|
||||
} else {
|
||||
match state.loc.code.as_str() {
|
||||
"zh" => TextScriptEncoding::GBK,
|
||||
_ => TextScriptEncoding::UTF8,
|
||||
}
|
||||
TextScriptEncoding::UTF8
|
||||
};
|
||||
|
||||
encoding != required_encoding
|
||||
|
|
Loading…
Reference in New Issue