feat: support optional custom encoding

2024-03-22 11:20:48 +08:00 · 2024-03-22 11:20:48 +08:00 · 42e1e5c31d
parent 7630a9b60e
commit 42e1e5c31d
5 changed files with 134 additions and 61 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -65,6 +65,7 @@ cpal = { git = "https://github.com/doukutsu-rs/cpal", rev = "9d269d8724102404e73
 directories = "3"
 discord-rich-presence = { version = "0.2", optional = true }
 downcast = "0.11"
+encoding_rs = "0.8.33"
 fern = "0.6.2"
 glutin = { git = "https://github.com/doukutsu-rs/glutin.git", rev = "2dd95f042e6e090d36f577cbea125560dd99bd27", optional = true, default_features = false, features = ["x11"] }
 imgui = "0.8"
--- a/src/game/scripting/tsc/bytecode_utils.rs
+++ b/src/game/scripting/tsc/bytecode_utils.rs
@ -3,7 +3,6 @@ use std::io::{Cursor, Read};
 use crate::framework::error::GameError::ParseError;
 use crate::framework::error::GameResult;
 use crate::game::scripting::tsc::text_script::TextScriptEncoding;
-use crate::util::encoding::{read_cur_shift_jis, read_cur_wtf8};

 pub fn put_varint(val: i32, out: &mut Vec<u8>) {
    let mut x = ((val as u32) >> 31) ^ ((val as u32) << 1);
@ -43,7 +42,7 @@ pub fn read_cur_varint(cursor: &mut Cursor<&[u8]>) -> GameResult<i32> {
 }

 #[allow(unused)]
-pub fn read_varint<I: Iterator<Item=u8>>(iter: &mut I) -> GameResult<i32> {
+pub fn read_varint<I: Iterator<Item = u8>>(iter: &mut I) -> GameResult<i32> {
    let mut result = 0u32;

    for o in 0..5 {
@ -62,27 +61,57 @@ pub fn put_string(buffer: &mut Vec<u8>, out: &mut Vec<u8>, encoding: TextScriptE
    if buffer.is_empty() {
        return;
    }
+    let mut chars_count = 0;

-    let mut cursor: Cursor<&Vec<u8>> = Cursor::new(buffer);
    let mut tmp_buf = Vec::new();
-    let mut remaining = buffer.len() as u32;
-    let mut chars = 0;

-    while remaining > 0 {
-        let (consumed, chr) = match encoding {
-            TextScriptEncoding::UTF8 => read_cur_wtf8(&mut cursor, remaining),
-            TextScriptEncoding::ShiftJIS => read_cur_shift_jis(&mut cursor, remaining),
-        };
+    let encoding = match encoding {
+        TextScriptEncoding::ShiftJIS => encoding_rs::SHIFT_JIS,
+        TextScriptEncoding::UTF8 => encoding_rs::UTF_8,
+        TextScriptEncoding::UTF16BE => encoding_rs::UTF_16BE,
+        TextScriptEncoding::UTF16LE => encoding_rs::UTF_16LE,
+        TextScriptEncoding::ISO_2022_JP => encoding_rs::ISO_2022_JP,
+        TextScriptEncoding::ISO_8859_2 => encoding_rs::ISO_8859_2,
+        TextScriptEncoding::ISO_8859_3 => encoding_rs::ISO_8859_3,
+        TextScriptEncoding::ISO_8859_4 => encoding_rs::ISO_8859_4,
+        TextScriptEncoding::ISO_8859_5 => encoding_rs::ISO_8859_5,
+        TextScriptEncoding::ISO_8859_6 => encoding_rs::ISO_8859_6,
+        TextScriptEncoding::ISO_8859_7 => encoding_rs::ISO_8859_7,
+        TextScriptEncoding::ISO_8859_8 => encoding_rs::ISO_8859_8,
+        TextScriptEncoding::ISO_8859_8_I => encoding_rs::ISO_8859_8_I,
+        TextScriptEncoding::ISO_8859_10 => encoding_rs::ISO_8859_10,
+        TextScriptEncoding::ISO_8859_13 => encoding_rs::ISO_8859_13,
+        TextScriptEncoding::ISO_8859_14 => encoding_rs::ISO_8859_14,
+        TextScriptEncoding::ISO_8859_15 => encoding_rs::ISO_8859_15,
+        TextScriptEncoding::ISO_8859_16 => encoding_rs::ISO_8859_16,
+        TextScriptEncoding::KOI8_R => encoding_rs::KOI8_R,
+        TextScriptEncoding::KOI8_U => encoding_rs::KOI8_U,
+        TextScriptEncoding::MACINTOSH => encoding_rs::MACINTOSH,
+        TextScriptEncoding::EUC_JP => encoding_rs::EUC_JP,
+        TextScriptEncoding::EUC_KR => encoding_rs::EUC_KR,
+        TextScriptEncoding::GB18030 => encoding_rs::GB18030,
+        TextScriptEncoding::GBK => encoding_rs::GBK,
+        TextScriptEncoding::BIG5 => encoding_rs::BIG5,
+        TextScriptEncoding::WINDOWS_1250 => encoding_rs::WINDOWS_1250,
+        TextScriptEncoding::WINDOWS_1251 => encoding_rs::WINDOWS_1251,
+        TextScriptEncoding::WINDOWS_1252 => encoding_rs::WINDOWS_1252,
+        TextScriptEncoding::WINDOWS_1253 => encoding_rs::WINDOWS_1253,
+        TextScriptEncoding::WINDOWS_1254 => encoding_rs::WINDOWS_1254,
+        TextScriptEncoding::WINDOWS_1255 => encoding_rs::WINDOWS_1255,
+        TextScriptEncoding::WINDOWS_1256 => encoding_rs::WINDOWS_1256,
+        TextScriptEncoding::WINDOWS_1257 => encoding_rs::WINDOWS_1257,
+        TextScriptEncoding::WINDOWS_1258 => encoding_rs::WINDOWS_1258,
+    };

-        remaining -= consumed;
-        chars += 1;
-
-        put_varint(chr as i32, &mut tmp_buf);
+    let decoded_text = encoding.decode_without_bom_handling(&buffer).0;
+    for chr in decoded_text.chars() {
+        chars_count += 1;
+        put_varint(chr as _, &mut tmp_buf);
    }

    buffer.clear();

-    put_varint(chars, out);
+    put_varint(chars_count, out);
    out.append(&mut tmp_buf);
 }

--- a/src/game/scripting/tsc/text_script.rs
+++ b/src/game/scripting/tsc/text_script.rs
@ -44,16 +44,91 @@ bitfield! {
 }

 #[derive(Debug, PartialEq, Eq, Copy, Clone)]
+#[allow(non_camel_case_types)]
 #[repr(u8)]
 pub enum TextScriptEncoding {
    UTF8 = 0,
    ShiftJIS,
+    UTF16BE,
+    UTF16LE,
+    ISO_2022_JP,
+    ISO_8859_2,
+    ISO_8859_3,
+    ISO_8859_4,
+    ISO_8859_5,
+    ISO_8859_6,
+    ISO_8859_7,
+    ISO_8859_8,
+    ISO_8859_8_I,
+    ISO_8859_10,
+    ISO_8859_13,
+    ISO_8859_14,
+    ISO_8859_15,
+    ISO_8859_16,
+    KOI8_R,
+    KOI8_U,
+    MACINTOSH,
+    EUC_JP,
+    EUC_KR,
+    GB18030,
+    GBK,
+    BIG5,
+    WINDOWS_1250,
+    WINDOWS_1251,
+    WINDOWS_1252,
+    WINDOWS_1253,
+    WINDOWS_1254,
+    WINDOWS_1255,
+    WINDOWS_1256,
+    WINDOWS_1257,
+    WINDOWS_1258,
 }

 impl From<&str> for TextScriptEncoding {
    fn from(s: &str) -> Self {
        match s {
            "utf-8" => Self::UTF8,
+
+            "iso-2022-jp" => Self::ISO_2022_JP,
+            "iso-8859-2" => Self::ISO_8859_2,
+            "iso-8859-3" => Self::ISO_8859_3,
+            "iso-8859-4" => Self::ISO_8859_4,
+            "iso-8859-5" => Self::ISO_8859_5,
+            "iso-8859-6" => Self::ISO_8859_6,
+            "iso-8859-7" => Self::ISO_8859_7,
+            "iso-8859-8" => Self::ISO_8859_8,
+            "iso-8859-8-i" => Self::ISO_8859_8_I,
+            "iso-8859-10" => Self::ISO_8859_10,
+            "iso-8859-13" => Self::ISO_8859_13,
+            "iso-8859-14" => Self::ISO_8859_14,
+            "iso-8859-15" => Self::ISO_8859_15,
+            "iso-8859-16" => Self::ISO_8859_16,
+
+            "koi8-r" => Self::KOI8_R,
+            "koi8-u" => Self::KOI8_U,
+
+            "macintosh" => Self::MACINTOSH,
+
+            "euc-jp" => Self::EUC_JP,
+            "euc-kr" => Self::EUC_KR,
+
+            "gb18030" => Self::GB18030,
+            "gbk" => Self::GBK,
+            "big5" => Self::BIG5,
+
+            "windows-1250" => Self::WINDOWS_1250,
+            "windows-1251" => Self::WINDOWS_1251,
+            "windows-1252" => Self::WINDOWS_1252,
+            "windows-1253" => Self::WINDOWS_1253,
+            "windows-1254" => Self::WINDOWS_1254,
+            "windows-1255" => Self::WINDOWS_1255,
+            "windows-1256" => Self::WINDOWS_1256,
+            "windows-1257" => Self::WINDOWS_1257,
+            "windows-1258" => Self::WINDOWS_1258,
+
+            "utf-16be" => Self::UTF16BE,
+            "utf-16le" => Self::UTF16LE,
+
            _ => Self::ShiftJIS,
        }
    }
@ -61,6 +136,9 @@ impl From<&str> for TextScriptEncoding {

 impl TextScriptEncoding {
    pub fn invalid_encoding(encoding: TextScriptEncoding, state: &SharedGameState) -> bool {
+        if state.loc.encoding.as_ref().is_some_and(|s| TextScriptEncoding::from(s.as_str()) == encoding) {
+            return true;
+        }
        let required_encoding = if (state.loc.code == "jp" || state.loc.code == "en") && state.constants.is_base() {
            TextScriptEncoding::ShiftJIS
        } else {
@ -798,8 +876,10 @@ impl TextScriptVM {
                        // The vanilla game treats this as a 1-byte value lol
                        //if npc.event_num == (new_direction & 0xFF) as u16 {
                        if npc.event_num == new_direction as u16 {
-                            game_scene.player1.direction = if game_scene.player1.x > npc.x { Direction::Left } else { Direction::Right };
-                            game_scene.player2.direction = if game_scene.player2.x > npc.x { Direction::Left } else { Direction::Right };
+                            game_scene.player1.direction =
+                                if game_scene.player1.x > npc.x { Direction::Left } else { Direction::Right };
+                            game_scene.player2.direction =
+                                if game_scene.player2.x > npc.x { Direction::Left } else { Direction::Right };
                        }
                    }
                }
--- a/src/i18n.rs
+++ b/src/i18n.rs
@ -1,4 +1,5 @@
 use std::collections::HashMap;
+use std::string;

 use crate::framework::context::Context;
 use crate::framework::filesystem;
@ -9,6 +10,7 @@ pub struct Locale {
    pub code: String,
    pub name: String,
    pub font: FontData,
+    pub encoding: Option<String>,
    strings: HashMap<String, String>,
 }

@ -22,6 +24,7 @@ impl Default for Locale {
                scale: 1.0,
                space_offset: 0.0
            },
+            encoding: None,
            strings: HashMap::new(),
        }
    }
@ -29,7 +32,7 @@ impl Default for Locale {

 impl Locale {
    pub fn new(ctx: &mut Context, base_paths: &Vec<String>, code: &str) -> Locale {
-        let file = filesystem::open_find(ctx, base_paths, &format!("locale/{}.json", code)).unwrap();
+        let file = filesystem::open_find(ctx, base_paths, &format!("locale/{code}.json")).unwrap();
        let json: serde_json::Value = serde_json::from_reader(file).unwrap();

        let strings = Locale::flatten(&json);
@ -39,8 +42,10 @@ impl Locale {
        let font_name = strings["font"].clone();
        let font_scale = strings["font_scale"].parse::<f32>().unwrap_or(1.0);
        let font = FontData::new(font_name, font_scale, 0.0);
+        
+        let encoding = strings.get("encoding").cloned();

-        Locale { code: code.to_string(), name, font, strings }
+        Locale { code: code.to_string(), name, font, encoding, strings }
    }

    fn flatten(json: &serde_json::Value) -> HashMap<String, String> {
--- a/src/util/encoding.rs
+++ b/src/util/encoding.rs
@ -2,48 +2,6 @@ use std::io::Cursor;

 use byteorder::ReadBytesExt;

-/// Decodes UTF-8 character in a less strict way.
-/// http://simonsapin.github.io/wtf-8/#decoding-wtf-8
-pub fn read_cur_wtf8<T: AsRef<[u8]>>(cursor: &mut Cursor<T>, max_bytes: u32) -> (u32, char) {
-    let result: u32;
-    let consumed: u32;
-
-    if max_bytes == 0 {
-        return (0, '\u{fffd}');
-    }
-
-    match cursor.read_u8() {
-        Ok(byte @ 0x00..=0x7f) => {
-            consumed = 1;
-            result = byte as u32;
-        }
-        Ok(byte @ 0xc2..=0xdf) if max_bytes >= 2 => {
-            let byte2 = { if let Ok(n) = cursor.read_u8() { n } else { return (1, '\u{fffd}'); } };
-
-            consumed = 2;
-            result = (byte as u32 & 0x1f) << 6 | (byte2 as u32 & 0x3f);
-        }
-        Ok(byte @ 0xe0..=0xef) if max_bytes >= 3 => {
-            let byte2 = { if let Ok(n) = cursor.read_u8() { n } else { return (1, '\u{fffd}'); } };
-            let byte3 = { if let Ok(n) = cursor.read_u8() { n } else { return (2, '\u{fffd}'); } };
-
-            consumed = 3;
-            result = (byte as u32 & 0x0f) << 12 | (byte2 as u32 & 0x3f) << 6 | (byte3 as u32 & 0x3f);
-        }
-        Ok(byte @ 0xf0..=0xf4) if max_bytes >= 4 => {
-            let byte2 = { if let Ok(n) = cursor.read_u8() { n } else { return (1, '\u{fffd}'); } };
-            let byte3 = { if let Ok(n) = cursor.read_u8() { n } else { return (2, '\u{fffd}'); } };
-            let byte4 = { if let Ok(n) = cursor.read_u8() { n } else { return (3, '\u{fffd}'); } };
-
-            consumed = 4;
-            result = (byte as u32 & 0x07) << 18 | (byte2 as u32 & 0x3f) << 12 | (byte3 as u32 & 0x3f) << 6 | (byte4 as u32 & 0x3f);
-        }
-        _ => { return (1, '\u{fffd}'); }
-    }
-
-    (consumed, std::char::from_u32(result).unwrap_or('\u{fffd}'))
-}
-
 /// Shift-JIS -> Unicode converter.
 pub fn read_cur_shift_jis<T: AsRef<[u8]>>(cursor: &mut Cursor<T>, max_bytes: u32) -> (u32, char) {
    let result: u32;