feat(import/zip): support UTF-16 LE with BOM (closes #1241)

2025-11-11 20:54:04 +08:00 · 2025-02-22 01:37:02 +02:00 · 2025-02-22 01:37:02 +02:00 · bedc61c3d0
commit bedc61c3d0
parent c925ae5f15
5 changed files with 73 additions and 47 deletions
--- a/src/services/import/samples/IREN.Reports.Q2.FY25.Results_files.zip
+++ b/src/services/import/samples/IREN.Reports.Q2.FY25.Results_files.zip
--- a/src/services/import/single.ts
+++ b/src/services/import/single.ts
@ -3,14 +3,12 @@
 import type BNote from "../../becca/entities/bnote.js";
 import type TaskContext from "../task_context.js";

-import chardet from "chardet";
-import stripBom from "strip-bom";
 import noteService from "../../services/notes.js";
 import imageService from "../../services/image.js";
 import protectedSessionService from "../protected_session.js";
 import markdownService from "./markdown.js";
 import mimeService from "./mime.js";
-import { getNoteTitle } from "../../services/utils.js";
+import { getNoteTitle, processStringOrBuffer } from "../../services/utils.js";
 import importUtils from "./utils.js";
 import htmlSanitizer from "../html_sanitizer.js";
 import type { File } from "./common.js";
@ -148,21 +146,6 @@ function importMarkdown(taskContext: TaskContext, file: File, parentNote: BNote)
    return note;
 }

-function processStringOrBuffer(data: string | Buffer) {
-    if (!Buffer.isBuffer(data)) {
-        return data;
-    }
-
-    const detectedEncoding = chardet.detect(data);
-    switch (detectedEncoding) {
-        case "UTF-16LE":
-            return stripBom(data.toString("utf-16le"));
-        case "UTF-8":
-        default:
-            return data.toString("utf-8");
-    }
-}
-
 function importHtml(taskContext: TaskContext, file: File, parentNote: BNote) {
    let content = processStringOrBuffer(file.buffer);

--- a/src/services/import/zip.spec.ts
+++ b/src/services/import/zip.spec.ts
@ -1,4 +1,4 @@
-import { describe, expect, it } from "vitest";
+import { beforeAll, describe, expect, it } from "vitest";
 import fs from "fs";
 import path from "path";
 import { fileURLToPath } from "url";
@ -12,35 +12,46 @@ import sql_init from "../sql_init.js";
 import { initializeTranslations } from "../i18n.js";
 const scriptDir = dirname(fileURLToPath(import.meta.url));

-describe("processNoteContent", () => {
-    it("treats single MDX as Markdown in ZIP as text note", async () => {
-        const mdxSample = fs.readFileSync(path.join(scriptDir, "samples", "mdx.zip"));
-        const taskContext = TaskContext.getInstance("import-mdx", "import", {
-            textImportedAsText: true
-        });
+async function testImport(fileName: string) {
+    const mdxSample = fs.readFileSync(path.join(scriptDir, "samples", fileName));
+    const taskContext = TaskContext.getInstance("import-mdx", "import", {
+        textImportedAsText: true
+    });

-        await new Promise<void>((resolve, reject) => {
-            cls.init(async () => {
-                initializeTranslations();
-                sql_init.initializeDb();
-                await sql_init.dbReady;
+    return new Promise<{ importedNote: BNote; rootNote: BNote }>((resolve, reject) => {
+        cls.init(async () => {
+            const rootNote = becca.getNote("root");
+            if (!rootNote) {
+                expect(rootNote).toBeTruthy();
+                return;
+            }

-                const rootNote = becca.getNote("root");
-                if (!rootNote) {
-                    expect(rootNote).toBeTruthy();
-                    return;
-                }
-
-                const importedNote = await zip.importZip(taskContext, mdxSample, rootNote as BNote);
-                try {
-                    expect(importedNote.mime).toBe("text/mdx");
-                    expect(importedNote.type).toBe("text");
-                    expect(importedNote.title).toBe("Text Note");
-                } catch (e) {
-                    reject(e);
-                }
-                resolve();
+            const importedNote = await zip.importZip(taskContext, mdxSample, rootNote as BNote);
+            resolve({
+                importedNote,
+                rootNote
            });
        });
    });
+}
+
+describe("processNoteContent", () => {
+    beforeAll(async () => {
+        initializeTranslations();
+        sql_init.initializeDb();
+        await sql_init.dbReady;
+    });
+
+    it("treats single MDX as Markdown in ZIP as text note", async () => {
+        const { importedNote } = await testImport("mdx.zip");
+        expect(importedNote.mime).toBe("text/mdx");
+        expect(importedNote.type).toBe("text");
+        expect(importedNote.title).toBe("Text Note");
+    });
+
+    it("can import email from Microsoft Outlook with UTF-16 with BOM", async () => {
+        const { rootNote, importedNote } = await testImport("IREN.Reports.Q2.FY25.Results_files.zip");
+        const htmlNote = rootNote.children.find((ch) => ch.title === "IREN Reports Q2 FY25 Results");
+        expect(htmlNote?.getContent().toString().substring(0, 4)).toEqual("<div");
+    });
 })
--- a/src/services/import/zip.ts
+++ b/src/services/import/zip.ts
@ -1,7 +1,7 @@
 "use strict";

 import BAttribute from "../../becca/entities/battribute.js";
-import { removeTextFileExtension, newEntityId, getNoteTitle } from "../../services/utils.js";
+import { removeTextFileExtension, newEntityId, getNoteTitle, processStringOrBuffer } from "../../services/utils.js";
 import log from "../../services/log.js";
 import noteService from "../../services/notes.js";
 import attributeService from "../../services/attributes.js";
@ -457,7 +457,7 @@ async function importZip(taskContext: TaskContext, fileBuffer: Buffer, importRoo
        }

        if (type !== "file" && type !== "image") {
-            content = content.toString("utf-8");
+            content = processStringOrBuffer(content);
        }

        const noteTitle = getNoteTitle(filePath, taskContext.data?.replaceUnderscoresWithSpaces || false, noteMeta);
--- a/src/services/utils.ts
+++ b/src/services/utils.ts
@ -1,5 +1,7 @@
 "use strict";

+import chardet from "chardet";
+import stripBom from "strip-bom";
 import crypto from "crypto";
 import { generator } from "rand-token";
 import unescape from "unescape";
@ -330,6 +332,36 @@ function compareVersions(v1: string, v2: string): number {
    return 0;
 }

+/**
+ * For buffers, they are scanned for a supported encoding and decoded (UTF-8, UTF-16). In some cases, the BOM is also stripped.
+ *
+ * For strings, they are returned immediately without any transformation.
+ *
+ * For nullish values, an empty string is returned.
+ *
+ * @param data the string or buffer to process.
+ * @returns the string representation of the buffer, or the same string is it's a string.
+ */
+export function processStringOrBuffer(data: string | Buffer | null) {
+    if (!data) {
+        return "";
+    }
+
+    if (!Buffer.isBuffer(data)) {
+        return data;
+    }
+
+    const detectedEncoding = chardet.detect(data);
+    console.log("Detected as ", detectedEncoding);
+    switch (detectedEncoding) {
+        case "UTF-16LE":
+            return stripBom(data.toString("utf-16le"));
+        case "UTF-8":
+        default:
+            return data.toString("utf-8");
+    }
+}
+
 export default {
    compareVersions,
    crash,