diff --git a/package-lock.json b/package-lock.json index 894c82f64..8eb6db485 100644 --- a/package-lock.json +++ b/package-lock.json @@ -31,6 +31,7 @@ "better-sqlite3": "11.8.1", "bootstrap": "5.3.3", "boxicons": "2.1.4", + "chardet": "2.0.0", "cheerio": "1.0.0", "chokidar": "4.0.3", "cls-hooked": "4.2.2", @@ -97,6 +98,7 @@ "source-map-support": "0.5.21", "split.js": "1.6.5", "stream-throttle": "0.1.3", + "strip-bom": "5.0.0", "striptags": "3.2.0", "swagger-ui-express": "5.0.1", "tmp": "0.2.3", @@ -6175,6 +6177,12 @@ "url": "https://github.com/chalk/chalk?sponsor=1" } }, + "node_modules/chardet": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/chardet/-/chardet-2.0.0.tgz", + "integrity": "sha512-xVgPpulCooDjY6zH4m9YW3jbkaBe3FKIAvF5sj5t7aBNsVl2ljIE+xwJ4iNgiDZHFQvNIpjdKdVOQvvk5ZfxbQ==", + "license": "MIT" + }, "node_modules/check-error": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/check-error/-/check-error-2.1.1.tgz", @@ -11889,6 +11897,16 @@ "node": ">=4" } }, + "node_modules/load-json-file/node_modules/strip-bom": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-3.0.0.tgz", + "integrity": "sha512-vavAMRXOgBVNF6nyEEmL3DBK19iRpDcoIwW+swQ+CbGiu7lju6t+JklA1MHweoWtadgt4ISVUsXLyDq34ddcwA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, "node_modules/loader-runner": { "version": "4.3.0", "resolved": "https://registry.npmjs.org/loader-runner/-/loader-runner-4.3.0.tgz", @@ -15917,13 +15935,15 @@ } }, "node_modules/strip-bom": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-3.0.0.tgz", - "integrity": "sha512-vavAMRXOgBVNF6nyEEmL3DBK19iRpDcoIwW+swQ+CbGiu7lju6t+JklA1MHweoWtadgt4ISVUsXLyDq34ddcwA==", - "dev": true, + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-5.0.0.tgz", + "integrity": "sha512-p+byADHF7SzEcVnLvc/r3uognM1hUhObuHXxJcgLCfD194XAkaLbjq3Wzb0N5G2tgIjH0dgT708Z51QxMeu60A==", "license": "MIT", "engines": { - "node": ">=4" + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" } }, "node_modules/strip-eof": { diff --git a/package.json b/package.json index e14d45979..401f3d5ad 100644 --- a/package.json +++ b/package.json @@ -26,7 +26,6 @@ "server:start-test": "npm run server:switch && rimraf ./data-test && cross-env TRILIUM_DATA_DIR=./data-test TRILIUM_SYNC_SERVER_HOST=http://tsyncserver:4000 TRILIUM_ENV=dev TRILIUM_PORT=9999 nodemon src/main.ts", "server:qstart": "npm run server:switch && npm run server:start", "server:switch": "rimraf ./node_modules/better-sqlite3 && npm install", - "electron:start": "cross-env NODE_OPTIONS=\"--import tsx\" TRILIUM_DATA_DIR=./data TRILIUM_SYNC_SERVER_HOST=http://tsyncserver:4000 TRILIUM_ENV=dev electron ./electron-main.ts --inspect=5858 .", "electron:start-no-dir": "cross-env NODE_OPTIONS=\"--import tsx\" TRILIUM_ENV=dev electron --inspect=5858 .", "electron:start-nix": "electron-rebuild --version 33.3.1 && cross-env NODE_OPTIONS=\"--import tsx\" TRILIUM_DATA_DIR=./data TRILIUM_SYNC_SERVER_HOST=http://tsyncserver:4000 TRILIUM_ENV=dev nix-shell -p electron_33 --run \"electron ./electron-main.ts --inspect=5858 .\"", @@ -37,30 +36,23 @@ "electron:start-prod-nix-no-dir": "electron-rebuild --version 33.3.1 && npm run build:prepare-dist && cross-env TRILIUM_ENV=dev nix-shell -p electron_33 --run \"electron ./dist/electron-main.js --inspect=5858 .\"", "electron:qstart": "npm run electron:switch && npm run electron:start", "electron:switch": "electron-rebuild", - "electron-forge:start": "npm run build:prepare-dist && electron-forge start", "electron-forge:make": "npm run build:prepare-dist && electron-forge make", "electron-forge:package": "npm run build:prepare-dist && electron-forge package", - "docs:build-backend": "rimraf ./docs/backend_api && typedoc ./docs/backend_api src/becca/entities/*.ts src/services/backend_script_api.ts src/services/sql.ts", "docs:build-frontend": "rimraf ./docs/frontend_api && jsdoc -c jsdoc-conf.json -d ./docs/frontend_api src/public/app/entities/*.js src/public/app/services/frontend_script_api.js src/public/app/widgets/basic_widget.js src/public/app/widgets/note_context_aware_widget.js src/public/app/widgets/right_panel_widget.js", "docs:build": "npm run docs:build-backend && npm run docs:build-frontend", - "build:webpack": "tsx node_modules/webpack/bin/webpack.js -c webpack.config.ts", "build:prepare-dist": "npm run build:webpack && rimraf ./dist && tsc && tsx ./bin/copy-dist.ts", - "test": "cross-env TRILIUM_DATA_DIR=./integration-tests/db TRILIUM_INTEGRATION_TEST=memory vitest", "test:coverage": "cross-env TRILIUM_DATA_DIR=./integration-tests/db vitest --coverage", "test:playwright": "playwright test", - "test:integration-edit-db": "cross-env TRILIUM_INTEGRATION_TEST=edit TRILIUM_PORT=8081 TRILIUM_ENV=dev TRILIUM_DATA_DIR=./integration-tests/db nodemon src/main.ts", "test:integration-mem-db": "cross-env TRILIUM_INTEGRATION_TEST=memory TRILIUM_PORT=8082 TRILIUM_DATA_DIR=./integration-tests/db nodemon src/main.ts", "test:integration-mem-db-dev": "cross-env TRILIUM_INTEGRATION_TEST=memory TRILIUM_PORT=8082 TRILIUM_ENV=dev TRILIUM_DATA_DIR=./integration-tests/db nodemon src/main.ts", - "dev:watch-dist": "tsx ./bin/watch-dist.ts", "dev:prettier-check": "prettier . --check", "dev:prettier-fix": "prettier . --write", - "chore:update-build-info": "tsx bin/update-build-info.ts", "chore:ci-update-nightly-version": "tsx ./bin/update-nightly-version.ts", "chore:generate-document": "cross-env nodemon ./bin/generate_document.ts 1000", @@ -89,6 +81,7 @@ "better-sqlite3": "11.8.1", "bootstrap": "5.3.3", "boxicons": "2.1.4", + "chardet": "2.0.0", "cheerio": "1.0.0", "chokidar": "4.0.3", "cls-hooked": "4.2.2", @@ -155,6 +148,7 @@ "source-map-support": "0.5.21", "split.js": "1.6.5", "stream-throttle": "0.1.3", + "strip-bom": "5.0.0", "striptags": "3.2.0", "swagger-ui-express": "5.0.1", "tmp": "0.2.3", diff --git a/src/services/import/samples/IREN Reports Q2 FY25 Results.htm b/src/services/import/samples/IREN Reports Q2 FY25 Results.htm new file mode 100644 index 000000000..361ceb340 Binary files /dev/null and b/src/services/import/samples/IREN Reports Q2 FY25 Results.htm differ diff --git a/src/services/import/samples/IREN.Reports.Q2.FY25.Results_files.zip b/src/services/import/samples/IREN.Reports.Q2.FY25.Results_files.zip new file mode 100644 index 000000000..86c9de9b5 Binary files /dev/null and b/src/services/import/samples/IREN.Reports.Q2.FY25.Results_files.zip differ diff --git a/src/services/import/samples/UTF-16LE Code Note.json b/src/services/import/samples/UTF-16LE Code Note.json new file mode 100644 index 000000000..a9d06ee69 Binary files /dev/null and b/src/services/import/samples/UTF-16LE Code Note.json differ diff --git a/src/services/import/samples/UTF-16LE Text Note.md b/src/services/import/samples/UTF-16LE Text Note.md new file mode 100644 index 000000000..a0fec98d6 Binary files /dev/null and b/src/services/import/samples/UTF-16LE Text Note.md differ diff --git a/src/services/import/samples/UTF-16LE Text Note.txt b/src/services/import/samples/UTF-16LE Text Note.txt new file mode 100644 index 000000000..c76e1ddd7 Binary files /dev/null and b/src/services/import/samples/UTF-16LE Text Note.txt differ diff --git a/src/services/import/single.spec.ts b/src/services/import/single.spec.ts index 716eb7b91..74b4746ab 100644 --- a/src/services/import/single.spec.ts +++ b/src/services/import/single.spec.ts @@ -1,4 +1,4 @@ -import { describe, expect, it } from "vitest"; +import { beforeAll, describe, expect, it } from "vitest"; import fs from "fs"; import path from "path"; import { fileURLToPath } from "url"; @@ -10,40 +10,72 @@ import cls from "../cls.js"; import sql_init from "../sql_init.js"; import { initializeTranslations } from "../i18n.js"; import single from "./single.js"; +import stripBom from "strip-bom"; const scriptDir = dirname(fileURLToPath(import.meta.url)); -describe("processNoteContent", () => { - it("treats single MDX as Markdown", async () => { - const mdxSample = fs.readFileSync(path.join(scriptDir, "samples", "Text Note.mdx")); - const taskContext = TaskContext.getInstance("import-mdx", "import", { - textImportedAsText: true - }); +async function testImport(fileName: string, mimetype: string) { + const buffer = fs.readFileSync(path.join(scriptDir, "samples", fileName)); + const taskContext = TaskContext.getInstance("import-mdx", "import", { + textImportedAsText: true, + codeImportedAsCode: true + }); - await new Promise((resolve, reject) => { - cls.init(async () => { - initializeTranslations(); - sql_init.initializeDb(); - await sql_init.dbReady; + return new Promise<{ buffer: Buffer, importedNote: BNote }>((resolve, reject) => { + cls.init(async () => { + const rootNote = becca.getNote("root"); + if (!rootNote) { + reject("Missing root note."); + } - const rootNote = becca.getNote("root"); - if (!rootNote) { - reject("Missing root note."); - } - - const importedNote = single.importSingleFile(taskContext, { - originalname: "Text Note.mdx", - mimetype: "text/mdx", - buffer: mdxSample - }, rootNote as BNote); - try { - expect(importedNote.mime).toBe("text/html"); - expect(importedNote.type).toBe("text"); - expect(importedNote.title).toBe("Text Note"); - } catch (e) { - reject(e); - } - resolve(); + const importedNote = single.importSingleFile(taskContext, { + originalname: fileName, + mimetype, + buffer: buffer + }, rootNote as BNote); + resolve({ + buffer, + importedNote }); }); }); +} + +describe("processNoteContent", () => { + beforeAll(async () => { + initializeTranslations(); + sql_init.initializeDb(); + await sql_init.dbReady; + }); + + it("treats single MDX as Markdown", async () => { + const { importedNote } = await testImport("Text Note.mdx", "text/mdx"); + expect(importedNote.mime).toBe("text/html"); + expect(importedNote.type).toBe("text"); + expect(importedNote.title).toBe("Text Note"); + }); + + it("supports HTML note with UTF-16 (w/ BOM) from Microsoft Outlook", async () => { + const { importedNote } = await testImport("IREN Reports Q2 FY25 Results.htm", "text/html"); + expect(importedNote.mime).toBe("text/html"); + expect(importedNote.title).toBe("IREN Reports Q2 FY25 Results"); + expect(importedNote.getContent().toString().substring(0, 5)).toEqual(" { + const { importedNote, buffer } = await testImport("UTF-16LE Code Note.json", "application/json"); + expect(importedNote.mime).toBe("application/json"); + expect(importedNote.getContent().toString()).toStrictEqual(stripBom(buffer.toString("utf-16le"))); + }); + + it("supports plain text note with UTF-16", async () => { + const { importedNote } = await testImport("UTF-16LE Text Note.txt", "text/plain"); + expect(importedNote.mime).toBe("text/html"); + expect(importedNote.getContent().toString()).toBe("

Plain text goes here.

"); + }); + + it("supports markdown note with UTF-16", async () => { + const { importedNote } = await testImport("UTF-16LE Text Note.md", "text/markdown"); + expect(importedNote.mime).toBe("text/html"); + expect(importedNote.getContent().toString()).toBe("

Hello world

\n

Plain text goes here.

\n"); + }); }) diff --git a/src/services/import/single.ts b/src/services/import/single.ts index 79f90ca2d..b572aea7f 100644 --- a/src/services/import/single.ts +++ b/src/services/import/single.ts @@ -8,7 +8,7 @@ import imageService from "../../services/image.js"; import protectedSessionService from "../protected_session.js"; import markdownService from "./markdown.js"; import mimeService from "./mime.js"; -import { getNoteTitle } from "../../services/utils.js"; +import { getNoteTitle, processStringOrBuffer } from "../../services/utils.js"; import importUtils from "./utils.js"; import htmlSanitizer from "../html_sanitizer.js"; import type { File } from "./common.js"; @@ -69,7 +69,7 @@ function importFile(taskContext: TaskContext, file: File, parentNote: BNote) { function importCodeNote(taskContext: TaskContext, file: File, parentNote: BNote) { const title = getNoteTitle(file.originalname, !!taskContext.data?.replaceUnderscoresWithSpaces); - const content = file.buffer.toString("utf-8"); + const content = processStringOrBuffer(file.buffer); const detectedMime = mimeService.getMime(file.originalname) || file.mimetype; const mime = mimeService.normalizeMimeType(detectedMime); @@ -89,7 +89,7 @@ function importCodeNote(taskContext: TaskContext, file: File, parentNote: BNote) function importPlainText(taskContext: TaskContext, file: File, parentNote: BNote) { const title = getNoteTitle(file.originalname, !!taskContext.data?.replaceUnderscoresWithSpaces); - const plainTextContent = file.buffer.toString("utf-8"); + const plainTextContent = processStringOrBuffer(file.buffer); const htmlContent = convertTextToHtml(plainTextContent); const { note } = noteService.createNewNote({ @@ -125,7 +125,7 @@ function convertTextToHtml(text: string) { function importMarkdown(taskContext: TaskContext, file: File, parentNote: BNote) { const title = getNoteTitle(file.originalname, !!taskContext.data?.replaceUnderscoresWithSpaces); - const markdownContent = file.buffer.toString("utf-8"); + const markdownContent = processStringOrBuffer(file.buffer); let htmlContent = markdownService.renderToHtml(markdownContent, title); if (taskContext.data?.safeImport) { @@ -147,7 +147,7 @@ function importMarkdown(taskContext: TaskContext, file: File, parentNote: BNote) } function importHtml(taskContext: TaskContext, file: File, parentNote: BNote) { - let content = file.buffer.toString("utf-8"); + let content = processStringOrBuffer(file.buffer); // Try to get title from HTML first, fall back to filename // We do this before sanitization since that turns all

s into

diff --git a/src/services/import/zip.spec.ts b/src/services/import/zip.spec.ts index c29459f93..67f0175f3 100644 --- a/src/services/import/zip.spec.ts +++ b/src/services/import/zip.spec.ts @@ -1,4 +1,4 @@ -import { describe, expect, it } from "vitest"; +import { beforeAll, describe, expect, it } from "vitest"; import fs from "fs"; import path from "path"; import { fileURLToPath } from "url"; @@ -12,35 +12,46 @@ import sql_init from "../sql_init.js"; import { initializeTranslations } from "../i18n.js"; const scriptDir = dirname(fileURLToPath(import.meta.url)); -describe("processNoteContent", () => { - it("treats single MDX as Markdown in ZIP as text note", async () => { - const mdxSample = fs.readFileSync(path.join(scriptDir, "samples", "mdx.zip")); - const taskContext = TaskContext.getInstance("import-mdx", "import", { - textImportedAsText: true - }); +async function testImport(fileName: string) { + const mdxSample = fs.readFileSync(path.join(scriptDir, "samples", fileName)); + const taskContext = TaskContext.getInstance("import-mdx", "import", { + textImportedAsText: true + }); - await new Promise((resolve, reject) => { - cls.init(async () => { - initializeTranslations(); - sql_init.initializeDb(); - await sql_init.dbReady; + return new Promise<{ importedNote: BNote; rootNote: BNote }>((resolve, reject) => { + cls.init(async () => { + const rootNote = becca.getNote("root"); + if (!rootNote) { + expect(rootNote).toBeTruthy(); + return; + } - const rootNote = becca.getNote("root"); - if (!rootNote) { - expect(rootNote).toBeTruthy(); - return; - } - - const importedNote = await zip.importZip(taskContext, mdxSample, rootNote as BNote); - try { - expect(importedNote.mime).toBe("text/mdx"); - expect(importedNote.type).toBe("text"); - expect(importedNote.title).toBe("Text Note"); - } catch (e) { - reject(e); - } - resolve(); + const importedNote = await zip.importZip(taskContext, mdxSample, rootNote as BNote); + resolve({ + importedNote, + rootNote }); }); }); +} + +describe("processNoteContent", () => { + beforeAll(async () => { + initializeTranslations(); + sql_init.initializeDb(); + await sql_init.dbReady; + }); + + it("treats single MDX as Markdown in ZIP as text note", async () => { + const { importedNote } = await testImport("mdx.zip"); + expect(importedNote.mime).toBe("text/mdx"); + expect(importedNote.type).toBe("text"); + expect(importedNote.title).toBe("Text Note"); + }); + + it("can import email from Microsoft Outlook with UTF-16 with BOM", async () => { + const { rootNote, importedNote } = await testImport("IREN.Reports.Q2.FY25.Results_files.zip"); + const htmlNote = rootNote.children.find((ch) => ch.title === "IREN Reports Q2 FY25 Results"); + expect(htmlNote?.getContent().toString().substring(0, 4)).toEqual("