From 47c05b2c6d591227b4ce2913809818cc605d28d1 Mon Sep 17 00:00:00 2001 From: maphew Date: Sat, 16 Nov 2024 09:06:58 -0700 Subject: [PATCH 01/11] feat: prefer HTML title tag over filename during import When importing HTML files, extract and use the title from the tag if available, falling back to the filename only when no title tag is found. This improves handling of titles with special characters that can't be represented in filenames. --- src/services/import/single.ts | 15 +++++++++------ src/services/import/utils.ts | 8 +++++++- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/src/services/import/single.ts b/src/services/import/single.ts index 465d3b7a8..18b22329f 100644 --- a/src/services/import/single.ts +++ b/src/services/import/single.ts @@ -149,15 +149,18 @@ function importMarkdown(taskContext: TaskContext, file: File, parentNote: BNote) } function importHtml(taskContext: TaskContext, file: File, parentNote: BNote) { - const title = utils.getNoteTitle(file.originalname, !!taskContext.data?.replaceUnderscoresWithSpaces); let content = file.buffer.toString("utf-8"); - + if (taskContext?.data?.safeImport) { content = htmlSanitizer.sanitize(content); } - + + // Try to get title from HTML first, fall back to filename + const htmlTitle = importUtils.extractHtmlTitle(content); + const title = htmlTitle || utils.getNoteTitle(file.originalname, !!taskContext.data?.replaceUnderscoresWithSpaces); + content = importUtils.handleH1(content, title); - + const {note} = noteService.createNewNote({ parentNoteId: parentNote.noteId, title, @@ -166,9 +169,9 @@ function importHtml(taskContext: TaskContext, file: File, parentNote: BNote) { mime: 'text/html', isProtected: parentNote.isProtected && protectedSessionService.isProtectedSessionAvailable(), }); - + taskContext.increaseProgressCount(); - + return note; } diff --git a/src/services/import/utils.ts b/src/services/import/utils.ts index b85700230..ea3cb075e 100644 --- a/src/services/import/utils.ts +++ b/src/services/import/utils.ts @@ -11,6 +11,12 @@ function handleH1(content: string, title: string) { return content; } +function extractHtmlTitle(content: string): string | null { + const titleMatch = content.match(/<title[^>]*>([^<]+)<\/title>/i); + return titleMatch ? titleMatch[1].trim() : null; +} + export default { - handleH1 + handleH1, + extractHtmlTitle }; From 3a7564f7332cf0070b4c7dfb9fe66cb141491581 Mon Sep 17 00:00:00 2001 From: maphew <maphew@gmail.com> Date: Sat, 16 Nov 2024 09:45:13 -0700 Subject: [PATCH 02/11] a missed .ts file --- spec/services/import/single.spec.ts | 145 ++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 spec/services/import/single.spec.ts diff --git a/spec/services/import/single.spec.ts b/spec/services/import/single.spec.ts new file mode 100644 index 000000000..05e307a6f --- /dev/null +++ b/spec/services/import/single.spec.ts @@ -0,0 +1,145 @@ +import importSingle from '../../../src/services/import/single.js'; +import importUtils from '../../../src/services/import/utils.js'; +import BNote from '../../../src/becca/entities/bnote.js'; +import TaskContext from '../../../src/services/task_context.js'; + +describe('HTML Import', () => { + let parentNote: BNote; + let taskContext: TaskContext; + + beforeEach(() => { + // Create a mock parent note + parentNote = new BNote({ + noteId: 'testParent', + title: 'Test Parent', + type: 'text', + mime: 'text/html' + }); + + // Create a mock task context + taskContext = new TaskContext('test', 'test'); + }); + + describe('extractHtmlTitle', () => { + it('should extract title from HTML content', () => { + const html = ` + <html> + <head> + <title>Test Title + + +

Content

+ + `; + + const title = importUtils.extractHtmlTitle(html); + expect(title).toBe('Test Title'); + }); + + it('should handle missing title tag', () => { + const html = ` + + + +

Content

+ + `; + + const title = importUtils.extractHtmlTitle(html); + expect(title).toBeNull(); + }); + + it('should handle title with special characters', () => { + const html = ` + + + Test/Title: With Special & Characters + + +

Content

+ + `; + + const title = importUtils.extractHtmlTitle(html); + expect(title).toBe('Test/Title: With Special & Characters'); + }); + }); + + describe('importHtml', () => { + it('should prefer title from HTML over filename', () => { + const file = { + originalname: 'test.html', + buffer: Buffer.from(` + + + HTML Title + + +

Content

+ + `), + mimetype: 'text/html' + }; + + const note = importSingle.importHtml(taskContext, file, parentNote); + expect(note.title).toBe('HTML Title'); + }); + + it('should fall back to filename when no HTML title exists', () => { + const file = { + originalname: 'test_file.html', + buffer: Buffer.from(` + + + +

Content

+ + `), + mimetype: 'text/html' + }; + + const note = importSingle.importHtml(taskContext, file, parentNote); + expect(note.title).toBe('test file'); // assuming replaceUnderscoresWithSpaces is true + }); + + it('should handle HTML with both title and H1', () => { + const file = { + originalname: 'test.html', + buffer: Buffer.from(` + + + HTML Title + + +

Different H1 Title

+

Content

+ + `), + mimetype: 'text/html' + }; + + const note = importSingle.importHtml(taskContext, file, parentNote); + expect(note.title).toBe('HTML Title'); + expect(note.content).toContain('

HTML Title

'); // H1 should be updated to match title + }); + + it('should preserve special characters in title', () => { + const file = { + originalname: 'test.html', + buffer: Buffer.from(` + + + Title/With: Special & Characters + + +

Content

+ + `), + mimetype: 'text/html' + }; + + const note = importSingle.importHtml(taskContext, file, parentNote); + expect(note.title).toBe('Title/With: Special & Characters'); + }); + }); +}); From 14d7e3e1ce812c50b6e8fdbfa5a96c6306421f2c Mon Sep 17 00:00:00 2001 From: matt wilkie Date: Sat, 16 Nov 2024 21:17:50 -0700 Subject: [PATCH 03/11] test: attempt fix test import error by using importSinglefile instead of importHtml --- spec/services/import/single.spec.ts | 138 ++++++++++++++-------------- 1 file changed, 67 insertions(+), 71 deletions(-) diff --git a/spec/services/import/single.spec.ts b/spec/services/import/single.spec.ts index 05e307a6f..a4d3b9131 100644 --- a/spec/services/import/single.spec.ts +++ b/spec/services/import/single.spec.ts @@ -18,6 +18,8 @@ describe('HTML Import', () => { // Create a mock task context taskContext = new TaskContext('test', 'test'); + // Set textImportedAsText to true to ensure HTML imports are processed + taskContext.data = { textImportedAsText: true }; }); describe('extractHtmlTitle', () => { @@ -30,116 +32,110 @@ describe('HTML Import', () => {

Content

- `; - + + `; + const title = importUtils.extractHtmlTitle(html); expect(title).toBe('Test Title'); }); - it('should handle missing title tag', () => { - const html = ` - - - -

Content

- - `; - - const title = importUtils.extractHtmlTitle(html); - expect(title).toBeNull(); - }); - - it('should handle title with special characters', () => { + it('should return null if no title tag is present', () => { const html = ` - Test/Title: With Special & Characters

Content

- `; - + + `; + const title = importUtils.extractHtmlTitle(html); - expect(title).toBe('Test/Title: With Special & Characters'); + expect(title).toBeNull(); }); }); - describe('importHtml', () => { - it('should prefer title from HTML over filename', () => { + describe('importSingleFile with HTML', () => { + it('should import HTML file with title from title tag', () => { const file = { originalname: 'test.html', + mimetype: 'text/html', buffer: Buffer.from(` HTML Title -

Content

+

Test content

- `), - mimetype: 'text/html' + + `) }; - const note = importSingle.importHtml(taskContext, file, parentNote); + const note = importSingle.importSingleFile(taskContext, file, parentNote); expect(note.title).toBe('HTML Title'); + expect(note.mime).toBe('text/html'); }); - it('should fall back to filename when no HTML title exists', () => { - const file = { - originalname: 'test_file.html', - buffer: Buffer.from(` - - - -

Content

- - `), - mimetype: 'text/html' - }; - - const note = importSingle.importHtml(taskContext, file, parentNote); - expect(note.title).toBe('test file'); // assuming replaceUnderscoresWithSpaces is true - }); - - it('should handle HTML with both title and H1', () => { + it('should import HTML file with title from h1 when no title tag', () => { const file = { originalname: 'test.html', + mimetype: 'text/html', + buffer: Buffer.from(` + + +

Heading Title

+

Test content

+ + + `) + }; + + const note = importSingle.importSingleFile(taskContext, file, parentNote); + expect(note.title).toBe('Heading Title'); + expect(note.mime).toBe('text/html'); + }); + + it('should import HTML file with filename as title when no title or h1', () => { + const file = { + originalname: 'test-document.html', + mimetype: 'text/html', + buffer: Buffer.from(` + + +

Test content without title

+ + + `) + }; + + const note = importSingle.importSingleFile(taskContext, file, parentNote); + expect(note.title).toBe('test-document'); + expect(note.mime).toBe('text/html'); + }); + + it('should sanitize HTML content during import', () => { + const file = { + originalname: 'test.html', + mimetype: 'text/html', buffer: Buffer.from(` - HTML Title + Test Title + -

Different H1 Title

-

Content

+

Safe content

+ - `), - mimetype: 'text/html' + + `) }; - const note = importSingle.importHtml(taskContext, file, parentNote); - expect(note.title).toBe('HTML Title'); - expect(note.content).toContain('

HTML Title

'); // H1 should be updated to match title - }); - - it('should preserve special characters in title', () => { - const file = { - originalname: 'test.html', - buffer: Buffer.from(` - - - Title/With: Special & Characters - - -

Content

- - `), - mimetype: 'text/html' - }; - - const note = importSingle.importHtml(taskContext, file, parentNote); - expect(note.title).toBe('Title/With: Special & Characters'); + const note = importSingle.importSingleFile(taskContext, file, parentNote); + expect(note.title).toBe('Test Title'); + expect(note.content).not.toContain(' - - -

Safe content

- - - - `) - }; + return cls.init(() => { + return sql.transactional(() => { + const file = { + originalname: 'test.html', + mimetype: 'text/html', + buffer: Buffer.from(` + + + Test Title + + + +

Safe content

+ + + + `) + }; - const note = importSingle.importSingleFile(taskContext, file, parentNote); - expect(note.title).toBe('Test Title'); - const content = note.getContent(); - expect(content).not.toContain(' - - -

Safe content

- - - - `) - }; - - const note = importSingle.importSingleFile(taskContext, file, parentNote); - expect(note.title).toBe('Test Title'); - const content = note.getContent(); - expect(content).not.toContain('