Merge pull request #598 from maphew/feature/prefer-html-title

Importing single HTML file: prefer html title over filename
2025-11-17 16:31:44 +08:00 · 2024-11-28 19:46:15 +02:00 · 2024-11-28 19:46:15 +02:00 · 21a5481691
commit 21a5481691
parent 29b062660d 98b4e36f78
3 changed files with 23 additions and 9 deletions
--- a/src/services/html_sanitizer.ts
+++ b/src/services/html_sanitizer.ts
@ -43,7 +43,10 @@ function sanitize(dirtyHtml: string) {
            'mumble', 'nfs', 'onenote', 'pop', 'rmi', 's3', 'sftp', 'skype', 'sms', 'spotify', 'steam', 'svn', 'udp',
            'view-source', 'vnc', 'ws', 'wss', 'xmpp', 'jdbc', 'slack'
        ],
-        transformTags,
+        nonTextTags: [
            'head'
        ],
        transformTags
    });
 }
--- a/src/services/import/single.ts
+++ b/src/services/import/single.ts
@ -149,14 +149,19 @@ function importMarkdown(taskContext: TaskContext, file: File, parentNote: BNote)
 }
 function importHtml(taskContext: TaskContext, file: File, parentNote: BNote) {
    const title = utils.getNoteTitle(file.originalname, !!taskContext.data?.replaceUnderscoresWithSpaces);
    let content = file.buffer.toString("utf-8");
    // Try to get title from HTML first, fall back to filename
    // We do this before sanitization since that turns all <h1>s into <h2>
    const htmlTitle = importUtils.extractHtmlTitle(content);
    const title = htmlTitle || utils.getNoteTitle(file.originalname, !!taskContext.data?.replaceUnderscoresWithSpaces);
    content = importUtils.handleH1(content, title);
    if (taskContext?.data?.safeImport) {
        content = htmlSanitizer.sanitize(content);
    }    
    content = importUtils.handleH1(content, title);
    const {note} = noteService.createNewNote({
        parentNoteId: parentNote.noteId,
--- a/src/services/import/utils.ts
+++ b/src/services/import/utils.ts
@ -1,7 +1,7 @@
 "use strict";
 function handleH1(content: string, title: string) {
-    content = content.replace(/<h1>([^<]*)<\/h1>/gi, (match, text) => {
+    content = content.replace(/<h1[^>]*>([^<]*)<\/h1>/gi, (match, text) => {
        if (title.trim() === text.trim()) {
            return ""; // remove whole H1 tag
        } else {
@ -11,6 +11,12 @@ function handleH1(content: string, title: string) {
    return content;
 }
 function extractHtmlTitle(content: string): string | null {
    const titleMatch = content.match(/<title[^>]*>([^<]+)<\/title>/i);
    return titleMatch ? titleMatch[1].trim() : null;
 }
 export default {
-    handleH1
+    handleH1,
    extractHtmlTitle
 };