Merge pull request #598 from maphew/feature/prefer-html-title

Importing single HTML file: prefer html title over filename
This commit is contained in:
Elian Doran 2024-11-28 19:46:15 +02:00 committed by GitHub
commit 21a5481691
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 23 additions and 9 deletions

View File

@ -43,7 +43,10 @@ function sanitize(dirtyHtml: string) {
'mumble', 'nfs', 'onenote', 'pop', 'rmi', 's3', 'sftp', 'skype', 'sms', 'spotify', 'steam', 'svn', 'udp',
'view-source', 'vnc', 'ws', 'wss', 'xmpp', 'jdbc', 'slack'
],
transformTags,
nonTextTags: [
'head'
],
transformTags
});
}

View File

@ -149,15 +149,20 @@ function importMarkdown(taskContext: TaskContext, file: File, parentNote: BNote)
}
function importHtml(taskContext: TaskContext, file: File, parentNote: BNote) {
const title = utils.getNoteTitle(file.originalname, !!taskContext.data?.replaceUnderscoresWithSpaces);
let content = file.buffer.toString("utf-8");
if (taskContext?.data?.safeImport) {
content = htmlSanitizer.sanitize(content);
}
// Try to get title from HTML first, fall back to filename
// We do this before sanitization since that turns all <h1>s into <h2>
const htmlTitle = importUtils.extractHtmlTitle(content);
const title = htmlTitle || utils.getNoteTitle(file.originalname, !!taskContext.data?.replaceUnderscoresWithSpaces);
content = importUtils.handleH1(content, title);
if (taskContext?.data?.safeImport) {
content = htmlSanitizer.sanitize(content);
}
const {note} = noteService.createNewNote({
parentNoteId: parentNote.noteId,
title,
@ -166,9 +171,9 @@ function importHtml(taskContext: TaskContext, file: File, parentNote: BNote) {
mime: 'text/html',
isProtected: parentNote.isProtected && protectedSessionService.isProtectedSessionAvailable(),
});
taskContext.increaseProgressCount();
return note;
}

View File

@ -1,7 +1,7 @@
"use strict";
function handleH1(content: string, title: string) {
content = content.replace(/<h1>([^<]*)<\/h1>/gi, (match, text) => {
content = content.replace(/<h1[^>]*>([^<]*)<\/h1>/gi, (match, text) => {
if (title.trim() === text.trim()) {
return ""; // remove whole H1 tag
} else {
@ -11,6 +11,12 @@ function handleH1(content: string, title: string) {
return content;
}
function extractHtmlTitle(content: string): string | null {
const titleMatch = content.match(/<title[^>]*>([^<]+)<\/title>/i);
return titleMatch ? titleMatch[1].trim() : null;
}
export default {
handleH1
handleH1,
extractHtmlTitle
};