Merge pull request #598 from maphew/feature/prefer-html-title

Importing single HTML file: prefer html title over filename
This commit is contained in:
Elian Doran 2024-11-28 19:46:15 +02:00 committed by GitHub
commit 21a5481691
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 23 additions and 9 deletions

View File

@ -43,7 +43,10 @@ function sanitize(dirtyHtml: string) {
'mumble', 'nfs', 'onenote', 'pop', 'rmi', 's3', 'sftp', 'skype', 'sms', 'spotify', 'steam', 'svn', 'udp', 'mumble', 'nfs', 'onenote', 'pop', 'rmi', 's3', 'sftp', 'skype', 'sms', 'spotify', 'steam', 'svn', 'udp',
'view-source', 'vnc', 'ws', 'wss', 'xmpp', 'jdbc', 'slack' 'view-source', 'vnc', 'ws', 'wss', 'xmpp', 'jdbc', 'slack'
], ],
transformTags, nonTextTags: [
'head'
],
transformTags
}); });
} }

View File

@ -149,14 +149,19 @@ function importMarkdown(taskContext: TaskContext, file: File, parentNote: BNote)
} }
function importHtml(taskContext: TaskContext, file: File, parentNote: BNote) { function importHtml(taskContext: TaskContext, file: File, parentNote: BNote) {
const title = utils.getNoteTitle(file.originalname, !!taskContext.data?.replaceUnderscoresWithSpaces);
let content = file.buffer.toString("utf-8"); let content = file.buffer.toString("utf-8");
// Try to get title from HTML first, fall back to filename
// We do this before sanitization since that turns all <h1>s into <h2>
const htmlTitle = importUtils.extractHtmlTitle(content);
const title = htmlTitle || utils.getNoteTitle(file.originalname, !!taskContext.data?.replaceUnderscoresWithSpaces);
content = importUtils.handleH1(content, title);
if (taskContext?.data?.safeImport) { if (taskContext?.data?.safeImport) {
content = htmlSanitizer.sanitize(content); content = htmlSanitizer.sanitize(content);
} }
content = importUtils.handleH1(content, title);
const {note} = noteService.createNewNote({ const {note} = noteService.createNewNote({
parentNoteId: parentNote.noteId, parentNoteId: parentNote.noteId,

View File

@ -1,7 +1,7 @@
"use strict"; "use strict";
function handleH1(content: string, title: string) { function handleH1(content: string, title: string) {
content = content.replace(/<h1>([^<]*)<\/h1>/gi, (match, text) => { content = content.replace(/<h1[^>]*>([^<]*)<\/h1>/gi, (match, text) => {
if (title.trim() === text.trim()) { if (title.trim() === text.trim()) {
return ""; // remove whole H1 tag return ""; // remove whole H1 tag
} else { } else {
@ -11,6 +11,12 @@ function handleH1(content: string, title: string) {
return content; return content;
} }
function extractHtmlTitle(content: string): string | null {
const titleMatch = content.match(/<title[^>]*>([^<]+)<\/title>/i);
return titleMatch ? titleMatch[1].trim() : null;
}
export default { export default {
handleH1 handleH1,
extractHtmlTitle
}; };