mirror of
https://github.com/TriliumNext/Notes.git
synced 2025-09-21 19:47:13 +08:00
129 lines
3.9 KiB
JavaScript
129 lines
3.9 KiB
JavaScript
![]() |
const Canvas = require("canvas");
|
||
|
const OCRAD = require("ocrad.js");
|
||
|
const log = require("./log.js");
|
||
|
const optionService = require("./options.js");
|
||
|
|
||
|
function ocrFromByteArray(img) {
|
||
|
// byte array contains raw uncompressed pixel data
|
||
|
// kind: 1 - GRAYSCALE_1BPP (unsupported)
|
||
|
// kind: 2 - RGB_24BPP
|
||
|
// kind: 3 - RGBA_32BPP
|
||
|
|
||
|
if (!(img.data instanceof Uint8ClampedArray) || ![2, 3].includes(img.kind)) {
|
||
|
return null;
|
||
|
}
|
||
|
|
||
|
const start = Date.now();
|
||
|
const canvas = new Canvas.createCanvas(img.width, img.height);
|
||
|
const ctx = canvas.getContext('2d');
|
||
|
|
||
|
const imageData = ctx.createImageData(img.width, img.height);
|
||
|
const imageBytes = imageData.data;
|
||
|
|
||
|
for (let j = 0, k = 0, jj = img.width * img.height * 4; j < jj;) {
|
||
|
imageBytes[j++] = img.data[k++];
|
||
|
imageBytes[j++] = img.data[k++];
|
||
|
imageBytes[j++] = img.data[k++];
|
||
|
// in case of kind = 2, the alpha channel is missing in source pixels and we'll add it
|
||
|
imageBytes[j++] = img.kind === 2 ? 255 : img.data[k++];
|
||
|
}
|
||
|
|
||
|
ctx.putImageData(imageData, 0, 0);
|
||
|
const text = OCRAD(canvas);
|
||
|
|
||
|
log.info(`OCR of ${img.data.length} canvas into ${text.length} chars of text took ${Date.now() - start}ms`);
|
||
|
|
||
|
return text;
|
||
|
}
|
||
|
|
||
|
async function ocrTextFromPdfImages(pdfjsLib, page, strings) {
|
||
|
const ops = await page.getOperatorList();
|
||
|
|
||
|
const fns = ops.fnArray;
|
||
|
const args = ops.argsArray;
|
||
|
|
||
|
for (const arg of args) {
|
||
|
const i = args.indexOf(arg);
|
||
|
|
||
|
if (fns[i] !== pdfjsLib.OPS.paintXObject && fns[i] !== pdfjsLib.OPS.paintImageXObject) {
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
const imgKey = arg[0];
|
||
|
const img = await new Promise((res) => page.objs.get(imgKey, r => res(r)));
|
||
|
|
||
|
if (!img) {
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
const text = ocrFromByteArray(img);
|
||
|
|
||
|
if (text) {
|
||
|
strings.push(text);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
async function extractTextFromPdf(note, buffer) {
|
||
|
if (note.mime !== 'application/pdf' || !optionService.getOptionBool('extractTextFromPdf')) {
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
try {
|
||
|
const pdfjsLib = require("pdfjs-dist");
|
||
|
const doc = await pdfjsLib.getDocument({data: buffer}).promise;
|
||
|
let strings = [];
|
||
|
|
||
|
for (let p = 1; p <= doc.numPages; p++) {
|
||
|
const page = await doc.getPage(p);
|
||
|
|
||
|
const content = await page.getTextContent({
|
||
|
normalizeWhitespace: true,
|
||
|
disableCombineTextItems: false
|
||
|
});
|
||
|
|
||
|
content.items.forEach(({str}) => strings.push(str));
|
||
|
|
||
|
try {
|
||
|
if (optionService.getOptionBool('ocrImages')) {
|
||
|
await ocrTextFromPdfImages(pdfjsLib, page, strings);
|
||
|
}
|
||
|
}
|
||
|
catch (e) {
|
||
|
log.info(`Could not OCR images from PDF note '${note.noteId}': '${e.message}', stack '${e.stack}'`);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
strings = strings.filter(str => str?.trim());
|
||
|
|
||
|
note.saveNoteAttachment('plainText', 'text/plain', strings.join(" "));
|
||
|
}
|
||
|
catch (e) {
|
||
|
log.info(`Extracting text from PDF on note '${note.noteId}' failed with error '${e.message}', stack ${e.stack}`);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
async function ocrTextFromBuffer(buffer) {
|
||
|
// buffer is expected to contain an image in JPEG, PNG etc.
|
||
|
const start = Date.now();
|
||
|
|
||
|
const img = await new Promise((res, rej) => {
|
||
|
const img = new Canvas.Image();
|
||
|
img.onload = () => res(img);
|
||
|
img.onerror = err => rej(new Error("Can't load the image " + err));
|
||
|
img.src = buffer;
|
||
|
});
|
||
|
|
||
|
const canvas = new Canvas.createCanvas(img.width, img.height);
|
||
|
const ctx = canvas.getContext('2d');
|
||
|
ctx.drawImage(img, 0, 0, img.width, img.height);
|
||
|
const plainText = OCRAD(canvas);
|
||
|
|
||
|
log.info(`OCR of ${buffer.byteLength} image bytes into ${plainText.length} chars of text took ${Date.now() - start}ms`);
|
||
|
return plainText;
|
||
|
}
|
||
|
|
||
|
module.exports = {
|
||
|
ocrTextFromBuffer,
|
||
|
extractTextFromPdf
|
||
|
};
|