mirror of
https://github.com/TriliumNext/Notes.git
synced 2025-09-19 10:15:22 +08:00
151 lines
4.4 KiB
JavaScript
151 lines
4.4 KiB
JavaScript
const Canvas = require("canvas");
|
|
const OCRAD = require("ocrad.js");
|
|
const log = require("./log");
|
|
const optionService = require("./options");
|
|
const cls = require("./cls");
|
|
|
|
function ocrFromByteArray(img) {
|
|
// byte array contains raw uncompressed pixel data
|
|
// kind: 1 - GRAYSCALE_1BPP (unsupported)
|
|
// kind: 2 - RGB_24BPP
|
|
// kind: 3 - RGBA_32BPP
|
|
|
|
if (!(img.data instanceof Uint8ClampedArray) || ![2, 3].includes(img.kind)) {
|
|
return null;
|
|
}
|
|
|
|
const start = Date.now();
|
|
const canvas = new Canvas.createCanvas(img.width, img.height);
|
|
const ctx = canvas.getContext('2d');
|
|
|
|
const imageData = ctx.createImageData(img.width, img.height);
|
|
const imageBytes = imageData.data;
|
|
|
|
for (let j = 0, k = 0, jj = img.width * img.height * 4; j < jj;) {
|
|
imageBytes[j++] = img.data[k++];
|
|
imageBytes[j++] = img.data[k++];
|
|
imageBytes[j++] = img.data[k++];
|
|
// in case of kind = 2, the alpha channel is missing in source pixels and we'll add it
|
|
imageBytes[j++] = img.kind === 2 ? 255 : img.data[k++];
|
|
}
|
|
|
|
ctx.putImageData(imageData, 0, 0);
|
|
const text = OCRAD(canvas);
|
|
|
|
log.info(`OCR of ${img.data.length} canvas into ${text.length} chars of text took ${Date.now() - start}ms`);
|
|
|
|
return text;
|
|
}
|
|
|
|
async function ocrTextFromPdfImages(pdfjsLib, page, strings) {
|
|
const ops = await page.getOperatorList();
|
|
|
|
const fns = ops.fnArray;
|
|
const args = ops.argsArray;
|
|
|
|
for (const arg of args) {
|
|
const i = args.indexOf(arg);
|
|
|
|
if (fns[i] !== pdfjsLib.OPS.paintXObject && fns[i] !== pdfjsLib.OPS.paintImageXObject) {
|
|
continue;
|
|
}
|
|
|
|
const imgKey = arg[0];
|
|
const img = await new Promise((res) => page.objs.get(imgKey, r => res(r)));
|
|
|
|
if (!img) {
|
|
continue;
|
|
}
|
|
|
|
const text = ocrFromByteArray(img);
|
|
|
|
if (text) {
|
|
strings.push(text);
|
|
}
|
|
}
|
|
}
|
|
|
|
async function extractTextFromPdf(note, buffer) {
|
|
if (note.mime !== 'application/pdf' || !optionService.getOptionBool('extractTextFromPdf')) {
|
|
return;
|
|
}
|
|
|
|
try {
|
|
const pdfjsLib = require("pdfjs-dist");
|
|
const doc = await pdfjsLib.getDocument({data: buffer}).promise;
|
|
let strings = [];
|
|
|
|
for (let p = 1; p <= doc.numPages; p++) {
|
|
const page = await doc.getPage(p);
|
|
|
|
const content = await page.getTextContent({
|
|
normalizeWhitespace: true,
|
|
disableCombineTextItems: false
|
|
});
|
|
|
|
content.items.forEach(({str}) => strings.push(str));
|
|
|
|
try {
|
|
if (optionService.getOptionBool('ocrImages') && !cls.isOcrDisabled()) {
|
|
await ocrTextFromPdfImages(pdfjsLib, page, strings);
|
|
}
|
|
}
|
|
catch (e) {
|
|
log.info(`Could not OCR images from PDF note '${note.noteId}': '${e.message}', stack '${e.stack}'`);
|
|
}
|
|
}
|
|
|
|
strings = strings.filter(str => str?.trim());
|
|
|
|
note.saveNoteAncillary('plainText', 'text/plain', strings.join(" "));
|
|
}
|
|
catch (e) {
|
|
log.info(`Extracting text from PDF on note '${note.noteId}' failed with error '${e.message}', stack ${e.stack}`);
|
|
}
|
|
}
|
|
|
|
async function ocrTextFromBuffer(buffer) {
|
|
// buffer is expected to contain an image in JPEG, PNG etc.
|
|
const start = Date.now();
|
|
|
|
const img = await new Promise((res, rej) => {
|
|
const img = new Canvas.Image();
|
|
img.onload = () => res(img);
|
|
img.onerror = err => rej(new Error("Can't load the image " + err));
|
|
img.src = buffer;
|
|
});
|
|
|
|
const canvas = new Canvas.createCanvas(img.width, img.height);
|
|
const ctx = canvas.getContext('2d');
|
|
ctx.drawImage(img, 0, 0, img.width, img.height);
|
|
|
|
const plainText = OCRAD(canvas);
|
|
|
|
log.info(`OCR of ${buffer.byteLength} image bytes into ${plainText.length} chars of text took ${Date.now() - start}ms`);
|
|
return plainText;
|
|
}
|
|
|
|
async function runOcr(note, buffer) {
|
|
if (!note.isImage()
|
|
|| !optionService.getOptionBool('ocrImages')
|
|
|| cls.isOcrDisabled()
|
|
|| buffer.length === 0
|
|
) {
|
|
return;
|
|
}
|
|
|
|
try {
|
|
const plainText = await ocrTextFromBuffer(buffer);
|
|
|
|
note.saveNoteAncillary('plainText', 'text/plain', plainText);
|
|
}
|
|
catch (e) {
|
|
log.error(`OCR on note '${note.noteId}' failed with error '${e.message}', stack ${e.stack}`);
|
|
}
|
|
}
|
|
|
|
module.exports = {
|
|
runOcr,
|
|
extractTextFromPdf
|
|
};
|