Notes/src/services/text_extracting.js
2023-02-15 15:57:33 +01:00

151 lines
4.4 KiB
JavaScript

const Canvas = require("canvas");
const OCRAD = require("ocrad.js");
const log = require("./log");
const optionService = require("./options");
const cls = require("./cls");
function ocrFromByteArray(img) {
// byte array contains raw uncompressed pixel data
// kind: 1 - GRAYSCALE_1BPP (unsupported)
// kind: 2 - RGB_24BPP
// kind: 3 - RGBA_32BPP
if (!(img.data instanceof Uint8ClampedArray) || ![2, 3].includes(img.kind)) {
return null;
}
const start = Date.now();
const canvas = new Canvas.createCanvas(img.width, img.height);
const ctx = canvas.getContext('2d');
const imageData = ctx.createImageData(img.width, img.height);
const imageBytes = imageData.data;
for (let j = 0, k = 0, jj = img.width * img.height * 4; j < jj;) {
imageBytes[j++] = img.data[k++];
imageBytes[j++] = img.data[k++];
imageBytes[j++] = img.data[k++];
// in case of kind = 2, the alpha channel is missing in source pixels and we'll add it
imageBytes[j++] = img.kind === 2 ? 255 : img.data[k++];
}
ctx.putImageData(imageData, 0, 0);
const text = OCRAD(canvas);
log.info(`OCR of ${img.data.length} canvas into ${text.length} chars of text took ${Date.now() - start}ms`);
return text;
}
async function ocrTextFromPdfImages(pdfjsLib, page, strings) {
const ops = await page.getOperatorList();
const fns = ops.fnArray;
const args = ops.argsArray;
for (const arg of args) {
const i = args.indexOf(arg);
if (fns[i] !== pdfjsLib.OPS.paintXObject && fns[i] !== pdfjsLib.OPS.paintImageXObject) {
continue;
}
const imgKey = arg[0];
const img = await new Promise((res) => page.objs.get(imgKey, r => res(r)));
if (!img) {
continue;
}
const text = ocrFromByteArray(img);
if (text) {
strings.push(text);
}
}
}
async function extractTextFromPdf(note, buffer) {
if (note.mime !== 'application/pdf' || !optionService.getOptionBool('extractTextFromPdf')) {
return;
}
try {
const pdfjsLib = require("pdfjs-dist");
const doc = await pdfjsLib.getDocument({data: buffer}).promise;
let strings = [];
for (let p = 1; p <= doc.numPages; p++) {
const page = await doc.getPage(p);
const content = await page.getTextContent({
normalizeWhitespace: true,
disableCombineTextItems: false
});
content.items.forEach(({str}) => strings.push(str));
try {
if (optionService.getOptionBool('ocrImages') && !cls.isOcrDisabled()) {
await ocrTextFromPdfImages(pdfjsLib, page, strings);
}
}
catch (e) {
log.info(`Could not OCR images from PDF note '${note.noteId}': '${e.message}', stack '${e.stack}'`);
}
}
strings = strings.filter(str => str?.trim());
note.saveNoteAncillary('plainText', 'text/plain', strings.join(" "));
}
catch (e) {
log.info(`Extracting text from PDF on note '${note.noteId}' failed with error '${e.message}', stack ${e.stack}`);
}
}
async function ocrTextFromBuffer(buffer) {
// buffer is expected to contain an image in JPEG, PNG etc.
const start = Date.now();
const img = await new Promise((res, rej) => {
const img = new Canvas.Image();
img.onload = () => res(img);
img.onerror = err => rej(new Error("Can't load the image " + err));
img.src = buffer;
});
const canvas = new Canvas.createCanvas(img.width, img.height);
const ctx = canvas.getContext('2d');
ctx.drawImage(img, 0, 0, img.width, img.height);
const plainText = OCRAD(canvas);
log.info(`OCR of ${buffer.byteLength} image bytes into ${plainText.length} chars of text took ${Date.now() - start}ms`);
return plainText;
}
async function runOcr(note, buffer) {
if (!note.isImage()
|| !optionService.getOptionBool('ocrImages')
|| cls.isOcrDisabled()
|| buffer.length === 0
) {
return;
}
try {
const plainText = await ocrTextFromBuffer(buffer);
note.saveNoteAncillary('plainText', 'text/plain', plainText);
}
catch (e) {
log.error(`OCR on note '${note.noteId}' failed with error '${e.message}', stack ${e.stack}`);
}
}
module.exports = {
runOcr,
extractTextFromPdf
};