Notes/src/services/text_extracting.js

const Canvas = require("canvas");
const OCRAD = require("ocrad.js");
const log = require("./log");
const optionService = require("./options");
const cls = require("./cls");

function ocrFromByteArray(img) {
    // byte array contains raw uncompressed pixel data
    // kind: 1 - GRAYSCALE_1BPP (unsupported)
    // kind: 2 - RGB_24BPP
    // kind: 3 - RGBA_32BPP

    if (!(img.data instanceof Uint8ClampedArray) || ![2, 3].includes(img.kind)) {
        return null;
    }

    const start = Date.now();
    const canvas = new Canvas.createCanvas(img.width, img.height);
    const ctx = canvas.getContext('2d');

    const imageData = ctx.createImageData(img.width, img.height);
    const imageBytes = imageData.data;

    for (let j = 0, k = 0, jj = img.width * img.height * 4; j < jj;) {
        imageBytes[j++] = img.data[k++];
        imageBytes[j++] = img.data[k++];
        imageBytes[j++] = img.data[k++];
        // in case of kind = 2, the alpha channel is missing in source pixels and we'll add it
        imageBytes[j++] = img.kind === 2 ? 255 : img.data[k++];
    }

    ctx.putImageData(imageData, 0, 0);
    const text = OCRAD(canvas);

    log.info(`OCR of ${img.data.length} canvas into ${text.length} chars of text took ${Date.now() - start}ms`);

    return text;
}

async function ocrTextFromPdfImages(pdfjsLib, page, strings) {
    const ops = await page.getOperatorList();

    const fns = ops.fnArray;
    const args = ops.argsArray;

    for (const arg of args) {
        const i = args.indexOf(arg);

        if (fns[i] !== pdfjsLib.OPS.paintXObject && fns[i] !== pdfjsLib.OPS.paintImageXObject) {
            continue;
        }

        const imgKey = arg[0];
        const img = await new Promise((res) => page.objs.get(imgKey, r => res(r)));

        if (!img) {
            continue;
        }

        const text = ocrFromByteArray(img);

        if (text) {
            strings.push(text);
        }
    }
}

async function extractTextFromPdf(note, buffer) {
    if (note.mime !== 'application/pdf' || !optionService.getOptionBool('extractTextFromPdf')) {
        return;
    }

    try {
        const pdfjsLib = require("pdfjs-dist");
        const doc = await pdfjsLib.getDocument({data: buffer}).promise;
        let strings = [];

        for (let p = 1; p <= doc.numPages; p++) {
            const page = await doc.getPage(p);

            const content = await page.getTextContent({
                normalizeWhitespace: true,
                disableCombineTextItems: false
            });

            content.items.forEach(({str}) => strings.push(str));

            try {
                if (optionService.getOptionBool('ocrImages') && !cls.isOcrDisabled()) {
                    await ocrTextFromPdfImages(pdfjsLib, page, strings);
                }
            }
            catch (e) {
                log.info(`Could not OCR images from PDF note '${note.noteId}': '${e.message}', stack '${e.stack}'`);
            }
        }

        strings = strings.filter(str => str?.trim());

        note.saveNoteAncillary('plainText', 'text/plain', strings.join(" "));
    }
    catch (e) {
        log.info(`Extracting text from PDF on note '${note.noteId}' failed with error '${e.message}', stack ${e.stack}`);
    }
}

async function ocrTextFromBuffer(buffer) {
    // buffer is expected to contain an image in JPEG, PNG etc.
    const start = Date.now();

    const img = await new Promise((res, rej) => {
        const img = new Canvas.Image();
        img.onload = () => res(img);
        img.onerror = err => rej(new Error("Can't load the image " + err));
        img.src = buffer;
    });

    const canvas = new Canvas.createCanvas(img.width, img.height);
    const ctx = canvas.getContext('2d');
    ctx.drawImage(img, 0, 0, img.width, img.height);

    const plainText = OCRAD(canvas);

    log.info(`OCR of ${buffer.byteLength} image bytes into ${plainText.length} chars of text took ${Date.now() - start}ms`);
    return plainText;
}

async function runOcr(note, buffer) {
    if (!note.isImage()
        || !optionService.getOptionBool('ocrImages')
        || cls.isOcrDisabled()
        || buffer.length === 0
    ) {
        return;
    }

    try {
        const plainText = await ocrTextFromBuffer(buffer);

        note.saveNoteAncillary('plainText', 'text/plain', plainText);
    }
    catch (e) {
        log.error(`OCR on note '${note.noteId}' failed with error '${e.message}', stack ${e.stack}`);
    }
}

module.exports = {
    runOcr,
    extractTextFromPdf
};