Notes/src/services/text_extracting.js

const Canvas = require("canvas");
const OCRAD = require("ocrad.js");
const log = require("./log.js");
const optionService = require("./options.js");

function ocrFromByteArray(img) {
    // byte array contains raw uncompressed pixel data
    // kind: 1 - GRAYSCALE_1BPP (unsupported)
    // kind: 2 - RGB_24BPP
    // kind: 3 - RGBA_32BPP

    if (!(img.data instanceof Uint8ClampedArray) || ![2, 3].includes(img.kind)) {
        return null;
    }

    const start = Date.now();
    const canvas = new Canvas.createCanvas(img.width, img.height);
    const ctx = canvas.getContext('2d');

    const imageData = ctx.createImageData(img.width, img.height);
    const imageBytes = imageData.data;

    for (let j = 0, k = 0, jj = img.width * img.height * 4; j < jj;) {
        imageBytes[j++] = img.data[k++];
        imageBytes[j++] = img.data[k++];
        imageBytes[j++] = img.data[k++];
        // in case of kind = 2, the alpha channel is missing in source pixels and we'll add it
        imageBytes[j++] = img.kind === 2 ? 255 : img.data[k++];
    }

    ctx.putImageData(imageData, 0, 0);
    const text = OCRAD(canvas);

    log.info(`OCR of ${img.data.length} canvas into ${text.length} chars of text took ${Date.now() - start}ms`);

    return text;
}

async function ocrTextFromPdfImages(pdfjsLib, page, strings) {
    const ops = await page.getOperatorList();

    const fns = ops.fnArray;
    const args = ops.argsArray;

    for (const arg of args) {
        const i = args.indexOf(arg);

        if (fns[i] !== pdfjsLib.OPS.paintXObject && fns[i] !== pdfjsLib.OPS.paintImageXObject) {
            continue;
        }

        const imgKey = arg[0];
        const img = await new Promise((res) => page.objs.get(imgKey, r => res(r)));

        if (!img) {
            continue;
        }

        const text = ocrFromByteArray(img);

        if (text) {
            strings.push(text);
        }
    }
}

async function extractTextFromPdf(note, buffer) {
    if (note.mime !== 'application/pdf' || !optionService.getOptionBool('extractTextFromPdf')) {
        return;
    }

    try {
        const pdfjsLib = require("pdfjs-dist");
        const doc = await pdfjsLib.getDocument({data: buffer}).promise;
        let strings = [];

        for (let p = 1; p <= doc.numPages; p++) {
            const page = await doc.getPage(p);

            const content = await page.getTextContent({
                normalizeWhitespace: true,
                disableCombineTextItems: false
            });

            content.items.forEach(({str}) => strings.push(str));

            try {
                if (optionService.getOptionBool('ocrImages')) {
                    await ocrTextFromPdfImages(pdfjsLib, page, strings);
                }
            }
            catch (e) {
                log.info(`Could not OCR images from PDF note '${note.noteId}': '${e.message}', stack '${e.stack}'`);
            }
        }

        strings = strings.filter(str => str?.trim());

        note.saveNoteAttachment('plainText', 'text/plain', strings.join(" "));
    }
    catch (e) {
        log.info(`Extracting text from PDF on note '${note.noteId}' failed with error '${e.message}', stack ${e.stack}`);
    }
}

async function ocrTextFromBuffer(buffer) {
    // buffer is expected to contain an image in JPEG, PNG etc.
    const start = Date.now();

    const img = await new Promise((res, rej) => {
        const img = new Canvas.Image();
        img.onload = () => res(img);
        img.onerror = err => rej(new Error("Can't load the image " + err));
        img.src = buffer;
    });

    const canvas = new Canvas.createCanvas(img.width, img.height);
    const ctx = canvas.getContext('2d');
    ctx.drawImage(img, 0, 0, img.width, img.height);
    const plainText = OCRAD(canvas);

    log.info(`OCR of ${buffer.byteLength} image bytes into ${plainText.length} chars of text took ${Date.now() - start}ms`);
    return plainText;
}

module.exports = {
    ocrTextFromBuffer,
    extractTextFromPdf
};
added image OCR and parsing text from PDF (and OCR of PDF images) 2023-01-26 20:32:27 +01:00			`const Canvas = require("canvas");`
			`const OCRAD = require("ocrad.js");`
			`const log = require("./log.js");`
			`const optionService = require("./options.js");`

			`function ocrFromByteArray(img) {`
			`// byte array contains raw uncompressed pixel data`
			`// kind: 1 - GRAYSCALE_1BPP (unsupported)`
			`// kind: 2 - RGB_24BPP`
			`// kind: 3 - RGBA_32BPP`

			`if (!(img.data instanceof Uint8ClampedArray) \|\| ![2, 3].includes(img.kind)) {`
			`return null;`
			`}`

			`const start = Date.now();`
			`const canvas = new Canvas.createCanvas(img.width, img.height);`
			`const ctx = canvas.getContext('2d');`

			`const imageData = ctx.createImageData(img.width, img.height);`
			`const imageBytes = imageData.data;`

			`for (let j = 0, k = 0, jj = img.width * img.height * 4; j < jj;) {`
			`imageBytes[j++] = img.data[k++];`
			`imageBytes[j++] = img.data[k++];`
			`imageBytes[j++] = img.data[k++];`
			`// in case of kind = 2, the alpha channel is missing in source pixels and we'll add it`
			`imageBytes[j++] = img.kind === 2 ? 255 : img.data[k++];`
			`}`

			`ctx.putImageData(imageData, 0, 0);`
			`const text = OCRAD(canvas);`

			log.info(`OCR of ${img.data.length} canvas into ${text.length} chars of text took ${Date.now() - start}ms`);

			`return text;`
			`}`

			`async function ocrTextFromPdfImages(pdfjsLib, page, strings) {`
			`const ops = await page.getOperatorList();`

			`const fns = ops.fnArray;`
			`const args = ops.argsArray;`

			`for (const arg of args) {`
			`const i = args.indexOf(arg);`

			`if (fns[i] !== pdfjsLib.OPS.paintXObject && fns[i] !== pdfjsLib.OPS.paintImageXObject) {`
			`continue;`
			`}`

			`const imgKey = arg[0];`
			`const img = await new Promise((res) => page.objs.get(imgKey, r => res(r)));`

			`if (!img) {`
			`continue;`
			`}`

			`const text = ocrFromByteArray(img);`

			`if (text) {`
			`strings.push(text);`
			`}`
			`}`
			`}`

			`async function extractTextFromPdf(note, buffer) {`
			`if (note.mime !== 'application/pdf' \|\| !optionService.getOptionBool('extractTextFromPdf')) {`
			`return;`
			`}`

			`try {`
			`const pdfjsLib = require("pdfjs-dist");`
			`const doc = await pdfjsLib.getDocument({data: buffer}).promise;`
			`let strings = [];`

			`for (let p = 1; p <= doc.numPages; p++) {`
			`const page = await doc.getPage(p);`

			`const content = await page.getTextContent({`
			`normalizeWhitespace: true,`
			`disableCombineTextItems: false`
			`});`

			`content.items.forEach(({str}) => strings.push(str));`

			`try {`
			`if (optionService.getOptionBool('ocrImages')) {`
			`await ocrTextFromPdfImages(pdfjsLib, page, strings);`
			`}`
			`}`
			`catch (e) {`
			log.info(`Could not OCR images from PDF note '${note.noteId}': '${e.message}', stack '${e.stack}'`);
			`}`
			`}`

			`strings = strings.filter(str => str?.trim());`

			`note.saveNoteAttachment('plainText', 'text/plain', strings.join(" "));`
			`}`
			`catch (e) {`
			log.info(`Extracting text from PDF on note '${note.noteId}' failed with error '${e.message}', stack ${e.stack}`);
			`}`
			`}`

			`async function ocrTextFromBuffer(buffer) {`
			`// buffer is expected to contain an image in JPEG, PNG etc.`
			`const start = Date.now();`

			`const img = await new Promise((res, rej) => {`
			`const img = new Canvas.Image();`
			`img.onload = () => res(img);`
			`img.onerror = err => rej(new Error("Can't load the image " + err));`
			`img.src = buffer;`
			`});`

			`const canvas = new Canvas.createCanvas(img.width, img.height);`
			`const ctx = canvas.getContext('2d');`
			`ctx.drawImage(img, 0, 0, img.width, img.height);`
			`const plainText = OCRAD(canvas);`

			log.info(`OCR of ${buffer.byteLength} image bytes into ${plainText.length} chars of text took ${Date.now() - start}ms`);
			`return plainText;`
			`}`

			`module.exports = {`
			`ocrTextFromBuffer,`
			`extractTextFromPdf`
			`};`