Pdf.js memory leak when sketching

I am creating an nw.js application that needs to display a lot of PDFs. PDF files are loaded the first time you launch the application. During the initialization phase, I also need to create a thumbnail for each PDF file that will be displayed in the lists.

Thumbnail creation itself doesn't seem to be a problem when we had multiple PDFs. It works by creating a canvas element and PDF.js draws the first page and then saves the canvas to PNG.

The problem is that PDF.js doesn't seem to dump the PDF between runs. Loading 20 PDF files of 1MB usually results in nw.js using about 500MB of RAM. Now we would have 100+, maybe even thousands of PDFs, so we need to figure out how to free RAM between each thumbnail, since about 80 or so PDFs nw.js already uses 2GB of RAM and freezes my laptop as it runs out of memory.

I made a simple test that shows this problem:

var fs = require("fs");
var Q = require("q");
var glob = require("glob");

var canvas = document.createElement("canvas");
var ctx = canvas.getContext('2d');

PDFJS.workerSrc = "pdf.worker.js";

function pdf(pdfFile) {
    return new Q.Promise(function (fulfill, reject) {
        PDFJS.getDocument(pdfFile).then(function (pdf) {

            pdf.getPage(1).then(function (page) {
                var viewport = page.getViewport(0.5);

                canvas.height = viewport.height;
                canvas.width = viewport.width;

                var renderContext = {
                    canvasContext: ctx,
                    viewport: viewport
                };

                page.render(renderContext).then(function () {
                    //set to draw behind current content
                    ctx.globalCompositeOperation = "destination-over";

                    //set background color
                    ctx.fillStyle = "#ffffff";

                    //draw background / rect on entire canvas
                    ctx.fillRect(0, 0, canvas.width, canvas.height);
                    var img = canvas.toDataURL("image/png");
                    img = img.replace(/^data:image\/png;base64,/, "");
                    fs.writeFile(pdfFile + ".png", img, 'base64', function (err) {
                        console.log("Done thumbnail for: " + pdfFile);
                        fulfill();
                    });
                });
            });
        });
    });
}

glob("pdf/*.pdf", function (err, files) {
    if (err) {
        console.log(err);
    } else {
        function generate(file) {
            console.log("Generating thumb for: " + file);
            pdf(file).then(function() {
                if(files.length > 0) next();
            });
        }
        function next() {
            var file = files.pop();
            generate(file);
        }

        next();
    }
});

      

I've never done anything like this before. I tried reusing the same canvas for all thumbs, but that didn't seem to change anything.

I tried to take a bunch of snapshots in developer tools to see what is taking up all the RAM, but guess what? It seems to run garbage collection before taking snapshot, so nw.js goes from 500MB to 100MB before taking snapshot. This leads me to believe that the objects are in fact marked for deletion, but the GC will never have the ability to start before the computer is out of RAM. Loading 20 files and then just wait does not start GC, although it does not run out of RAM.

I tried to check the API and PDF.js documentation but I couldn't find anything talking about how to upload the PDF before uploading the next one.

Any ideas on how I should proceed? The idea I had was to call some external tool or create an ac / C ++ lib, which I would call with node-ffi, but I would have to use PDF.js to show the PDF in a later state in anyway, and I would imagine that I ran into the same issue again.

+3


source to share





All Articles