Local PDF file scraping in node.js

I have downloaded the pdf through a MEAN stack web application using fs. I want to extract certain fields from a PDF and show them in a web application. I've looked at a couple of packages on npm like pdf.js, pdf2json. I cannot understand the documentation and javascript callbacks used in the examples available. Please, help!

+3


source to share


2 answers


I hope I can answer your question. Using pdf2json can be used to parse pdf and extract text. There are several steps you need to take to make it work. I adapted an example from https://github.com/modesty/pdf2json .

Setting - install pdf2json in node app and also underline. The example page did not explain the need to define your own callback functions. He also used self

instead this

to register them. So, with appropriate changes, the code for extracting all text from pdf will be something like this:



// Get the dependencies that have already been installed
// to ./node_modules with `npm install <dep>`in the root director
// of your app 

var _ = require('underscore'),
    PDFParser = require('pdf2json');

var pdfParser = new PDFParser();

// Create a function to handle the pdf once it has been parsed.
// In this case we cycle through all the pages and extraxt
// All the text blocks and print them to console.
// If you do `console.log(JSON.stringify(pdf))` you will 
// see how the parsed pdf is composed. Drill down into it
// to find the data you are looking for.
var _onPDFBinDataReady = function (pdf) {
  console.log('Loaded pdf:\n');
  for (var i in pdf.data.Pages) {
    var page = pdf.data.Pages[i];
    for (var j in page.Texts) { 
      var text = page.Texts[j];
      console.log(text.R[0].T);
    }
  }
};

// Create an error handling function
var _onPDFBinDataError = function (error) {
  console.log(error);
};

// Use underscore to bind the data ready function to the pdfParser
// so that when the data ready event is emitted your function will
// be called. As opposed to the example, I have used `this` instead
// of `self` since self had no meaning in this context
pdfParser.on('pdfParser_dataReady', _.bind(_onPDFBinDataReady, this));

// Register error handling function
pdfParser.on('pdfParser_dataError', _.bind(_onPDFBinDataError, this));

// Construct the file path of the pdf
var pdfFilePath = 'test3.pdf';

// Load the pdf. When it is loaded your data ready function will be called.
pdfParser.loadPDF(pdfFilePath);

      

+2


source


I am running code from my server side controller.

module.exports = (function() {
return {
    add: function(req, res) {
        var tmp_path = req.files.pdf.path;
        var target_path = './uploads/' + req.files.pdf.name;
        fs.rename(tmp_path, target_path, function(err) {
            if (err) throw err;
            // delete the temporary file, so that the explicitly set temporary upload dir does not get filled with unwanted files
            fs.unlink(tmp_path, function() {
                if (err) throw err;
            //edit here pdf parser

            res.redirect('#/');

            });
        })
    },
    show: function(req, res) {

    var pdfParser = new PDFParser();

    var _onPDFBinDataReady = function (pdf) {
      console.log('Loaded pdf:\n');

      for (var i in pdf.data.Pages) {

        var page = pdf.data.Pages[i];
        // console.log(page.Texts);
        for (var j in page.Texts) { 
          var text = page.Texts[j];
          // console.log(text.R[0].T);

        }
      }
      console.log(JSON.stringify(pdf));
    };
    // Create an error handling function
    var _onPDFBinDataError = function (error) {
      console.log(error);
    };
    pdfParser.on('pdfParser_dataReady', _.bind(_onPDFBinDataReady, this));
    // Register error handling function
    pdfParser.on('pdfParser_dataError', _.bind(_onPDFBinDataError, this));
    // Construct the file path of the pdf
    var pdfFilePath = './uploads/Invoice_template.pdf';
    // Load the pdf. When it is loaded your data ready function will be called.
    pdfParser.loadPDF(pdfFilePath);

},

//end controller

      



}

0


source







All Articles