Extracting a table from PDF using pdfminer in python

I am trying to use the following code to extract table data from a pdf file.

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox,LTChar, LTFigure
import sys

class PdfMinerWrapper(object):
    """
    Usage:
    with PdfMinerWrapper('2009t.pdf') as doc:
        for page in doc:
           #do something with the page
    """
    def __init__(self, pdf_doc, pdf_pwd=""):
        self.pdf_doc = pdf_doc
        self.pdf_pwd = pdf_pwd

    def __enter__(self):
        #open the pdf file
        self.fp = open(self.pdf_doc, 'rb')
        # create a parser object associated with the file object
        parser = PDFParser(self.fp)
        # create a PDFDocument object that stores the document structure
        doc = PDFDocument(parser, password=self.pdf_pwd)
        # connect the parser and document objects
        parser.set_document(doc)
        self.doc=doc
        return self

    def _parse_pages(self):
        rsrcmgr = PDFResourceManager()
        laparams = LAParams(char_margin=3.5, all_texts = True)
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for page in PDFPage.create_pages(self.doc):
            interpreter.process_page(page)
            # receive the LTPage object for this page
            layout = device.get_result()
            # layout is an LTPage object which may contain child objects like LTTextBox, LTFigure, LTImage, etc.
            yield layout
    def __iter__(self):
        return iter(self._parse_pages())

    def __exit__(self, _type, value, traceback):
        self.fp.close()

def main():
    with PdfMinerWrapper(sys.argv[1]) as doc:
        for page in doc:
            print 'Page no.', page.pageid, 'Size',  (page.height, page.width)
            for tbox in page:
                if not isinstance(tbox, LTTextBox):
                    continue
                print ' '*1, 'Block', 'bbox=(%0.2f, %0.2f, %0.2f, %0.2f)'% tbox.bbox
                for obj in tbox:
                    print ' '*2, obj.get_text().encode('UTF-8')[:-1], '(%0.2f, %0.2f, %0.2f, %0.2f)'% tbox.bbox
                    for c in obj:
                        if not isinstance(c, LTChar):
                            continue
                        print c.get_text().encode('UTF-8'), '(%0.2f, %0.2f, %0.2f, %0.2f)'% c.bbox, c.fontname, c.size,
                    print



if __name__=='__main__':
    main()

      

when i try to run the file i get the following error:

python parse.py "/home/hp/AlgoLeap/poextracter/learning/pdfta
bles/pdf_set/sample1.pdf"
Traceback (most recent call last):
  File "parse.py", line 69in <module>
    main()
  File "parse.py", line 52in main
    for page in doc:
  File "parse.py", line 39in _parse_pages
    interpreter.process_page(page)
  File "/usr/local/lib/python2.7/dist-packages/pdfminer/pdfinterp.py", line 836in process_page
    self.render_contents(page.resources, page.contents, ctm=ctm)
  File "/usr/local/lib/python2.7/dist-packages/pdfminer/pdfinterp.py", line 848in render_contents
    self.execute(list_value(streams))
  File "/usr/local/lib/python2.7/dist-packages/pdfminer/pdfinterp.py", line 878in execute
    raise PDFInterpreterError('Unknown operator: %r' % name)
pdfminer.pdfinterp.PDFInterpreterError: Unknown operator'x\x9c\x95Z'

      

I am racking my brains trying to get my way out of this logic. Can anyone help me with this code.

+3


source to share





All Articles