Reading PDF fields with Python/pdfminer

pdfminer is a PDF data extraction class written completely in Python. You can use it to extract data from PDF fields as well. However, doing so can be a headache since the form entries may have child objects which you should search as well. Most of the sample codes I found from the net did not do this properly or there were problems with the encoding of the strings.

Here is the sample code I wrote to demonstrate getting the data:

from argparse import ArgumentParser

from pdfminer.pdfparser import PDFParser 
from pdfminer.pdfdocument import PDFDocument

from pdfminer.pdftypes import resolve1, PDFObjRef

def load_form(filename):

    """Load pdf form contents into a nested list of name/value tuples"""

    with open(filename, 'rb') as file:
        parser = PDFParser(file)
        doc = PDFDocument(parser)
        return [load_fields(resolve1(f)) for f in
                   resolve1(doc.catalog['AcroForm'])['Fields']]
                   

def load_fields(field, parent_var=None):

    """Recursively load form fields"""

    def escape_utf16(param_str):
        if type(param_str).__name__ == "PSLiteral":
            param_str = str(param_str)
        if isinstance(param_str, basestring) and param_str[:2] == "\xfe\xff":
            # If we have string with UTF-16 BOM remove BOM and null characters
            param_str = param_str[2:].translate(None, "\x00")
        if isinstance(param_str, basestring):
            # Encode all strings to UTF-8 (PDF uses ISO-8859-15)
            return param_str.decode("iso-8859-15").encode("utf-8") 
        return param_str
        
    form = field.get('Kids', None)

    if form:
        # This is a child form, recurse into
        new_parent = field.get('T')
        if parent_var:
            new_parent = parent_var+"."+new_parent
        return [load_fields(resolve1(f), new_parent) for f in form]
    else:
        # Some field types, like signatures, need extra resolving
        if (parent_var):
             return (parent_var+"."+field.get('T'), escape_utf16(resolve1(field.get('V'))))
        else:
             return (field.get('T'), escape_utf16(resolve1(field.get('V'))))


def flatten_form (deep_form):
    """ Flatten given form (from load_form()) to a dictionary """

    dict_form = {}
    
    for this_item in deep_form:
        if isinstance(this_item, list):
            this_flat_item = flatten_form(this_item)
            for this_key in this_flat_item.keys():
               dict_form[this_key] = this_flat_item[this_key]
        else:
            dict_form[this_item[0]] = this_item[1]

    return dict_form
    
def parse_cli():
    """Load command line arguments"""

    parser = ArgumentParser(description='Dump the form contents of a PDF.')

    parser.add_argument('file', metavar='pdf_form',
                    help='PDF Form to dump the contents of')

    return parser.parse_args()



def main():
    args = parse_cli()

    # Read form
    form = load_form(args.file)
    # Make a "flat" dictionary from the form data given by load_form()
    form_flat = flatten_form(form)
    
    # Print form data
    form_keys = form_flat.keys()
    form_keys.sort()
    for this_key in form_keys:
        if isinstance(form_flat[this_key], basestring):
            print this_key+": "+form_flat[this_key]
        elif form_flat[this_key] == None:
            print this_key+": None"
        else:
            print this_key+": Unprintable"
    
if __name__ == '__main__':
    main()

2 thoughts on “Reading PDF fields with Python/pdfminer

  1. Very close to what I need. Despite the UTF-8 encoding, all I get are Japanese/Chinese kanji characters. What am I missing? Thanks.

Comments are closed.