pdfminer is a PDF data extraction class written completely in Python. You can use it to extract data from PDF fields as well. However, doing so can be a headache since the form entries may have child objects which you should search as well. Most of the sample codes I found from the net did not do this properly or there were problems with the encoding of the strings.
Here is the sample code I wrote to demonstrate getting the data:
from argparse import ArgumentParser
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import resolve1, PDFObjRef
def load_form(filename):
"""Load pdf form contents into a nested list of name/value tuples"""
with open(filename, 'rb') as file:
parser = PDFParser(file)
doc = PDFDocument(parser)
return [load_fields(resolve1(f)) for f in
resolve1(doc.catalog['AcroForm'])['Fields']]
def load_fields(field, parent_var=None):
"""Recursively load form fields"""
def escape_utf16(param_str):
if type(param_str).__name__ == "PSLiteral":
param_str = str(param_str)
if isinstance(param_str, basestring) and param_str[:2] == "\xfe\xff":
# If we have string with UTF-16 BOM remove BOM and null characters
param_str = param_str[2:].translate(None, "\x00")
if isinstance(param_str, basestring):
# Encode all strings to UTF-8 (PDF uses ISO-8859-15)
return param_str.decode("iso-8859-15").encode("utf-8")
return param_str
form = field.get('Kids', None)
if form:
# This is a child form, recurse into
new_parent = field.get('T')
if parent_var:
new_parent = parent_var+"."+new_parent
return [load_fields(resolve1(f), new_parent) for f in form]
else:
# Some field types, like signatures, need extra resolving
if (parent_var):
return (parent_var+"."+field.get('T'), escape_utf16(resolve1(field.get('V'))))
else:
return (field.get('T'), escape_utf16(resolve1(field.get('V'))))
def flatten_form (deep_form):
""" Flatten given form (from load_form()) to a dictionary """
dict_form = {}
for this_item in deep_form:
if isinstance(this_item, list):
this_flat_item = flatten_form(this_item)
for this_key in this_flat_item.keys():
dict_form[this_key] = this_flat_item[this_key]
else:
dict_form[this_item[0]] = this_item[1]
return dict_form
def parse_cli():
"""Load command line arguments"""
parser = ArgumentParser(description='Dump the form contents of a PDF.')
parser.add_argument('file', metavar='pdf_form',
help='PDF Form to dump the contents of')
return parser.parse_args()
def main():
args = parse_cli()
# Read form
form = load_form(args.file)
# Make a "flat" dictionary from the form data given by load_form()
form_flat = flatten_form(form)
# Print form data
form_keys = form_flat.keys()
form_keys.sort()
for this_key in form_keys:
if isinstance(form_flat[this_key], basestring):
print this_key+": "+form_flat[this_key]
elif form_flat[this_key] == None:
print this_key+": None"
else:
print this_key+": Unprintable"
if __name__ == '__main__':
main()