pdfminer is a PDF data extraction class written completely in Python. You can use it to extract data from PDF fields as well. However, doing so can be a headache since the form entries may have child objects which you should search as well. Most of the sample codes I found from the net did not do this properly or there were problems with the encoding of the strings.
Here is the sample code I wrote to demonstrate getting the data:
from argparse import ArgumentParser from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdftypes import resolve1, PDFObjRef def load_form(filename): """Load pdf form contents into a nested list of name/value tuples""" with open(filename, 'rb') as file: parser = PDFParser(file) doc = PDFDocument(parser) return [load_fields(resolve1(f)) for f in resolve1(doc.catalog['AcroForm'])['Fields']] def load_fields(field, parent_var=None): """Recursively load form fields""" def escape_utf16(param_str): if type(param_str).__name__ == "PSLiteral": param_str = str(param_str) if isinstance(param_str, basestring) and param_str[:2] == "\xfe\xff": # If we have string with UTF-16 BOM remove BOM and null characters param_str = param_str[2:].translate(None, "\x00") if isinstance(param_str, basestring): # Encode all strings to UTF-8 (PDF uses ISO-8859-15) return param_str.decode("iso-8859-15").encode("utf-8") return param_str form = field.get('Kids', None) if form: # This is a child form, recurse into new_parent = field.get('T') if parent_var: new_parent = parent_var+"."+new_parent return [load_fields(resolve1(f), new_parent) for f in form] else: # Some field types, like signatures, need extra resolving if (parent_var): return (parent_var+"."+field.get('T'), escape_utf16(resolve1(field.get('V')))) else: return (field.get('T'), escape_utf16(resolve1(field.get('V')))) def flatten_form (deep_form): """ Flatten given form (from load_form()) to a dictionary """ dict_form = {} for this_item in deep_form: if isinstance(this_item, list): this_flat_item = flatten_form(this_item) for this_key in this_flat_item.keys(): dict_form[this_key] = this_flat_item[this_key] else: dict_form[this_item[0]] = this_item[1] return dict_form def parse_cli(): """Load command line arguments""" parser = ArgumentParser(description='Dump the form contents of a PDF.') parser.add_argument('file', metavar='pdf_form', help='PDF Form to dump the contents of') return parser.parse_args() def main(): args = parse_cli() # Read form form = load_form(args.file) # Make a "flat" dictionary from the form data given by load_form() form_flat = flatten_form(form) # Print form data form_keys = form_flat.keys() form_keys.sort() for this_key in form_keys: if isinstance(form_flat[this_key], basestring): print this_key+": "+form_flat[this_key] elif form_flat[this_key] == None: print this_key+": None" else: print this_key+": Unprintable" if __name__ == '__main__': main()