|
| 1 | +import pikepdf |
| 2 | +import datetime |
| 3 | +import re |
| 4 | +from dateutil.tz import tzutc, tzoffset |
| 5 | +import sys |
| 6 | + |
| 7 | + |
| 8 | +pdf_date_pattern = re.compile(''.join([ |
| 9 | + r"(D:)?", |
| 10 | + r"(?P<year>\d\d\d\d)", |
| 11 | + r"(?P<month>\d\d)", |
| 12 | + r"(?P<day>\d\d)", |
| 13 | + r"(?P<hour>\d\d)", |
| 14 | + r"(?P<minute>\d\d)", |
| 15 | + r"(?P<second>\d\d)", |
| 16 | + r"(?P<tz_offset>[+-zZ])?", |
| 17 | + r"(?P<tz_hour>\d\d)?", |
| 18 | + r"'?(?P<tz_minute>\d\d)?'?"])) |
| 19 | + |
| 20 | + |
| 21 | +def transform_date(date_str): |
| 22 | + """ |
| 23 | + Convert a pdf date such as "D:20120321183444+07'00'" into a usable datetime |
| 24 | + http://www.verypdf.com/pdfinfoeditor/pdf-date-format.htm |
| 25 | + (D:YYYYMMDDHHmmSSOHH'mm') |
| 26 | + :param date_str: pdf date string |
| 27 | + :return: datetime object |
| 28 | + """ |
| 29 | + global pdf_date_pattern |
| 30 | + match = re.match(pdf_date_pattern, date_str) |
| 31 | + if match: |
| 32 | + date_info = match.groupdict() |
| 33 | + |
| 34 | + for k, v in date_info.items(): # transform values |
| 35 | + if v is None: |
| 36 | + pass |
| 37 | + elif k == 'tz_offset': |
| 38 | + date_info[k] = v.lower() # so we can treat Z as z |
| 39 | + else: |
| 40 | + date_info[k] = int(v) |
| 41 | + |
| 42 | + if date_info['tz_offset'] in ('z', None): # UTC |
| 43 | + date_info['tzinfo'] = tzutc() |
| 44 | + else: |
| 45 | + multiplier = 1 if date_info['tz_offset'] == '+' else -1 |
| 46 | + date_info['tzinfo'] = tzoffset(None, multiplier*(3600 * date_info['tz_hour'] + 60 * date_info['tz_minute'])) |
| 47 | + |
| 48 | + for k in ('tz_offset', 'tz_hour', 'tz_minute'): # no longer needed |
| 49 | + del date_info[k] |
| 50 | + |
| 51 | + return datetime.datetime(**date_info) |
| 52 | + |
| 53 | + |
| 54 | + |
| 55 | +# get the target pdf file from the command-line arguments |
| 56 | +pdf_filename = sys.argv[1] |
| 57 | +# read the pdf file |
| 58 | +pdf = pikepdf.Pdf.open(pdf_filename) |
| 59 | +docinfo = pdf.docinfo |
| 60 | +for key, value in docinfo.items(): |
| 61 | + if str(value).startswith("D:"): |
| 62 | + # pdf datetime format, convert to python datetime |
| 63 | + value = transform_date(str(pdf.docinfo["/CreationDate"])) |
| 64 | + print(key, ":", value) |
0 commit comments