How to extract a date from a PDF file using Python


Here is a quick one-off script that I created to extract a date from PDFs. (Actually, this is an updated version. Maybe it’s not a one-off script after all?)

  • The script will look for dates in common German formats, like 31.12.2021, 31. Dezember 2021, 31 Dez 2021.
  • It will print either the first date it finds, or nothing
  • It will print the date using ISO 8601 date part, e.g. 2021-12-31
  • Using an optional second parameter, you can make sure the printed date is before a certain date

Requirements

The script uses two external libraries for PDF and date parsing: pdfminer.six and dateparser

pip3 install pdfminer.six dateparser

Usage

Print the first date in the content of the PDF, if any:

$ python3 extract_date_from_pdf.py some_pdf_with_a_date.pdf
2021-12-31

Print the first that is before 2020-01-01 a certain date, if any:

$ python3 extract_date_from_pdf.py some_pdf_with_a_date.pdf --before=2020-01-01

In this example the PDF did either not contain a date, or the date was later than 2020-01-01.

The script

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import argparse
import re
import warnings
from datetime import date
from typing import Union

import dateparser
from pdfminer.high_level import extract_text

# Ignore dateparser warnings regarding pytz. Remove once this issue is resolved:
# https://github.com/scrapinghub/dateparser/issues/1013
warnings.filterwarnings(
    "ignore",
    message="The localize method is no longer necessary, as this time zone supports the fold attribute",
)

date_patterns = {
    "dotted_with_two_or_four_digit_year": re.compile(
        r'(\d{1,2}\.\d{2}\.\d{2,4})'
    ),
    "ISO-8601 date part": re.compile(
        r'(\d{4}-\d{2})-\d{2}'),
    "english_short_months": re.compile(
        r'((?i:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w{0,9}\.? \d{1,2}\.? \d{2,4})'),
    "german_short_months": re.compile(
        r'(\d{1,2}\.? (?i:Jan|Feb|Mär|Apr|Mai|Jun|Jul|Aug|Sep|Okt|Nov|Dez)\w{0,9}\.? \d{2,4})'),
}


def extract_all_dates_in_order_of_occurrence(text: str) -> dict:
    dates = dict()
    for format_name, date_format_regex in date_patterns.items():
        matches = re.finditer(date_format_regex, text)
        for match in matches:
            if match:
                dt: date = dateparser.parse(match.group())
                if dt:
                    dates[match.start()] = dt
                else:
                    pass  # found a match that dateparser could not parse
    return dates


def extract_first_date(text: str, before: date = None) -> Union[date, None]:
    # print(text)  # if you want to see what pdfminer extracted
    dates = extract_all_dates_in_order_of_occurrence(text)
    for key, date_value in sorted(dates.items()):
        if not before:
            return date_value
        if date_value < before:
            return date_value
    return None


def parse_args():
    parser = argparse.ArgumentParser(
        description="Extract a date from a PDF file and return it in ISO date-format (YYYY-MM-DD). Note: Only the first date will be returned, if any.")
    parser.add_argument("filename", help="The PDF file to extract the first date from.", type=str)
    parser.add_argument("--before", help="Return the first date that is before this date (in ISO format YYYY-MM-DD). "
                                         "Example: passing '2020-01-01' could return 2019-06-11 but never 2020-01-02 or later.",
                        type=str, required=False)
    return parser.parse_args()


def extract_first_date_from_pdf(filename: str, before: date):
    pdf_as_text = extract_text(filename)
    return extract_first_date(text=pdf_as_text, before=before)


def to_date_or_none(before) -> Union[date, None]:
    dt = None
    if before:
        dt = dateparser.parse(before)
    return dt


if __name__ == '__main__':
    args = parse_args()
    before = to_date_or_none(args.before)
    first_date = extract_first_date_from_pdf(filename=args.filename, before=before)
    if first_date:
        print(f"{first_date:%Y-%m-%d}")

The tests

These are the passing tests:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import dateparser
import pytest

from extract_date_from_pdf import extract_first_date


@pytest.mark.parametrize("string_with_date,expected_date", [
    ("01.01.2020", "2020-01-01"),
    ("01.01.20", "2020-01-01"),
    ("2224.05.2020123 01.01.2020", "2020-05-24"),
    ("abc24.05.2020xyz 01.01.2020", "2020-05-24"),
    ("24.05.2020 01.01.2020", "2020-05-24"),
    ("2020-05-24 01.01.2020", "2020-05-24"),
    ("24 August 2020 01.01.2020", "2020-08-24"),
    ("24 Aug 2020 01.01.2020", "2020-08-24"),
    ("24. Aug. 2020 01.01.2020", "2020-08-24"),
    ("August 24 2020 01.01.2020", "2020-08-24"),
    ("August 24. 2020 01.01.2020", "2020-08-24"),
    ("Aug 24. 2020 01.01.2020", "2020-08-24"),
    ("24. Mai 2020 01.01.2020", "2020-05-24"),
    ("1. März 2020 01.01.2020", "2020-03-01"),
    ("1. Dezember 2020 01.01.2020", "2020-12-01"),
    ("1. December 2020 01.01.2020", "2020-01-01"),  # Unsupported combination of German order and English month
    ("December 01 2020 01.01.2020", "2020-12-01"),
    ("Dezember 01 2020 01.01.2020", "2020-01-01"),  # Unsupported combination of English order and German month
])
def test_dates(string_with_date, expected_date):
    # act
    result = extract_first_date(text=string_with_date).strftime("%Y-%m-%d")

    # assert
    assert result == expected_date

@pytest.mark.parametrize("string_with_date,expected_date", [
    ("01.01.2020", None),
    ("13.03.20", None),
    ("2224.05.2020123 01.01.2020", None),
    ("abc24.05.2020xyz 01.01.2020", None),
    ("24.05.2020 01.01.2020", None),
    ("2020-05-24 01.01.2020", None),
    ("24 August 2020 01.01.2020", None),
    ("24 Aug 2020 01.01.2020", None),
    ("24. Aug. 2020 01.01.2020", None),
    ("August 24 2020 01.01.2020", None),
    ("August 24. 2020 01.01.2020", None),
    ("Aug 24. 2020 01.01.2020", None),
    ("24. Mai 2020 01.01.2020", None),
    ("1. März 2020 01.01.2020", None),
    ("1. Dezember 2020 01.01.2020", None),
    ("1. December 2020 01.01.2020", None),
    ("December 01 2020 01.01.2020", None),
    ("Dezember 01 2020 01.01.2020", None),
])
def test_If_latest_date_is_before_all_dates_Then_none_is_returned(string_with_date, expected_date):
    # arrange
    latest_date = dateparser.parse("2019-01-01")

    # act
    result = extract_first_date(text=string_with_date, before=latest_date)

    # assert
    assert result == None

@pytest.mark.parametrize("string_with_date,expected_date", [
    ("2224.05.2020123 01.01.2020", "2020-01-01"),
    ("abc24.05.2020xyz 01.01.2020", "2020-01-01"),
    ("24.05.2020 01.01.2020", "2020-01-01"),
    ("2020-05-24 01.01.2020", "2020-01-01"),
    ("24 August 2020 01.01.2020", "2020-01-01"),
    ("24 Aug 2020 01.01.2020", "2020-01-01"),
    ("24. Aug. 2020 01.01.2020", "2020-01-01"),
    ("August 24 2020 01.01.2020", "2020-01-01"),
    ("August 24. 2020 01.01.2020", "2020-01-01"),
    ("Aug 24. 2020 01.01.2020", "2020-01-01"),
    ("24. Mai 2020 01.01.2020", "2020-01-01"),
    ("1. März 2020 01.01.2020", "2020-01-01"),
    ("1. Dezember 2020 01.01.2020", "2020-01-01"),
    ("1. December 2020 01.01.2020", "2020-01-01"),
    ("December 01 2020 01.01.2020", "2020-01-01"),
    ("Dezember 01 2020 01.01.2020", "2020-01-01"),
])
def test_If_latest_date_is_provided_Then_match_before_earliest_date_is_returned(string_with_date, expected_date):
    # arrange
    latest_date = dateparser.parse("2020-02-01")

    # act
    result = extract_first_date(text=string_with_date, before=latest_date).strftime("%Y-%m-%d")

    # assert
    assert result == dateparser.parse(expected_date).strftime("%Y-%m-%d")

See also