Pdf Datasource

This module contains functionality related to the Pdf datasource.

Document

PdfDocument

Bases: BaseDocument

Document representation for PDF file content.

Extends BaseDocument to handle PDF-specific document processing including metadata filtering for embeddings and LLM contexts.

Attributes:
  • included_embed_metadata_keys (List[str]) –

    Metadata keys to include in embeddings

  • included_llm_metadata_keys (List[str]) –

    Metadata keys to include in LLM context

  • text

    Document content in text format

  • metadata

    Extracted PDF metadata

  • attachments

    Dictionary of document attachments

  • excluded_embed_metadata_keys

    Metadata keys to exclude from embeddings

  • excluded_llm_metadata_keys

    Metadata keys to exclude from LLM context

Source code in src/embedding/datasources/pdf/document.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
class PdfDocument(BaseDocument):
    """Document representation for PDF file content.

    Extends BaseDocument to handle PDF-specific document processing including
    metadata filtering for embeddings and LLM contexts.

    Attributes:
        included_embed_metadata_keys: Metadata keys to include in embeddings
        included_llm_metadata_keys: Metadata keys to include in LLM context
        text: Document content in text format
        metadata: Extracted PDF metadata
        attachments: Dictionary of document attachments
        excluded_embed_metadata_keys: Metadata keys to exclude from embeddings
        excluded_llm_metadata_keys: Metadata keys to exclude from LLM context
    """

    included_embed_metadata_keys: List[str] = [
        "Title",
        "CreationDate",
        "ModDate",
        "creation_date",
        "client_name",
        "offer_name",
        "project_lead",
    ]
    included_llm_metadata_keys: List[str] = [
        "Title",
        "CreationDate",
        "ModDate",
        "creation_date",
        "client_name",
        "offer_name",
        "project_lead",
    ]

    def __init__(self, text: str, metadata: dict, attachments: dict = None):
        """Initialize PDF document.

        Args:
            text: Extracted text content
            metadata: PDF metadata dictionary
            attachments: Optional dictionary of attachments
        """
        super().__init__()
        self.text = text
        self.metadata = metadata
        self.attachments = attachments or {}
        self.excluded_embed_metadata_keys = self._set_excluded_metadata_keys(
            self.metadata, self.included_embed_metadata_keys
        )
        self.excluded_llm_metadata_keys = self._set_excluded_metadata_keys(
            self.metadata, self.included_llm_metadata_keys
        )

    @staticmethod
    def _set_excluded_metadata_keys(
        metadata: dict, included_keys: List[str]
    ) -> List[str]:
        """Determine metadata keys to exclude from processing.

        Args:
            metadata: Complete metadata dictionary
            included_keys: Keys to include in processing

        Returns:
            List[str]: Keys that should be excluded

        Note:
            Returns any key from metadata that isn't in included_keys
        """
        return [key for key in metadata.keys() if key not in included_keys]

__init__(text, metadata, attachments=None)

Initialize PDF document.

Parameters:
  • text (str) –

    Extracted text content

  • metadata (dict) –

    PDF metadata dictionary

  • attachments (dict, default: None ) –

    Optional dictionary of attachments

Source code in src/embedding/datasources/pdf/document.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def __init__(self, text: str, metadata: dict, attachments: dict = None):
    """Initialize PDF document.

    Args:
        text: Extracted text content
        metadata: PDF metadata dictionary
        attachments: Optional dictionary of attachments
    """
    super().__init__()
    self.text = text
    self.metadata = metadata
    self.attachments = attachments or {}
    self.excluded_embed_metadata_keys = self._set_excluded_metadata_keys(
        self.metadata, self.included_embed_metadata_keys
    )
    self.excluded_llm_metadata_keys = self._set_excluded_metadata_keys(
        self.metadata, self.included_llm_metadata_keys
    )

_set_excluded_metadata_keys(metadata, included_keys) staticmethod

Determine metadata keys to exclude from processing.

Parameters:
  • metadata (dict) –

    Complete metadata dictionary

  • included_keys (List[str]) –

    Keys to include in processing

Returns:
  • List[str]

    List[str]: Keys that should be excluded

Note

Returns any key from metadata that isn't in included_keys

Source code in src/embedding/datasources/pdf/document.py
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
@staticmethod
def _set_excluded_metadata_keys(
    metadata: dict, included_keys: List[str]
) -> List[str]:
    """Determine metadata keys to exclude from processing.

    Args:
        metadata: Complete metadata dictionary
        included_keys: Keys to include in processing

    Returns:
        List[str]: Keys that should be excluded

    Note:
        Returns any key from metadata that isn't in included_keys
    """
    return [key for key in metadata.keys() if key not in included_keys]

Manager

PdfDatasourceManager

Bases: DatasourceManager

Manager for PDF content extraction and processing.

Handles document retrieval, cleaning, splitting and embedding updates for PDF documents. Implements the base DatasourceManager interface for PDF-specific processing.

Source code in src/embedding/datasources/pdf/manager.py
 4
 5
 6
 7
 8
 9
10
11
12
class PdfDatasourceManager(DatasourceManager):
    """Manager for PDF content extraction and processing.

    Handles document retrieval, cleaning, splitting and embedding updates
    for PDF documents. Implements the base DatasourceManager
    interface for PDF-specific processing.
    """

    pass

Reader

DefaultPDFParser

Source code in src/embedding/datasources/pdf/reader.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
class DefaultPDFParser:
    def parse(self, file_path: str) -> List[PdfDocument]:
        with open(file_path, "rb") as f:
            reader = pypdf.PdfReader(f)
            text = "\n\n".join(
                page.extract_text() or "" for page in reader.pages
            )
            metadata = self._extract_metadata(
                reader=reader, file_path=file_path
            )
            return [PdfDocument(text=text, metadata=metadata)]

    def _extract_metadata(
        self, reader: pypdf.PdfReader, file_path: str
    ) -> dict:
        """Extract and process PDF metadata.

        Args:
            reader: PDF reader instance

        Returns:
            dict: Processed metadata dictionary

        Note:
            Converts date strings to ISO format where possible
        """
        pdf_metadata = reader.metadata
        metadata = {
            "datasource": "pdf",
            "url": file_path,
            "title": os.path.basename(file_path),
        }
        if pdf_metadata is not None:
            for key, value in pdf_metadata.items():
                clean_key = key.strip("/")
                if clean_key in ["CreationDate", "ModDate"]:
                    date_str = value.strip("D:")
                    try:
                        parsed_date = datetime.strptime(
                            date_str[:14], "%Y%m%d%H%M%S"
                        )
                        metadata[clean_key] = parsed_date.isoformat()
                    except ValueError:
                        metadata[clean_key] = value
                else:
                    metadata[clean_key] = value
        return metadata

_extract_metadata(reader, file_path)

Extract and process PDF metadata.

Parameters:
  • reader (PdfReader) –

    PDF reader instance

Returns:
  • dict( dict ) –

    Processed metadata dictionary

Note

Converts date strings to ISO format where possible

Source code in src/embedding/datasources/pdf/reader.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
def _extract_metadata(
    self, reader: pypdf.PdfReader, file_path: str
) -> dict:
    """Extract and process PDF metadata.

    Args:
        reader: PDF reader instance

    Returns:
        dict: Processed metadata dictionary

    Note:
        Converts date strings to ISO format where possible
    """
    pdf_metadata = reader.metadata
    metadata = {
        "datasource": "pdf",
        "url": file_path,
        "title": os.path.basename(file_path),
    }
    if pdf_metadata is not None:
        for key, value in pdf_metadata.items():
            clean_key = key.strip("/")
            if clean_key in ["CreationDate", "ModDate"]:
                date_str = value.strip("D:")
                try:
                    parsed_date = datetime.strptime(
                        date_str[:14], "%Y%m%d%H%M%S"
                    )
                    metadata[clean_key] = parsed_date.isoformat()
                except ValueError:
                    metadata[clean_key] = value
            else:
                metadata[clean_key] = value
    return metadata

NLMPDFParser

Source code in src/embedding/datasources/pdf/reader.py
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
class NLMPDFParser:
    # Field patterns as class constant
    FIELDS_TO_EXTRACT = [
        {
            "name": "valid_until",
            "search_patterns": r"(?:Gültig bis|Valid until)\s*[:\s]*([\d/]+)",
        },
        {
            "name": "client_name",
            "search_patterns": r"(?:Client|Kunde)\s*[:\s]*([\S ]+?)(?=\s*\b(?:Quote No\.|Quote|Angebotsnummer|Date|Contact|Conta ct|Contents|Project Lead|Projektnummer)\b)",
        },
        {
            "name": "offer_name",
            "search_patterns": r"(?:Angebot|Quote)\s*[:\s]*([\S ]+?)(?=\s*\b(?:Datum|Date|Valid until|Contact|Project Lead|Projektleiter|Projektnummer)\b)",
        },
        {
            "name": "project_lead",
            "search_patterns": r"(?:Project\s*Lead|Projektleiter)\s*[:\s]*([\w\s.]+?)(?=\s*(Contact|Kontakt|Project Number|Quote Number|Valid until|$))",
        },
    ]

    def __init__(self, api_base: str):
        self.reader = LayoutPDFReader(api_base)

    def parse(self, file_path: str) -> List[PdfDocument]:
        """
        Parses the given PDF file and enriches its metadata with additional fields.

        Args:
            file_path (str): Path to the PDF file.

        Returns:
            List[PdfDocument]: List of enriched PdfDocument objects.
        """
        doc = self.reader.read_pdf(file_path)
        metadata = default_file_metadata_func(file_path)
        additional_metadata = self._extract_page_metadata(file_path)
        documents = []

        for chunk in doc.chunks():
            chunk_metadata = metadata.copy()
            chunk_metadata["page_label"] = chunk.page_idx
            enriched_metadata = {**chunk_metadata, **additional_metadata}
            documents.append(
                PdfDocument(
                    text=chunk.to_context_text(),
                    metadata=enriched_metadata,
                )
            )

        return documents

    def _extract_page_metadata(self, file_path: str) -> dict:
        """Extract metadata from first pages of PDF.

        Args:
            file_path: Path to PDF file

        Returns:
            dict: Extracted metadata fields
        """
        reader = pypdf.PdfReader(file_path)
        text = "".join(page.extract_text() or "" for page in reader.pages[:2])
        text = preprocess_text(text)
        return self._extract_fields(text, self.FIELDS_TO_EXTRACT)

    def _extract_fields(self, text: str, fields_to_extract: List[dict]) -> dict:
        extracted_fields = {}

        for field in fields_to_extract:
            match = re.search(field["search_patterns"], text, re.IGNORECASE)
            if match:
                extracted_fields[field["name"]] = match.group(1).strip()

        # Fallback/default values
        extracted_fields.setdefault("valid_until", "01/01/2024")
        extracted_fields.setdefault("client_name", "Unknown Client")
        extracted_fields.setdefault("offer_name", "Generic Offer")
        extracted_fields.setdefault("project_lead", "Not Assigned")

        return extracted_fields

_extract_page_metadata(file_path)

Extract metadata from first pages of PDF.

Parameters:
  • file_path (str) –

    Path to PDF file

Returns:
  • dict( dict ) –

    Extracted metadata fields

Source code in src/embedding/datasources/pdf/reader.py
146
147
148
149
150
151
152
153
154
155
156
157
158
def _extract_page_metadata(self, file_path: str) -> dict:
    """Extract metadata from first pages of PDF.

    Args:
        file_path: Path to PDF file

    Returns:
        dict: Extracted metadata fields
    """
    reader = pypdf.PdfReader(file_path)
    text = "".join(page.extract_text() or "" for page in reader.pages[:2])
    text = preprocess_text(text)
    return self._extract_fields(text, self.FIELDS_TO_EXTRACT)

parse(file_path)

Parses the given PDF file and enriches its metadata with additional fields.

Parameters:
  • file_path (str) –

    Path to the PDF file.

Returns:
  • List[PdfDocument]

    List[PdfDocument]: List of enriched PdfDocument objects.

Source code in src/embedding/datasources/pdf/reader.py
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
def parse(self, file_path: str) -> List[PdfDocument]:
    """
    Parses the given PDF file and enriches its metadata with additional fields.

    Args:
        file_path (str): Path to the PDF file.

    Returns:
        List[PdfDocument]: List of enriched PdfDocument objects.
    """
    doc = self.reader.read_pdf(file_path)
    metadata = default_file_metadata_func(file_path)
    additional_metadata = self._extract_page_metadata(file_path)
    documents = []

    for chunk in doc.chunks():
        chunk_metadata = metadata.copy()
        chunk_metadata["page_label"] = chunk.page_idx
        enriched_metadata = {**chunk_metadata, **additional_metadata}
        documents.append(
            PdfDocument(
                text=chunk.to_context_text(),
                metadata=enriched_metadata,
            )
        )

    return documents

PdfReader

Bases: BaseReader[PdfDocument]

Reader for extracting content from PDF files.

Implements document extraction from PDF files with support for text and metadata extraction.

Attributes:
  • export_limit

    Maximum number of documents to process

  • base_path

    Root directory containing PDF files

Source code in src/embedding/datasources/pdf/reader.py
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
class PdfReader(BaseReader[PdfDocument]):
    """Reader for extracting content from PDF files.

    Implements document extraction from PDF files with support for
    text and metadata extraction.

    Attributes:
        export_limit: Maximum number of documents to process
        base_path: Root directory containing PDF files
    """

    def __init__(self, configuration: PdfDatasourceConfiguration):
        """Initialize PDF reader.

        Args:
            configuration: Settings for PDF processing
        """
        super().__init__()
        self.export_limit = configuration.export_limit
        self.base_path = configuration.base_path

        if configuration.nlm_parser_enabled:
            self.parser = NLMPDFParser(configuration.nlm_parser_api_base)
        else:
            self.parser = DefaultPDFParser()

    def get_all_documents(self) -> List[PdfDocument]:
        documents = []
        pdf_files = [
            f for f in os.listdir(self.base_path) if f.endswith(".pdf")
        ]
        files_to_load = (
            pdf_files
            if self.export_limit is None
            else pdf_files[: self.export_limit]
        )

        for file_name in tqdm(files_to_load, desc="Loading PDFs"):
            file_path = os.path.join(self.base_path, file_name)
            if os.path.isfile(file_path):
                try:
                    parsed_docs = self.parser.parse(file_path)
                    documents.extend(parsed_docs)
                except Exception as e:
                    logging.error(f"Failed to load PDF {file_name}: {str(e)}")

        return documents

    async def get_all_documents_async(self) -> List[PdfDocument]:
        """Load documents asynchronously from configured path.

        Returns:
            List[PdfDocument]: Collection of processed documents

        Note:
            Currently calls synchronous implementation
        """
        return self.get_all_documents()

__init__(configuration)

Initialize PDF reader.

Parameters:
  • configuration (PdfDatasourceConfiguration) –

    Settings for PDF processing

Source code in src/embedding/datasources/pdf/reader.py
188
189
190
191
192
193
194
195
196
197
198
199
200
201
def __init__(self, configuration: PdfDatasourceConfiguration):
    """Initialize PDF reader.

    Args:
        configuration: Settings for PDF processing
    """
    super().__init__()
    self.export_limit = configuration.export_limit
    self.base_path = configuration.base_path

    if configuration.nlm_parser_enabled:
        self.parser = NLMPDFParser(configuration.nlm_parser_api_base)
    else:
        self.parser = DefaultPDFParser()

get_all_documents_async() async

Load documents asynchronously from configured path.

Returns:
  • List[PdfDocument]

    List[PdfDocument]: Collection of processed documents

Note

Currently calls synchronous implementation

Source code in src/embedding/datasources/pdf/reader.py
225
226
227
228
229
230
231
232
233
234
async def get_all_documents_async(self) -> List[PdfDocument]:
    """Load documents asynchronously from configured path.

    Returns:
        List[PdfDocument]: Collection of processed documents

    Note:
        Currently calls synchronous implementation
    """
    return self.get_all_documents()

preprocess_text(text)

Preprocess text to clean split labels and values while preserving structure.

Parameters:
  • text (str) –

    Raw extracted text.

Returns:
  • str( str ) –

    Cleaned and normalized text.

Source code in src/embedding/datasources/pdf/reader.py
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
def preprocess_text(text: str) -> str:
    """
    Preprocess text to clean split labels and values while preserving structure.

    Args:
        text (str): Raw extracted text.

    Returns:
        str: Cleaned and normalized text.
    """
    # Normalize known splits or errors
    text = re.sub(r"Conta\s*ct", "Contact", text)
    text = re.sub(r"Projektl\s*eiter", "Projektleiter", text)
    text = re.sub(r"Proje\s*ct\s*Lead", "Project Lead", text)
    # Join lines where a label is split from its value (without look-behind)
    text = re.sub(
        r"(Client|Kunde|Projektleiter|Project Lead|Gültig bis|Valid until)\s*\n\s*",
        r"\1 ",
        text,
    )
    # Remove excessive spaces
    text = re.sub(r"\s{2,}", " ", text)

    return text

Builders

PdfCleanerBuilder

Builder for creating PDF content cleaner instances.

Provides factory method to create Cleaner objects for PDF content.

Source code in src/embedding/datasources/pdf/builders.py
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
class PdfCleanerBuilder:
    """Builder for creating PDF content cleaner instances.

    Provides factory method to create Cleaner objects for PDF content.
    """

    @staticmethod
    @inject
    def build() -> Cleaner:
        """Creates a content cleaner for PDFs.

        Returns:
            Cleaner: Configured cleaner instance
        """
        return Cleaner()

build() staticmethod

Creates a content cleaner for PDFs.

Returns:
  • Cleaner( Cleaner ) –

    Configured cleaner instance

Source code in src/embedding/datasources/pdf/builders.py
78
79
80
81
82
83
84
85
86
@staticmethod
@inject
def build() -> Cleaner:
    """Creates a content cleaner for PDFs.

    Returns:
        Cleaner: Configured cleaner instance
    """
    return Cleaner()

PdfDatasourceManagerBuilder

Builder for creating PDF datasource manager instances.

Provides factory method to create configured PdfDatasourceManager with required components for content processing.

Source code in src/embedding/datasources/pdf/builders.py
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
class PdfDatasourceManagerBuilder:
    """Builder for creating PDF datasource manager instances.

    Provides factory method to create configured PdfDatasourceManager
    with required components for content processing.
    """

    @staticmethod
    @inject
    def build(
        configuration: PdfDatasourceConfiguration,
        reader: PdfReader,
        cleaner: Cleaner,
        splitter: BoundEmbeddingModelMarkdownSplitter,
    ) -> PdfDatasourceManager:
        """Creates a configured PDF datasource manager.

        Args:
            configuration: Settings for PDF processing
            reader: Component for reading PDF content
            cleaner: Component for cleaning extracted content
            splitter: Component for splitting content into chunks

        Returns:
            PdfDatasourceManager: Configured manager instance
        """
        return PdfDatasourceManager(
            configuration=configuration,
            reader=reader,
            cleaner=cleaner,
            splitter=splitter,
        )

build(configuration, reader, cleaner, splitter) staticmethod

Creates a configured PDF datasource manager.

Parameters:
  • configuration (PdfDatasourceConfiguration) –

    Settings for PDF processing

  • reader (PdfReader) –

    Component for reading PDF content

  • cleaner (Cleaner) –

    Component for cleaning extracted content

  • splitter (BoundEmbeddingModelMarkdownSplitter) –

    Component for splitting content into chunks

Returns:
  • PdfDatasourceManager( PdfDatasourceManager ) –

    Configured manager instance

Source code in src/embedding/datasources/pdf/builders.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
@staticmethod
@inject
def build(
    configuration: PdfDatasourceConfiguration,
    reader: PdfReader,
    cleaner: Cleaner,
    splitter: BoundEmbeddingModelMarkdownSplitter,
) -> PdfDatasourceManager:
    """Creates a configured PDF datasource manager.

    Args:
        configuration: Settings for PDF processing
        reader: Component for reading PDF content
        cleaner: Component for cleaning extracted content
        splitter: Component for splitting content into chunks

    Returns:
        PdfDatasourceManager: Configured manager instance
    """
    return PdfDatasourceManager(
        configuration=configuration,
        reader=reader,
        cleaner=cleaner,
        splitter=splitter,
    )

PdfReaderBuilder

Builder for creating PDF reader instances.

Provides factory method to create configured PdfReader objects.

Source code in src/embedding/datasources/pdf/builders.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
class PdfReaderBuilder:
    """Builder for creating PDF reader instances.

    Provides factory method to create configured PdfReader objects.
    """

    @staticmethod
    @inject
    def build(
        configuration: PdfDatasourceConfiguration,
    ) -> PdfReader:
        """Creates a configured PDF reader.

        Args:
            configuration: Settings for PDF processing

        Returns:
            PdfReader: Configured reader instance
        """
        return PdfReader(
            configuration=configuration,
        )

build(configuration) staticmethod

Creates a configured PDF reader.

Parameters:
  • configuration (PdfDatasourceConfiguration) –

    Settings for PDF processing

Returns:
  • PdfReader( PdfReader ) –

    Configured reader instance

Source code in src/embedding/datasources/pdf/builders.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
@staticmethod
@inject
def build(
    configuration: PdfDatasourceConfiguration,
) -> PdfReader:
    """Creates a configured PDF reader.

    Args:
        configuration: Settings for PDF processing

    Returns:
        PdfReader: Configured reader instance
    """
    return PdfReader(
        configuration=configuration,
    )